Md. Wasi-ur-Rahman, Nusrat Sharmin Islam, Xiaoyi Lu, and Dhabaleswar K. Panda. A comprehensive study of mapreduce over lustre for intermediate data placement and shuffle strategies on hpc clusters. IEEE Transactions on Parallel and Distributed Systems, 28(3):633-646, March 2017. doi:10.1109/TPDS.2016.2591947.
@ARTICLE{journals-tpds-Wasi-ur-RahmanI17,
author={Wasi-ur-Rahman, Md. and Islam, Nusrat Sharmin and Lu, Xiaoyi and Panda, Dhabaleswar K.},
journal={IEEE Transactions on Parallel and Distributed Systems},
title={A Comprehensive Study of MapReduce Over Lustre for Intermediate Data Placement and Shuffle Strategies on HPC Clusters},
year={2017},
volume={28},
number={3},
pages={633-646},
abstract={With high performance interconnects and parallel file systems, running MapReduce over modern High Performance Computing (HPC) clusters has attracted much attention due to its uniqueness of solving data analytics problems with a combination of Big Data and HPC technologies. Since the MapReduce architecture relies heavily on the availability of local storage media, the Lustrebased global storage in HPC clusters poses many new opportunities and challenges. In this paper, we perform a comprehensive study on different MapReduce over Lustre deployments and propose a novel high-performance design of YARN MapReduce on HPC clusters by utilizing Lustre as the additional storage provider for intermediate data. With a deployment architecture where both local disks and Lustre are utilized for intermediate data storage, we propose a novel priority directory selection scheme through which RDMAenhanced MapReduce can choose the best intermediate storage during runtime by on-line profiling. Our results indicate that, we can achieve 44 percent performance benefit for shuffle-intensive workloads in leadership-class HPC systems. Our priority directory selection scheme can improve the job execution time by 63 percent over default MapReduce while executing multiple concurrent jobs. To the best of our knowledge, this is the first such comprehensive study for YARN MapReduce with Lustre and RDMA.},
keywords={Big Data;distributed databases;network operating systems;parallel architectures;resource allocation;storage management;storage media;workstation clusters;intermediate data placement;HPC clusters;high performance interconnects;parallel file systems;high performance computing clusters;data analytics;Big Data;HPC technologies;local storage media;Lustre-based global storage;MapReduce over Lustre deployments;high-performance YARN MapReduce design;storage provider;intermediate data storage;priority directory selection;RDMA-enhanced MapReduce;shuffle-intensive workloads;leadership-class HPC systems;job execution;Computer architecture;Servers;High performance computing;Data analysis;Big data;Memory;Big data;high performance computing;RDMA;MapReduce;lustre},
doi={10.1109/TPDS.2016.2591947},
ISSN={1558-2183},
series = {TPDS '17},
month={March},}