Ching-Hsiang Chu, Xiaoyi Lu, Ammar Awan, Hari Subramoni, Bracy Elton, and Dhabaleswar K. Panda. Exploiting hardware multicast and gpudirect rdma for efficient broadcast. IEEE Transactions on Parallel and Distributed Systems, 30(3):575-588, March 2019. doi:10.1109/TPDS.2018.2867222.
@ARTICLE{journals-debu-LuSP17,
author={Chu, Ching-Hsiang and Lu, Xiaoyi and Awan, Ammar and Subramoni, Hari and Elton, Bracy and Panda, Dhabaleswar K.},
journal={IEEE Transactions on Parallel and Distributed Systems},
title={Exploiting Hardware Multicast and GPUDirect RDMA for Efficient Broadcast},
year={2019},
volume={30},
number={3},
pages={575-588},
abstract={Broadcast is a widely used operation in many streaming and deep learning applications to disseminate large amounts of data on emerging heterogeneous High-Performance Computing (HPC) systems. However, traditional broadcast schemes do not fully utilize hardware features for Graphics Processing Unit (GPU)-based applications. In this paper, a model-oriented analysis is presented to identify performance bottlenecks of existing broadcast schemes on GPU clusters. Next, streaming-based broadcast schemes are proposed to exploit InfiniBand hardware multicast (IB-MCAST) and NVIDIA GPUDirect technology for efficient message transmission. The proposed designs are evaluated in the context of using Message Passing Interface (MPI) based benchmarks and applications. The experimental results indicate improved scalability and up to 82 percent reduction of latency compared to the state-of-the-art solutions in the benchmark-level evaluation. Furthermore, compared to the state-of-the-art, the proposed design yields stable higher throughput for a synthetic streaming workload, and 1.3x faster training time for a deep learning framework.},
keywords={graphics processing units;learning (artificial intelligence);message passing;neural nets;parallel processing;performance evaluation;model-oriented analysis;GPU clusters;streaming-based broadcast schemes;InfiniBand hardware multicast;IB-MCAST;NVIDIA GPUDirect technology;streaming learning applications;message transmission;deep learning;high-performance computing;graphics processing unit-based applications;message passing interface;remote direct memory access technology;GPUDirect RDMA technology;HPC clusters;performance evaluation;Graphics processing units;Hardware;Analytical models;Machine learning;Clustering algorithms;Scalability;Bandwidth;Broadcast;deep learning;hardware multicast;GPU;GPUDirect RDMA;heterogeneous broadcast;streaming},
doi={10.1109/TPDS.2018.2867222},
ISSN={1558-2183},
series={TPDS '19},
month={March},}