Adam Weingram, Yuke Li, Hao Qi, Darren Ng, Liuyao Dai, and Xiaoyi Lu. Xccl: a survey of industry-led collective communication libraries for deep learning. Journal of Computer Science and Technology, 38(1):166, 2023. Invited Paper for the Special Issue in Honor of Professor Kai Hwang’s 80th Birthday. URL: https://jcst.ict.ac.cn/EN/abstract/article_2965.shtml, doi:10.1007/s11390-023-2894-6.
@article{journals-jcst-Weingram23,
author = {Weingram, Adam and Li, Yuke and Qi, Hao and Ng, Darren and Dai, Liuyao and Lu, Xiaoyi},
title = {xCCL: A Survey of Industry-Led Collective Communication Libraries for Deep Learning},
publisher = {Journal of Computer Science and Technology},
year = {2023},
journal = {Journal of Computer Science and Technology},
volume = {38},
number = {1},
eid = {166},
numpages = {29},
pages = {166},
keywords = {collective; deep learning; distributed training; GPUDirect; RDMA (remote direct memory access)},
url = {https://jcst.ict.ac.cn/EN/abstract/article_2965.shtml},
doi = {10.1007/s11390-023-2894-6},
abstract = {Machine learning techniques have become ubiquitous both in industry and academic applications. Increasing model sizes and training data volumes necessitate fast and efficient distributed training approaches. Collective communications greatly simplify inter- and intra-node data transfer and are an essential part of the distributed training process as information such as gradients must be shared between processing nodes. In this paper, we survey the current state-of-the-art collective communication libraries (namely xCCL, including NCCL, oneCCL, RCCL, MSCCL, ACCL, and Gloo), with a focus on the industry-led ones for deep learning workloads. We investigate the design features of these xCCLs, discuss their use cases in the industry deep learning workloads, compare their performance with industry-made benchmarks (i.e., NCCL Tests and PARAM), and discuss key take-aways and interesting observations. We believe our survey sheds light on potential research directions of future designs for xCCLs.},
note = {Invited Paper for the Special Issue in Honor of Professor Kai Hwang’s 80th Birthday},
series = {JCST '23}
}