Yafan Huang, Sheng Di, Zhaorui Zhang, Xiaoyi Lu, and Guanpeng Li. Versatile Datapath Soft Error Detection on the Cheap for HPC Applications. In Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis, SC '24. IEEE Press, 2024. URL: https://doi.org/10.1109/SC41406.2024.00061, doi:10.1109/SC41406.2024.00061.
@inproceedings{conf-sc24-yafan,
author = {Huang, Yafan and Di, Sheng and Zhang, Zhaorui and Lu, Xiaoyi and Li, Guanpeng},
title = "{Versatile Datapath Soft Error Detection on the Cheap for HPC Applications}",
year = {2024},
isbn = {9798350352917},
publisher = {IEEE Press},
url = {https://doi.org/10.1109/SC41406.2024.00061},
doi = {10.1109/SC41406.2024.00061},
abstract = {With the ongoing reduction in technology sizes and voltage levels, modern microprocessors are increasingly susceptible to soft errors, corrupting datapath units during program execution. While these error types have received considerable attention recently, existing solutions either confine themselves to limited scopes or incur massive overheads in performance and power consumption, hindering practical usage. In this work, we propose CONDA, a novel error detection technique based on code transformation and static program analysis, achieving versatile datapath protection at low cost. At compile time, CONDA analyzes program characteristics and transforms the original program code without complicating its control-flow and memory access patterns. At runtime, CONDA detects datapath errors with low overhead and latency. The evaluation of 38 benchmarks and a parallel HPC simulation reveals that ConDa only incurs 57.79\% runtime overhead, which is 41.84\% faster than existing state-of-the-art, with the same level of error detection effectiveness and low detection latency.},
booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis},
articleno = {55},
numpages = {15},
keywords = {Code Transformation, Compiler, Datapath Protection, High-Performance Computing (HPC), Reliability, Soft Errors},
location = {Atlanta, GA, USA},
series = {SC '24}
}