torchrun --nproc_per_node=2 gpu_bandwidth_test.py
