import torch

def test_nccl():
    if not torch.cuda.is_available():
        print("CUDA is not available.")
        return

    device = torch.device('cuda:0')
    tensor = torch.randn(10, device=device)

    try:
        # 创建一个进程组，即使是单卡也会调用 NCCL 初始化
        torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:29500', rank=0, world_size=1)
        print("NCCL init success!")

        # 测试 all_reduce 操作
        torch.distributed.all_reduce(tensor)
        print("NCCL all_reduce success:", tensor)

    except Exception as e:
        print("NCCL test failed:", e)

    finally:
        if torch.distributed.is_initialized():
            torch.distributed.destroy_process_group()

if __name__ == "__main__":
    test_nccl()