{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.distributed as dist\n",
    "import time\n",
    "\n",
    "# 初始化进程组\n",
    "dist.init_process_group(backend='nccl', init_method='tcp://localhost:23456', rank=0, world_size=1)\n",
    "\n",
    "# 设置数据大小为10GB\n",
    "size_gb = 10\n",
    "size_bytes = size_gb * (1024**3)  # 转换为字节\n",
    "elements = size_bytes // 4  # 假设使用float32,每个元素4字节\n",
    "\n",
    "try:\n",
    "    # 在GPU0上创建数据\n",
    "    data = torch.randn(elements, dtype=torch.float32, device='cuda:0')\n",
    "    data_gpu1 = torch.empty_like(data, device='cuda:1')\n",
    "\n",
    "    # 确保GPU0上的数据准备完毕\n",
    "    torch.cuda.synchronize()\n",
    "\n",
    "    # 记录开始时间\n",
    "    start = time.time()\n",
    "\n",
    "    # 使用NCCL进行点对点通信\n",
    "    # 使用send/recv替代broadcast\n",
    "    dist.send(data, dst=1)\n",
    "    dist.recv(data_gpu1, src=0)\n",
    "    \n",
    "    # 确保传输完成\n",
    "    torch.cuda.synchronize()\n",
    "\n",
    "    # 计算耗时\n",
    "    elapsed = time.time() - start\n",
    "    print(f'使用NCCL传输 {size_gb}GB 数据从GPU0到GPU1耗时: {elapsed:.3f} 秒')\n",
    "    print(f'传输速度: {size_gb/elapsed:.2f} GB/s')\n",
    "\n",
    "    # 验证数据传输的正确性\n",
    "    print(\"\\n验证传输结果：\")\n",
    "    print(f\"源数据形状: {data.shape}, 目标数据形状: {data_gpu1.shape}\")\n",
    "    \n",
    "\n",
    "finally:\n",
    "    # 释放GPU内存\n",
    "    del data\n",
    "    del data_gpu1\n",
    "    torch.cuda.empty_cache()  # 清空GPU缓存\n",
    "    # 清理进程组\n",
    "    dist.destroy_process_group()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Allocating tensor on cuda:2...\n",
      "Starting transfer...\n",
      "Transfer completed in 0.440 seconds.\n",
      "Verifying tensor content...\n",
      "Data verification: Success\n",
      "Memory released.\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import time\n",
    "\n",
    "# 数据大小计算：float32 = 4 bytes，所以10GB = 10 * 1024^3 / 4 = 2.5e9个元素\n",
    "NUM_ELEMENTS = 10 * 1024 ** 3 // 4  # float32\n",
    "\n",
    "# 初始化 device\n",
    "src_device = torch.device('cuda:2')\n",
    "dst_device = torch.device('cuda:3')\n",
    "\n",
    "# 创建源数据\n",
    "print(\"Allocating tensor on cuda:2...\")\n",
    "src_tensor = torch.randn(NUM_ELEMENTS, dtype=torch.float32, device=src_device)\n",
    "\n",
    "# 同步 GPU 确保计时准确\n",
    "torch.cuda.synchronize(src_device)\n",
    "torch.cuda.synchronize(dst_device)\n",
    "\n",
    "# 开始计时\n",
    "print(\"Starting transfer...\")\n",
    "start_time = time.time()\n",
    "\n",
    "# 使用 NCCL 后端的方式移动张量（可以认为是底层使用 NCCL copy）\n",
    "dst_tensor = src_tensor.to(dst_device)\n",
    "\n",
    "# 同步 GPU 等待操作完成\n",
    "torch.cuda.synchronize(dst_device)\n",
    "end_time = time.time()\n",
    "\n",
    "print(f\"Transfer completed in {end_time - start_time:.3f} seconds.\")\n",
    "\n",
    "# 校验正确性\n",
    "print(\"Verifying tensor content...\")\n",
    "is_correct = torch.allclose(src_tensor.cpu(), dst_tensor.cpu())\n",
    "print(\"Data verification:\", \"Success\" if is_correct else \"Failed\")\n",
    "\n",
    "# 显式释放显存\n",
    "del src_tensor\n",
    "del dst_tensor\n",
    "torch.cuda.empty_cache()\n",
    "print(\"Memory released.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "recv init:29.266769647598267\n",
      "send init:27.47691249847412\n",
      "recv begin:1743753818.250363\n",
      "发送方数据校验和: -54623.12109375\n",
      "send begin:1743753818.2742946\n",
      "recv end:1743753818.8226726\n",
      "send end:1743753818.8230922\n",
      "从GPU2发送 5GB 数据耗时: 0.549 秒\n",
      "发送速度: 9.11 GB/s\n",
      "GPU3成功接收数据，张量形状: torch.Size([1342177280])\n",
      "原始校验和: -54623.12109375\n",
      "接收数据校验和: -54623.12109375\n",
      "数据传输验证成功！\n"
     ]
    }
   ],
   "source": [
    "import subprocess\n",
    "import time\n",
    "import os\n",
    "\n",
    "try:\n",
    "    # 启动接收进程\n",
    "    receiver = subprocess.Popen(['python', 'receiver.py'])\n",
    "    time.sleep(2)  # 等待接收进程完全启动\n",
    "    \n",
    "    # 启动发送进程\n",
    "    sender = subprocess.Popen(['python', 'sender.py'])\n",
    "\n",
    "    # 等待两个进程完成\n",
    "    sender.wait()\n",
    "    receiver.wait()\n",
    "\n",
    "except Exception as e:\n",
    "    print(f\"发生错误: {e}\")\n",
    "    os.system('pkill -9 python')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.distributed as dist\n",
    "import time\n",
    "import os\n",
    "\n",
    "t1 = time.time()\n",
    "rank = int(os.environ[\"RANK\"])\n",
    "world_size = int(os.environ[\"WORLD_SIZE\"])\n",
    "torch.cuda.set_device(rank)\n",
    "\n",
    "print(f'rank{rank}: {world_size}')\n",
    "# 初始化进程组\n",
    "dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)\n",
    "print(f\"rank{rank}: init:{time.time()-t1}\")\n",
    "\n",
    "dist.destroy_process_group()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "mask = torch.zeros(10,dtype=torch.bool)\n",
    "mask[1] = True\n",
    "mask[0] = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mask.nonzero()[0][0].item()\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
