#!/bin/bash

# 添加信号处理，捕获中断并清理所有子进程
trap 'echo "收到中断信号，正在清理所有子进程..."; pkill -P $$; wait; echo "清理完成，退出脚本"; exit' INT TERM

# 定义脚本文件列表 - 现在包含10个脚本
SCRIPTS=(
  "infer_ckpt_gsm_base_50_4_0.sh"
  "infer_ckpt_gsm_base_100_8_0.sh"
  "infer_ckpt_gsm_base_150_12_0.sh"
  "infer_ckpt_gsm_base_200_16_0.sh"
  "infer_ckpt_gsm_layer_50_4_4.sh"
  "infer_ckpt_gsm_layer_50_4_8.sh"
  "infer_ckpt_gsm_layer_50_4_12.sh"
  "infer_ckpt_gsm_layer_100_8_8.sh"
  "infer_ckpt_gsm_layer_100_8_16.sh"
  "infer_ckpt_gsm_layer_100_8_24.sh"
)

# 创建日志目录
mkdir -p logs

# 统计脚本数量
SCRIPT_COUNT=${#SCRIPTS[@]}
echo "检测到 ${SCRIPT_COUNT} 个脚本需要运行"

# 修复Python脚本中的路径
echo "修复Python脚本中的路径..."
cp /workspace/megatron/tools/run_logits_gsm.py /workspace/megatron/tools/run_logits_gsm.py.backup
sed -i 's#\./datasets/igsm/#/workspace/megatron/datasets/igsm/#g' /workspace/megatron/tools/run_logits_gsm.py
echo "Python脚本路径修复完成"

# 创建一个修复每个原始脚本中路径的函数
fix_script_paths() {
  local script_path=$1
  local tmp_script_path=$2
  
  # 复制原始脚本
  cp "${script_path}" "${tmp_script_path}"
  
  # 修改路径引用
  # 1. 修改results路径
  sed -i 's|./results/|/workspace/megatron/results/|g' "${tmp_script_path}"
  
  # 2. 修改datasets路径
  sed -i 's|./datasets/|/workspace/megatron/datasets/|g' "${tmp_script_path}"
  
  # 3. 修改tools路径，确保tools路径是相对于当前目录的
  sed -i 's|tools/|/workspace/megatron/tools/|g' "${tmp_script_path}"
  
  # 4. 确保目录存在，但不要尝试为文件创建目录
  grep -o "/workspace/megatron/results/[^/ ]*" "${tmp_script_path}" | sort -u | while read dir; do
    mkdir -p "${dir}"
  done
  
  # 只为实际目录创建mkdir命令，排除文件路径（如.txt和.json文件）
  grep -o "/workspace/megatron/datasets/[^/ ]*" "${tmp_script_path}" | grep -v "\.txt\|\.json" | sort -u | while read dir; do
    mkdir -p "${dir}"
  done
  
  # 5. 创建必要的ckpts子目录
  grep -o "CHECKPOINT_BASE=\"[^\"]*\"" "${tmp_script_path}" | sed 's/CHECKPOINT_BASE="//' | sed 's/"//' | while read ckpt_dir; do
    mkdir -p "${ckpt_dir}"
  done
}

# 存储所有子进程PID
declare -a CHILD_PIDS

# 手动设置每个GPU上运行的脚本数量
# GPU 0和1各运行3个任务，GPU 2和3各运行2个任务
SCRIPTS_PER_GPU=(3 3 2 2)

echo "GPU分配: GPU0: ${SCRIPTS_PER_GPU[0]}个任务, GPU1: ${SCRIPTS_PER_GPU[1]}个任务, GPU2: ${SCRIPTS_PER_GPU[2]}个任务, GPU3: ${SCRIPTS_PER_GPU[3]}个任务"

# 跟踪当前脚本索引
CURRENT_SCRIPT_IDX=0

# 为每个GPU创建运行脚本
for GPU_ID in {0..3}; do
  # 获取该GPU要运行的脚本数量
  TASKS_FOR_THIS_GPU=${SCRIPTS_PER_GPU[$GPU_ID]}
  
  # 创建一个运行这些脚本的包装脚本
  WRAPPER_SCRIPT="gpu${GPU_ID}_wrapper.sh"
  
  cat > "${WRAPPER_SCRIPT}" << EOF
#!/bin/bash

# 设置环境变量
export CUDA_VISIBLE_DEVICES=${GPU_ID}

# 添加信号处理
trap 'echo "包装脚本收到中断信号，正在清理..."; exit' INT TERM

# 创建日志目录
mkdir -p logs

# 修改master_port以避免冲突
PORT=$((6000 + GPU_ID))

EOF

  # 为每个脚本添加运行命令
  for i in $(seq 1 ${TASKS_FOR_THIS_GPU}); do
    if [ $CURRENT_SCRIPT_IDX -lt ${#SCRIPTS[@]} ]; then
      SCRIPT=${SCRIPTS[$CURRENT_SCRIPT_IDX]}
      
      cat >> "${WRAPPER_SCRIPT}" << EOF
# 为任务 $i 创建修改后的临时脚本
TMP_SCRIPT$i="tmp_${SCRIPT}"

# 复制并修改脚本的路径
echo "修复 ${SCRIPT} 中的路径..."
$(declare -f fix_script_paths)
fix_script_paths "${SCRIPT}" "\${TMP_SCRIPT$i}"

# 修改master_port以避免冲突
sed -i "s/--master_port 6000/--master_port \${PORT}/" "\${TMP_SCRIPT$i}"

# 运行脚本
echo "[\$(date)] 开始在 GPU ${GPU_ID} 上运行脚本 ${SCRIPT}..." | tee -a "logs/gpu${GPU_ID}_tasks.log"
bash "\${TMP_SCRIPT$i}" > "logs/${SCRIPT%.sh}_gpu${GPU_ID}.log" 2>&1
SCRIPT${i}_EXIT_CODE=\$?
echo "[\$(date)] 脚本 ${SCRIPT} 在 GPU ${GPU_ID} 上完成 (退出代码: \$SCRIPT${i}_EXIT_CODE)" | tee -a "logs/gpu${GPU_ID}_tasks.log"

# 检查脚本是否成功
if [ \$SCRIPT${i}_EXIT_CODE -ne 0 ]; then
  echo "[\$(date)] 警告: 脚本 ${SCRIPT} 出错，退出代码: \$SCRIPT${i}_EXIT_CODE" | tee -a "logs/gpu${GPU_ID}_tasks.log"
fi

EOF
      # 增加当前脚本索引
      CURRENT_SCRIPT_IDX=$((CURRENT_SCRIPT_IDX + 1))
    fi
  done
  
  # 添加清理临时文件的命令
  cat >> "${WRAPPER_SCRIPT}" << EOF
# 清理临时文件
EOF

  for i in $(seq 1 ${TASKS_FOR_THIS_GPU}); do
    cat >> "${WRAPPER_SCRIPT}" << EOF
rm -f "\${TMP_SCRIPT$i}"
EOF
  done
  
  cat >> "${WRAPPER_SCRIPT}" << EOF

echo "[\$(date)] GPU ${GPU_ID} 的所有任务已完成" | tee -a "logs/gpu${GPU_ID}_tasks.log"
EOF
  
  # 添加执行权限
  chmod +x "${WRAPPER_SCRIPT}"
  
  # 在后台启动包装脚本并记录PID
  ./"${WRAPPER_SCRIPT}" &
  CHILD_PID=$!
  CHILD_PIDS+=($CHILD_PID)
  
  echo -n "已在 GPU ${GPU_ID} 上安排脚本："
  local START_IDX=$((CURRENT_SCRIPT_IDX - TASKS_FOR_THIS_GPU))
  for i in $(seq 0 $((TASKS_FOR_THIS_GPU-1))); do
    SCRIPT_IDX=$((START_IDX + i))
    if [ $SCRIPT_IDX -lt ${#SCRIPTS[@]} ]; then
      echo -n " ${SCRIPTS[$SCRIPT_IDX]}"
    fi
  done
  echo " (PID: $CHILD_PID)"
done

echo "所有GPU任务已启动。您可以使用以下命令监控进度："
echo "  - 查看GPU使用情况: nvidia-smi"
echo "  - 查看总体进度: tail -f logs/gpu*_tasks.log"
echo "  - 查看特定任务: tail -f logs/具体任务名.log"

# 等待所有子进程完成
echo "等待所有子进程完成..."
for pid in "${CHILD_PIDS[@]}"; do
  wait $pid
  echo "子进程 $pid 已完成"
done

echo "所有任务已完成!"

# 清理临时包装脚本
rm -f gpu*_wrapper.sh

################################################################################################################################


# #!/bin/bash

# # 添加信号处理，捕获中断并清理所有子进程
# trap 'echo "收到中断信号，正在清理所有子进程..."; pkill -P $$; wait; echo "清理完成，退出脚本"; exit' INT TERM

# # 定义脚本文件列表
# SCRIPTS=(
#   "infer_ckpt_gsm_base_50_4_0.sh"
#   "infer_ckpt_gsm_base_100_8_0.sh"
#   "infer_ckpt_gsm_base_150_12_0.sh"
#   "infer_ckpt_gsm_base_200_16_0.sh"
#   "infer_ckpt_gsm_cycle_50_4_4.sh"
#   "infer_ckpt_gsm_cycle_50_4_8.sh"
#   "infer_ckpt_gsm_cycle_50_4_12.sh"
#   "infer_ckpt_gsm_cycle_100_8_8.sh"
#   "infer_ckpt_gsm_layer_50_4_4.sh"
#   "infer_ckpt_gsm_layer_50_4_8.sh"
#   "infer_ckpt_gsm_layer_50_4_12.sh"
#   "infer_ckpt_gsm_layer_100_8_8.sh"
# )

# # 创建日志目录
# mkdir -p logs

# # 检查是否有12个脚本
# if [ ${#SCRIPTS[@]} -ne 12 ]; then
#   echo "错误：脚本数量必须为12个，当前有 ${#SCRIPTS[@]} 个"
#   exit 1
# fi

# # 修复Python脚本中的路径
# echo "修复Python脚本中的路径..."
# cp /workspace/megatron/tools/run_logits_gsm.py /workspace/megatron/tools/run_logits_gsm.py.backup
# sed -i 's#\./datasets/igsm/#/workspace/megatron/datasets/igsm/#g' /workspace/megatron/tools/run_logits_gsm.py
# echo "Python脚本路径修复完成"

# # 创建一个修复每个原始脚本中路径的函数
# fix_script_paths() {
#   local script_path=$1
#   local tmp_script_path=$2
  
#   # 复制原始脚本
#   cp "${script_path}" "${tmp_script_path}"
  
#   # 修改路径引用
#   # 1. 修改results路径
#   sed -i 's|./results/|/workspace/megatron/results/|g' "${tmp_script_path}"
  
#   # 2. 修改datasets路径
#   sed -i 's|./datasets/|/workspace/megatron/datasets/|g' "${tmp_script_path}"
  
#   # 3. 修改tools路径，确保tools路径是相对于当前目录的
#   sed -i 's|tools/|/workspace/megatron/tools/|g' "${tmp_script_path}"
  
#   # 4. 确保目录存在，但不要尝试为文件创建目录
#   grep -o "/workspace/megatron/results/[^/ ]*" "${tmp_script_path}" | sort -u | while read dir; do
#     mkdir -p "${dir}"
#   done
  
#   # 只为实际目录创建mkdir命令，排除文件路径（如.txt和.json文件）
#   grep -o "/workspace/megatron/datasets/[^/ ]*" "${tmp_script_path}" | grep -v "\.txt\|\.json" | sort -u | while read dir; do
#     mkdir -p "${dir}"
#   done
  
#   # 5. 创建必要的ckpts子目录
#   grep -o "CHECKPOINT_BASE=\"[^\"]*\"" "${tmp_script_path}" | sed 's/CHECKPOINT_BASE="//' | sed 's/"//' | while read ckpt_dir; do
#     mkdir -p "${ckpt_dir}"
#   done
# }

# # 存储所有子进程PID
# declare -a CHILD_PIDS

# # 计算每个GPU运行的脚本数量
# SCRIPTS_PER_GPU=3

# # 为每个GPU创建运行脚本
# for GPU_ID in {0..3}; do
#   # 计算该GPU要运行的脚本起始索引
#   START_IDX=$((GPU_ID * SCRIPTS_PER_GPU))
  
#   # 创建一个运行这些脚本的包装脚本
#   WRAPPER_SCRIPT="gpu${GPU_ID}_wrapper.sh"
  
#   cat > "${WRAPPER_SCRIPT}" << EOF
# #!/bin/bash

# # 设置环境变量
# export CUDA_VISIBLE_DEVICES=${GPU_ID}

# # 添加信号处理
# trap 'echo "包装脚本收到中断信号，正在清理..."; exit' INT TERM

# # 创建日志目录
# mkdir -p logs

# # 修改master_port以避免冲突
# PORT=$((6000 + GPU_ID))

# EOF

#   # 为每个脚本添加运行命令
#   for i in $(seq 0 $((SCRIPTS_PER_GPU-1))); do
#     SCRIPT_IDX=$((START_IDX + i))
#     if [ $SCRIPT_IDX -lt ${#SCRIPTS[@]} ]; then
#       SCRIPT=${SCRIPTS[$SCRIPT_IDX]}
      
#       cat >> "${WRAPPER_SCRIPT}" << EOF
# # 为任务 $((i+1)) 创建修改后的临时脚本
# TMP_SCRIPT$((i+1))="tmp_${SCRIPT}"

# # 复制并修改脚本的路径
# echo "修复 ${SCRIPT} 中的路径..."
# $(declare -f fix_script_paths)
# fix_script_paths "${SCRIPT}" "\${TMP_SCRIPT$((i+1))}"

# # 修改master_port以避免冲突
# sed -i "s/--master_port 6000/--master_port \${PORT}/" "\${TMP_SCRIPT$((i+1))}"

# # 运行脚本
# echo "[\$(date)] 开始在 GPU ${GPU_ID} 上运行脚本 ${SCRIPT}..." | tee -a "logs/gpu${GPU_ID}_tasks.log"
# bash "\${TMP_SCRIPT$((i+1))}" > "logs/${SCRIPT%.sh}_gpu${GPU_ID}.log" 2>&1
# SCRIPT$((i+1))_EXIT_CODE=\$?
# echo "[\$(date)] 脚本 ${SCRIPT} 在 GPU ${GPU_ID} 上完成 (退出代码: \$SCRIPT$((i+1))_EXIT_CODE)" | tee -a "logs/gpu${GPU_ID}_tasks.log"

# # 检查脚本是否成功
# if [ \$SCRIPT$((i+1))_EXIT_CODE -ne 0 ]; then
#   echo "[\$(date)] 警告: 脚本 ${SCRIPT} 出错，退出代码: \$SCRIPT$((i+1))_EXIT_CODE" | tee -a "logs/gpu${GPU_ID}_tasks.log"
# fi

# EOF
#     fi
#   done
  
#   # 添加清理临时文件的命令
#   cat >> "${WRAPPER_SCRIPT}" << EOF
# # 清理临时文件
# EOF

#   for i in $(seq 0 $((SCRIPTS_PER_GPU-1))); do
#     SCRIPT_IDX=$((START_IDX + i))
#     if [ $SCRIPT_IDX -lt ${#SCRIPTS[@]} ]; then
#       cat >> "${WRAPPER_SCRIPT}" << EOF
# rm -f "\${TMP_SCRIPT$((i+1))}"
# EOF
#     fi
#   done
  
#   cat >> "${WRAPPER_SCRIPT}" << EOF

# echo "[\$(date)] GPU ${GPU_ID} 的所有任务已完成" | tee -a "logs/gpu${GPU_ID}_tasks.log"
# EOF
  
#   # 添加执行权限
#   chmod +x "${WRAPPER_SCRIPT}"
  
#   # 在后台启动包装脚本并记录PID
#   ./"${WRAPPER_SCRIPT}" &
#   CHILD_PID=$!
#   CHILD_PIDS+=($CHILD_PID)
  
#   echo -n "已在 GPU ${GPU_ID} 上安排脚本："
#   for i in $(seq 0 $((SCRIPTS_PER_GPU-1))); do
#     SCRIPT_IDX=$((START_IDX + i))
#     if [ $SCRIPT_IDX -lt ${#SCRIPTS[@]} ]; then
#       echo -n " ${SCRIPTS[$SCRIPT_IDX]}"
#     fi
#   done
#   echo " (PID: $CHILD_PID)"
# done

# echo "所有GPU任务已启动。您可以使用以下命令监控进度："
# echo "  - 查看GPU使用情况: nvidia-smi"
# echo "  - 查看总体进度: tail -f logs/gpu*_tasks.log"
# echo "  - 查看特定任务: tail -f logs/具体任务名.log"

# # 等待所有子进程完成
# echo "等待所有子进程完成..."
# for pid in "${CHILD_PIDS[@]}"; do
#   wait $pid
#   echo "子进程 $pid 已完成"
# done

# echo "所有任务已完成!"

# # 清理临时包装脚本
# rm -f gpu*_wrapper.sh

# # 脚本正常退出时不会执行到这里，因为wait命令会一直等待


