#!/bin/bash

# 设置环境变量
export CUDA_VISIBLE_DEVICES=1

# 添加信号处理
trap 'echo "包装脚本收到中断信号，正在清理..."; exit' INT TERM

# 创建日志目录
mkdir -p logs

# 修改master_port以避免冲突
PORT=6001

# 为任务 1 创建修改后的临时脚本
TMP_SCRIPT1="tmp_infer_ckpt_gsm_layer_100_8_16.sh"

# 复制并修改脚本的路径
echo "修复 infer_ckpt_gsm_layer_100_8_16.sh 中的路径..."
fix_script_paths () 
{ 
    local script_path=$1;
    local tmp_script_path=$2;
    cp "${script_path}" "${tmp_script_path}";
    sed -i 's|./results/|/workspace/megatron/results/|g' "${tmp_script_path}";
    sed -i 's|./datasets/|/workspace/megatron/datasets/|g' "${tmp_script_path}";
    sed -i 's|tools/|/workspace/megatron/tools/|g' "${tmp_script_path}";
    grep -o "/workspace/megatron/results/[^/ ]*" "${tmp_script_path}" | sort -u | while read dir; do
        mkdir -p "${dir}";
    done;
    grep -o "/workspace/megatron/datasets/[^/ ]*" "${tmp_script_path}" | grep -v "\.txt\|\.json" | sort -u | while read dir; do
        mkdir -p "${dir}";
    done;
    grep -o "CHECKPOINT_BASE=\"[^\"]*\"" "${tmp_script_path}" | sed 's/CHECKPOINT_BASE="//' | sed 's/"//' | while read ckpt_dir; do
        mkdir -p "${ckpt_dir}";
    done
}
fix_script_paths "infer_ckpt_gsm_layer_100_8_16.sh" "${TMP_SCRIPT1}"

# 修改master_port以避免冲突
sed -i "s/--master_port 6000/--master_port ${PORT}/" "${TMP_SCRIPT1}"

# 运行脚本
echo "[$(date)] 开始在 GPU 1 上运行脚本 infer_ckpt_gsm_layer_100_8_16.sh..." | tee -a "logs/gpu1_tasks.log"
bash "${TMP_SCRIPT1}" > "logs/infer_ckpt_gsm_layer_100_8_16_gpu1.log" 2>&1
SCRIPT1_EXIT_CODE=$?
echo "[$(date)] 脚本 infer_ckpt_gsm_layer_100_8_16.sh 在 GPU 1 上完成 (退出代码: $SCRIPT1_EXIT_CODE)" | tee -a "logs/gpu1_tasks.log"

# 检查脚本是否成功
if [ $SCRIPT1_EXIT_CODE -ne 0 ]; then
  echo "[$(date)] 警告: 脚本 infer_ckpt_gsm_layer_100_8_16.sh 出错，退出代码: $SCRIPT1_EXIT_CODE" | tee -a "logs/gpu1_tasks.log"
fi

# 清理临时文件
rm -f "${TMP_SCRIPT1}"

echo "[$(date)] GPU 1 的所有任务已完成" | tee -a "logs/gpu1_tasks.log"
