BASE_PATH=${1-"/home/spectrumKD"}
# 2nd arg: logical checkpoint name for naming outputs (e.g., gpt2-xl)
CKPT_NAME=${2-"gpt2-xl"}
# 3rd arg (optional): absolute path to finetuned model directory to load
CKPT_DIR=${3-""}
# 4th arg (optional): number of GPUs to use for generation
GPUS_PER_NODE=${4-1}

export TF_CPP_MIN_LOG_LEVEL=3

# Resolve effective checkpoint directory to load
CKPT_EFF=""
if [ -n "${CKPT_DIR}" ]; then
  # If CKPT_DIR is a leaf model folder, detect via any of the expected HF files
  if [ -f "${CKPT_DIR}/model.safetensors" ] || \
     [ -f "${CKPT_DIR}/model.safetensors.index.json" ] || \
     ls "${CKPT_DIR}"/model-*.safetensors >/dev/null 2>&1 || \
     [ -f "${CKPT_DIR}/pytorch_model.bin" ] || \
     [ -f "${CKPT_DIR}/pytorch_model.bin.index.json" ]; then
    CKPT_EFF="${CKPT_DIR}"
  else
    # Otherwise, assume it's a run directory with numeric subfolders; pick the largest
    LAST_SUBDIR=$(ls -1 "${CKPT_DIR}" 2>/dev/null | grep -E '^[0-9]+$' | sort -n | tail -n 1)
    if [ -n "${LAST_SUBDIR}" ] && [ -d "${CKPT_DIR}/${LAST_SUBDIR}" ]; then
      CKPT_EFF="${CKPT_DIR}/${LAST_SUBDIR}"
    fi
  fi
fi

PYTHONPATH=${BASE_PATH} python3 ${BASE_PATH}/tools/process_data_dolly.py \
    --data-dir ${BASE_PATH}/data/dolly/ \
    --processed-data-dir ${BASE_PATH}/processed_data/dolly/full \
    --model-path ${CKPT_EFF:-${BASE_PATH}/checkpoints/${CKPT_NAME}} \
    --data-process-workers 32 \
    --max-prompt-length 256 \
    --dev-num -1 \
    --model-type gpt2

# Run generation using resolved checkpoint (if any)
if [ -n "${CKPT_EFF}" ]; then
  echo "Using checkpoint: ${CKPT_EFF} with ${GPUS_PER_NODE} GPU(s)"
  bash ${BASE_PATH}/scripts/gpt2/tools/generate_data_dolly.sh ${BASE_PATH} 2112 ${GPUS_PER_NODE} ${CKPT_NAME} "${CKPT_EFF}"
else
  # Fall back to base checkpoints by name
  bash ${BASE_PATH}/scripts/gpt2/tools/generate_data_dolly.sh ${BASE_PATH} 2112 ${GPUS_PER_NODE} ${CKPT_NAME}
fi

# Derive assessment save and input path names. If CKPT_DIR provided, use parent/basename to avoid collisions
ASSESS_NAME="${CKPT_NAME}"
if [ -n "${CKPT_DIR}" ]; then
  CKPT_PARENT=$(basename "$(dirname "${CKPT_DIR}")")
  CKPT_BASE=$(basename "${CKPT_DIR}")
  ASSESS_NAME="${CKPT_PARENT}/${CKPT_BASE}"
fi

PYTHONPATH=${BASE_PATH} python3 ${BASE_PATH}/tools/assess.py \
    --generate-data-dir ${BASE_PATH}/processed_data/dolly/full/generate_data/dolly-512/${ASSESS_NAME}/10/answers.jsonl \
    --truth-data-dir ${BASE_PATH}/processed_data/dolly/full/gpt2/train.jsonl \
    --save-dir ${BASE_PATH}/processed_data/dolly/full/gpt2/${ASSESS_NAME}/ \
    --model-name ${ASSESS_NAME}





	


    