#!/usr/bin/env bash
set -euo pipefail

export WANDB_PROJECT="stlm-logic-reasoning"

DATE_MARKER_STRING=$(date +%Y-%m-%d_%H-%M-%S)

######## user knobs ########
# export MODEL="Qwen/Qwen2.5-0.5B-Instruct"
# export MODEL="Qwen/Qwen2.5-1.5B-Instruct"
# export MODEL="meta-llama/Llama-3.2-1B-Instruct"
# export MODEL="Qwen/Qwen2-1.5B-Instruct"

# export MODEL="Qwen/Qwen2.5-0.5B-Instruct-canonical-legal-move-ckpt-1800-cont"
# export MODEL="Qwen/Qwen2.5-0.5B-Instruct-random-legal-move-ckpt-600-cont"

# export MODEL="Qwen/Qwen2.5-1.5B-Instruct-pretrained_legal-move-ckpt-750"
# export MODEL="Qwen/Qwen2.5-1.5B-Instruct-pretrained_legal-move-ckpt-750"

export MODEL="meta-llama/Llama-3.2-1B-Instruct_canonical-legal-move-ckpt-600"
# export MODEL="meta-llama/Llama-3.2-1B-Instruct_random_legal-move-ckpt-450"


# Config for legal move
# export CFG_YAML="/home/data/stlm-game-logic/configs_v2/grpo/config.yaml"
export CFG_YAML="/home/data/stlm-game-logic/configs_v2/grpo/config_best_move.yaml"
# export DS_CFG="configs/deepspeed/zero3_bf16_offload.json"

export MASTER_PORT=29516
export ACCELERATE_LOG_LEVEL="info"
export DEBUG="False"
############################

################### runtime env ###################
# export CUDA_VISIBLE_DEVICES=${GPU_FOR_GRPO}
# export TOKENIZERS_PARALLELISM=false
# export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64
export NCCL_P2P_DISABLE=1          # saves GPU RAM on single‑node jobs
export NCCL_IB_DISABLE=1
export TORCH_NCCL_BLOCKING_WAIT=1
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
# export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=ALL
export TORCH_DISTRIBUTED_DEBUG=INFO
export  PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export TRL_VLLM_GROUP_PORT=61036
###################################################

# If there are cuda  mismatch issues
export CUDA_HOME=$CONDA_PREFIX

# # # 2. Add this debug line to verify the variable is set correctly.
# # echo "--- SCRIPT DEBUG: CUDA_HOME is set to: $CUDA_HOME ---"

# # # 2. Tell the C++ linker where to find CUDA libraries at COMPILE time.
# # #    Use ${VAR:-} to handle cases where the variable is initially unset.
export LIBRARY_PATH="$CUDA_HOME/lib:${LIBRARY_PATH:-}"

# # # 3. Tell the system's dynamic linker where to find CUDA libraries at RUN time.
# # #    Use ${VAR:-} here as well for safety.
export LD_LIBRARY_PATH="$CUDA_HOME/lib:${LD_LIBRARY_PATH:-}"


DATASET_NAME="canonical-symmetry-grouping"
# DATASET_NAME="random-80-10-10"
# DATASET_TYPE="special"
DATASET_TYPE="nl"
# MISC_INFO="lr_1e-6-from-legal"
MISC_INFO="lr_1e-6"
EXPERIMENT_MODE="best_move"

DEEPSPEED_CONFIG="/home/data/stlm-game-logic/configs_v2/deepspeed_zero3_grpo.yaml"
MODEL_MARK="${MODEL//\//-}_${DATASET_NAME}_${DATASET_TYPE}_${EXPERIMENT_MODE}_${DATE_MARKER_STRING}_DEBUG_${DEBUG}_${MISC_INFO}"

RUN_NAME="H100:/$MODEL_MARK"
export WANDB_NAME=$MODEL_MARK

echo "Starting GRPO…"

# the values here overwrite the configs/recipes/Qwen2.5-1.5B-Instruct/grpo/config.yaml
 CUDA_VISIBLE_DEVICES=3 ACCELERATE_LOG_LEVEL=info accelerate launch \
    --main_process_port $MASTER_PORT \
    --config_file $DEEPSPEED_CONFIG \
    --num_processes 1 \
    python/grpo_v2.py \
    --config $CFG_YAML \
    --use_liger_kernel true \
    --eval_accumulation_steps 8 \
    --torch_empty_cache_steps 8 \
    --run_name $RUN_NAME 
   #  

    # --ds3_gather_for_generation False # This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation, improving generation speed. However, disabling this option allows training models that exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible with vLLM generation.

    # --torch_compile true \
