#!/bin/bash
# New version of code
export BASE_DATA="${BASE_DATA:-/home//d1/data}"
echo "Saving to $BASE_DATA"

export VAR_DATA=$BASE_DATA/var_diff

export HF_DATASETS_CACHE=$BASE_DATA/cache_hugg
export HF_HOME=$BASE_DATA/cache_hugg
export HF_HUB_CACHE=$BASE_DATA/cache_hugg
export WANDB_DIR=$BASE_DATA/wandb

export LOGDIR=$BASE_DATA/var_diff/logs
mkdir -p $LOGDIR


export WANDB_PROJECT=var-diff-v2

MODEL_NAME=diffusion-reasoning/LLaDA-8B-Instruct-SFT
DATASET="sudoku"
RUN_NAME=d1_SFT_us_${DATASET}
NUM_ITER=8 # number of policy gradient inner updates iterations
RL_RUN_NAME=${RUN_NAME}
# --eval_strategy: steps
# run_train.py
CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
    --config_file "wd1/accelerate.yaml" \
    --num_processes 4 \
    --main_process_port 12344 wd1/run_train.py \
    --config "wd1/train.yaml" \
    --model_path $MODEL_NAME \
    --num_iterations $NUM_ITER \
    --dataset $DATASET \
    --trainer_type d1 \
    --run_name $RL_RUN_NAME \
    --wandb_project $WANDB_PROJECT \
    --output_dir $VAR_DATA/checkpoints/${RL_RUN_NAME} \
    > $LOGDIR/$RUN_NAME.log 2>&1 &