#!/bin/bash
#SBATCH --gpus=4
#SBATCH -p gpu_h200

WORK_DIR=${WORK_DIR:-"$(pwd)"}
cd "$WORK_DIR"

if ! module load miniforge3/25.11.0-1>/dev/null 2>/dev/null; then
    source ~/.bashrc 2>/dev/null || true
fi

source $(conda info --base)/etc/profile.d/conda.sh
module load cuda/12.8 2>/dev/null || true
source activate uni-plan

export SLURM_NNODES=${SLURM_NNODES:-1}
export SLURM_PROCID=${SLURM_PROCID:-0}
export MASTER_ADDR=${MASTER_ADDR:-'127.0.0.1'}
export MASTER_PORT=${MASTER_PORT:-29501}

model_path=${model_path:-"./models/BAGEL-7B-MoT"}
results_dir=${results_dir:-"./results/libero/libero"}
ckpt_dir=${ckpt_dir:-"./results/libero/libero/checkpoints"}

export CUDA_VISIBLE_DEVICES=0,1,2,3
export WANDB_API_KEY=${WANDB_API_KEY:-""} 

torchrun \
  --nnodes=$SLURM_NNODES \
  --node_rank=$SLURM_PROCID \
  --nproc_per_node=4 \
  --master_addr=$MASTER_ADDR \
  --master_port=$MASTER_PORT \
  train/pretrain_unified_navit.py \
  --dataset_config_file ./data/configs/libero/object.yaml \
  --wandb_name "libero_object_scratch" \
  --wandb_runid "0" \
  --wandb_offline True \
  --model_path $model_path \
  --results_dir $results_dir \
  --checkpoint_dir $ckpt_dir \
  --layer_module Qwen2MoTDecoderLayer \
  --max_latent_size 64 \
  --finetune_from_hf True \
  --auto_resume False \
  --resume-model-only True \
  --resume-from $model_path \
  --finetune-from-ema True \
  --log_every 10 \
  --ce_weight 0.01 \
  --lr 2e-5 \
  --num_shard 4 \
  --warmup_steps 500 \
  --total_steps 5000 \
  --save_every 1000 \
  --expected_num_tokens 32768 \
  --max_num_tokens 32768 \
  --max_num_tokens_per_sample 32768 \
  --sharding_strategy "FULL_SHARD" \
  --freeze_vit True \
  --freeze_vae True \
  --visual_und True \
  --visual_gen True \