export RAY_RUNTIME_ENV_HOOK=ray._private.runtime_env.uv_runtime_env_hook.hook
export PYTHONPATH="/root/advisor-models/SkyRL/skyrl-train:$PYTHONPATH"

export DATA_DIR="/root/advisor-models/data/math"
export NUM_GPUS=8
export LOGGER="wandb"  # change to "console" to print to stdout
export ADVISOR_MODELS_MODE="advisor"  # "advisor" or "baseline"
export AGENT_MODEL="openai/gpt-4o-mini"  # Student model for advisor mode

/root/advisor-models/SkyRL/skyrl-train/.venv/bin/python -m advisor_models.math.main_math \
    data.train_data="['$DATA_DIR/train.parquet']" \
    data.val_data="['$DATA_DIR/validation.parquet']" \
    trainer.algorithm.advantage_estimator="grpo" \
    trainer.policy.model.path="Qwen/Qwen2.5-7B-Instruct" \
    trainer.placement.colocate_all=true \
    trainer.strategy=fsdp2 \
    trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \
    trainer.placement.ref_num_gpus_per_node=$NUM_GPUS \
    generator.num_inference_engines=$NUM_GPUS \
    generator.inference_engine_tensor_parallel_size=1 \
    trainer.epochs=20 \
    trainer.eval_batch_size=50 \
    trainer.eval_before_train=true \
    trainer.eval_interval=10 \
    trainer.update_epochs_per_batch=1 \
    trainer.train_batch_size=8 \
    trainer.policy_mini_batch_size=4 \
    trainer.micro_forward_batch_size_per_gpu=1 \
    trainer.micro_train_batch_size_per_gpu=1 \
    trainer.ckpt_interval=10 \
    trainer.max_prompt_length=28672 \
    generator.sampling_params.max_generate_length=4096 \
    generator.sampling_params.top_p=0.999 \
    trainer.policy.optimizer_config.lr=1.0e-6 \
    trainer.algorithm.use_kl_loss=true \
    generator.backend=vllm \
    generator.run_engines_locally=true \
    generator.weight_sync_backend=nccl \
    generator.async_engine=true \
    generator.batched=false \
    environment.env_class=math \
    generator.n_samples_per_prompt=8 \
    generator.gpu_memory_utilization=0.8 \
    trainer.logger="$LOGGER" \
    trainer.project_name="advisor-models" \
    trainer.run_name="math_qwen2.5_7b_20ep_paper" \
    trainer.resume_mode=null \
    trainer.ckpt_path="$HOME/ckpts/math_qwen2.5_7b_20ep_paper_ckpt" \
    generator.zero_reward_on_non_stop=true \
    trainer.hf_save_interval=10 \
    trainer.export_path="$HOME/exports/math_qwen2.5_7b_20ep_paper"
