#!/usr/bin/env bash
set -euo pipefail

# =============================================================================
# DAPO AIME2024 Baseline Training Script
# =============================================================================

# Project and experiment configuration
project_name='AIME2024-Qwen3-8B'
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
exp_name="aime2024_dapo_${TIMESTAMP}"

# Environment variables
      # 启用IB网络


export HYDRA_FULL_ERROR=1
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0,1,2,3,4,5,6,7"}

# Data paths
TRAIN_FILE="/primus_datasets/primus_data/clpo_SKYRTP/DAPO-Math-17k/data/dapo-math-17k.parquet"
VAL_FILE="/primus_datasets/primus_data/aime_2B4pCq/train-00000-of-00001-fixed.parquet"

# Model path
MODEL_PATH="/primus_datasets/primus_data/Qwen3_rNrLUi/Qwen3-8B"

# Output directory
CKPTS_DIR="/primus_oss/_checkpoint/0910-Qwen3-8B-AIME2024"

# Data configuration
max_prompt_length=2048
max_response_length=8192
gen_batch_size=192
train_batch_size=64
val_batch_size=32
truncation="error"
filter_overlong_prompts=true
dataloader_num_workers=4

# Algorithm configuration
adv_estimator=grpo
use_kl_in_reward=false
filter_groups_enable=True
filter_groups_metric=acc
max_num_gen_batches=10

# Model configuration
enable_gradient_checkpointing=true
use_remove_padding=true

# Actor configuration
actor_lr=1e-6
ppo_mini_batch_size=8
ppo_micro_batch_size_per_gpu=1
use_kl_loss=false
kl_loss_coef=0.001
clip_ratio_low=0.2
clip_ratio_high=0.28
clip_ratio_c=10.0
entropy_coeff=0
param_offload=false
optimizer_offload=false

# Rollout configuration
rollout_name=vllm
n_resp_per_prompt=4
tensor_model_parallel_size=1
gpu_memory_utilization=0.5
log_prob_micro_batch_size_per_gpu=1
max_model_len=10240
max_num_batched_tokens=10240

# DAPO specific configuration
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0

# Trainer configuration
total_epochs=1
critic_warmup=0
test_freq=10
save_freq=50
val_before_train=true
NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
NNODES=${NNODES:-1}

echo "==== DAPO AIME2024 BASELINE TRAINING CONFIGURATION ===="
echo "Project: $project_name"
echo "Experiment: $exp_name"
echo "Train Data: $TRAIN_FILE"
echo "Val Data: $VAL_FILE"
echo "Model Path: $MODEL_PATH"
echo "Output Dir: $CKPTS_DIR"


 - 使用DAPO专用配置
python3 -m recipe.dapo.main_dapo \
    algorithm.adv_estimator="${adv_estimator}" \
    algorithm.use_kl_in_reward="${use_kl_in_reward}" \
    algorithm.filter_groups.enable="${filter_groups_enable}" \
    algorithm.filter_groups.metric="${filter_groups_metric}" \
    algorithm.filter_groups.max_num_gen_batches="${max_num_gen_batches}" \
    data.train_files="${TRAIN_FILE}" \
    data.val_files="${VAL_FILE}" \
    data.gen_batch_size="${gen_batch_size}" \
    data.train_batch_size="${train_batch_size}" \
    data.val_batch_size="${val_batch_size}" \
    data.max_prompt_length="${max_prompt_length}" \
    data.max_response_length="${max_response_length}" \
    data.filter_overlong_prompts="${filter_overlong_prompts}" \
    data.truncation="${truncation}" \
    data.dataloader_num_workers="${dataloader_num_workers}" \
    actor_rollout_ref.model.path="${MODEL_PATH}" \
    actor_rollout_ref.model.enable_gradient_checkpointing="${enable_gradient_checkpointing}" \
    actor_rollout_ref.model.use_remove_padding="${use_remove_padding}" \
    actor_rollout_ref.actor.optim.lr="${actor_lr}" \
    actor_rollout_ref.actor.ppo_mini_batch_size="${ppo_mini_batch_size}" \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu="${ppo_micro_batch_size_per_gpu}" \
    actor_rollout_ref.actor.use_kl_loss="${use_kl_loss}" \
    actor_rollout_ref.actor.kl_loss_coef="${kl_loss_coef}" \
    actor_rollout_ref.actor.clip_ratio_low="${clip_ratio_low}" \
    actor_rollout_ref.actor.clip_ratio_high="${clip_ratio_high}" \
    actor_rollout_ref.actor.clip_ratio_c="${clip_ratio_c}" \
    actor_rollout_ref.actor.entropy_coeff="${entropy_coeff}" \
    actor_rollout_ref.actor.fsdp_config.param_offload="${param_offload}" \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload="${optimizer_offload}" \
    actor_rollout_ref.rollout.name="${rollout_name}" \
    actor_rollout_ref.rollout.n="${n_resp_per_prompt}" \
    actor_rollout_ref.rollout.tensor_model_parallel_size="${tensor_model_parallel_size}" \
    actor_rollout_ref.rollout.gpu_memory_utilization="${gpu_memory_utilization}" \
    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu="${log_prob_micro_batch_size_per_gpu}" \
    actor_rollout_ref.rollout.max_model_len="${max_model_len}" \
    actor_rollout_ref.rollout.max_num_batched_tokens="${max_num_batched_tokens}" \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu="${log_prob_micro_batch_size_per_gpu}" \
    actor_rollout_ref.ref.fsdp_config.param_offload="${param_offload}" \
    reward_model.reward_manager=dapo \
    +reward_model.reward_kwargs.overlong_buffer_cfg.enable="${enable_overlong_buffer}" \
    +reward_model.reward_kwargs.overlong_buffer_cfg.len="${overlong_buffer_len}" \
    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor="${overlong_penalty_factor}" \
    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
    +reward_model.reward_kwargs.max_resp_len="${max_response_length}" \
    trainer.logger='["console", "swanlab"]' \
    trainer.project_name="${project_name}" \
    trainer.experiment_name="${exp_name}" \
    trainer.total_epochs="${total_epochs}" \
    trainer.critic_warmup="${critic_warmup}" \
    trainer.test_freq="${test_freq}" \
    trainer.save_freq="${save_freq}" \
    trainer.val_before_train="${val_before_train}" \
    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
    trainer.nnodes="${NNODES}" \
    trainer.default_local_dir="${CKPTS_DIR}" \
    "$@"

TRAINING_EXIT_CODE=$?

echo ""
echo "🏁 ===== TRAINING COMPLETED ====="

if [ $TRAINING_EXIT_CODE -eq 0 ]; then
    echo "✅ DAPO Baseline training completed successfully!"
    echo "📁 Checkpoints saved to: ${CKPTS_DIR}"
    echo "📊 Experiment name: ${exp_name}"
    
    if [ -d "${CKPTS_DIR}" ]; then
        echo ""
        echo "📋 Output directory contents:"
        ls -la "${CKPTS_DIR}" || true
    fi
    
    echo ""
    echo "🎉 DAPO baseline training finished successfully!"
else
    echo "❌ Training failed with exit code: $TRAINING_EXIT_CODE"
    echo "💡 Please check the logs above for error details"
    echo "🔧 Common issues:"
    echo "   - Check if data files exist and are readable"
    echo "   - Verify GPU memory is sufficient"
    echo "   - Ensure all dependencies are installed"
    echo "   - Check CUDA_VISIBLE_DEVICES setting"
fi

exit $TRAINING_EXIT_CODE
