training:
  # --- model ------------------------------------------------------
  training_model_name: "Qwen/Qwen2.5-VL-3B-Instruct"
  training_method: grpo
  scaffold_type: two_stage
  # --- method -----------------------------------------------------
  # --- data -------------------------------------------------------
  # Use pre-converted HuggingFace datasets (much faster!)
  dataset_path: /scratch/<ANONYMIZED>/trajectory_generation/trajectories_hf_fixed
  eval_dataset_path: /scratch/<ANONYMIZED>/trajectory_generation/trajectories_hf_fixed
  dataset_type: hf             # Pre-converted HuggingFace format
  max_sequence_length: 8192    # Support longer multimodal sequences

  # --- optimisation ----------------------------------------------
  learning_rate: 2.0e-6
  batch_size: 4
  gradient_accumulation_steps: 8
  max_epochs: 1.0
  warmup_ratio: 0.03
  weight_decay: 0.01
  gradient_checkpointing: true
  # torch_compile: false            # Disable torch compile to avoid dynamo issues with InternVL3

  use_lora: true
  lora_r: 256
  lora_alpha: 512
  lora_dropout: 0.05

  # --- logging ----------------------------------------------------
  output_dir: /scratch/<ANONYMIZED>/qwen2.5_vl_3b_grpo_two_stage_hf_2e-6_lr_256r_512a_beta0.001_8gen_8bs_4ga
  logging_steps: 1
  save_steps: 100
  eval_steps: 1
  run_name: qwen2.5_vl_3b_grpo_two_stage_hf_2e-6_lr_256r_512a_beta0.001_8gen_8bs_4ga
  report_to: wandb

  # --- GRPO specific ---------------------------------------------
  beta: 0.001                 # small KL penalty
  grpo_num_iterations: 6
  epsilon: 0.2
  loss_type: bnpo
  scale_rewards: true
  
  # Decouple generation-time batch from optimizer accumulation to avoid OOM
  # grpo_steps_per_generation: 8  # Cap generation batch size while keeping gradient_accumulation_steps=16

  use_curriculum_learning: false
  curriculum_num_bins: 15
  curriculum_binning_strategy: equal_width
  curriculum_adaptive_thresholds: true
  curriculum_bin_edges:
    - 0.0625
    - 0.12083333333333333
    - 0.17916666666666667
    - 0.2375
    - 0.29583333333333334
    - 0.3541666666666667
    - 0.4125
    - 0.4708333333333333
    - 0.5291666666666667
    - 0.5875
    - 0.6458333333333334
    - 0.7041666666666667
    - 0.7625
    - 0.8208333333333333
    - 0.8791666666666667
    - 0.9375
  curriculum_success_thresholds:
    - 0.98
    - 0.9707142857142856
    - 0.9614285714285714
    - 0.9521428571428572
    - 0.9428571428571428
    - 0.9335714285714285
    - 0.9242857142857143
    - 0.915
    - 0.9057142857142857
    - 0.8964285714285714
    - 0.8871428571428571
    - 0.8778571428571429
    - 0.8685714285714285
    - 0.8592857142857142
    - 0.85
  curriculum_flat_thresholds:
    - 0.95
    - 0.9357142857142857
    - 0.9214285714285714
    - 0.9071428571428571
    - 0.8928571428571428
    - 0.8785714285714286
    - 0.8642857142857143
    - 0.85
    - 0.8357142857142857
    - 0.8214285714285714
    - 0.8071428571428572
    - 0.7928571428571428
    - 0.7785714285714286
    - 0.7642857142857142
    - 0.75
  curriculum_stats_window: 150
  curriculum_min_weight: 0.02
  curriculum_decay_factor: 0.2
  enable_dapo_filtering: true

  # # --- Static filtering configuration ---
  # static_filtering_enabled: false  # Enable static filtering based on difficulty scores
  # difficulty_0_leakage_percent: 0.0  # Percentage (0.0-1.0) of difficulty 0 samples to leak back
  # difficulty_1_leakage_percent: 0.0  # Percentage (0.0-1.0) of difficulty 1 samples to leak back

  # GRPO generation parameters
  num_generations: 8
  max_completion_length: 800

  # --- vLLM servers for three-stage (needs both VLM and reasoner) -----
  use_vllm: true
  use_transformers_paged: false   # Disable paged attention - not compatible with InternVL3
  vllm_mode: colocate
  vllm_server_host: 0.0.0.0     # Will be overridden by CLI
  vllm_server_port: 8000          # Captioner server port

  # # VLM configuration for three-stage (needed for clarifying questions)
  # vlm_model_type: "openai"
  # vlm_api_base: "http://10.0.1.9:8000/v1"
  # vlm_model_name: "OpenGVLab/InternVL3-8B-Instruct"
  # vlm_api_key: "EMPTY"
  # # vlm_timeout: 900.0
  # vlm_temperature: 1.0
  # vlm_top_p: 0.001
  # vlm_top_k: 1
  # vlm_max_tokens: 8192

  # Reasoner configuration
  reasoner_model_type: "openai"
  reasoner_api_base: "http://10.0.1.14:8001/v1"
  reasoner_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
  reasoner_api_key: "EMPTY"
  reasoner_max_tokens: 32768
  reasoner_temperature: 0.6
  reasoner_top_p: 0.95
  reasoner_top_k: -1
  reasoner_server_port: 8001
  
  vllm_gpu_memory_utilization: 0.8
  vllm_tensor_parallel_size: 1
  

  # --- Reasoner generation parameters (match trajectory generation) -------
  reasoner_max_tokens: 100000    # Long reasoning chains
  reasoner_temperature: 0.6      # Match virl_generation config
  reasoner_top_p: 0.95           # Match virl_generation config
  reasoner_top_k: -1             # Disabled (match virl_generation config)

  # --- Prompt templating ----------------------------------------
  prompt_template_name: "two_stage_math_v1"  # REQUIRED: three-stage prompt template for consistent training/evaluation

  # --- Parallel reward computation (leverage server concurrency) --------
  reward_parallel_workers: 32        # Parallel reasoning (math verification done sequentially)
  reward_enable_parallel: true       # Enable parallel reward computation (threading issue fixed)

  # --- Output configuration -------------------------------------
  # debug_data_dir: "/scratch/<ANONYMIZED>/debug_data_three_stage_grpo_training/"

  # --- framework specific ----------------------------------------
  framework_config:
    reward_function: "reasoning_frameworks.training.reward_functions.two_stage_math_correctness_reward"
    # Two-stage reward function uses only reasoner server parameters
