training:
  # --- model ------------------------------------------------------
  training_model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
  # training_model_name: "OpenGVLab/InternVL3-2B-hf"
  # training_model_name: "/scratch/<ANONYMIZED>/model_conversions/InternVL3-2B-Instruct-converted-hf"
  # training_model_name: "/scratch/<ANONYMIZED>/hf_cache/InternVL3-8B-Instruct-hf"
  training_method: grpo
  # reasoner_model_name: "JamAndTeaStudios/DeepSeek-R1-Distill-Qwen-32B-FP8-Dynamic"
  scaffold_type: three_stage
  # --- method -----------------------------------------------------
  # --- data -------------------------------------------------------
  # Use pre-converted HuggingFace datasets (much faster!)
  dataset_path: /scratch/<ANONYMIZED>/trajectory_generation/trajectories_hf_fixed
  eval_dataset_path: /scratch/<ANONYMIZED>/trajectory_generation/trajectories_hf_fixed
  dataset_type: hf             # Pre-converted HuggingFace format
  max_sequence_length: 8192    # Support longer multimodal sequences

  # --- optimisation ----------------------------------------------
  learning_rate: 1.0e-6
  batch_size: 4
  gradient_accumulation_steps: 8
  max_epochs: 1.0
  warmup_ratio: 0.03
  weight_decay: 0.01
  gradient_checkpointing: true
  # torch_compile: false            # Disable torch compile to avoid dynamo issues with InternVL3

  use_lora: true
  lora_r: 128
  lora_alpha: 256
  lora_dropout: 0.05

  # --- logging ----------------------------------------------------
  output_dir: /scratch/<ANONYMIZED>/qwen2.5_vl_7b_grpo_three_stage_hf_1.0e-6_lr_128r_256a_beta0.001_8gen_4bs_8ga_curriculum/
  logging_steps: 1
  save_steps: 100
  eval_steps: 1
  run_name: qwen2.5_vl_7b_grpo_three_stage_hf_1.0e-6_lr_128r_256a_beta0.001_8gen_4bs_8ga_curriculum
  report_to: wandb

  # --- GRPO specific ---------------------------------------------
  beta: 0.001                 # small KL penalty
  grpo_num_iterations: 6
  epsilon: 0.2
  loss_type: bnpo
  scale_rewards: true
  
  # Decouple generation-time batch from optimizer accumulation to avoid OOM
  grpo_steps_per_generation: 8  # Cap generation batch size while keeping gradient_accumulation_steps=16

  use_curriculum_learning: false
  curriculum_num_bins: 15
  curriculum_binning_strategy: equal_width
  curriculum_adaptive_thresholds: true
  curriculum_bin_edges:
    - 0.0625
    - 0.12083333333333333
    - 0.17916666666666667
    - 0.2375
    - 0.29583333333333334
    - 0.3541666666666667
    - 0.4125
    - 0.4708333333333333
    - 0.5291666666666667
    - 0.5875
    - 0.6458333333333334
    - 0.7041666666666667
    - 0.7625
    - 0.8208333333333333
    - 0.8791666666666667
    - 0.9375
  curriculum_success_thresholds:
    - 0.98
    - 0.9707142857142856
    - 0.9614285714285714
    - 0.9521428571428572
    - 0.9428571428571428
    - 0.9335714285714285
    - 0.9242857142857143
    - 0.915
    - 0.9057142857142857
    - 0.8964285714285714
    - 0.8871428571428571
    - 0.8778571428571429
    - 0.8685714285714285
    - 0.8592857142857142
    - 0.85
  curriculum_flat_thresholds:
    - 0.95
    - 0.9357142857142857
    - 0.9214285714285714
    - 0.9071428571428571
    - 0.8928571428571428
    - 0.8785714285714286
    - 0.8642857142857143
    - 0.85
    - 0.8357142857142857
    - 0.8214285714285714
    - 0.8071428571428572
    - 0.7928571428571428
    - 0.7785714285714286
    - 0.7642857142857142
    - 0.75
  curriculum_stats_window: 150
  curriculum_min_weight: 0.02
  curriculum_decay_factor: 0.2
  enable_dapo_filtering: true

  resume_from_checkpoint: /scratch/<ANONYMIZED>/qwen2.5_vl_7b_grpo_three_stage_hf_1.0e-6_lr_128r_256a_beta0.001_8gen_4bs_8ga_curriculum/checkpoint-600

  # --- Static filtering configuration ---
  static_filtering_enabled: false  # Enable static filtering based on difficulty scores
  difficulty_0_leakage_percent: 0.0  # Percentage (0.0-1.0) of difficulty 0 samples to leak back
  difficulty_1_leakage_percent: 0.0  # Percentage (0.0-1.0) of difficulty 1 samples to leak back

  # GRPO generation parameters
  num_generations: 8
  max_completion_length: 800

  # --- vLLM servers for three-stage (needs both VLM and reasoner) -----
  use_vllm: true
  use_transformers_paged: false   # Disable paged attention - not compatible with InternVL3
  vllm_mode: server
  vllm_server_host: 10.0.17.13     # Will be overridden by CLI
  # vllm_server_port: 8000          # Captioner server port
  # vllm_mode: colocate
  # vllm_server_host: 0.0.0.0     # Will be overridden by CLI
  vllm_server_port: 8004          # Captioner server port

  # VLM configuration for three-stage (needed for clarifying questions)
  vlm_model_type: "openai"
  vlm_api_base: "http://10.0.1.13:8002/v1"
  vlm_model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
  vlm_api_key: "EMPTY"
  # vlm_timeout: 900.0
  vlm_temperature: 1.0
  vlm_top_p: 0.001
  vlm_top_k: 1
  vlm_max_tokens: 8192

  # Reasoner configuration
  reasoner_model_type: "openai"
  reasoner_api_base: "http://10.0.1.14:8001/v1"
  reasoner_model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
  reasoner_api_key: "EMPTY"
  reasoner_max_tokens: 32768
  reasoner_temperature: 0.6
  reasoner_top_p: 0.95
  reasoner_top_k: -1
  reasoner_server_port: 8001
  
  # vllm_gpu_memory_utilization: 0.98
  vllm_tensor_parallel_size: 2
  
  # --- Three-stage scaffold configuration ---
  scaffold_max_iterations: 3  # Fixed at 3 for three-stage (Stage 1 + Stage 2 + Optional Stage 2.5)
  question_penalty: 0.3  # Penalty applied when reasoner asks clarifying questions (incentivizes self-contained descriptions)

  # --- Reasoner generation parameters (match trajectory generation) -------
  reasoner_max_tokens: 100000    # Long reasoning chains
  reasoner_temperature: 0.6      # Match virl_generation config
  reasoner_top_p: 0.95           # Match virl_generation config
  reasoner_top_k: -1             # Disabled (match virl_generation config)

  # --- Prompt templating ----------------------------------------
  prompt_template_name: "three_stage_math_v1"  # REQUIRED: three-stage prompt template for consistent training/evaluation

  # --- Parallel reward computation (leverage server concurrency) --------
  reward_parallel_workers: 32        # Parallel reasoning (math verification done sequentially)
  reward_enable_parallel: true       # Enable parallel reward computation (threading issue fixed)

  # --- Output configuration -------------------------------------
  # debug_data_dir: "/scratch/<ANONYMIZED>/debug_data_three_stage_grpo_training/"

  # --- framework specific ----------------------------------------
  framework_config:
    reward_function: "reasoning_frameworks.training.reward_functions.three_stage_math_correctness_reward"
