defaults:
  - actor@actor_rollout_ref.actor: dp_actor
  - data@data: legacy_data
  - ref@actor_rollout_ref.ref: dp_ref
  - rollout@actor_rollout_ref.rollout: rollout
  - model@actor_rollout_ref.model: hf_model
  - critic@critic: dp_critic
  - reward_model@reward_model: dp_reward_model
  - _self_

algorithm:
  adv_estimator: grpo
  use_kl_in_reward: false

data:
  train_files:
    - ${hydra:runtime.cwd}/data/op13-50k.jsonl
    - ${hydra:runtime.cwd}/data/op14-50k.jsonl
  val_files:
    - ${hydra:runtime.cwd}/data/test-200.jsonl
  train_batch_size: 1024
  max_prompt_length: 1024
  max_response_length: 1024
  id_max_op: 14
  filter_overlong_prompts: true
  truncation: error
  custom_cls:
    path: ${hydra:runtime.cwd}/recipe/physics_rl/dataset.py
    name: CustomRLHFDataset

custom_reward_function:
  path: ${hydra:runtime.cwd}/recipe/physics_rl/dataset.py
  name: compute_score

actor_rollout_ref:
  model:
    path: ${hydra:runtime.cwd}/ckpt/checkpoint-20293
    enable_gradient_checkpointing: true
    use_remove_padding: true
  actor:
    optim:
      lr: 1e-6
    ppo_mini_batch_size: 256
    ppo_micro_batch_size_per_gpu: 16
    use_kl_loss: true
    kl_loss_coef: 0.001
    kl_loss_type: low_var_kl
    entropy_coeff: 0.0
    fsdp_config:
      param_offload: false
      optimizer_offload: false
  rollout:
    name: vllm
    gpu_memory_utilization: 0.6
    n: 6
    tensor_model_parallel_size: 1
    log_prob_micro_batch_size_per_gpu: 16
  ref:
    log_prob_micro_batch_size_per_gpu: 16
    fsdp_config:
      param_offload: true

trainer:
  critic_warmup: 0
  logger:
    - console
    - wandb
  project_name: pr-difficulty-rl
  experiment_name: diff2_14-tok5B-lr2e4-bs250k-schedcos-minlr3e-5-250910-ckpt20293_rl-diff13_14-rollout6-bs1024-ep2
  n_gpus_per_node: 6
  nnodes: 1
  save_freq: 20
  test_freq: 5
  log_val_generations: 100
  total_epochs: 2
