# @package _global_

algorithm:
  adv_estimator: grpo
  use_kl_in_reward: false
actor_rollout_ref:
  model:
    use_remove_padding: true
    enable_gradient_checkpointing: true
  actor:
    optim:
      lr: 1.0e-6
    ppo_mini_batch_size: 256
    ppo_micro_batch_size_per_gpu: 40
    use_kl_loss: true
    kl_loss_coef: 0.001
    kl_loss_type: low_var_kl
    entropy_coeff: 0
    fsdp_config:
      param_offload: false
      optimizer_offload: false
  rollout:
    log_prob_micro_batch_size_per_gpu: 40
    tensor_model_parallel_size: 2
    name: vllm
    gpu_memory_utilization: 0.6
    n: 5
  ref:
    log_prob_micro_batch_size_per_gpu: 40
    fsdp_config:
      param_offload: true
trainer:
  logger:
    - console
    - wandb
  project_name: sqlm
  experiment_name: null
  nnodes: 1
  test_freq: 5
  total_epochs: 1000
  vis_freq: 1
  n_gpus_per_node: 4