defaults:
  - reil_trainer
  - envs
  - evaluation
  - _self_

data:
  type: standard
  train_batch_size: 256
  micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
  micro_batch_size_per_gpu: 4  # this is also val batch size
  train_files: ~/data/gsm8k/train.parquet
  val_files: ~/data/gsm8k/test.parquet
  prompt_key: question
  response_key: answer
  max_length: 1024
  truncation: error
  balance_dp_token: False
  chat_template: False
  custom_cls:
    path: null
    name: null
  reasoning_gym:
    max_length: 2048
    developer_prompt: DeepSeekZero
    train:
      seed: 42
      dataset_size: 5000

    val:
      seed: 42
      dataset_size: 1000

model:
  partial_pretrain: ~/models/gemma-1.1-7b-it
  fsdp_config:
    wrap_policy:
      min_num_params: 0
    cpu_offload: False
    offload_params: False
  external_lib: null
  enable_gradient_checkpointing: False
  trust_remote_code: True
  lora_rank: 0  # Set to positive value to enable LoRA (e.g., 32)
  lora_alpha: 16  # LoRA scaling factor
  target_modules: all-linear  # Target modules for LoRA adaptation
  use_liger: False

optim:
  lr: 1e-5
  betas: [0.9, 0.95]
  weight_decay: 0.01
  warmup_steps_ratio: 0.1
  clip_grad: 1.0
ulysses_sequence_parallel_size: 1
use_remove_padding: False

trainer:
  default_local_dir: /tmp/sft_model
  default_hdfs_dir: hdfs://tmp/experiments/gsm8k/gemma-1.1-7b-it/ # change the hdfs path here
  resume_path: null
  project_name: gsm8k-sft
  experiment_name: test
  total_epochs: 4
  total_training_steps: null
  logger: ['console', 'wandb']
  seed: 1
  policy_eval: True
  sft_type: standard
  aft_power: 1.0
  kl_regularization:
    enabled: false
    kl_coef: 0.05

  anchor_regularization:
    enabled: False
    l2_anchor_coeff: 0.0
    include_bias: False
    include_layernorm: False
    normalize: none  # or none
    mode: gpu_mirror        # gpu_mirror or cpu_mirror (not used if disabled)
    every_n_steps: 1        # compute every step by default

