model-name: meta-llama/Llama-3.2-3B-Instruct
# train-file: /home/<anonymized>/data/math-hard-distr-llama3.2-3B/train.parquet
# val-file: /home/<anonymized>/data/math-hard-distr-llama3.2-3B/val.parquet
train-file: /home/<anonymized>/data/math-hard-distr-llama3.2-3B-smooth/train.parquet
val-file: /home/<anonymized>/data/math-hard-distr-llama3.2-3B-smooth/val.parquet
max-train-samples: 200_000
max-val-samples: 1_000
output: /beegfs/scratch/user/<anonymized>/huggingface/math_hard_distr_{loss}_llama3.2-3B
generation-length: 1024
batch-size: [128, 256, 512]
epochs: 1
learning-rate: 1e-7
scheduler-type: 'constant'
reduce-lr-on-plateau-patience: 1
micro-batch-size: 1
val-micro-batch-size: 8
validation-interval: 16
track-features: [["reward"]]
metric-accumulation-samples: 2048
loss: rkl
  # alpha:
  #   alpha: [0.75, 0.85, 0.95]