model_name: microsoft/deberta-v2-xlarge
learning_rate: 1e-5
freeze_layer: 15
scheduler: cosine
gradient_checkpointing: false
gradient_accumulation_steps: 16
per_device_train_batch_size: 1
warmup_steps: 600
eval_steps: 200
save_steps: 500
max_length: 512
num_train_epochs: 2
datasets:
  - webgpt
  - hfsummary
