datasets:
  - class_name: DebugDataset
    class_args:
      num_examples: 100
    data_name: debug
    data_sampling_ratio: 1
    max_input_tokens: 4096
    max_output_tokens: 4096

model_args:
  model_name: bigcode/starcoder
  model_class: AutoModelForCausalLM
  attention_implementation: flash_attention_2
  use_padding_free_transformer: true

tuning_args:
  tuning_method: full_finetuning

save_args:
  save_path: checkpoints
  save_interval: 50

training_parameters:
  num_training_steps: 100
  eval_interval: 50
  micro_batch_size: 6

optimizer_args:
  class_name: TorchAdamW
  class_args:
    lr: 1e-5
    weight_decay: 0.1
    betas:
      - 0.9
      - 0.95
    eps: 1e-10

lr_scheduler_args:
  lr_decay_style: cosine

mixed_precision_args:
  dtype: bf16

distributed_args:
  distributed_backend: deepspeed
  gradient_checkpointing_method: block
