# @package _group_
common:
  fp16: true
  log_format: json
  log_interval: 200
  use_plasma_view: false
  tensorboard_logdir: tb

checkpoint:
  no_epoch_checkpoints: true
  save_interval_updates: 2000
  keep_interval_updates: 2
  save_interval: 9999999
  restore_file: checkpoint_last.pt
  reset_optimizer: false
  reset_dataloader: false
  reset_meters: false

task:
  _name: t5_seq2seq_mixture
  data: ???
  data_cap_file: data_cap_100k.json
  exclude_tasks: code_x_glue_ct_code_to_text_python
  truncate_source: true
  truncate_target: true
  tokens_per_sample: 1024
  tokens_per_sample_target: 256

criterion:
  _name: label_smoothed_cross_entropy
  report_accuracy: true

dataset:
  num_workers: 2
  max_tokens: 8192
  batch_size: 1024  # min seq_len = 8
  disable_validation: true
  validate_interval: 9999999
  valid_subset: valid_code_x_glue_cc_code_refinement_small,valid_code_x_glue_cc_code_refinement_medium

optimizer:
  _name: adam
  weight_decay: 0.01
  adam_betas: (0.9,0.98)
  adam_eps: 1e-06

lr_scheduler:
  _name: polynomial_decay
  warmup_updates: 2500

optimization:
  clip_norm: 2.0
  lr: [0.00002]
  max_update: 25000
  update_freq: [2]  # Assuming 64 GPUs

model:
  _name: transformer_t5_base_rpe
  max_positions: 1280
  encoder_rp_bins: 128
  encoder_rp_max_dist: 256
