defaults:
  - _self_  
  - batch_size: 11b
  - device_map: 11b
  - data_formats: normal
  - override hydra/hydra_logging: disabled  
  - override hydra/job_logging: disabled  

model_version: "0.0.2.2"
gpu_id: '0'
grad_accum_steps: 15
# save_steps: [50000]
save_per_step: 10000 # save the model per this step
lr: 0.0005
num_warmup_steps: 10
num_epoch: 10
log_interval: 2
neg_loss_alpha: 1
pos_neg_ratio: 5 # every 5 positive batch, mix in a negative batch 
num_in_context_sample: 4
save_path: '/mnt/data_10t/flan_t5_distill/checkpoints/'
base_model: 'google/flan-t5-xxl' # [google/flan-t5-xxl, google/flan-t5-xl, t5-3b, t5-large], xxl for 11B, xl for 3B, TODO: OPT 66B
loss_type: match_sample # match_generation, match_distribution, contrastive
batch_mix_mode: fully_random # same_question, fully_random
gradient_clip_val: 5.
# generation_importance: "uniform" # uniform, emphasize_transition, emphasize_equation

hydra:  
  output_subdir: null  
  run:  
    dir: .