en_te_text: &en_te_text
  dataset_name: xnli
  joint_input: True

model: &model
  n_labels: 3
  model_type: routing

optim: &optim
  optim: adam
  scheduler: cosine
  #lr: 5e-5
  clip: 1.0
  weight_decay: 0
  adam_b1: 0.9
  adam_b2: 0.999
  adam_eps: 1e-8

train: &train
  #batch_size: 32
  seq_len: 512
  gradient_accumulation_steps: 1
  warmup_ratio: 0.1
  num_warmup_steps: 1500
  checkpointing_steps: "epoch"
  num_train_epochs: 5

default:
  train:
    <<: *en_te_text
    <<: *model
    <<: *optim
    <<: *train