defaults:
  - optim: adam
  - tasks:
      - axb
      - axg
      - cb
      - copa
      - multirc
      - rte
      - wic
      - wsc_fixed
      - boolq
      # - record # this is a bit of a hack without the usual special engineering

optim:
  lr: 1e-5 # superGLUE paper

name: superGLUE
evaluation_set: validation # always keep this at validation except for the final run

# checkpoint name:
# This can be either "latest", or a reference to a specific checkpoint in a subfolder
checkpoint: latest
path: ${impl.path} # Path for caches of datasets and tokenizers
max_seq_length: 128

# Default options:
# These can be overwritten by specific tasks
batch_size: 16
batch_size_ramp: 0

gradient_clipping:
limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight, norm]
scheduler: cosine-decay
optim_mod:
  name: none
eval_in_train_mode: True # Turn on dropout (if existing in the model) during finetuning

epochs: 10 # all superGLUE tasks are short

# These options are only used for scheduling:
warmup_steps: 0.1
cooldown_steps: 0
steps:

testing:
  batch_size: 128

arch_modifications:
  classification_head:
    pooler: zero_index
    include_ff_layer: True
    # head_dim: ${arch.hidden_size}
    nonlin: Tanh
    # classifier_dropout: ${arch.hidden_dropout_prob}
