# Note that some of the fields in this template haven't been filled in yet.
# Please resolve any `null` fields before launching!

# Whether to run the various GLUE jobs serially or in parallel (use parallel=True to take advantage of multiple GPUs)
parallel: true
#max_seq_len: 256
# Basic run configuration, additional details will be added to this name for each GLUE task, and each random seed
base_run_name: hydra_mod-large-uncased-glue-finetuning-lr0.0002-mlm0.3 # Determines how runs are saved and logged in W&B
default_seed: 19
precision: amp_bf16 #ytchanged bf16 does not work, bf16



# Tokenizer for dataset creation
tokenizer_name: bert-large-uncased

# Model
model:
  name: hydra
  pretrained_model_name: ${tokenizer_name}
  tokenizer_name: ${tokenizer_name}
  # This implementation generally uses the default architecture values for from the Hugging Face BertConfig object
  # These values can be changed here when pretraining from scratch. Note that these should only be used
  # if used_pretained: false, otherwise the model will not be loaded properly
  model_config:
    num_attention_heads: 16 # bert-large default
    num_hidden_layers: 45 # bert-large default
    hidden_size: 1024 # bert-large default
    max_position_embedding: 256
    use_position_embeddings: False 

# Loading
starting_checkpoint_load_path: ./local-bert-checkpoints/hydra-large-uncased-lr0.0002-mlm0.3/latest-rank0.pt      # Fill this in with the composer checkpoint from the end of pre-training a HF BERT
local_pretrain_checkpoint_folder: ./local-bert-checkpoints/

# Saving
save_finetune_checkpoint_prefix: ./local-finetune-checkpoints/ # (local)
# save_finetune_checkpoint_prefix: s3://<bucket>/remote-finetune-checkpoints # (remote)
save_finetune_checkpoint_folder: ${save_finetune_checkpoint_prefix}/${base_run_name}

# (Optional) W&B logging
loggers:
  wandb:
    project: nova_finetune      # Fill this in
    name: hydra-large-uncased-len128-lr0.0002-mlm0.3

# Callbacks
callbacks:
  lr_monitor: {}
  speed_monitor: {}

# Scheduler
scheduler:
  name: linear_decay_with_warmup
  t_warmup: 0.06dur
  alpha_f: 0.0

# Task configuration
tasks:
  mnli:
    # Specify any extra task-specific arguments for the trainer here
    trainer_kwargs:
      # We keep one MNLI checkpoint locally so that we can start finetuning of
      # RTE, MRPC and STS-B from the MNLI checkpoint
      save_num_checkpoints_to_keep: 1
      max_sequence_length: 128
      #lr: 5.0e-05
      lr: 1.0e-05
      wd: 5.0e-06
  rte:
    seeds: [19, 8364, 717, 10536, 90166]
    trainer_kwargs:
      save_num_checkpoints_to_keep: 0
      max_sequence_length: 128
      #lr: 5.0e-5
      lr: 1.0e-05
      wd: 1.0e-6
      max_duration: 2ep
      optim_name: adamw
  qqp:
    trainer_kwargs:
      save_num_checkpoints_to_keep: 0
      max_sequence_length: 128
      #lr: 3.0e-5
      lr: 1.0e-05
      wd: 3.0e-6
  qnli:
    trainer_kwargs:
      save_num_checkpoints_to_keep: 0
      max_sequence_length: 128
      #lr: 5.0e-5
      lr: 1.0e-05
      wd: 1.0e-6
      #pool_all: True
  sst2:
    seeds: [19, 8364, 717]
    trainer_kwargs:
      save_num_checkpoints_to_keep: 0
      max_sequence_length: 128
      #lr: 3.0e-5
      lr: 1.0e-05
      wd: 3.0e-6
  stsb:
    seeds: [19, 8364, 717, 10536, 90166]
    trainer_kwargs:
      save_num_checkpoints_to_keep: 0
      max_sequence_length: 128
      #lr: 8.0e-5
      lr: 1.0e-05
      wd: 3.0e-6
      max_duration: 8ep
  mrpc:
    seeds: [19, 8364, 717, 10536, 90166]
    trainer_kwargs:
      save_num_checkpoints_to_keep: 0
      max_sequence_length: 128
      #lr: 8.0e-5
      lr: 1.0e-05
      wd: 8.0e-6
  cola:
    seeds: [19, 8364, 717, 10536]
    trainer_kwargs:
      save_num_checkpoints_to_keep: 0
      max_sequence_length: 128
      #lr: 5.0e-5
      lr: 1.0e-05
      wd: 1.0e-6