en_te_text: &en_te_text
   data: /fs/scratch/project_account/your_username/experiments/fxt/data/data/
   language_to_script:
      en: en
      fr: fr
      es: es
      te: te
      hi: hi
      bn: bn
      uk: uk
      ru: ru
      be: be
   streaming: True
   cache_dir: /fs/scratch/project_account/your_username/.cache/
   slice:   100% #streaming must be false to use slice
   num_proc: 80


model: &model
   d_model: 768
   n_head: 12
   d_head: 64
   d_inner: 3072
   dropout: 0.1
   dropatt: 0.1
   pre_lnorm: false
   model_config: "[2, (12,), 2]"
   activation_function: gelu
   shuffle: true
   roll: true
   nw: 8
   fp16: true
   boundary_supervision: False
   num_predictors: 1
   seq_len: 512
   learn_prior: True



boundaries: &boundaries
   boundaries_type: 'gumbel'
   fixed_sf: 2
   spikes_left: 2
   temp: 0.5
   prior_one: 0.5
   prior_two: 0.2
   script_tokens: "en,es,fr,ru,uk,be,hi,bn,te"
   prior_list: "0.333,0.275,0.269,0.178,0.167,0.161,0.13,0.127,0.124"
   prior_std: "0.07,0.057,0.055,0.035,0.033,0.031,0.026,0.026,0.025"

eval: &eval
   eval_interval: 20000
   eval_max_steps: 20000
   eval_tgt_len: 512
   eval_total_len: 2048
   eval_batch_size: 256

optim: &optim
   optim: adam
   scheduler: cosine
   #lr: 0.00025
   lr: 5e-5
   warmup_step: 6000
   clip: 0.25
   weight_decay: 0
   adam_b1: 0.9
   adam_b2: 0.999
   adam_eps: 1e-8

train: &train
   max_train_steps: 300000
   batch_size: 128
   seq_len: 512
   line_by_line: true
   gradient_accumulation_steps: 2
   num_warmup_steps: 9000
   checkpointing_steps: "2000"
   num_train_epochs: 2

default:
   train:
      <<: *en_te_text
      <<: *model
      <<: *boundaries
      <<: *eval
      <<: *optim
      <<: *train
