# @package _global_
# For use in NLB
defaults:
  - /model: flat_enc_dec
  - /train: pretrain
  - /dataset: flat
model:
  session_embed_token_count: 8
  task_embed_strategy: EmbedStrat.token
  subject_embed_strategy: EmbedStrat.token

  causal: False

  transformer:
    n_layers: 32
    n_heads: 4
    pre_norm: true
    debug_force_nonlearned_position: true
  hidden_size: 512

  task:
    mask_ratio: 0.3
  neurons_per_token: 16

  weight_decay: 0.01
  lr_ramp_steps: 100
  lr_decay_steps: 300
  lr_min: 1e-5

  # force_zero_mask: true
dataset:
  neurons_per_token: 16
  max_tokens: 8192
  max_channels: 288
  max_arrays: 2

  datasets:
  - churchland_maze.*
  - churchland_misc.*
  - observation_.*
  - ortho_.*
  - fbc_.*
  - unstructured_.*
  - REDACT_co.*
  - odoherty_rtt.*
  - mc_maze$
  - dyer_co.*
  - gallego_co.*
  eval_datasets:
  - mc_maze$
train:
  accumulate_batches: 4
  autoscale_batch_size: false
  batch_size: 64 # for A100 80G. 2x GPU - effective batch size 1024
notes: "Data ~100x -> Model ~100x (0.3M -> 30M) 1. Sched. to 1K epochs"
load_from_id: m3_250k-2txblr93
# Design  notes
# 150 -> 250: 24 -> 32 layers. (Pref deep due to Tay 22)
# pre-norm - stability never was an issue after this, and perf comparable (and match lit)
# mask-ratio: 0.3 was optimal for exp
# chunk: 16 - 8 seemed better but gets too expensive
# decay: 300 - chinchilla rec, we see that our 150K run converged by ~200 epochs, no reason to think it much diff for 250K
# weight_decay: 0.01
# ? Is our model big enough? We never reached parity on `mc_maze` -- are we "overfit" or not large enough to contain it? Presumably latter