# @package _global_
# For use in NLB
defaults:
  - /model: flat_enc_dec
  - /model/task: joint_heldout_decode
  - /train: pretrain
  - /dataset: flat
model:
  session_embed_token_count: 8
  task_embed_strategy: EmbedStrat.token
  subject_embed_strategy: EmbedStrat.token

  causal: False

  transformer:
    n_layers: 12
    pre_norm: true
  hidden_size: 384

  task:
    task_weights: [0, 1.]
    mask_ratio: 0.3
    query_heldout: 45
    decode_time_pool: 'mean'
    decode_strategy: EmbedStrat.project
    decode_use_shuffle_backbone: True
  neurons_per_token: 32

  lr_init: 1e-5
  accelerate_new_params: 1.0
  lr_schedule: 'fixed' # per chinchilla...
  # tune_decay: 0.75

dataset:
  neurons_per_token: 32
  max_tokens: 8192
  max_channels: 288
  max_arrays: 2

  data_keys:
  - DataKey.spikes
  - DataKey.heldout_spikes
  datasets:
  - mc_maze_small
  # No eval set, just compare directly with the benchmark baselines
train:
  autoscale_batch_size: false
  batch_size: 64
  patience: 500
  epochs: 10000
init_from_id: m3_150k-hodo53b1
init_tag: val_loss
probe_finetune: true
notes: "trying constant LR probing on novel test set"