# @package _global_

defaults:
  - /model: flat_enc_dec
  - /train: pretrain
  - /dataset: flat
model:
  session_embed_token_count: 8
  task_embed_strategy: EmbedStrat.token
  subject_embed_strategy: EmbedStrat.token
  array_embed_strategy: EmbedStrat.none

  causal: True

  transformer:
    n_layers: 12
  hidden_size: 512

  task:
    mask_ratio: 0.5
  neurons_per_token: 16

dataset:
  neurons_per_token: 16
  max_tokens: 8192
  max_length_ms: 2000
  max_arrays: 2

  bin_size_ms: 20
  max_channels: 288
  datasets:
  - churchland_maze.*
  - churchland_misc.*
  # - CRS02bHome.data.*
  - odoherty_rtt-Loco.*
  - mc_maze.*
  - dyer_co.*
  - gallego_co.*

train:
  autoscale_batch_size: false
  batch_size: 32
notes: "Want to figure how to aggregate without task interference (which we observe in 125K base), testing capacity."
