# @package _global_

# Really just a test of heterogeneity throughput rather than any specific win; can compare with scores in just REDACT

defaults:
  - /model: flat_enc_dec
  - /train: pretrain
model:
  task_embed_strategy: EmbedStrat.token
  subject_embed_strategy: EmbedStrat.token
  array_embed_strategy: EmbedStrat.none
  # array_embed_strategy: EmbedStrat.token_add
  # causal: True

  transformer:
    n_layers: 12
    # 6 layers used for 30K, 9 layers for ~60K, 15 for 120K

  neurons_per_token: 8
  hidden_size: 384

  task:
    mask_ratio: 0.4
    tasks:
    - ModelTask.shuffle_infill
    # - ModelTask.kinematic_decoding
    metrics: []
    # - Metric.kinematic_r2
dataset:
  max_tokens: 4096
  max_length_ms: 2000
  max_arrays: 2
  bin_size_ms: 20
  serve_tokenized: true
  serve_tokenized_flat: true
  neurons_per_token: 8
  max_channels: 192 # 2 x 96... this should also cover Dyer.
  datasets:
  - churchland_maze.*
  - churchland_misc.*
  - CRS02bHome.data.*
  - odoherty_rtt.*
  - mc_maze.*
  - dyer_co.*
  # - mc_rtt
  eval_datasets:
  - CRS02bHome.data.00437 # using this as a convenient marker where we already have 20ms experiments
  # - 'mc_rtt'
  # - 'odoherty_rtt-Indy-20161005_06'
  odoherty_rtt:
    arrays:
    - Indy-M1
    - Loco-M1
  data_keys:
  - DataKey.spikes
  meta_keys:
  - MetaKey.unique
  - MetaKey.session
  - MetaKey.array
  - MetaKey.subject
  - MetaKey.task

train:
  batch_size: 5 # 3 by 5, close enough to 4x4
  # accumulate_batches: 2 # only 2 nodes
# load_from_id: base_20-yjhvosga
# init_tag: val_loss