# @package _global_
# Controlled evaluation
defaults:
  - /model:
    - flat_enc_dec
    # - accel_tune
  # - /model/task:
    # - bhvr_decode_flat
  # - /train: finetune
  - /dataset: flat
model:
  # Well we already know from early scaling experiments that embed strategies promotes faster convergence
  # And doesn't hurt peak performance; and it must be explicitly through transfer mxsms, so I don't see why we shouldn't allow it.
  session_embed_token_count: 8
  subject_embed_strategy: EmbedStrat.token
  neurons_per_token: 16
  causal: true

  task:
    behavior_lag: 120
    decode_time_pool: "mean"

  lr_init: 5e-5
  lr_ramp_steps: 1000
  lr_decay_steps: 10000
  accelerate_new_params: 10.0
  tune_decay: 0.75 # per Kaiming MAE

dataset:
  max_tokens: 8192
  max_length_ms: 2000 # fit
  max_arrays: 1

  scale_limit_per_session: 800 # no limit

  neurons_per_token: 16
  max_channels: 96

  data_keys:
  - DataKey.spikes
  # - DataKey.bhvr_vel
  datasets:
  # This data was select for being the "biggest" and having a substantial number of nearby sessions (16 in +- 2 months)
  # This let's us evaluate scaling in-session and out-session 1:1
  # We would like to see how much out-session degrades, where the baseline is still single session performance.
  - odoherty_rtt-Indy-20160627_01

  # - odoherty_rtt-Indy-20160624_03
  # - odoherty_rtt-Indy-20160630_01
  # - odoherty_rtt-Indy-20160622_01
  # - odoherty_rtt-Indy-20160426_01
  # - odoherty_rtt-Indy-20160420_01
  # - odoherty_rtt-Indy-20160419_01
  # - odoherty_rtt-Indy-20160418_01


  # Other multi datasets scale radial out, preferring backward
  eval_datasets:
  - odoherty_rtt-Indy-20160627_01
  # odoherty_rtt:
  #   arrays: ["Indy-M1", "Loco-M1"]
  #   include_sorted: False
train:
  # autoscale_batch_size: false
  # batch_size: 128
  patience: 200
init_from_id: multi_800-4h3svga0
init_tag: val_loss