# @package _global_

defaults:
  - /model: flat_enc_dec
  # - /model: pretrain_small
  - /train: nlb
  - /dataset: flat
  # - /dataset: maze_nlb
dataset:
  datasets:
  - mc_maze_med
  eval_datasets:
  - mc_maze_medium
  eval_ratio: 0.2
  nlb_maze:
    heldout_neurons: 45
  data_keys:
  - DataKey.spikes
  - DataKey.heldout_spikes
  bin_size_ms: 20
  neurons_per_token: 128
  max_channels: 128
model:
  neurons_per_token: 128
  session_embed_token_count: 1
  # session_embed_token_count: 8
  task_embed_strategy: EmbedStrat.none
  # task_embed_strategy: EmbedStrat.token
  subject_embed_strategy: EmbedStrat.none
  # subject_embed_strategy: EmbedStrat.token
  hidden_size: 128

  dropout: 0.8
  transformer:
    dropout: 0.8
    n_layers: 6 # since we use 2 decoder layers
    n_heads: 2
    debug_force_nonlearned_position: True
  task:
    task_weights: [1., 1.]
    tasks:
    - ModelTask.shuffle_infill
    - ModelTask.heldout_decoding
    mask_ratio: 0.25
    query_heldout: 45 # mc_maze maxed at this
    metrics:
    - Metric.co_bps
    - Metric.block_co_bps
    decode_time_pool: 'mean' # shouldn't matter
    decode_use_shuffle_backbone: true

  lr_ramp_steps: 3000
  lr_decay_steps: 100000

train:
  patience: 5000
notes: "Reference 1. Switch to shuffle, without changing anything else."
# This switches masking strategy.