##### Switching to a new model where each task has its own config, so I don't have to worry about changing 100 defaults every time
defaults:
  - _self_
#   - tokenizer: physion
  - dvae: default
  - path_constants: path_constants
#   - dataset: physion-training
#   - transformer: physion
  - task_worldmodeling: default

# manifest hydra folder
hydra:
  output_subdir: hydra

device: CUDA_IF_AVAILABLE  # this default is handled in a special decorator, can be overridden
device_type: ???     # this is set in the same decorator based on the device; this should NOT be overridden

# TODO seed

enable_profiling: False
enable_videos: True  # not implemented in all training scripts (default is True)
report_memory: False  # currently only implemented in train_three_transformers, causing it to write a simple memory report and exit efter 2 epochs
                      # (unless this comment became outdated since then)

the_global:
  embedding_dim: 768
  tokimg_sequence_length: 65   # an upper bound. The actual sequence length depends on the tokenizer config.
  vocab_size: 50304  # GPT-2 vocab_size is 50257, nanogpt pads up to nearest multiple of 64 (i.e. 50304) for efficiency

# phyre dataset 00002 ha esattamente 50000 video
# MOVi-E ne ha 9749
training:
  train_split: 0.95
  batch_size: 10
  epochs: 200
  batches_per_epoch: 50
  subbatch_size: 2  # because of memory constraints, only a small number of samples is parallelized at a time. Loss is accumulated so that gradient update is still per-batch.
  _tokenizer_lr: 0.0001  # TODO questo è quello di quando si allena da solo, è un buon punto di partenza ma c'è da vedere se per l'allenamento con tutto va sistemato
  _transformer_lr: 6e-4
  #
  crosscoder:
    learning_rate: ${training._transformer_lr}
    weight_decay: 1e-1
    betas:
      - 0.9
      - 0.95
  decoder:
    learning_rate: ${training._transformer_lr}
    weight_decay: 1e-1
    betas:
      - 0.9
      - 0.95
  encoder:
    learning_rate: ${training._transformer_lr}
    weight_decay: 1e-1
    betas:
      - 0.9
      - 0.95

eval:
  batch_size: ${training.batch_size}
  subbatch_size: ${training.subbatch_size}
