parent: research/conditional/train/configs/baselines/gpt/small.yaml
md5_parent_hash: cbdf4e5992f41713488c25c819086e51
interactive_debug: false
time: 2-00:00:00
params:
  ff_mode: cont_moe
  n_experts: 256
  ^group_size: [32, 64, 128]
  flop_matched: true
  sparsity_dim: 0
  learning_rate: 0.001
  temperature: 1.0
  ^loss_checkpoint_chungs: [8]
  name: "dynamic_group_size_multiply_by_gs"
  should_evaluate_dynamic_groupsize: true
  n_eval_batches: 3
  decoding_interval: 0
  flash_attention: true
