program: main.py
command:
  - ${env}
  - python3
  - ${program}
  - ${args}
method: grid
metric:
  name: validation/mean_accuracy
  goal: maximize
parameters:
  log:
    value: wandb
  task:
    value: slimpajama_transformer
  test_interval:
    value: 1000
  moe_name:
    value: deepseekv3
  max_compete_in_iter:
    value: 3
  balance_loss_coef:
    value: 0.01
  balance_loss_coef_comp:
    value: 0.001
  balance_affinity:
    value: 1
  state_size:
    value: 1024
  transformer.encoder_n_layers:
    value: 24
  transformer.n_heads:
    value: 32
  dropout:
    value: 0.0
  moe.drop_expert:
    value: 0.0
  lr:
    value: 0.00025
  optimizer:
    value: adamw
  lm.unroll:
    value: 1024
  grad_clip:
    value: 0.25
  amp:
    value: 1
  save_interval:
    value: 20000
  transformer.variant:
    value: preln_moe
  stop_after:
    value: 200000
  moe.n_experts:
    value: 23
  moe.expert_size:
    value: 512
  pkm.n_heads:
    value: 7
  transformer.p_drop_layer:
    value: 0.0
  moe.selection_mode:
    value: gate
  moe.perplexity_reg_mode:
    value: step
  moe.perplexity_reg:
    value: 0.01
  lr_sched.type:
    value: cos
  lr_warmup:
    value: 4000
  lmds.valid_ratio:
    value: 0.005
  transformer.head_projection_size:
    value: 128
  transformer.universal.group_size:
    value: 24
  wd:
    value: 0.01
  lm.trafo.context_blocks:
    value: 0
  min_lr_multiplier:
    value: 0.1
  details_log_interval:
    value: 500
  lm.eval.enabled:
    value: 0
  batch_size:
    value: 64
  n_microbatch:
    value: null
  per_device_batch_size:
    value: 16
  save_dir:
    value: ./pretrain_final/1BL3/deepseekv3
  restore:
    value: ./pretrain_final/1BL3/deepseekv3/slimpajama_moe_no_attmoe_1B_deepseek/checkpoint/model-20000.pth