training_args:
  run_name: test_fsm_model
  output_dir: output_fsm_test
  overwrite_output_dir: True
  do_train: True
  do_eval: True
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  learning_rate: 1e-3
  warmup_ratio: 0.0
  lr_scheduler_type: constant
  max_steps: 2
  evaluation_strategy: steps
  eval_steps: 2
  save_steps: 2
  ddp_find_unused_parameters: False
  report_to: none
  use_cpu: True

  train_model_params: True
  model_lr_factor: 1
  codebook_reg_p: null
  codebook_weight_decay: 0.01

codebook_args:
  codebook_at: ["attn_preproj", "mlp"]
  codebook_type: ["group", "vanilla"]
  num_codebooks: [-1, 1]
  k_codebook: [1,2]
  num_codes: 10
  layers_to_snap: all
  similarity_metric: inner_product
  loss: aeloss
  kmeans_init: False
  codebook_kwargs:
    replace_after_steps: 2
    replace_rho: 1e-4

k_scheduler_kwargs: null

fsm_dataset_args:
  path: null
  num_states: 4
  num_edges: 1
  seed: 42
  max_eval_samples: 2

model_config_args:
  model_path: null
  model_type: gptneox
  hidden_size: 16
  intermediate_size: 64
  num_hidden_layers: 2
  num_attention_heads: 1
  rotary_emb_base: 10
  seq_len: 32
  vocab_size: 3

apply_codebook: True
enable_logging: False
wandb_charts: False
pretrained_path: null
get_baseline: False
tag_keys: [codebook_type, num_codebooks, k_codebook, loss]
tags: ["test_fsm"]
