# Experiment Settings
wandb_project: gpt2-small
wandb_run_name: probabilistic_topk
wandb_tags: ["hc_topk"]
seed: 42

# Model Configuration  
tlens_model_name: "gpt2-small" 
tlens_model_path: null

# Training Configuration
save_every_n_samples: null
eval_every_n_samples: 40_000
gradient_accumulation_steps: 1
lr: 5e-4
lr_schedule: cosine
min_lr_factor: 0.1
warmup_samples: 20_000
max_grad_norm: 10.0
log_every_n_grad_steps: 20

# Data Configuration
data:
  dataset_name: "apollo-research/Skylion007-openwebtext-tokenizer-gpt2"
  tokenizer_name: "gpt2"
  context_length: 1024
  n_train_samples: 300_000
  n_eval_samples: 1_000
  train_batch_size: 8
  eval_batch_size: 8
  streaming: true
  seed: null
  is_tokenized: true
  column_name: "input_ids"
  split: "train"

# SAE Configuration - Hard Concrete SAE with learned gates
saes:
  name: "hardconcrete_topk_sae"
  sae_type: "hard_concrete_topk"
  dict_size_to_input_ratio: 32.0
  pretrained_sae_paths: null
  retrain_saes: false
  sae_positions:
    - blocks.2.hook_resid_pre
    - blocks.4.hook_resid_pre
    - blocks.6.hook_resid_pre
    - blocks.8.hook_resid_pre
    - blocks.10.hook_resid_pre
  init_decoder_orthogonal: true
  sparsity_coeff: null
  k: 32
  tied_encoder_init: true
  aux_k: null
  aux_coeff: null
  initial_beta: 5.0
  final_beta: 1e-4
  use_magnitude: true
  straight_through: false
  tau: null
  anneal_ratio: null
  normalize_scores: true
  normalize_magnitude: false
  add_magnitude_to_scores: true
  z_scale: 1.0
  magnitude_scale: 1e-4
  use_hard_concrete: true
  use_layer_norm: true