# Experiment Settings
wandb_project: gpt2-small
wandb_run_name: null
wandb_tags: ["topk"]
seed: 42

# Model Configuration  
tlens_model_name: "gpt2-small" 
tlens_model_path: null

# Training Configuration
save_every_n_samples: null
eval_every_n_samples: 40_000
gradient_accumulation_steps: 1
lr: 5e-4
lr_schedule: cosine
min_lr_factor: 0.1
warmup_samples: 20_000
max_grad_norm: 10.0
log_every_n_grad_steps: 20

# Data Configuration
data:
  dataset_name: "apollo-research/Skylion007-openwebtext-tokenizer-gpt2"
  tokenizer_name: "gpt2"
  context_length: 1024
  n_train_samples: 300_000
  n_eval_samples: 1_000
  train_batch_size: 8
  eval_batch_size: 4
  streaming: true
  seed: null
  is_tokenized: true
  column_name: "input_ids"
  split: "train"

# SAE Configuration - Hard Concrete SAE with learned gates
saes:
  name: "topk_sae"
  sae_type: "topk"
  dict_size_to_input_ratio: 32.0
  pretrained_sae_paths: null
  retrain_saes: false
  sae_positions:
    - blocks.2.hook_resid_pre
    - blocks.4.hook_resid_pre
    - blocks.6.hook_resid_pre
    - blocks.8.hook_resid_pre
    - blocks.10.hook_resid_pre
  init_decoder_orthogonal: true
  sparsity_coeff: null
  k: 32
  tied_encoder_init: true
  aux_k: 64
  aux_coeff: 0.03