architecture: standard
model_name: pythia-70m-deduped
hook_name: blocks.4.hook_resid_post
hook_layer: 4
context_size: 128
use_cached_activations: True
dataset_path: monology/pile-uncopyrighted
cached_activations_path: <PATH_TO_CACHED_ACTIVATIONS>
d_in: 512
training_tokens: 100_000_000
train_batch_size_tokens: 16384
seed: 42
expansion_factor: 64
lr: 0.0001
l1_coefficient: 0.1
log_to_wandb: True
wandb_log_frequency: 10
eval_every_n_wandb_logs: 10000000
device: cuda
