python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:2 --expansion_factor 32 --hook_name blocks.8.hook_resid_pre --hook_layer 8 --batch_size 8192 --dataset_path togethercomputer/RedPajama-Data-1T  --l1_coefficient 5 --datadir default --use_ghost_grads