# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:0 --expansion_factor 32 --hook_name blocks.0.hook_mlp_out --hook_layer 0 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de  xlwic_en_bg  xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --use_ghost_grads \
#     --output_dir ./wic/layer_8/ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:0 --expansion_factor 32 --hook_name blocks.0.hook_resid_pre --hook_layer 0 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#      --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --output_dir ./wic/layer_0/no_ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:0 --expansion_factor 32 --hook_name blocks.1.hook_resid_pre --hook_layer 1 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --use_ghost_grads \
#     --output_dir ./wic/layer_1/ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:0 --expansion_factor 32 --hook_name blocks.1.hook_resid_pre --hook_layer 1 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --output_dir ./wic/layer_1/no_ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:0 --expansion_factor 32 --hook_name blocks.2.hook_resid_pre --hook_layer 2 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --use_ghost_grads \
#     --output_dir ./wic/layer_2/ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:0 --expansion_factor 32 --hook_name blocks.2.hook_resid_pre --hook_layer 2 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --output_dir ./wic/layer_2/no_ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:0 --expansion_factor 32 --hook_name blocks.3.hook_resid_pre --hook_layer 3 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --use_ghost_grads \
#     --output_dir ./wic/layer_3/ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:0 --expansion_factor 32 --hook_name blocks.3.hook_resid_pre --hook_layer 3 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --output_dir ./wic/layer_3/no_ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:0 --expansion_factor 32 --hook_name blocks.4.hook_resid_pre --hook_layer 4 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --use_ghost_grads \
#     --output_dir ./wic/layer_4/ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.4.hook_resid_pre --hook_layer 4 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --output_dir ./wic/layer_4/no_ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.5.hook_resid_pre --hook_layer 5 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --use_ghost_grads \
#     --output_dir ./wic/layer_5/ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.5.hook_resid_pre --hook_layer 5 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --output_dir ./wic/layer_5/no_ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.6.hook_resid_pre --hook_layer 6 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --use_ghost_grads \
#     --output_dir ./wic/layer_6/ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.6.hook_resid_pre --hook_layer 6 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --output_dir ./wic/layer_6/no_ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.7.hook_resid_pre --hook_layer 7 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --use_ghost_grads \
#     --output_dir ./wic/layer_7/ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.7.hook_resid_pre --hook_layer 7 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --output_dir ./wic/layer_7/no_ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.9.hook_resid_pre --hook_layer 9 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --use_ghost_grads \
#     --output_dir ./wic/layer_8/ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.9.hook_resid_pre --hook_layer 9 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --output_dir ./wic/layer_8/no_ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.10.hook_resid_pre --hook_layer 10 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --use_ghost_grads \
#     --output_dir ./wic/layer_9/ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.10.hook_resid_pre --hook_layer 10 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --output_dir ./wic/layer_9/no_ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.11.hook_resid_pre --hook_layer 11 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --use_ghost_grads \
#     --output_dir ./wic/layer_10/ghost_grads --total_training_steps 200000
# python train.py --model_name_or_path gpt2-small --d_in 768 --device cuda:1 --expansion_factor 32 --hook_name blocks.11.hook_resid_pre --hook_layer 11 --batch_size 1024 --dataset_path pasinit/xlwic  --l1_coefficient 0.05 \
#     --datadir xlwic_en_de xlwic_en_bg xlwic_en_da xlwic_en_et xlwic_en_fa xlwic_en_fr xlwic_en_hr xlwic_en_it xlwic_en_ja xlwic_en_nl xlwic_en_ko xlwic_en_zh \
#     --output_dir ./wic/layer_10/no_ghost_grads --total_training_steps 200000
    