                                     
# python pretrain_umd/train_retrieval_w_anticausal.py --model_checkpoint /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --tokenizer_path /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --out_dir /fs/XXXX-37/llm-pretraining/llm-retrieval/out/dolma-retrieval-dual-causal-pythia-160m-worldbsz-16-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug --resume True --seed 1337 --max_tokens 25_000_000_000 --model_name pythia-160m --run_name dolma-retrieval-dual-causal-pythia-160m-worldbsz-16-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug --logger_name wandb --compile_model False --fabric_precision bf16-mixed --world_batch_size 16 --micro_batch_size 8 --block_size 2048 --n_chunks 4 --warmup_steps 2000 --log_step_interval 1 --eval_iters 100 --save_and_eval_interval 2000 --grad_clip 1.0 --decay_lr True --learning_rate 3e-4 --min_lr 3e-5 --data_telemetry False --data_config /XXXX-36/XXXX-22/XXXX-40/launch_scripts/XXXX-22/sample_hfds_only.json --fabric_strategy ddp --pretrained_prefix_model False --pretrained_suffix_model False 
# python pretrain_umd/train_retrieval_w_anticausal.py --model_checkpoint /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --tokenizer_path /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --out_dir /fs/XXXX-37/llm-pretraining/llm-retrieval/out/dolma-retrieval-dual-causal-pythia-160m-worldbsz-24-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug --resume True --seed 1337 --max_tokens 25_000_000_000 --model_name pythia-160m --run_name dolma-retrieval-dual-causal-pythia-160m-worldbsz-24-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug --logger_name wandb --compile_model False --fabric_precision bf16-mixed --world_batch_size 24 --micro_batch_size 8 --block_size 2048 --n_chunks 4 --warmup_steps 2000 --log_step_interval 1 --eval_iters 100 --save_and_eval_interval 2000 --grad_clip 1.0 --decay_lr True --learning_rate 3e-4 --min_lr 3e-5 --data_telemetry False --data_config /XXXX-36/XXXX-22/XXXX-40/launch_scripts/XXXX-22/sample_hfds_only.json --fabric_strategy ddp --pretrained_prefix_model False --pretrained_suffix_model False 
# python pretrain_umd/train_retrieval_w_anticausal.py --model_checkpoint /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --tokenizer_path /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --out_dir /fs/XXXX-37/llm-pretraining/llm-retrieval/out/dolma-retrieval-dual-causal-pythia-160m-worldbsz-32-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug --resume True --seed 1337 --max_tokens 25_000_000_000 --model_name pythia-160m --run_name dolma-retrieval-dual-causal-pythia-160m-worldbsz-32-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug --logger_name wandb --compile_model False --fabric_precision bf16-mixed --world_batch_size 32 --micro_batch_size 8 --block_size 2048 --n_chunks 4 --warmup_steps 2000 --log_step_interval 1 --eval_iters 100 --save_and_eval_interval 2000 --grad_clip 1.0 --decay_lr True --learning_rate 3e-4 --min_lr 3e-5 --data_telemetry False --data_config /XXXX-36/XXXX-22/XXXX-40/launch_scripts/XXXX-22/sample_hfds_only.json --fabric_strategy ddp --pretrained_prefix_model False --pretrained_suffix_model False 
# python pretrain_umd/train_retrieval_w_anticausal.py --model_checkpoint /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --tokenizer_path /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --out_dir /fs/XXXX-37/llm-pretraining/llm-retrieval/out/dolma-retrieval-dual-causal-pythia-160m-worldbsz-40-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug_3 --resume True --seed 1337 --max_tokens 25_000_000_000 --model_name pythia-160m --run_name dolma-retrieval-dual-causal-pythia-160m-worldbsz-40-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug_3 --logger_name wandb --compile_model False --fabric_precision bf16-mixed --world_batch_size 40 --micro_batch_size 8 --block_size 2048 --n_chunks 4 --warmup_steps 2000 --log_step_interval 1 --eval_iters 100 --save_and_eval_interval 2000 --grad_clip 1.0 --decay_lr True --learning_rate 3e-4 --min_lr 3e-5 --data_telemetry False --data_config /XXXX-36/XXXX-22/XXXX-40/launch_scripts/XXXX-22/sample_hfds_only.json --fabric_strategy ddp --pretrained_prefix_model False --pretrained_suffix_model False 
# python pretrain_umd/train_retrieval_w_anticausal.py --model_checkpoint /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --tokenizer_path /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --out_dir /fs/XXXX-37/llm-pretraining/llm-retrieval/out/dolma-retrieval-dual-causal-pythia-160m-worldbsz-48-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug --resume True --seed 1337 --max_tokens 25_000_000_000 --model_name pythia-160m --run_name dolma-retrieval-dual-causal-pythia-160m-worldbsz-48-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug --logger_name wandb --compile_model False --fabric_precision bf16-mixed --world_batch_size 48 --micro_batch_size 8 --block_size 2048 --n_chunks 4 --warmup_steps 2000 --log_step_interval 1 --eval_iters 100 --save_and_eval_interval 2000 --grad_clip 1.0 --decay_lr True --learning_rate 3e-4 --min_lr 3e-5 --data_telemetry False --data_config /XXXX-36/XXXX-22/XXXX-40/launch_scripts/XXXX-22/sample_hfds_only.json --fabric_strategy ddp --pretrained_prefix_model False --pretrained_suffix_model False 
python pretrain_umd/train_retrieval_w_anticausal.py --model_checkpoint /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --tokenizer_path /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --out_dir /fs/XXXX-37/llm-pretraining/llm-retrieval/out/dolma-retrieval-dual-causal-pythia-160m-worldbsz-56-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug --resume True --seed 1337 --max_tokens 25_000_000_000 --model_name pythia-160m --run_name dolma-retrieval-dual-causal-pythia-160m-worldbsz-56-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug --logger_name wandb --compile_model False --fabric_precision bf16-mixed --world_batch_size 56 --micro_batch_size 8 --block_size 2048 --n_chunks 4 --warmup_steps 2000 --log_step_interval 1 --eval_iters 100 --save_and_eval_interval 2000 --grad_clip 1.0 --decay_lr True --learning_rate 3e-4 --min_lr 3e-5 --data_telemetry False --data_config /XXXX-36/XXXX-22/XXXX-40/launch_scripts/XXXX-22/sample_hfds_only.json --fabric_strategy ddp --pretrained_prefix_model False --pretrained_suffix_model False 
# python pretrain_umd/train_retrieval_w_anticausal.py --model_checkpoint /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --tokenizer_path /fs/XXXX-37/llm-pretraining/llm-retrieval/checkpoints/EleutherAI/pythia-160m --out_dir /fs/XXXX-37/llm-pretraining/llm-retrieval/out/dolma-retrieval-dual-causal-pythia-160m-worldbsz-64-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug --resume True --seed 1337 --max_tokens 25_000_000_000 --model_name pythia-160m --run_name dolma-retrieval-dual-causal-pythia-160m-worldbsz-64-ctx-rand-batch_negative_ddp_RR_lr_3e-4_debug --logger_name wandb --compile_model False --fabric_precision bf16-mixed --world_batch_size 64 --micro_batch_size 8 --block_size 2048 --n_chunks 4 --warmup_steps 2000 --log_step_interval 1 --eval_iters 100 --save_and_eval_interval 2000 --grad_clip 1.0 --decay_lr True --learning_rate 3e-4 --min_lr 3e-5 --data_telemetry False --data_config /XXXX-36/XXXX-22/XXXX-40/launch_scripts/XXXX-22/sample_hfds_only.json --fabric_strategy ddp --pretrained_prefix_model False --pretrained_suffix_model False