

# Sanity checks for pytorch issue https://github.com/pytorch/pytorch/issues/96693
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_base arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_simplecomp arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_gemm arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_gemm=True seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_pw arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_pointwise=True seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_default arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_no_cudagraphs arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233

# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_base_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl.tf32_allowed=False seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_simplecomp_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.tf32_allowed=False seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_gemm_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_gemm=True impl.tf32_allowed=False seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_pw_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null +impl._inductor_vars.max_autotune_pointwise=True impl.tf32_allowed=False seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_default_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune impl.tf32_allowed=False seed=233
# python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_no_cudagraphs_tfoff arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs impl.tf32_allowed=False seed=233

# all follow the same curve:
python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_40k arch=crammed-bert               train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune               seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2
python pretrain.py name=DA6000amp_b8192_cb_o4_premade_max_autotune_no_cudagraphs_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2
python pretrain.py name=DA6000amp_b8192_cb_o4_premade_default_40k arch=crammed-bert                    train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=default                    seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2
python pretrain.py name=DA6000amp_b8192_cb_o4_premade_reduce_overhead_40k arch=crammed-bert            train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=reduce-overhead            seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=0 train.scheduler=triangle2


CUDA_VISIBLE_DEVICES=3 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_max_autotune_40k arch=crammed-bert               train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune               seed=233 train.steps=40000 budget=2.4 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=4 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_max_autotune_no_cudagraphs_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233 train.steps=40000 budget=2.4 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=5 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_default_40k arch=crammed-bert                    train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=default                    seed=233 train.steps=40000 budget=2.4 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=6 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_reduce_overhead_40k arch=crammed-bert            train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=reduce-overhead            seed=233 train.steps=40000 budget=2.4 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2


CUDA_VISIBLE_DEVICES=5 python pretrain.py name=DA6000amp_b8192_cb_o4_with_ramp_max_autotune_40k arch=crammed-bert               train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune               seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=20000 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=5 python pretrain.py name=DA6000amp_b8192_cb_o4_with_ramp_max_autotune_no_cudagraphs_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=max-autotune-no-cudagraphs seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=20000 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=6 python pretrain.py name=DA6000amp_b8192_cb_o4_with_ramp_default_40k arch=crammed-bert                    train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=default                    seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=20000 train.scheduler=triangle2
CUDA_VISIBLE_DEVICES=6 python pretrain.py name=DA6000amp_b8192_cb_o4_with_ramp_reduce_overhead_40k arch=crammed-bert            train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null impl.mode=reduce-overhead            seed=233 train.steps=40000 budget=2.4 impl.deterministic=True train.batch_size_ramp=20000 train.scheduler=triangle2



# invoke cache skip + cuidagraphs: +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True

CUDA_VISIBLE_DEVICES=3 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_det_reduce_overhead_40k arch=crammed-bert    train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null     seed=233 train.steps=40000 budget=100 impl.deterministic=True  train.batch_size_ramp=0 train.scheduler=triangle2 +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True
CUDA_VISIBLE_DEVICES=4 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_cb_o4_with_nondet_reduce_overhead_40k arch=crammed-bert train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null     seed=233 train.steps=40000 budget=100 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=triangle2 +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True


CUDA_VISIBLE_DEVICES=5 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_const_with_det_default_40k arch=crammed-bert                  train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null  seed=233 train.steps=40000 budget=100 impl.deterministic=True  train.batch_size_ramp=0 train.scheduler=constant +impl._inductor_vars.autotune_local_cache=False
CUDA_VISIBLE_DEVICES=5 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_const_with_det_reduce_overhead_40k arch=crammed-bert          train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null  seed=233 train.steps=40000 budget=100 impl.deterministic=True  train.batch_size_ramp=0 train.scheduler=constant +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True
CUDA_VISIBLE_DEVICES=6 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_const_with_nondet_default_40k arch=crammed-bert               train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null  seed=233 train.steps=40000 budget=100 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=constant +impl._inductor_vars.autotune_local_cache=False
CUDA_VISIBLE_DEVICES=6 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 python pretrain.py name=DA6000amp_b8192_const_with_nondet_reduce_overhead_40k arch=crammed-bert       train=bert-o4 data=pile-readymade data.hf_location=JonasGeiping/the_pile_WordPiecex32768_53b28db05413b6497e702f178268e1e2 impl.microbatch_size=512 impl._inductor_vars=null  seed=233 train.steps=40000 budget=100 impl.deterministic=False train.batch_size_ramp=0 train.scheduler=constant +impl._inductor_vars.autotune_local_cache=False +impl._inductor_vars.triton.cudagraphs=True

# torch._dynamo.reset()?
# torch.compiler.reset?