run_name: test-olmo-debug-precision
seed: 0
dry_run: false

max_duration: 48000 # ~12B tokens for batches of 256 x 2048
training: 
  batch_size: 256 # global_train_batch_size
device_train_microbatch_size: 32 # For H100 w/color, need smaller for RHO methods

model:
  # 151m non-embedding params
  d_model: 1024
  mlp_ratio: 4
  # mlp_hidden_size: 4096
  n_heads: 16
  n_layers: 6 # 12
  context_length: 512 #2048
  rope: true
  attention_layer_norm: true
  attention_layer_norm_with_affine: false 
  multi_query_attention: false
  include_bias: false
  block_type: sequential
  layer_norm_type: default
  layer_norm_with_affine: false 
  bias_for_layer_norm: false
  activation_type: gelu 
  attention_dropout: 0.0
  residual_dropout: 0.0
  embedding_dropout: 0.0
  # Llama-2-7b
  vocab_size: 32000
  embedding_size: 32100
  # Llama 3
  # vocab_size: 128256 
  # embedding_size: 128256 
  eos_token_id: 2
  pad_token_id: 0 # 2
  init_device: meta
  init_fn: mitchell
  weight_tying: false

  # quantizations
  # "float16”, “fp16”, “bfloat16”, “bf16”, “fp8_e5m2”, “fp8_e4m3”, “fp6_e3m2”, “fp6_e2m3”, "fp4_e2m1”, “fp4”, “int8”, “int4"
  w_mx_format: 'fp8_e4m3' #'bfloat16'
  a_mx_format: 'fp8_e4m3' #'bfloat16'
  block_size: 32 


# TODO: add your own wandb information
wandb:
  name: ${model.w_mx_format}_${model.a_mx_format}_debug
  log_interval: ${console_log_interval}
  entity: harvardml
  project: test-precision-MX-fineweb-log-clip
  group: debug

# compile: null

compile:
  mode: default

precision: amp_bf16
max_grad_norm: 1.0

fsdp:
  precision: mixed
  sharding_strategy: FULL_SHARD

scheduler:
  name: cosine_with_warmup
  t_warmup: 1000
  alpha_f: 0.1

optimizer:
  name: adamw
  learning_rate: 2.0e-4 #8.0e-4 # 1.0e-3
  weight_decay: 0.0
  eps: 1e-15
  betas:
  - 0.9
  - 0.95
  metrics_log_interval: 100

activation_checkpointing: false
softmax_auxiliary_loss: false

# datasets:
#   name: algebraic-stack
#   path: 
#   tokenizer_used: t5-base

datasets:
  name: fineweb-edu # algebraic-stack
  paths: "/n/holylfs06/LABS/kempner_shared/Everyone/testbed/text/fineweb-edu/tokenized/meta-llama-2-sapphire-low-resource/default" 
  tokenizer_used: llama2 #t5-base
  # /meta-llama-3/default"  
  # tokenizer_used: baseten/Meta-Llama-3-tokenizer
  memmap_dtype: uint16
  pad_direction: right
  num_workers: 16
  drop_last: true
  pin_memory: true
  prefetch_factor: 16
  persistent_workers: true
  timeout: 0
  extra_data_paths: null

tokenizer:
  identifier: KoboldAI/llama2-tokenizer #google-t5/t5-base
  # identifier: baseten/Meta-Llama-3-tokenizer # meta-llama/Llama-2-7b-hf
  truncate_direction: right

save_folder: ${oc.env:CHECKPOINTS_PATH}/${model.w_mx_format}_${model.a_mx_format}
save_overwrite: true
# Sharded checkpoints (best for restarts)
save_interval: 6000
save_num_checkpoints_to_keep: 1 # only keep the latest
# Unsharded checkpoints (for final storage)
save_interval_unsharded: 6000
save_num_unsharded_checkpoints_to_keep: 100 # save all

load_path: null


speed_monitor:
  window_size: 1

console_log_interval: 10

eval_interval: 1000
eval_on_load: false
eval_subset_num_batches: 200
device_eval_batch_size: ${device_train_microbatch_size}
evaluators:
  ##########################
  # Perplexity evaluations #
  ##########################
  - label: fineweb
    data:
      datasets:
        fineweb_{username}_val: fineweb-{username}-val
      drop_last: true
  # - label: fineweb
  #   data:
  #     datasets:
  #       fineweb: fineweb
  #     drop_last: true
#   - label: c4
#     data:
#       datasets:
#         c4_val: c4-val
#       drop_last: true
#   - label: starcoder
#     data:
#       datasets:
#         starcoder_val: starcoder-val
#       drop_last: true
#   - label: proof_pile_2
#     data:
#       datasets:
#         proof_pile_2_val: proof-pile-2-val
#       drop_last: true
#   - label: fineweb_100b
#     data:
#       datasets:
#         fineweb_100b_val: fineweb-100b-val
#       drop_last: true
#   - label: fineweb_1T
#     data:
#       datasets:
#         fineweb_1T_val: fineweb-1T-val
#       drop_last: true
#   - label: fineweb_edu_100b
#     data:
#       datasets:
#         fineweb_edu_100b_val: fineweb-edu-100b-val
#       drop_last: true
#   - label: slimpajama
#     data:
#       datasets:
#         slimpajama_val: slimpajama-chunk1-val
#       drop_last: true
#   - label: smollm
#     data:
#       datasets:
#         smollm_val: smollm-corpus-val
#       drop_last: true



  ##########################
  # Train #
  ##########################
  
  # - label: piqa_train
  #   type: downstream

  # - label: openbook_qa_train
  #   type: downstream

  # - label: hellaswag_train
  #   type: downstream

  # - label: winogrande_train
  #   type: downstream
  
  # - label: arc_easy_train
  #   type: downstream
  
  # - label: arc_challenge_train
  #   type: downstream
  
  # - label: boolq_train
  #   type: downstream

  # - label: sciq_train
  #   type: downstream


#   ##########################
#   # Downstream evaluations from OLMO #
#   ##########################

#   - label: piqa_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: openbook_qa_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: hellaswag_test     
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: winogrande_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: arc_easy_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: arc_challenge_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: boolq_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: sciq_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
  
# #  ##########################
# #   # Soft Downstream evaluations from OLMO #
# #   ##########################

#   - label: piqa_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: openbook_qa_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: hellaswag_test     
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: winogrande_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: arc_easy_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: arc_challenge_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: boolq_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: sciq_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true


#   - label: piqa_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ctx_ce_wrapper: true

#   - label: openbook_qa_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ctx_ce_wrapper: true

#   - label: hellaswag_test     
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ctx_ce_wrapper: true

# #   # Some sort of bug?
# #   # - label: winogrande_test
# #   #   type: downstream
# #   #   subset_num_batches: 1000 # do not subset here
# #   #   ctx_ce_wrapper: true

#   - label: arc_easy_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ctx_ce_wrapper: true

#   - label: arc_challenge_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ctx_ce_wrapper: true

#   - label: boolq_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ctx_ce_wrapper: true

#   - label: sciq_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ctx_ce_wrapper: true
  

# #   ##########################
# #   # Imbue evals
# #   ##########################
#   - label: imbue_code_comprehension
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: imbue_gsm8k
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
  
#   - label: imbue_codegen
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

# #   # And soft versions
#   - label: imbue_code_comprehension
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: imbue_gsm8k
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: imbue_codegen
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

# #   ##########################
# #   # MMLU evals
# #   ##########################
#   - label: mmlu_stem_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: mmlu_humanities_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: mmlu_social_sciences_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: mmlu_other_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

# #   # And soft versions
#   - label: mmlu_stem_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: mmlu_humanities_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: mmlu_social_sciences_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: mmlu_other_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true


# #   # MC versions
#   - label: mmlu_stem_mc_5shot_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: mmlu_humanities_mc_5shot_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: mmlu_social_sciences_mc_5shot_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

#   - label: mmlu_other_mc_5shot_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here

# #   # And soft versions
#   - label: mmlu_stem_mc_5shot_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true  

#   - label: mmlu_humanities_mc_5shot_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: mmlu_social_sciences_mc_5shot_test
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true

#   - label: mmlu_other_mc_5shot_test 
#     type: downstream
#     subset_num_batches: 1000 # do not subset here
#     ce_wrapper: true
    