run_name: olmo-7b-final
image: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04
gpu_num: 64
#cluster: r12z3
cluster: r7z2
gpu_type: a100_40gb
integrations:
  - integration_type: git_repo
    git_repo: allenai/LLM
    git_branch: main
    pip_install: -e .
    ssh_clone: true
command: |-
  checkpoint=s3://ai2-llm/checkpoints/7b/mitchish-lumi-2T-final/step458000
  run_name=mitchish-lumi-2T-final
  config=configs/v1_5-mix-medium-mitch-ish-s3.yaml

  cd LLM

  pip freeze

  # Prepare environment including AWS config files for both S3 and R2 access.
  mkdir -p /root/.cache/torch

  export OMP_NUM_THREADS=8
  export LOG_FILTER_TYPE=local_rank0_only

  torchrun \
  --master_addr "$MASTER_ADDR" \
  --master_port "$MASTER_PORT" \
  --nnodes "$NUM_NODES" \
  --node_rank "$NODE_RANK" \
  --nproc_per_node 8 \
  scripts/train.py ${config} \
    --run_name=${run_name} \
    --save_overwrite \
    --save_interval_unsharded=10000 \
    --load_path=${checkpoint} \
    --compile=null \
    --model.flash_attention=true \
    --activation_checkpointing=fine_grained \
    --fsdp.wrapping_strategy=size_based \
    --remote_save_folder=s3://ai2-llm/checkpoints/7b/${run_name} \
    --data.paths=[s3://ai2-llm/preprocessed/olmo-mix/v1_5-sample-9B/gpt-neox-20b-pii-special/data.npy,s3://ai2-llm/preprocessed/tulu-v2-sft-mixture/gpt-neox-20b-pii-special/data.npy] \
    --evaluators=[] \
    --optimizer.learning_rate=0.000023 \
    --scheduler.alpha_f=0.001 \
    --scheduler.t_warmup=456000 \
    --scheduler.t_max=458223  # + 2223

    #--scheduler.t_warmup=432410 \
    #--scheduler.t_max=434633  # + 2223
