run_name: v1-5-mix-medium-mitch-ish  # can't have "_" or "." here
image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
gpu_num: 216
cluster: r12z3
gpu_type: a100_40gb
integrations:
  - integration_type: git_repo
    git_repo: allenai/LLM
    # git_branch: mitchish
    git_commit: 148ca062e7f1f7667d7fc0f4346e97467e66ce87
    pip_install: -e .
    ssh_clone: true
command: |-
  pip freeze
  mkdir -p /root/.cache/torch/

  export OMP_NUM_THREADS=8
  export LOG_FILTER_TYPE=local_rank0_only
  #export OLMO_NO_SSL=1
  #export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128

  cd LLM

  torchrun \
  --master_addr $MASTER_ADDR \
  --master_port $MASTER_PORT \
  --nnodes $NUM_NODES \
  --node_rank $NODE_RANK \
  --nproc_per_node 8 \
  scripts/train.py configs/v1_5-mix-medium-mitch-ish-s3.yaml \
    --run_name=v1_5-mix-mitch-ish \
    --wandb.name=v1_5-mix-mitch-ish-mcli-final \
    --global_train_batch_size=2160 \
    --model.flash_attention=true \
    --time_limit=169200

# We added these flags in order to get a final checkpoint where we decayed the LR down to 0.
#    --eval_interval=100 \
#    --save_interval=500 \
#    --load_path=s3://ai2-llm/checkpoints/7b/v1_5-mix-mitch-ish/step556000 \
#    --remote_save_folder=s3://ai2-llm/checkpoints/7b/v1_5-mix-mitch-ish-final \
#    --epoch=1 \
#    --optimizer.learning_rate=0.000023 \
#    --scheduler.t_warmup=556000 \
#    --scheduler.t_max=557000 \
#    --scheduler.alpha_f=0.001 \
#    --stop_at=557001
