run_name: v1-mix-medium
image: mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04
gpu_num: 216
cluster: r12z3
gpu_type: a100_40gb
integrations:
  - integration_type: git_repo
    git_repo: allenai/LLM
    git_branch: main  # make sure to update this!
    pip_install: -e .
    ssh_clone: true
command: |-
  pip freeze
  mkdir -p /root/.cache/torch/

  export OMP_NUM_THREADS=8
  export LOG_FILTER_TYPE=local_rank0_only
  export OLMO_NO_SSL=1  # we get SSLErrors all the time on this cluster
  #export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128

  cd LLM

  torchrun \
  --master_addr $MASTER_ADDR \
  --master_port $MASTER_PORT \
  --nnodes $NUM_NODES \
  --node_rank $NODE_RANK \
  --nproc_per_node 8 \
  scripts/train.py configs/v1-mix-medium-s3.yaml \
    --run_name=v1-mix-medium \
    --model.flash_attention=true \
    --scheduler.name=linear_with_warmup \
    --global_train_batch_size=2160
