version: v2
description: open-instruct-finetune
budget: ai2/oe-adapt
tasks:
  - name: open-instruct-finetune
    image:
      beaker: nathanl/open_instruct_auto
    command: [
      '/bin/sh', '-c'
    ]
    arguments: ['PYTHONPATH="/stage:$PYTHONPATH" accelerate launch
      --mixed_precision bf16
      --num_machines 1
      --num_processes 4
      --use_deepspeed
      --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf
      open_instruct/finetune.py
      --model_name_or_path /hf_llama_models
      --use_flash_attn
      --tokenizer_name /hf_llama_models
      --max_seq_length 2048
      --preprocessing_num_workers 16
      --per_device_train_batch_size 2
      --gradient_accumulation_steps 16
      --learning_rate 2e-5
      --lr_scheduler_type linear
      --warmup_ratio 0.03
      --weight_decay 0.
      --num_train_epochs 2
      --output_dir /output/
      --with_tracking
      --report_to tensorboard
      --logging_steps 1
    ']
    envVars:
      - name: CUDA_DEVICE_ORDER
        value: PCI_BUS_ID
      - name: TRANSFORMERS_CACHE
        value: ./cache/
      - name: WANDB_API_KEY
        secret: WANDB_API_KEY
      - name: WANDB_PROJECT
        value: open-instruct
      - name: WANDB_WATCH
        value: false
      - name: WANDB_LOG_MODEL
        value: false
      - name: WANDB_DISABLED
        value: true
      - name: HF_TOKEN
        secret: HF_TOKEN
    # datasets: # example for how to include datasets in mounting
    #   - mountPath: /data
    #     source:
    #       beaker: Yizhongw03/processed_open_instruct_data
    #   - mountPath: /mmlu
    #     source:
    #       beaker: Yizhongw03/mmlu
    #   - mountPath: /hf_llama_models
    #     source:
    #       beaker: Yizhongw03/hf_llama_model_7B
    result:
      path: /output
    resources:
      gpuCount: 4
    context:
      cluster: ai2/allennlp-cirrascale
      priority: high
      preemptible: false