#!/bin/bash
set +e
set -x

export TMPDIR="/tmp"

export CXX=g++
export OMP_NUM_THREADS=20
export TRANSFORMERS_OFFLINE=1

# NCCL
export TORCH_DISTRIBUTED_DEBUG=INFO
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_DEBUG=INFO
export CUDA_DEVICE_MAX_CONNECTIONS=1

MODEL_NAME_OR_PATH=<base_model>
MAX_SEQ_LENGTH=16384
PREPROCESSING_NUM_WORKERS=64
RAW_INPUT_FILE=<data_path.jsonl>
TOTAL_BATCH_SIZE=128
LEARNING_RATE=7e-5
NUM_TRAIN_EPOCHS=20
TRAIN_TAG=<some_tag>
TOKENIZED_FILE=sft_outputs/preprocessed.$TRAIN_TAG.jsonl

python sft/preprocess.py --input_file $RAW_INPUT_FILE --output_file $TOKENIZED_FILE --tokenizer_name_or_path $MODEL_NAME_OR_PATH --max_seq_length $MAX_SEQ_LENGTH --preprocessing_num_workers $PREPROCESSING_NUM_WORKERS

OUTPUT_DIR=sft_outputs/model.$TRAIN_TAG

WORLD_SIZE=1
NUM_GPUS=8
BATCH_SIZE_PER_GPU=1
GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$WORLD_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))

torchrun \
    --nproc_per_node $NUM_GPUS \
    -m sft.finetune \
    --model_name_or_path $MODEL_NAME_OR_PATH \
    --train_tokenized_file $TOKENIZED_FILE \
    --output_dir $OUTPUT_DIR \
    --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
    --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
    --evaluation_strategy "no" \
    --save_strategy "epoch" \
    --save_total_limit 20 \
    --ddp_timeout 14400 \
    --learning_rate $LEARNING_RATE \
    --lr_scheduler_type cosine \
    --warmup_ratio 0.1 \
    --num_train_epochs $NUM_TRAIN_EPOCHS \
    --logging_steps 1 \
    --report_to "none" \
    --gradient_checkpointing True \
    --deepspeed configs/ds_configs/stage_3.json \
    --overwrite_output_dir \
    --bf16 True

# CUDA_VISIBLE_DEVICES=0,1,2,3 bash evaluation/eval_local_model.sh $OUTPUT_DIR
