#!/bin/bash

# Change to InstructionTuning directory
cd ../LLMPredictor/InstructionTuning/

# Check if dataset name is provided
if [ $# -eq 0 ]; then
    echo "Usage: bash run_sft.sh <dataset_name> [seed] [llm_model]"
    echo "Supported datasets: cora, citeseer, wikics, instagram, pubmed, reddit, photo, computer, history, arxiv"
    echo "Supported LLMs: Mistral-7B (default)"
    echo "Example: bash run_sft.sh cora 0 Mistral-7B"
    exit 1
fi

DATASET=$1
SEED=${2:-0}        # Default seed is 0
LLM=${3:-Mistral-7B} # Default LLM is Mistral-7B
prompt_type=${4:-neighbor}

echo "Starting Supervised Fine-tuning at $(date)..."
echo "Dataset: $DATASET, Seed: $SEED, LLM: $LLM, Prompt Type: $prompt_type"

# Create logs directory if it doesn't exist
mkdir -p ./logs

# Define Python commands based on dataset
if [ "$DATASET" = "cora" ] || [ "$DATASET" = "citeseer" ]; then
    accelerate launch train.py --num_epoch=4 --llm=$LLM --batch_size=4 --re_split=2 --dataset=$DATASET --max_txt_length=80 --max_origin_txt_length=200 --seed=$SEED --prompt_type=$prompt_type 2>&1 | tee ./logs/inductive_${DATASET}_${LLM}_${prompt_type}_seed${SEED}.log

elif [ "$DATASET" = "wikics" ] || [ "$DATASET" = "pubmed" ]; then
    # Middle-scale datasets (wikics, pubmed)
    echo "Running $DATASET with middle-scale configuration (wikics/pubmed)..."
    # Pubmed: Avg Query Prompt Length 51.0000 | Avg OriginTxT Length 425.6667 | Avg Output Length 4.3333
    # WikiCS: Avg Query Prompt Length 76.0000 | Avg OriginTxT Length 597.9836 | Avg Output Length 3.1882
    accelerate launch train.py --num_epoch=2 --llm=$LLM --batch_size=4 --re_split=2 --dataset=$DATASET --max_txt_length=80 --max_origin_txt_length=200 --seed=$SEED --prompt_type=$prompt_type 2>&1 | tee ./logs/inductive_${DATASET}_${LLM}_${prompt_type}_seed${SEED}.log

elif [ "$DATASET" = "instagram" ]; then
    # Instagram special configuration
    echo "Running $DATASET with instagram configuration..."
    # Instagram: Avg Query Prompt Length 37.0000 | Avg OriginTxT Length 54.8707 | Avg Output Length 2.0000
    accelerate launch train.py --num_epoch=2 --llm=$LLM --batch_size=12 --re_split=2 --dataset=$DATASET --max_txt_length=80 --max_origin_txt_length=100 --seed=$SEED --prompt_type=$prompt_type 2>&1 | tee ./logs/inductive_${DATASET}_${LLM}_${prompt_type}_seed${SEED}.log

elif [ "$DATASET" = "reddit" ] || [ "$DATASET" = "photo" ] || [ "$DATASET" = "computer" ] || [ "$DATASET" = "history" ]; then
    # Large-scale datasets
    echo "Running $DATASET with large-scale configuration..."
    accelerate launch train.py --num_epoch=1 --llm=$LLM --batch_size=4 --re_split=2 --dataset=$DATASET --max_txt_length=80 --max_origin_txt_length=200 --seed=$SEED --prompt_type=$prompt_type 2>&1 | tee ./logs/inductive_${DATASET}_${LLM}_${prompt_type}_seed${SEED}.log

elif [ "$DATASET" = "arxiv" ]; then
    # ArXiv special configuration
    echo "Running $DATASET with arxiv configuration..."
    accelerate launch train.py --num_epoch=1 --llm=$LLM --re_split=0 --dataset=arxiv --max_txt_length=256 --max_origin_txt_length=200 --batch_size=4 --seed=$SEED --prompt_type=$prompt_type 2>&1 | tee ./logs/arxiv_${LLM}_${prompt_type}_seed${SEED}.log

else
    echo "Error: Unsupported dataset '$DATASET'"
    echo "Supported datasets: cora, citeseer, wikics, instagram, pubmed, reddit, photo, computer, history, arxiv"
    exit 1
fi

echo "Supervised Fine-tuning completed at $(date). Check log files for results." 