#!/bin/bash

# Change to InstructionTuning directory
cd ../LLMPredictor/InstructionTuning/

# Check if dataset name is provided
if [ $# -eq 0 ]; then
    echo "Usage: bash run_sft.sh <dataset_name> [seed] [llm_model]"
    echo "Supported datasets: cora, citeseer, wikics, instagram, pubmed, reddit, photo, computer, history"
    echo "Supported LLMs: Mistral-7B (default)"
    echo "Example: bash run_sft.sh cora 0 Mistral-7B"
    exit 1
fi

DATASET=$1
SEED=${2:-0}        # Default seed is 0
LLM=${3:-Mistral-7B} # Default LLM is Mistral-7B
prompt_type=${4:-neighbor_label}

echo "Starting Supervised Fine-tuning at $(date)..."
echo "Dataset: $DATASET, Seed: $SEED, LLM: $LLM, Prompt Type: $prompt_type"

# Create logs directory if it doesn't exist
mkdir -p ./logs

# Define Python commands based on dataset
if [ "$DATASET" = "cora" ] || [ "$DATASET" = "citeseer" ]; then
    accelerate launch train.py --num_epoch=16 --llm=$LLM --batch_size=4 --re_split=1 --dataset=$DATASET --max_txt_length=80 --max_origin_txt_length=200 --seed=$SEED --prompt_type=$prompt_type 2>&1 | tee ./logs/transductive_${DATASET}_${LLM}_${prompt_type}_seed${SEED}.log

elif [ "$DATASET" = "wikics" ] || [ "$DATASET" = "pubmed" ]; then
    # Middle-scale datasets (wikics, pubmed)
    echo "Running $DATASET with middle-scale configuration (wikics/pubmed)..."
    accelerate launch train.py --num_epoch=4 --llm=$LLM --batch_size=4 --re_split=1 --dataset=$DATASET --max_txt_length=80 --max_origin_txt_length=200 --seed=$SEED --prompt_type=$prompt_type 2>&1 | tee ./logs/transductive_${DATASET}_${LLM}_${prompt_type}_seed${SEED}.log

elif [ "$DATASET" = "instagram" ]; then
    # Instagram special configuration
    echo "Running $DATASET with instagram configuration..."
    accelerate launch train.py --num_epoch=8 --llm=$LLM --batch_size=12 --re_split=1 --dataset=$DATASET --max_txt_length=80 --max_origin_txt_length=100 --seed=$SEED --prompt_type=$prompt_type 2>&1 | tee ./logs/transductive_${DATASET}_${LLM}_${prompt_type}_seed${SEED}.log

elif [ "$DATASET" = "reddit" ] || [ "$DATASET" = "photo" ] || [ "$DATASET" = "computer" ] || [ "$DATASET" = "history" ]; then
    # Large-scale datasets
    echo "Running $DATASET with large-scale configuration..."
    accelerate launch train.py --num_epoch=2 --llm=$LLM --batch_size=4 --re_split=1 --dataset=$DATASET --max_txt_length=80 --max_origin_txt_length=200 --seed=$SEED --prompt_type=$prompt_type 2>&1 | tee ./logs/transductive_${DATASET}_${LLM}_${prompt_type}_seed${SEED}.log

else
    echo "Error: Unsupported dataset '$DATASET'"
    echo "Supported datasets: cora, citeseer, wikics, instagram, pubmed, reddit, photo, computer, history"
    exit 1
fi

echo "Supervised Fine-tuning completed at $(date). Check log files for results." 