#!/bin/bash
# preprocess_data.sh: Script to preprocess datasets for DGSM-SCAM-GAT and MMT-ViT
# Note: The "bash scripts/preprocess_data.sh" only needs to be executed once.

set -e

# Define project directory (relative to script location)
PROJECT_DIR="$(dirname "$(dirname "$0")")"
CODE_DIR="$PROJECT_DIR"

# Check Python
if ! command -v python &> /dev/null; then
    echo "Error: Python is not installed. Please ensure Python 3.12 or higher is installed."
    echo "If using Conda, activate your environment (e.g., 'conda activate your_env'). See README.md."
    exit 1
fi

# Check Python version
PYTHON_VERSION=$(python --version 2>&1 | cut -d ' ' -f 2)
PYTHON_MAJOR=$(echo $PYTHON_VERSION | cut -d '.' -f 1)
PYTHON_MINOR=$(echo $PYTHON_VERSION | cut -d '.' -f 2)
if [ $PYTHON_MAJOR -lt 3 ] || { [ $PYTHON_MAJOR -eq 3 ] && [ $PYTHON_MINOR -lt 12 ]; }; then
    echo "Error: Python 3.12 or higher is required. Found Python $PYTHON_VERSION."
    echo "If using Conda, activate an environment with Python 3.12+. See README.md."
    exit 1
fi
echo "Python version: $PYTHON_VERSION"

# Set PYTHONPATH
export PYTHONPATH="$PROJECT_DIR:$PYTHONPATH"
echo "PYTHONPATH: $PYTHONPATH"

# Check code directory
if [ ! -d "$CODE_DIR" ]; then
    echo "Error: Code directory $CODE_DIR does not exist."
    exit 1
fi

# Check dependencies
if [ ! -f "$PROJECT_DIR/requirements.txt" ]; then
    echo "Error: requirements.txt not found in $PROJECT_DIR."
    exit 1
fi
echo "Ensure dependencies are installed: pip install -r $PROJECT_DIR/requirements.txt"


if [ ! -f "$CODE_DIR/mmt-ViT_data_preprocessing.py" ]; then
    echo "Error: mmt-ViT_data_preprocessing.py not found in $CODE_DIR."
    exit 1
fi


if [ ! -d "$PROJECT_DIR/data/big2015/dataset_big2015" ]; then
    echo "Error: Dataset directory $PROJECT_DIR/data/big2015/dataset_big2015 does not exist."
    echo "Please ensure big2015 dataset is prepared. See README.md."
    exit 1
fi
if [ ! -d "$PROJECT_DIR/data/big2015_yz/malimg_25" ]; then
    echo "Error: Dataset directory $PROJECT_DIR/data/big2015_yz/malimg_25 does not exist."
    echo "Please ensure malimg_25 dataset is prepared. See README.md."
    exit 1
fi
if [ ! -d "$PROJECT_DIR/data/big2015_yz/Malevis_malimg_31" ]; then
    echo "Error: Dataset directory $PROJECT_DIR/data/big2015_yz/Malevis_malimg_31 does not exist."
    echo "Please ensure Malevis_malimg_31 dataset is prepared. See README.md."
    exit 1
fi

#*************************************************************
# This file has been preprocessed and placed in the folder.
# If you want to preprocess data,please using dgsm-scam-gat_yz_api_data_2019.py code,
# please download the dataset and rename the dataset text file to "mal_api_2019.txt"
# and the labels text file to "mal_api_2019_lables.txt". Then, place them in the folder
# at the path "ProgectPytorch\data\mal_api_2019". Ensure the file names are correct.
# Additionally, ensure that the API_name_307.xlsx file is placed in the folder at the
# path "ProgectPytorch\data\mal_api_2019".

#echo "Preprocessing data for mal_api_2019..."

#python "$CODE_DIR/dgsm-scam-gat_yz_api_data_2019.py"
#if [ $? -ne 0 ]; then
#    echo "Error: Failed to preprocess mal_api_2019 data."
#    exit 1
#fi
#*************************************************************

echo "Preprocessing data for MMT-ViT (big2015, malimg, Malevis_malimg)..."
python "$CODE_DIR/mmt-ViT_data_preprocessing.py"
if [ $? -ne 0 ]; then
    echo "Error: Failed to preprocess MMT-ViT data."
    exit 1
fi

echo "Data preprocessing completed successfully."