"""Download datasets from TextTabBench (TTB)"""

import zipfile
import subprocess
from pathlib import Path
import pandas as pd
import json
import shutil

RAW_DIR = Path(__file__).parents[2] / "data/tabular/raw"


def download_kaggle_dataset(dataset_config: dict, download_path: Path):
    if "competition" in dataset_config.keys() and dataset_config["competition"]:
        subprocess.run(
            [
                "kaggle",
                "competitions",
                "download",
                "-c",
                dataset_config["remote_path"],
                "-p",
                str(download_path),
            ],
            check=True,
        )
    else:
        subprocess.run(
            [
                "kaggle",
                "datasets",
                "download",
                "-d",
                dataset_config["remote_path"],
                "-p",
                str(download_path),
            ],
            check=True,
        )
    return


def extract_zip_files(download_path: Path):
    for zip_file in download_path.glob("*.zip"):
        with zipfile.ZipFile(zip_file, "r") as z:
            z.extractall(download_path)
        zip_file.unlink()  # Remove zip file after extraction
    return


def main():
    # Load configurations from JSON files
    with open(Path(__file__).parent / "ttb_data_config.json", "r") as f:
        data_configs = json.load(f)

    with open(Path(__file__).parent / "ttb_extra_config.json", "r") as f:
        extra_configs = json.load(f)

    all_configs = {**data_configs, **extra_configs}

    for dataset_name, dataset_config in all_configs.items():
        print(f"Processing dataset: {dataset_name}")
        if dataset_config["task"] == "clf":
            dataset_subfolder = "classification/" + dataset_config["dataset_name"]
        elif dataset_config["task"] == "reg":
            dataset_subfolder = "regression/" + dataset_config["dataset_name"]
        else:
            raise ValueError(f"Unknown task: {dataset_config['task']}")

        download_path = RAW_DIR / "texttabbench" / dataset_subfolder

        download_path.mkdir(parents=True, exist_ok=True)

        # Step 0: Check if dataset already exists
        expected_file = download_path / dataset_config["rename_files"][0]
        if expected_file.exists():
            print("Dataset already exists, skipping download.")
            continue

        # Step 1: Download dataset
        download_kaggle_dataset(dataset_config, download_path)

        # Step 2: Extract dataset
        extract_zip_files(download_path)

        # Step 3: Delete unnecessary files
        for file in download_path.iterdir():
            if file.name not in dataset_config["files"]:
                if file.is_dir():
                    shutil.rmtree(file)
                else:
                    file.unlink()

        # Step 4: Rename CSV file if necessary
        csv_files = list(download_path.glob("*.csv"))
        if csv_files:
            original_csv = csv_files[0]
            target_csv = download_path / dataset_config["rename_files"][0]
            original_csv.rename(target_csv)
            print(f"Renamed {original_csv.name} to {target_csv.name}")
        else:
            tsv_files = list(download_path.glob("*.tsv"))
            if tsv_files:
                original_tsv = tsv_files[0]
                target_csv = download_path / dataset_config["rename_files"][0]
                df = pd.read_csv(original_tsv, sep="\t")
                df.to_csv(target_csv, index=False)
                original_tsv.unlink()
                print(f"Converted {original_tsv.name} to {target_csv.name}")
            else:
                print(f"No CSV/TSV file found for dataset: {dataset_name}")
    return


if __name__ == "__main__":
    main()
