"""Download datasets from CARTE: https://huggingface.co/datasets/inria-soda/carte-benchmark."""

import requests
from pathlib import Path
from zipfile import ZipFile
import json
import shutil

RAW_DIR = Path(__file__).parents[2] / "data/tabular/raw"
PROCESSED_DIR = Path(__file__).parents[2] / "data/tabular/processed"


def _download_with_request(url, download_path):
    req = requests.get(url, stream=True)
    with open(download_path, "wb") as f:
        for chunk in req.iter_content(chunk_size=8192):
            f.write(chunk)


def _download_raw():
    url = "https://huggingface.co/datasets/inria-soda/carte-benchmark/resolve/main/data_raw.zip"
    download_path = RAW_DIR / "carte/data_raw.zip"
    _download_with_request(url, download_path)
    with ZipFile(download_path, "r") as zObject:
        zObject.extractall(path=RAW_DIR / "carte")
    download_path.unlink()


def _download_preprocessed(include_llm=False):
    if include_llm:
        url = "https://huggingface.co/datasets/inria-soda/carte-benchmark/resolve/main/data_singletable.zip"
    else:
        url = "https://huggingface.co/datasets/inria-soda/carte-benchmark/resolve/main/data_singletable_light.zip"
    download_path = PROCESSED_DIR / "data_singletable.zip"
    _download_with_request(url, download_path)
    with ZipFile(download_path, "r") as zObject:
        zObject.extractall(path=PROCESSED_DIR)
    download_path.unlink()


def _process_carte_data():
    """Process the downloaded CARTE data."""
    source_dir = PROCESSED_DIR / "data_singletable"
    if not source_dir.exists():
        print(f"Source directory {source_dir} not found. Skipping processing.")
        return

    regression_dir = PROCESSED_DIR / "regression"
    classification_dir = PROCESSED_DIR / "classification"
    regression_dir.mkdir(parents=True, exist_ok=True)
    classification_dir.mkdir(parents=True, exist_ok=True)

    all_configs = {}

    for dataset_dir in source_dir.iterdir():
        if not dataset_dir.is_dir():
            continue

        dataset_name = dataset_dir.name
        config_path = dataset_dir / "config_data.json"
        parquet_path = dataset_dir / "raw.parquet"

        if not config_path.exists() or not parquet_path.exists():
            continue

        with open(config_path, "r") as f:
            config = json.load(f)
            all_configs[dataset_name] = config

        task_type = config.get("task")
        if task_type == "regression":
            dest_dir = regression_dir
        elif task_type == "classification":
            dest_dir = classification_dir
        else:
            print(f"Unknown task type for {dataset_name}. Skipping.")
            continue

        new_parquet_name = f"carte_{dataset_name}.parquet"
        new_parquet_path = dest_dir / new_parquet_name
        parquet_path.rename(new_parquet_path)

    # Save the concatenated config file
    with open(Path(__file__).parent / "carte_config.json", "w") as f:
        json.dump(all_configs, f, indent=4)

    # Clean up the original extracted folder
    shutil.rmtree(source_dir)


if __name__ == "__main__":
    _download_raw()
    _download_preprocessed(include_llm=False)
    _process_carte_data()
