"""Download tables from WikiDBs."""

from pathlib import Path
import requests
import zipfile
import shutil

WIKIDBS_DIR = Path(__file__).parents[2] / "data/tabular/raw/wikidbs"


def download_and_extract_wikidbs():
    archive_urls = [
        "https://zenodo.org/records/11559814/files/part-0.zip?download=1",
        "https://zenodo.org/records/11559814/files/part-1.zip?download=1",
        "https://zenodo.org/records/11559814/files/part-2.zip?download=1",
        "https://zenodo.org/records/11559814/files/part-3.zip?download=1",
        "https://zenodo.org/records/11559814/files/part-4.zip?download=1",
    ]
    out_dir = WIKIDBS_DIR
    out_dir.mkdir(parents=True, exist_ok=True)

    for idx, archive_url in enumerate(archive_urls):
        archive_path = out_dir / f"wikidbs_part_{idx}.zip"
        print(f"Downloading WikiDBs archive from {archive_url} to {archive_path}...")
        response = requests.get(archive_url, stream=True)
        response.raise_for_status()
        with open(archive_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print("Archive download complete.")

        print("Extracting archive...")
        with zipfile.ZipFile(archive_path, "r") as zip_ref:
            zip_ref.extractall(out_dir)
        print("Extraction complete.")

        archive_path.unlink()  # Remove the zip file after extraction


def remove_unecessary_files():
    """Remove unnecessary files from the extracted data."""
    necessary_folders = {
        "00473",
        "02053",
        "03977",
        "07136",
        "07310",
        "07900",
        "09510",
        "14012",
        "14976",
        "15542",
        "19664",
        "28146",
        "28324",
        "29832",
        "30417",
        "36100",
        "42562",
        "46159",
        "47746",
        "53353",
        "56474",
        "62826",
        "63797",
        "64477",
        "65102",
        "66610",
        "66643",
        "67195",
        "70780",
        "70942",
        "73376",
        "82939",
        "87283",
        "88197",
        "89039",
        "89439",
        "90741",
        "90930",
        "92415",
        "94062",
        "97229",
        "97297",
    }

    part_dirs_to_remove = []

    for item in WIKIDBS_DIR.iterdir():
        if item.is_dir() and item.name.startswith("part-"):
            part_dirs_to_remove.append(item)
            for sub_item in item.iterdir():
                if sub_item.is_dir() and sub_item.name[:5] in necessary_folders:
                    # Move the folder to the parent WIKIDBS_DIR
                    destination = WIKIDBS_DIR / sub_item.name
                    if not destination.exists():
                        sub_item.rename(destination)

    # Remove the part-* directories
    for part_dir in part_dirs_to_remove:
        for sub_item in part_dir.iterdir():
            # Clean up all remaining files/dirs
            if sub_item.is_dir():
                shutil.rmtree(sub_item)
            else:
                sub_item.unlink()
        part_dir.rmdir()


if __name__ == "__main__":
    download_and_extract_wikidbs()
    remove_unecessary_files()
