import os
import openml
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from constants import OUTPUT_DIR


os.makedirs(OUTPUT_DIR, exist_ok=True)


def download_dataset(args: tuple[int, str]) -> None:
    """Download the dataset.

    Args:
        args (tuple[int, str]): The dataset id and name.
    """
    did, name = args

    if os.path.exists(f"{OUTPUT_DIR}/{name}.csv"):
        return

    dataset = openml.datasets.get_dataset(
        did,
        download_data=False,
        download_qualities=False,
        download_features_meta_data=False,
    )
    df = dataset.get_data()[0]
    df.to_csv(f"{OUTPUT_DIR}/{name}.csv", index=False)


def main():
    datasets = openml.datasets.list_datasets(output_format="dataframe")
    # Filter datasets with numeric features and binary classification
    datasets = datasets.query(
        "NumberOfNumericFeatures + 1 == NumberOfFeatures and NumberOfClasses == 2"
    )

    print(f"Filtered datasets: {len(datasets)}")

    with Pool(cpu_count()) as p:
        t = tqdm(total=len(datasets))
        for _ in p.imap_unordered(download_dataset, datasets[["did", "name"]].values):
            t.update(1)
        t.close()


if __name__ == "__main__":
    main()
