import os.path
from pathlib import Path

import polars as pl
import pytest
from polars.testing import assert_frame_equal

from src.pipelines.base_pipeline import BasePipeline
from src.pipelines.chemspace_pipeline import ChemSpacePipeline
from src.pipelines.mcule_pipeline import MculePipeline
from src.pipelines.pubchem_pipeline import PubChemPipeline
from src.pipelines.unichem_pipeline import UniChemPipeline


@pytest.mark.parametrize(
    ("filename", "pipeline_cls", "download_url"),
    [
        (
            "chemspace.sdf",
            ChemSpacePipeline,
            "https://paste.c-net.org/JunkiesPredator",
        ),
        (
            "mcule.tsv",
            MculePipeline,
            "https://paste.c-net.org/ThompsonJackson",
        ),
        (
            "pubchem.tsv",
            PubChemPipeline,
            "https://paste.c-net.org/TriedSpilled",
        ),
        (
            "unichem.tsv",
            UniChemPipeline,
            "https://paste.c-net.org/HummelQuick",
        ),
    ],
)
def test_process(
    filename: str,
    pipeline_cls: type[BasePipeline],
    download_url: str,
    tmp_path,
):
    pipeline = pipeline_cls()

    # TODO(#40): Use "better" files hosting
    pipeline.url = download_url
    pipeline.output_dir = tmp_path
    pipeline.process()

    filename_base = filename.split(".")[0]
    preproc_parquet_path = str(tmp_path / f"{filename_base}_preprocessed.parquet")
    assert os.path.exists(preproc_parquet_path)

    preproc_csv_path = str(tmp_path / f"{filename_base}_preprocessed.csv")
    pl.read_parquet(preproc_parquet_path).write_csv(preproc_csv_path)

    target_csv_path = str(
        Path(__file__).resolve().parent / "mock" / f"{filename_base}_preprocessed.csv"
    )

    df_preprocessed = pl.read_csv(preproc_csv_path)
    df_target = pl.read_csv(target_csv_path)

    assert_frame_equal(df_preprocessed, df_target, check_row_order=False)
