import os

import duckdb

from src.pipelines.base_pipeline import BasePipeline


class UniChemPipeline(BasePipeline):
    def __init__(self):
        super().__init__(
            source_name="UniChem",
            filename="unichem.tsv",
            archive_name="structure.tsv.gz",
        )
        self.url = "https://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/table_dumps/structure.tsv.gz"

    def download(self, force_download: bool = False) -> None:
        """
        UniChem output: single TSV file, with columns:
        [UCI, STANDARDINCHI, STANDARDINCHIKEY]
        """
        downloaded_file = self._download_single_file_archive(
            url=self.url,
            force_download=force_download,
        )
        if downloaded_file:
            os.rename(
                os.path.join(self.output_dir, "structure.tsv"),
                os.path.join(self.output_dir, self.filename),
            )

    def preprocess(self) -> None:
        input_file_path = os.path.join(self.output_dir, self.filename)
        output_file_path = os.path.join(self.output_dir, self.preprocessed_filename)

        # check if preprocessed file already exists
        if os.path.exists(output_file_path):
            print("Found preprocessed dataset, skipping")
            return

        # rewrite this to DuckDB
        duckdb.sql(
            f"""
            COPY (
                SELECT
                    UCI AS id,
                    STANDARDINCHI AS InChI
                FROM
                    read_csv('{input_file_path}', delim='\t')
            )
            TO '{output_file_path}' (FORMAT parquet);
            """
        )
