"""Aggregate SMILES strings from external test set snapshots stored locally.

The original project collected several community benchmarks.  For the
anonymised submission we ship compact text files under ``data/external`` that
contain representative SMILES strings.  If reviewers have access to the full
benchmarks they can drop replacement files at the same locations and the script
will pick them up automatically.
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List

import logging
import pickle


SUBMISSION_ROOT = Path(__file__).resolve().parent
DATA_DIR = SUBMISSION_ROOT / "data" / "external"
LOG_DIR = SUBMISSION_ROOT / "logs"
DEFAULT_SAVE_PATH = SUBMISSION_ROOT / "data" / "external" / "external_test_set_molecules.pkl"


DEFAULT_DATASET_FILES: Dict[str, Path] = {
    "llasmol": DATA_DIR / "llasmol_test_set.smi",
    "chemIQ": DATA_DIR / "chemiq_test_set.smi",
    "chemDFM": DATA_DIR / "chemdfm_test_set.smi",
    "ether0": DATA_DIR / "ether0_test_set.smi",
}


SAMPLE_DATASETS: Dict[str, List[str]] = {
    "llasmol": ["CCO", "CCN", "c1ccccc1"],
    "chemIQ": ["CC(=O)O", "CCOC", "CCCN"],
    "chemDFM": ["CCOCC", "c1ccncc1", "CC(C)O"],
    "ether0": ["CCOC", "CC(C)CO", "C1=CC=CC=C1"],
}


@dataclass
class ExternalTestSetConfig:
    save_path: Path = DEFAULT_SAVE_PATH
    log_file_path: Path = LOG_DIR / "collect_external_test_set_molecules.log"
    dataset_files: Dict[str, Path] = None

    def __post_init__(self) -> None:
        if self.dataset_files is None:
            self.dataset_files = DEFAULT_DATASET_FILES


def _read_smiles_file(path: Path) -> List[str]:
    if not path.exists():
        return []

    with path.open("r", encoding="utf-8") as handle:
        lines = [line.strip() for line in handle]

    return [line for line in lines if line]


def _collect_dataset_smiles(dataset_files: Dict[str, Path]) -> Dict[str, List[str]]:
    collected: Dict[str, List[str]] = {}
    for name, file_path in dataset_files.items():
        smiles = _read_smiles_file(file_path)
        if not smiles:
            smiles = SAMPLE_DATASETS.get(name, [])
        collected[name] = smiles
    return collected


def _setup_logging(log_path: Path) -> None:
    log_path.parent.mkdir(parents=True, exist_ok=True)
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[
            logging.FileHandler(log_path, mode="w", encoding="utf-8"),
            logging.StreamHandler(),
        ],
    )


def collect_external_test_set_molecules(cfg: ExternalTestSetConfig) -> List[str]:
    cfg.save_path.parent.mkdir(parents=True, exist_ok=True)
    _setup_logging(Path(cfg.log_file_path))

    logging.info("Collecting external test set molecules from local files.")

    datasets = _collect_dataset_smiles(cfg.dataset_files)

    smiles_list: List[str] = []
    for name, smiles in datasets.items():
        logging.info("%s: %d molecules", name, len(smiles))
        smiles_list.extend(smiles)

    unique_smiles = sorted(set(smiles_list))
    logging.info("Total unique molecules collected: %d", len(unique_smiles))

    with Path(cfg.save_path).open("wb") as handle:
        pickle.dump(unique_smiles, handle)

    return unique_smiles


if __name__ == "__main__":
    collect_external_test_set_molecules(ExternalTestSetConfig())
