from pathlib import Path
from typing import List
import pandas as pd
import os

# Deterministic behavior
SEED = 0


def _read_source_csv(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Missing input file: {path}")
    # Read as strings; pandas handles embedded newlines in quoted fields
    df = pd.read_csv(path, dtype=str)
    # Normalize columns
    df.columns = [c.strip().lower() for c in df.columns]
    if not ("article" in df.columns and "abstract" in df.columns):
        raise AssertionError(
            f"Expected columns ['article','abstract'] in {path}, got {df.columns.tolist()}"
        )
    # Normalize content
    for c in ["article", "abstract"]:
        df[c] = df[c].fillna("").astype(str)
        df[c] = df[c].apply(lambda x: x.replace("\r", " ").replace("\n", " ").strip())
    # Drop rows that are entirely empty
    df = df[~((df["article"].str.len() == 0) & (df["abstract"].str.len() == 0))].copy()
    if len(df) == 0:
        raise AssertionError(f"No non-empty rows in {path}")
    return df


def _make_ids(prefix: str, n: int) -> List[str]:
    return [f"{prefix}{i:08d}" for i in range(n)]


def prepare(raw: Path, public: Path, private: Path):
    """
    Complete data preparation process.

    Inputs:
    - raw: directory containing source CSV files train.csv, validation.csv, test.csv
    - public: target directory for files visible to participants
    - private: target directory for hidden files (e.g., test_answer.csv)

    Outputs (exact locations):
    - public/train.csv           (id, article, abstract)
    - public/test.csv            (id, article)
    - public/sample_submission.csv (id, abstract)
    - public/description.txt     (copied from root description.txt if present)
    - private/test_answer.csv    (id, abstract)
    """

    public.mkdir(parents=True, exist_ok=True)
    private.mkdir(parents=True, exist_ok=True)

    # Read source
    train_df = _read_source_csv(raw / "train.csv")
    valid_df = _read_source_csv(raw / "validation.csv")
    test_df = _read_source_csv(raw / "test.csv")

    # Combine train + validation
    comb = pd.concat([train_df, valid_df], ignore_index=True)
    comb = comb.reset_index(drop=True)
    comb.insert(0, "id", _make_ids("TRN", len(comb)))

    test_df = test_df.reset_index(drop=True)
    test_ids = _make_ids("TST", len(test_df))
    test_df.insert(0, "id", test_ids)

    # Final views
    public_train = comb[["id", "article", "abstract"]].copy()
    public_test = test_df[["id", "article"]].copy()
    private_answer = test_df[["id", "abstract"]].copy()

    # Write outputs
    public_train.to_csv(public / "train.csv", index=False)
    public_test.to_csv(public / "test.csv", index=False)
    private_answer.to_csv(private / "test_answer.csv", index=False)

    # Sample submission with deterministic placeholder abstracts
    sample = public_test.copy()
    sample["abstract"] = sample["id"].apply(lambda x: f"predicted summary for {x.lower()}")
    sample[["id", "abstract"]].to_csv(public / "sample_submission.csv", index=False)

    # Copy description.txt to public if present at repo root
    repo_description = Path(__file__).resolve().parent / "description.txt"
    if repo_description.exists():
        (public / "description.txt").write_text(repo_description.read_text(encoding="utf-8"), encoding="utf-8")

    # Checks
    assert (public / "train.csv").exists() and os.path.getsize(public / "train.csv") > 0
    assert (public / "test.csv").exists() and os.path.getsize(public / "test.csv") > 0
    assert (public / "sample_submission.csv").exists() and os.path.getsize(public / "sample_submission.csv") > 0
    assert (private / "test_answer.csv").exists() and os.path.getsize(private / "test_answer.csv") > 0

    # Validate schema
    tr = pd.read_csv(public / "train.csv", dtype=str)
    te = pd.read_csv(public / "test.csv", dtype=str)
    ans = pd.read_csv(private / "test_answer.csv", dtype=str)
    sub = pd.read_csv(public / "sample_submission.csv", dtype=str)

    assert tr.columns.tolist() == ["id", "article", "abstract"], "train.csv columns must be ['id','article','abstract']"
    assert te.columns.tolist() == ["id", "article"], "test.csv columns must be ['id','article']"
    assert ans.columns.tolist() == ["id", "abstract"], "test_answer.csv columns must be ['id','abstract']"
    assert sub.columns.tolist() == ["id", "abstract"], "sample_submission.csv columns must be ['id','abstract']"

    # ID integrity
    assert tr["id"].is_unique
    assert te["id"].is_unique and ans["id"].is_unique and sub["id"].is_unique
    assert set(te["id"]) == set(ans["id"]) == set(sub["id"])
    assert set(tr["id"]).isdisjoint(set(te["id"]))

    # Non-empty fields
    assert (tr["article"].str.strip() != "").all() and (tr["abstract"].str.strip() != "").all()
    assert (te["article"].str.strip() != "").all() and (ans["abstract"].str.strip() != "").all()

    # Optional: ensure description copied
    if repo_description.exists():
        assert (public / "description.txt").exists(), "public/description.txt should exist"
