"""
Build processed CSV (jigsaw_cost.csv) from raw train.csv.
Usage: python -m src.data.preprocess_jigsaw
"""
import pandas as pd
import numpy as np
import csv

RAW = "data/jigsaw/train.csv"
OUT = "data/jigsaw_cost_table.csv"


def main():
    print(f"Loading {RAW}...")
    usecols = ["comment_text", "target", "toxicity_annotator_count"]
    df = pd.read_csv(RAW, usecols=usecols).dropna(subset=["toxicity_annotator_count"])

    print(f"  Loaded {len(df):,} rows")

    # Filter empty text
    df = df[df["comment_text"].str.strip() != ""]
    print(f"  After filtering empty text: {len(df):,} rows")

    # Compute aggregated votes
    df["n_yes"] = np.round(df["target"] * df["toxicity_annotator_count"]).astype(int)
    df["n_no"] = df["toxicity_annotator_count"] - df["n_yes"]
    df["delta_signed"] = np.log((df["n_yes"] + 1) / (df["n_no"] + 1))
    df["abs_delta"] = df["delta_signed"].abs()
    df["y_star"] = (df["delta_signed"] >= 0).astype(int)

    # Save
    df[["comment_text", "n_yes", "n_no", "delta_signed", "abs_delta", "y_star"]].to_csv(
        OUT, index=False, quoting=csv.QUOTE_ALL
    )

    print(f"\n✓ Wrote {OUT}")
    print(f"  Rows: {len(df):,}")
    print(f"  Mean annotations per example: {df['toxicity_annotator_count'].mean():.1f}")
    print(f"  Mean |Δ|: {df['abs_delta'].mean():.3f}")
    print(f"  Max |Δ|: {df['abs_delta'].max():.3f}")
    print(f"  Positive class (toxic): {df['y_star'].mean():.1%}")


if __name__ == "__main__":
    main()

