"""
Build processed CSV (inaturalist_cost_table.csv) from Gemini labels.

Converts Gemini's 7-point scale to the cost-sensitive framework:
- delta_signed = 4 - gemini_score
  - Score 1 (natural) → delta = +3
  - Score 7 (controlled) → delta = -3
- y_star = 1 if delta_signed >= 0 else 0 (natural=1, controlled=0)
  - Ties at score=4 (delta=0) go to positive class (natural)
- abs_delta = |delta_signed|

Usage: python -m src.data.preprocess_inaturalist
"""
import json
import pandas as pd
from pathlib import Path

LABELS_FILE = Path("data/inaturalist/gemini_labels.json")
OUT_CSV = "data/inaturalist_cost_table.csv"


def main():
    print(f"Loading {LABELS_FILE}...")

    if not LABELS_FILE.exists():
        raise FileNotFoundError(
            f"Missing {LABELS_FILE}\n"
            f"Run: python scripts/label_inaturalist_gemini.py"
        )

    with open(LABELS_FILE, 'r') as f:
        labels = json.load(f)

    print(f"  Loaded {len(labels):,} labels")

    # Build rows
    rows = []

    for label in labels:
        obs_id = label["observation_id"]
        gemini_score = label["gemini_score"]

        # Compute delta: score 1 (natural) → +3, score 7 (controlled) → -3
        delta_signed = 4 - gemini_score
        abs_delta = abs(delta_signed)

        # Natural (delta >= 0) = 1, Controlled (delta < 0) = 0
        # Ties at score=4 (delta=0) go to positive class (natural)
        y_star = 1 if delta_signed >= 0 else 0

        rows.append({
            "observation_id": obs_id,
            "image_path": label["image_path"],
            "gemini_score": gemini_score,
            "delta_signed": delta_signed,
            "abs_delta": abs_delta,
            "y_star": y_star,
            # Validation fields (not used for training)
            "api_captive": label.get("api_captive", None),
        })

    df = pd.DataFrame(rows)

    # Save
    df.to_csv(OUT_CSV, index=False)

    # Count ties for reporting
    ties_count = (df['gemini_score'] == 4).sum()

    print(f"\n✓ Wrote {OUT_CSV}")
    print(f"  Rows: {len(df):,}")
    if ties_count > 0:
        print(f"  Ties (score=4, assigned to natural): {ties_count}")
    print(f"  Mean |Δ|: {df['abs_delta'].mean():.2f}")
    print(f"  Max |Δ|: {df['abs_delta'].max():.0f}")
    print(f"  Positive class (natural): {df['y_star'].mean():.1%}")


if __name__ == "__main__":
    main()
