#!/usr/bin/env python
import argparse
import pandas as pd


def process_one(path: str, score_col: str):
    df = pd.read_csv(path)

    if score_col not in df.columns:
        raise ValueError(f"[{path}] Column '{score_col}' not found. Available: {df.columns.tolist()}")

    total = len(df)
    print("=" * 80)
    print(f"File: {path}")
    print(f"Total examples: {total}")

    scores_1_5 = pd.to_numeric(df[score_col], errors="coerce")

    print("\nMean GPT-judge ratings (1-5):")
    print(f"{score_col}: {scores_1_5.mean():.4f}")

    acc_mask  = scores_1_5 >= 4
    hall_mask = scores_1_5 < 3
    unc_mask  = (~acc_mask) & (~hall_mask)

    acc_rate  = acc_mask.mean() * 100.0
    hall_rate = hall_mask.mean() * 100.0
    unc_rate  = unc_mask.mean() * 100.0

    print("\nScore-based rates:")
    print(f"Accurate: {int(acc_mask.sum())} ({acc_rate:.2f}%)")
    print(f"Uncertain: {int(unc_mask.sum())} ({unc_rate:.2f}%)")
    print(f"Hallucinated: {int(hall_mask.sum())} ({hall_rate:.2f}%)")


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--input_csv", required=True, nargs="+",
                    help="One or more CSVs with GPT-as-judge scores per row.")
    ap.add_argument("--score_col", required=True,
                    help="Score column name in the CSV.")
    args = ap.parse_args()

    for p in args.input_csv:
        process_one(p, args.score_col)


if __name__ == "__main__":
    main()
