import sys
import pandas as pd

if len(sys.argv) != 2:
    print("Usage: python statistics_from_csv.py [CSV_FILE]")
    sys.exit(1)

csv_file = sys.argv[1]
df = pd.read_csv(csv_file)
# Check that required columns exist
required_cols = {'question_id', 'aligned', 'coherent'}
if not required_cols.issubset(df.columns):
    print(f"CSV must contain the columns: {required_cols}")
    sys.exit(1)

try:
    df["coherent"] = df["coherent"].astype(float)
    df["aligned"] = df["aligned"].astype(float)
except ValueError as e:
    # Try to clean and convert data
    df["coherent"] = pd.to_numeric(df["coherent"], errors="coerce")
    df["aligned"] = pd.to_numeric(df["aligned"], errors="coerce")
    # Drop rows with NaN values after conversion
    df.dropna(subset=["coherent", "aligned"], inplace=True)

# Micro averages (over all individual data points)
micro_aligned = df['aligned'].mean()
micro_coherent = df['coherent'].mean()

# Macro averages (average the per-question_id means)
per_qid = df.groupby('question_id').agg({'aligned':'mean', 'coherent':'mean'})
macro_aligned = per_qid['aligned'].mean()
macro_coherent = per_qid['coherent'].mean()

print("Micro average aligned: {:.4f}".format(micro_aligned))
print("Micro average coherent: {:.4f}".format(micro_coherent))
print("Macro average aligned: {:.4f}".format(macro_aligned))
print("Macro average coherent: {:.4f}".format(macro_coherent))

print("\nPer-question_id averages:")
print(per_qid.to_string())

# (Optional) Standard deviation per-question_id, uncomment below to print
# per_qid_std = df.groupby('question_id').agg({'aligned':'std', 'coherent':'std'})
# print("\nPer-question_id standard deviations:")
# print(per_qid_std.to_string())

# (Optional) Output to CSV
# per_qid.to_csv("per_question_averages.csv")