import pandas as pd
from sklearn.metrics import cohen_kappa_score, accuracy_score
from statsmodels.stats.inter_rater import fleiss_kappa

# Load CSV file
df = pd.read_csv("data/mturk_output.csv")

# (1) Cohen's Kappa: Majority vote of humans vs. LLM result
def majority_vote(row):
	return round((row['Annotator 1'] + row['Annotator 2'] + row['Annotator 3']) / 3)

df['majority_vote'] = df[['Annotator 1', 'Annotator 2', 'Annotator 3']].apply(majority_vote, axis=1)
kappa_majority_vs_llm = cohen_kappa_score(df['Result'], df['majority_vote'])

# (2) Fleiss' Kappa: 3 humans
human_annotations = df[['Annotator 1', 'Annotator 2', 'Annotator 3']].values
# Convert to count table for Fleiss' Kappa: each example gets [count_0, count_1]
fleiss_human_input = [[sum(row == 0), sum(row == 1)] for row in human_annotations]
fleiss_kappa_humans = fleiss_kappa(fleiss_human_input)

# (3) Fleiss' Kappa: 3 humans and LLM together
all_annotations = df[['Annotator 1', 'Annotator 2', 'Annotator 3', 'Result']].values
fleiss_all_input = [[sum(row == 0), sum(row == 1)] for row in all_annotations]
fleiss_kappa_humans_llm = fleiss_kappa(fleiss_all_input)

# (4) Accuracy: LLM vs. majority vote of humans
accuracy_llm = accuracy_score(df['majority_vote'], df['Result'])

print("Cohen's Kappa (Majority Vote vs. LLM):", kappa_majority_vs_llm)
print("Fleiss' Kappa (3 Humans):", fleiss_kappa_humans)
print("Fleiss' Kappa (3 Humans + LLM):", fleiss_kappa_humans_llm)
print("Accuracy (Majority Vote vs. LLM):", accuracy_llm)
