import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu

# Read the CSV file
df = pd.read_csv('experiment_22.csv')
df2 = pd.read_csv('experiment_3.csv')
df = pd.concat([df, df2], ignore_index=True)

# Apply the score transformation based on option_swapped column
# If option_swapped is False, keep original score; if True, use 8-score (to reverse 1-7 scale)
text_to_score = {"Option A is much more harmful":1, "Option A is moderately more harmful":2, "Option A is somewhat more harmful":2, "Equally harmful or unharmful":4, "Option B is somewhat more harmful":5, "Option B is moderately more harmful":6, "Option B is much more harmful":7}

df['scale_numeric'] = df['scale'].apply(lambda x: text_to_score.get(x, x) if isinstance(x, str) else x)
df['scale_numeric'] = pd.to_numeric(df['scale_numeric'], errors='coerce')
df['adjusted_score'] = np.where(df['option_swapped'] == False,
                               df['scale_numeric'],
                               8 - df['scale_numeric'])

# Calculate average score by participant and item_type
participant_stats = df.groupby(['participant_id', 'item_type'])['adjusted_score'].agg(
    avg_score='mean',
    std_score='std'
).reset_index()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print("Average scores and standard deviations by participant and item type:")
    print(participant_stats)
    print("\n" + "="*50 + "\n")

# Calculate overall average by item_type
overall_avg = df.groupby('item_type')['adjusted_score'].mean().reset_index()
overall_avg = overall_avg.rename(columns={'adjusted_score': 'overall_avg_score'})

print("Overall average scores by item type:")
print(overall_avg)
print("\n" + "="*50 + "\n")

# Calculate average by item_type and harm_type
harm_type_avg = df.groupby(['item_type', 'harm_type'])['adjusted_score'].mean().reset_index()
harm_type_avg = harm_type_avg.rename(columns={'adjusted_score': 'avg_score'})

print("Average scores by item type and harm type:")
print(harm_type_avg)
print("\n" + "="*50 + "\n")

# Calculate average by item_type and topic
topic_avg = df.groupby(['item_type', 'topic'])['adjusted_score'].mean().reset_index()
topic_avg = topic_avg.rename(columns={'adjusted_score': 'avg_score'})

print("Average scores by item type and topic:")
print(topic_avg)
print("\n" + "="*50 + "\n")


# Calculate average by item_type and topic
topic_avg = df.groupby(['item_type', 'model_type'])['adjusted_score'].mean().reset_index()
topic_avg = topic_avg.rename(columns={'adjusted_score': 'avg_score'})

print("Average scores by item type and model type:")
print(topic_avg)
print("\n" + "="*50 + "\n")

# Calculate average by item_type and benefit_reason
benefit_reason_avg = df.groupby(['item_type', 'benefit_reason'])['adjusted_score'].mean().reset_index()
benefit_reason_avg = benefit_reason_avg.rename(columns={'adjusted_score': 'avg_score'})

print("Average scores by item type and benefit reason:")
print(benefit_reason_avg)
print("\n" + "="*50 + "\n")


# Display overall statistics
print("Overall statistics:")
for item_type in df['item_type'].unique():
    subset = df[df['item_type'] == item_type]
    avg_score = subset['adjusted_score'].mean()
    count = subset['adjusted_score'].count()
    print(f"{item_type}: Average = {avg_score:.3f}+-{subset['adjusted_score'].std()}, Count = {count}")

print("\n" + "="*50 + "\n")

# Display detailed statistics by harm_type within each item_type
print("Detailed statistics by harm_type within each item_type:")
for item_type in df['item_type'].unique():
    print(f"\n{item_type}:")
    subset = df[df['item_type'] == item_type]
    for harm_type in subset['harm_type'].unique():
        harm_subset = subset[subset['harm_type'] == harm_type]
        avg_score = harm_subset['adjusted_score'].mean()
        count = harm_subset['adjusted_score'].count()
        print(f"  {harm_type}: Average = {avg_score:.3f}+-{harm_subset['adjusted_score'].std():.2f}, Count = {count}")

print("\n" + "="*50 + "\n")

# Display detailed statistics by topic within each item_type
print("Detailed statistics by topic within each item_type:")
for item_type in df['item_type'].unique():
    print(f"\n{item_type}:")
    subset = df[df['item_type'] == item_type]
    for topic in subset['topic'].unique():
        topic_subset = subset[subset['topic'] == topic]
        avg_score = topic_subset['adjusted_score'].mean()
        count = topic_subset['adjusted_score'].count()
        print(f"  {topic}: Average = {avg_score:.3f}+-{topic_subset['adjusted_score'].std():.2f}, Count = {count}")

print("\n" + "="*50 + "\n")


# Display detailed statistics by topic within each item_type
print("Detailed statistics by model_type within each item_type:")
for item_type in df['item_type'].unique():
    print(f"\n{item_type}:")
    subset = df[df['item_type'] == item_type]
    for topic in subset['model_type'].unique():
        topic_subset = subset[subset['model_type'] == topic]
        avg_score = topic_subset['adjusted_score'].mean()
        count = topic_subset['adjusted_score'].count()
        print(f"  {topic}: Average = {avg_score:.3f}+-{topic_subset['adjusted_score'].std():.2f}, Count = {count}")

print("\n" + "="*50 + "\n")

# Display detailed statistics by benefit_reason within each item_type
print("Detailed statistics by benefit_reason within each item_type:")
for item_type in df['item_type'].unique():
    print(f"\n{item_type}:")
    subset = df[df['item_type'] == item_type]
    for benefit_reason in subset['benefit_reason'].unique():
        benefit_subset = subset[subset['benefit_reason'] == benefit_reason]
        avg_score = benefit_subset['adjusted_score'].mean()
        count = benefit_subset['adjusted_score'].count()
        print(f"  {benefit_reason}: Average = {avg_score:.3f}+-{benefit_subset['adjusted_score'].std():.2f}, Count = {count}")

print("\n" + "="*70 + "\n")

# Mann-Whitney U test comparing treatment vs control groups
print("Mann-Whitney U Test Results:")
print("="*40)

# First, let's check for any data issues
print("Data Quality Check:")
print(f"Total rows: {len(df)}")
print(f"Missing values in adjusted_score: {df['adjusted_score'].isna().sum()}")
print(f"Unique item_types: {df['item_type'].unique()}")
print(f"Score range: {df['adjusted_score'].min():.3f} to {df['adjusted_score'].max():.3f}")

# Test 1: Treatment vs Control
treatment_scores = df[df['item_type'] == 'treatment']['adjusted_score'].dropna()
control_scores = df[df['item_type'] == 'control']['adjusted_score'].dropna()

print(f"\nTreatment group debug:")
print(f"  Raw count: {len(df[df['item_type'] == 'treatment'])}")
print(f"  After dropna: {len(treatment_scores)}")
print(f"  Unique values: {treatment_scores.nunique()}")
print(f"  Sample values: {treatment_scores.head().tolist()}")

print(f"\nControl group debug:")
print(f"  Raw count: {len(df[df['item_type'] == 'control'])}")
print(f"  After dropna: {len(control_scores)}")
print(f"  Unique values: {control_scores.nunique()}")
print(f"  Sample values: {control_scores.head().tolist()}")

if len(treatment_scores) > 0 and len(control_scores) > 0:
    try:
        statistic, p_value = mannwhitneyu(treatment_scores, control_scores, alternative='two-sided')
        print(f"\nTreatment vs Control:")
        print(f"  Treatment: n = {len(treatment_scores)}, median = {treatment_scores.median():.3f}, mean = {treatment_scores.mean():.3f}")
        print(f"  Control: n = {len(control_scores)}, median = {control_scores.median():.3f}, mean = {control_scores.mean():.3f}")
        print(f"  U-statistic = {statistic:.3f}")
        print(f"  p-value = {p_value:.6f}")
        print(f"  Significant at α = 0.05: {'Yes' if p_value < 0.05 else 'No'}")
    except Exception as e:
        print(f"\nError in Treatment vs Control test: {e}")
        print("This might indicate identical distributions or other data issues")
else:
    print("Treatment vs Control: Insufficient data for comparison")

# Test 2: Realism Treatment vs Realism Control
realism_treatment_scores = df[df['item_type'] == 'realism_treatment']['adjusted_score']
realism_control_scores = df[df['item_type'] == 'realism_control']['adjusted_score']

if len(realism_treatment_scores) > 0 and len(realism_control_scores) > 0:
    statistic, p_value = mannwhitneyu(realism_treatment_scores, realism_control_scores, alternative='two-sided')
    print(f"\nRealism Treatment vs Realism Control:")
    print(f"  Realism Treatment: n = {len(realism_treatment_scores)}, median = {realism_treatment_scores.median():.3f}, mean = {realism_treatment_scores.mean():.3f}")
    print(f"  Realism Control: n = {len(realism_control_scores)}, median = {realism_control_scores.median():.3f}, mean = {realism_control_scores.mean():.3f}")
    print(f"  U-statistic = {statistic:.3f}")
    print(f"  p-value = {p_value:.6f}")
    print(f"  Significant at α = 0.05: {'Yes' if p_value < 0.05 else 'No'}")
else:
    print("Realism Treatment vs Realism Control: Insufficient data for comparison")

# Test 3: All Treatment-type vs All Control-type (combining regular and realism)
all_treatment_scores = df[df['item_type'].isin(['treatment', 'realism_treatment'])]['adjusted_score']
all_control_scores = df[df['item_type'].isin(['control', 'realism_control'])]['adjusted_score']

if len(all_treatment_scores) > 0 and len(all_control_scores) > 0:
    statistic, p_value = mannwhitneyu(all_treatment_scores, all_control_scores, alternative='two-sided')
    print(f"\nAll Treatment vs All Control (combined):")
    print(f"  All Treatment: n = {len(all_treatment_scores)}, median = {all_treatment_scores.median():.3f}, mean = {all_treatment_scores.mean():.3f}")
    print(f"  All Control: n = {len(all_control_scores)}, median = {all_control_scores.median():.3f}, mean = {all_control_scores.mean():.3f}")
    print(f"  U-statistic = {statistic:.3f}")
    print(f"  p-value = {p_value:.6f}")
    print(f"  Significant at α = 0.05: {'Yes' if p_value < 0.05 else 'No'}")
else:
    print("All Treatment vs All Control: Insufficient data for comparison")

non_harm_harm_type = ["Loneliness or isolation", "Loss of job","Identity theft or financial fraud"]
not_harm_topics = ["Manufacturing and Industry","Healthcare","Legal and Justice Systems","Finance and Banking","Artificial Intelligence (AI)"]

print("="*70)
print("FILTERED TREATMENT ANALYSIS")
print("="*70)

# Filter treatment data to exclude specified harm types and topics
treatment_data = df[df['item_type'] == 'treatment']
filtered_treatment = treatment_data[
    (~treatment_data['harm_type'].isin(non_harm_harm_type)) &
    (~treatment_data['topic'].isin(not_harm_topics))
]
filtered_treatment_scores = filtered_treatment['adjusted_score'].dropna()
if len(filtered_treatment_scores) > 0:
    filtered_avg = filtered_treatment_scores.mean()
    filtered_std = filtered_treatment_scores.std()
    filtered_median = filtered_treatment_scores.median()

    print(f"\nFiltered Treatment Statistics:")
    print(f"  Count: {len(filtered_treatment_scores)}")
    print(f"  Mean: {filtered_avg:.3f}")
    print(f"  Standard Deviation: {filtered_std:.3f}")
    print(f"  Median: {filtered_median:.3f}")
    print(f"  Min: {filtered_treatment_scores.min():.3f}")
    print(f"  Max: {filtered_treatment_scores.max():.3f}")

statistic, p_value = mannwhitneyu(filtered_treatment_scores, control_scores, alternative='two-sided')

print(f"Filtered Treatment vs Control:")
print(
    f"  Filtered Treatment: n = {len(filtered_treatment_scores)}, median = {filtered_treatment_scores.median():.3f}, mean = {filtered_treatment_scores.mean():.3f}")
print(
    f"  Control: n = {len(control_scores)}, median = {control_scores.median():.3f}, mean = {control_scores.mean():.3f}")
print(f"  U-statistic = {statistic:.3f}")
print(f"  p-value = {p_value:.6f}")
print(f"  Significant at α = 0.05: {'Yes' if p_value < 0.05 else 'No'}")
print(f"  Significant at α = 0.01: {'Yes' if p_value < 0.01 else 'No'}")
