import numpy as np
import pandas as pd
from itertools import combinations


def compute_cohens_d(scores_a, scores_b):
    diff = scores_a - scores_b
    mean_diff = diff.mean()
    std_diff = diff.std(ddof=1)
    if std_diff < 1e-10:
        return 0.0
    return mean_diff / std_diff


def compute_win_rate(scores_a, scores_b):
    wins = (scores_a > scores_b).sum()
    losses = (scores_a < scores_b).sum()
    ties = (scores_a == scores_b).sum()
    n = len(scores_a)
    return wins / n, losses / n, ties / n


def compute_breakdown_point(scores_a, scores_b, direction):
    k = len(scores_a)
    
    if direction == 'higher_better':
        advantage = scores_a - scores_b
    else:
        advantage = scores_b - scores_a
    
    sorted_idx = advantage.sort_values(ascending=False).index.tolist()
    
    for i in range(1, k + 1):
        keep = sorted_idx[i:]
        if len(keep) == 0:
            return 1.0
        
        mean_a = scores_a[keep].mean()
        mean_b = scores_b[keep].mean()
        
        if direction == 'higher_better':
            if mean_a <= mean_b:
                return i / k
        else:
            if mean_a >= mean_b:
                return i / k
    
    return 1.0


def analyze_benchmark(df, direction='higher_better', top_k=20,
                      tau_d=0.2, tau_w=0.6, tau_b=0.2):

    mean_scores = df.mean(axis=0)
    if direction == 'higher_better':
        top_models = mean_scores.nlargest(top_k).index.tolist()
    else:
        top_models = mean_scores.nsmallest(top_k).index.tolist()
    
    df = df[top_models]
    
    results = []
    for model_a, model_b in combinations(df.columns, 2):
        mean_a = df[model_a].mean()
        mean_b = df[model_b].mean()
        
        if direction == 'higher_better':
            winner, loser = (model_a, model_b) if mean_a > mean_b else (model_b, model_a)
        else:
            winner, loser = (model_a, model_b) if mean_a < mean_b else (model_b, model_a)
        
        scores_w = df[winner]
        scores_l = df[loser]
        
        cohens_d = compute_cohens_d(scores_w, scores_l)
        win_rate, neg_rate, tie_rate = compute_win_rate(scores_w, scores_l)
        bp_ratio = compute_breakdown_point(scores_w, scores_l, direction)
        
        viol_magnitude = cohens_d <= tau_d
        viol_consistency = win_rate <= tau_w
        viol_stability = bp_ratio <= tau_b
        fragile = viol_magnitude or viol_consistency or viol_stability
        
        results.append({
            'winner': winner,
            'loser': loser,
            'cohens_d': cohens_d,
            'win_rate': win_rate,
            'bp_ratio': bp_ratio,
            'viol_magnitude': viol_magnitude,
            'viol_consistency': viol_consistency,
            'viol_stability': viol_stability,
            'fragile': fragile
        })
    
    results_df = pd.DataFrame(results)
    
    n_pairs = len(results_df)
    print(f"Models: {len(top_models)}, Pairs: {n_pairs}")
    print(f"Magnitude violations:   {results_df['viol_magnitude'].mean():.2%}")
    print(f"Consistency violations: {results_df['viol_consistency'].mean():.2%}")
    print(f"Stability violations:   {results_df['viol_stability'].mean():.2%}")
    print(f"Fragility rate:         {results_df['fragile'].mean():.2%}")
    
    return results_df


if __name__ == '__main__':
    import sys
    
    if len(sys.argv) < 2:
        print("Usage: python diagnostics.py <csv_file> [direction] [top_k]")
        print("  direction: higher_better (default) or lower_better")
        print("  top_k: number of top models (default 20)")
        sys.exit(1)
    
    csv_path = sys.argv[1]
    direction = sys.argv[2] if len(sys.argv) > 2 else 'higher_better'
    top_k = int(sys.argv[3]) if len(sys.argv) > 3 else 20
    
    df = pd.read_csv(csv_path, index_col=0)
    results = analyze_benchmark(df, direction=direction, top_k=top_k)
    
    output_path = csv_path.replace('.csv', '_diagnostics.csv')
    results.to_csv(output_path, index=False)
    print(f"\nSaved: {output_path}")
