




""""""



import argparse

import json

import os

import yaml

import pandas as pd

import numpy as np

from scipy import stats

from typing import Dict, List, Tuple, Any, Optional, Union





pd.set_option('', None)      

pd.set_option('', 50)     

pd.set_option('', 1000)         

pd.set_option('', 100)   













































METRICS = ["", "", "", "", "", 

           "", "", "", ""]













RENAME_DICT = {

    "": "",

    "": "",

    

    "": "", 

    "": "",

    "": "", 

    

    "": "", 

    "": "",

    "": "",

    "": ""

}









DEFAULT_MODEL_ORDER = [

    "",  

    

    "",

    "", 

    "",

    "",

    "",



    

    "",

    "",

    "", 

    "",

    "",

    "",



    

    "",

    "",

    "",

    "",



    

    "",

    "",

    "",

    ""

    

    

    

    

    

    

    

    

    

    

    

    

    

    

    

    

    

    

    

    

    

    

    

    

    

]

















































































































































































































































































BASE_VS_INSTRUCT_SFT = [

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





BASE_VS_INSTRUCT_RLHF = [

    

    

    

    

    

    

    

    

    

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    

    

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





BASE_VS_INSTRUCT_GRPO = [

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





BASE_VS_INSTRUCT_DPO = [

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





BASE_VS_INSTRUCT_PREFERENCE = BASE_VS_INSTRUCT_RLHF + BASE_VS_INSTRUCT_DPO + BASE_VS_INSTRUCT_GRPO



BASE_VS_INSTRUCT_RL = BASE_VS_INSTRUCT_RLHF + BASE_VS_INSTRUCT_GRPO





BASE_VS_INSTRUCT_ALL = BASE_VS_INSTRUCT_SFT + BASE_VS_INSTRUCT_RLHF + BASE_VS_INSTRUCT_DPO





SMALL_VS_LARGE_BASE = [

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





SMALL_VS_LARGE_INSTRUCT_RLHF = [

    

    

    

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





SMALL_VS_LARGE_INSTRUCT_SFT = [

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





SMALL_VS_LARGE_INSTRUCT_DPO = [

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





SMALL_VS_LARGE_INSTRUCT = SMALL_VS_LARGE_INSTRUCT_RLHF + SMALL_VS_LARGE_INSTRUCT_SFT + SMALL_VS_LARGE_INSTRUCT_DPO



SMALL_VS_LARGE_PREFERENCE = SMALL_VS_LARGE_INSTRUCT_RLHF + SMALL_VS_LARGE_INSTRUCT_DPO 





SMALL_VS_LARGE_ALL = SMALL_VS_LARGE_BASE + SMALL_VS_LARGE_INSTRUCT





ZERO_VS_TWO_SHOT_BASE = [

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





ZERO_VS_TWO_SHOT_INSTRUCT_RLHF = [

    

    

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]









ZERO_VS_TWO_SHOT_INSTRUCT_SFT = [

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





ZERO_VS_TWO_SHOT_INSTRUCT_DPO = [

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





ZERO_VS_TWO_SHOT_INSTRUCT = ZERO_VS_TWO_SHOT_INSTRUCT_RLHF + ZERO_VS_TWO_SHOT_INSTRUCT_SFT + ZERO_VS_TWO_SHOT_INSTRUCT_DPO





ZERO_VS_TWO_SHOT_ALL = ZERO_VS_TWO_SHOT_BASE + ZERO_VS_TWO_SHOT_INSTRUCT





TWO_SHOT_VS_TWO_SHOT_COT_ALL = [

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





SFT_VS_DPO = [

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),    

    

]





SFT_VS_RLHF = [

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    

    

    

    

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]



SFT_VS_GRPO = [

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]



DPO_VS_GRPO = [

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]



SFT_VS_PREFERENCE = SFT_VS_RLHF + SFT_VS_DPO + SFT_VS_GRPO



SFT_VS_RL = SFT_VS_RLHF + SFT_VS_GRPO



RLHF_VS_GRPO = [

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

]





SFT_VS_PREFERENCE = SFT_VS_RLHF + SFT_VS_DPO + SFT_VS_GRPO



DPO_VS_RLHF = [

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    

    

    

    

    

    

    

    

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    (("", ""), ("", "")),

    

    

]



DPO_VS_RL = DPO_VS_RLHF + DPO_VS_GRPO

























    















    















MODEL_TEMPLATE_PAIRS = {

    

    

    "": BASE_VS_INSTRUCT_ALL,

    "": BASE_VS_INSTRUCT_SFT,

    "": BASE_VS_INSTRUCT_RLHF,

    "": BASE_VS_INSTRUCT_DPO,

    "": BASE_VS_INSTRUCT_PREFERENCE,

    "": BASE_VS_INSTRUCT_GRPO,

    "": BASE_VS_INSTRUCT_RL,

    

    

    "": SMALL_VS_LARGE_ALL,

    "": SMALL_VS_LARGE_BASE,

    "": SMALL_VS_LARGE_INSTRUCT,

    "": SMALL_VS_LARGE_INSTRUCT_RLHF,

    "": SMALL_VS_LARGE_INSTRUCT_SFT,

    "": SMALL_VS_LARGE_INSTRUCT_DPO,

    "": SMALL_VS_LARGE_PREFERENCE,

    

    

    "": ZERO_VS_TWO_SHOT_ALL,

    "": ZERO_VS_TWO_SHOT_BASE,

    "": ZERO_VS_TWO_SHOT_INSTRUCT,

    "": ZERO_VS_TWO_SHOT_INSTRUCT_RLHF,

    "": ZERO_VS_TWO_SHOT_INSTRUCT_SFT,

    "": ZERO_VS_TWO_SHOT_INSTRUCT_DPO,

    

    

    

    "": TWO_SHOT_VS_TWO_SHOT_COT_ALL,

    

    

    

    "": SFT_VS_DPO,

    "": SFT_VS_RLHF,

    "": SFT_VS_PREFERENCE,

    "": SFT_VS_RL,

    "": DPO_VS_RLHF,

    "": SFT_VS_GRPO,

    "": DPO_VS_GRPO,

    "": RLHF_VS_GRPO,

    "": DPO_VS_RL,

}



def check_for_missing_model_prompt_pairs(data: pd.DataFrame) -> None:

    

    all_model_prompt_pairs = set(data[["", ""]].apply(tuple, axis=1))

    all_desired_model_prompt_pairs = set()

    for model_pair in MODEL_TEMPLATE_PAIRS.values():

        for pair in model_pair:

            all_desired_model_prompt_pairs.add(tuple(pair[0]))

            all_desired_model_prompt_pairs.add(tuple(pair[1]))

    

    missing_pairs = all_desired_model_prompt_pairs - all_model_prompt_pairs

    if missing_pairs:

        print("")

        for pair in missing_pairs:

            print(pair)





def load_and_process_data(tsv_paths: List[str], model_order: List[str]) -> pd.DataFrame:

    """"""

    

    dfs = []

    for path in tsv_paths:

        if not os.path.exists(path):

            print(f"Warning: File not found: {path}")

            exit(1)

        try:

            data = pd.read_csv(path, sep='')

            print(f"Loaded {path}: {data.shape[0]} rows, {data.shape[1]} columns")

            dfs.append(data)

        except Exception as e:

            print(f"Error loading {path}: {str(e)}")

    

    if not dfs:

        raise ValueError("")

    

    

    total_rows = sum(len(df) for df in dfs)

    

    

    

    new_dfs = []

    for df in dfs:

        df = df[df[""] == 1.0]

        df = df[df[""] == 1.0]

        new_dfs.append(df)

        

    dfs = new_dfs

        

    print(f"Filtered {total_rows - sum(len(df) for df in dfs)} rows")

    

    

    all_columns = set()

    for i, df in enumerate(dfs):

        all_columns.update(df.columns)

    

    for i, df in enumerate(dfs):

        missing = all_columns - set(df.columns)

        if missing:

            print(f"File {i} ({tsv_paths[i]}) is missing columns: {missing}")

    

    

    data = pd.concat(dfs, ignore_index=True)

    print(f"Combined data shape: {data.shape}")

    

    

    nan_check_before = {col: data[col].isna().sum() for col in data.columns if data[col].isna().sum() > 0}

    if nan_check_before:

        print("")

        for col, count in sorted(nan_check_before.items(), key=lambda x: x[1], reverse=True):

            print(f"  {col}: {count} NaNs ({count/len(data)*100:.2f}%)")

    

    

    if "" in data.columns:

        orig_len = len(data)

        data = data[~data[""].apply(

            lambda row: any(word in str(row) for word in ["", ""])

        )]

        filtered_len = len(data)

        if orig_len > filtered_len:

            print(f"Removed {orig_len - filtered_len} rows with errors in all_coherence")

    

    

    

    

    

    

        

    

    

    

    

    

    

    

    

        

    

    

    

    

    

    

    

    

    

    

    

    

    

    for old_col, new_col in RENAME_DICT.items():

        if old_col in data.columns:

            data[new_col] = data[old_col]

            nan_count = data[new_col].isna().sum()

            if nan_count > 0:

                print(f"Warning: {new_col} has {nan_count} NaN values after renaming from {old_col}")

                

    

    

    orig_len = len(data)

    data = data.dropna(subset=[""])

    filtered_len = len(data)

    if orig_len > filtered_len:

        print(f"Removed {orig_len - filtered_len} rows with NaN model values")

    

    

    nan_in_metrics = {metric: data[metric].isna().sum() for metric in METRICS 

                     if metric in data.columns and data[metric].isna().sum() > 0}

    

    

        

        

        

        

        

        

        

        

        

        

        

    

    

    required_columns = ["", ""] + list(set(METRICS) & set(data.columns))

    

    missing_columns = [col for col in required_columns if col not in data.columns]

    if missing_columns:

        print(f"Warning: Missing columns: {missing_columns}")

        print("", list(data.columns))

        if "" in missing_columns or "" in missing_columns:

            raise ValueError(f"Critical columns missing: {missing_columns}")

    

    

    print(f"Data shape after processing: {data.shape}")

    print(f"Unique models: {data[''].nunique()}")

    print(f"Models with NaN values: {data[''].isna().sum()}")

    print(f"Available metrics: {[m for m in METRICS if m in data.columns]}")

    

    

    if model_order:

        

        models_not_ordered = set(data[""].unique()) - set(model_order)

        if models_not_ordered:

            print(f"Warning: Models not in model_order: {models_not_ordered}")

        

        data[""] = pd.Categorical(data[""], categories=model_order, ordered=True)

        data = data.sort_values(by=["", ""])

        

    check_for_missing_model_prompt_pairs(data)

    

    return data













def cohens_d(d1: np.ndarray, d2: np.ndarray) -> Tuple[float, float, float]:

    """"""

    

    mask = ~np.isnan(d1) & ~np.isnan(d2)

    d1, d2 = d1[mask], d2[mask]

    

    

    diff = d1.mean() - d2.mean()

    

    n1, n2 = len(d1), len(d2)

    

    

    if n1 <= 1 or n2 <= 1:

        return 0.0, diff, 0.0

    

    s1 = np.sum((d1 - d1.mean()) ** 2) / (n1 - 1)

    s2 = np.sum((d2 - d2.mean()) ** 2) / (n2 - 1)

    

    

    if (n1 + n2 - 2) <= 0:

        return 0.0, diff, 0.0

    

    pooled_std = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))

    

    

    if pooled_std <= 0:

        return 0.0, diff, 0.0

    

    

    d_value = diff / pooled_std

    

    return d_value, diff, pooled_std





def perform_statistical_test(group1: np.ndarray, group2: np.ndarray, 

                            group1_name: str, group2_name: str) -> Dict[str, Any]:

    """"""

    

    group1 = np.array(group1, dtype=float)

    group2 = np.array(group2, dtype=float)

    try: 

        mask = ~np.isnan(group1) & ~np.isnan(group2)

    except:

        import pdb; pdb.set_trace()

        mask = ~pd.isna(group1) & ~pd.isna(group2)

    g1, g2 = group1[mask], group2[mask]

    

    if len(g1) == 0 or len(g2) == 0:

        return {

            "": 0,

            "": None,

            "": None,

            "": None,

            "": None,

            "": None,

            "": "",

            "": False,

            "": 0,

            "": 0

        }

    

    

    differences = g2 - g1

    

    

    pos_diff = np.sum(differences > 0)

    neg_diff = np.sum(differences < 0)

    

    

    mean_diff = differences.mean()

    median_diff = np.median(differences)

    

    

    if mean_diff > 0:

        direction = f"{group2_name}_higher"

    elif mean_diff < 0:

        direction = f"{group1_name}_higher"

    else:

        direction = ""

    

    

    d_value, _, _ = cohens_d(g1, g2)

    

    

    try:

        if np.all(differences == 0):

            wilcoxon_statistic, p_value = None, 1.0

        else:

            wilcoxon_statistic, p_value = stats.wilcoxon(g1, g2, alternative='')

    except ValueError:

        

        wilcoxon_statistic, p_value = None, None

    

    return {

        "": len(g1),

        "": float(wilcoxon_statistic) if wilcoxon_statistic is not None else None,

        "": float(p_value) if p_value is not None else None,

        "": float(d_value),

        "": float(mean_diff),

        "": float(median_diff),

        "": direction,

        "": bool(p_value is not None and p_value < 0.05),

        "": int(pos_diff),

        "": int(neg_diff)

    }



def analyze_model_template_pairs(data: pd.DataFrame, 

                              pairs: List[Tuple[Tuple[str, str], Tuple[str, str]]], 

                              metrics: List[str],

                              group1_name: str,

                              group2_name: str) -> Dict[str, Dict[str, Any]]:

    """"""

    results = {}

    

    

    for metric in metrics:

        group1_values = []

        group2_values = []

        pair_info = []

        

        

        for (model1, template1), (model2, template2) in pairs:

            

            model1_data = data[(data[""] == model1) & (data[""] == template1)]

            model2_data = data[(data[""] == model2) & (data[""] == template2)]

            

            

            

            if model1_data.empty or model2_data.empty:

                print(f"Warning: Missing data for metric {metric} pair ({model1}, {template1}) vs ({model2}, {template2})")

                

                continue

            

            

            try:

                value1 = model1_data[metric].values[0]

                value2 = model2_data[metric].values[0]

                

                

                value1 = float(value1)

                value2 = float(value2)

                

                

                

                if pd.isna(value1) or pd.isna(value2):

                    print(f"Warning: NaN value for ({model1}, {template1}) vs ({model2}, {template2})")

                    continue

                    

                group1_values.append(value1)

                group2_values.append(value2)

                pair_info.append((model1, template1, model2, template2, value1, value2))

                

            except (IndexError, KeyError) as e:

                print(f"Error extracting values for ({model1}, {template1}) vs ({model2}, {template2}): {e}")

        

        

        if group1_values and group2_values:

            stat_results = perform_statistical_test(

                np.array(group1_values),

                np.array(group2_values),

                group1_name,

                group2_name

            )

            

            

            stat_results[""] = [

                {

                    f"{group1_name}_model": p[0],

                    f"{group1_name}_template": p[1],

                    f"{group2_name}_model": p[2],

                    f"{group2_name}_template": p[3],

                    f"{group1_name}_value": float(p[4]),

                    f"{group2_name}_value": float(p[5]),

                    "": float(p[5] - p[4])

                }

                for p in pair_info

            ]

            

            results[metric] = stat_results

        else:

            print(f"Warning: No valid pairs found for metric {metric}")

            results[metric] = {

                "": 0,

                "": ""

            }

    

    return results



def save_json(results: Dict[str, Any], output_file: str) -> None:

    """"""

    with open(output_file, '') as f:

        json.dump(results, f, indent=2)

    print(f"Results saved to {output_file}")



def json_to_csv(results: Dict[str, Any], output_file: str) -> None:

    """"""

    rows = []

    

    

    for comparison_type, metrics in results.items():

        for metric_name, stats in metrics.items():

            if isinstance(stats, dict):  

                row = {

                    "": comparison_type,

                    "": metric_name

                }

                

                for stat_name, stat_value in stats.items():

                    if stat_name != "":  

                        row[stat_name] = stat_value

                

                rows.append(row)

    

    

    pd.DataFrame(rows).to_csv(output_file, index=False)

    print(f"CSV saved to {output_file}")



def save_yaml(results: Dict[str, Any], output_file: str) -> None:

    """"""

    with open(output_file, '') as f:

        yaml.dump(results, f, default_flow_style=False, sort_keys=False)

    print(f"YAML saved to {output_file}")

    

def json_to_yaml(results: Dict[str, Any], output_file: str) -> None:

    """"""

    with open(output_file, '') as f:

        yaml.dump(results, f, default_flow_style=False, sort_keys=False)

    print(f"YAML saved to {output_file}")





def main():

    """"""

    parser = argparse.ArgumentParser(description='')

    parser.add_argument('', nargs='', required=False, 

                        help='')

    parser.add_argument('', default='',

                        help='')

    parser.add_argument('', choices=['', '', '', ''], 

                        default='', help='')

    parser.add_argument('', help='')

    parser.add_argument('', help='')

    args = parser.parse_args()

    

    if args.tsv_paths is None:

        args.tsv_paths = PATHS

    

    

    model_order = DEFAULT_MODEL_ORDER

    if args.model_order_file:

        try:

            with open(args.model_order_file, '') as f:

                model_order = json.load(f)

            print(f"Loaded custom model order from {args.model_order_file}")

        except Exception as e:

            print(f"Error loading model order, using default: {e}")

    

    

    data = load_and_process_data(args.tsv_paths, model_order)

    

    data[["", ""] + METRICS].to_csv(f"{args.output_prefix}_raw_data.csv")

    

    

    model_template_pairs = MODEL_TEMPLATE_PAIRS

    if args.pairs_file:

        try:

            with open(args.pairs_file, '') as f:

                model_template_pairs = json.load(f)

            print(f"Loaded custom model pairs from {args.pairs_file}")

        except Exception as e:

            print(f"Error loading model pairs, using default: {e}")

    

    

    print("")

    results = {}

    

    for comparison_type, pairs in model_template_pairs.items():

        

        label1, label2 = comparison_type.split("")

        results[comparison_type] = analyze_model_template_pairs(

            data, pairs, METRICS, label1, label2

        )

    

    

    print("")

    if args.output_format in ['', '']:

        save_json(results, f"{args.output_prefix}.json")



    if args.output_format in ['', '']:

        json_to_csv(results, f"{args.output_prefix}.csv")

        

        

        post_train_df = pd.read_csv(f"{args.output_prefix}.csv")

        

        

        comparison_types = ["", "", "", "", "", "", "", "", ""]

        post_train_filtered = post_train_df[post_train_df[""].isin(comparison_types)]

        post_train_sub = post_train_filtered[["", "", "", "", "", ""]]

        

        post_train_sub.to_csv(f"{args.output_prefix}_post-train.csv")

        

        small_vs_large_types = ["", "", "", "", "", "", ""]

        small_vs_large_filtered = post_train_df[post_train_df[""].isin(small_vs_large_types)]

        small_vs_large_sub = small_vs_large_filtered[["", "", "", "", "", ""]]

        

        small_vs_large_sub.to_csv(f"{args.output_prefix}_small_vs_large.csv")

    if args.output_format in ['', '']:

        json_to_yaml(results, f"{args.output_prefix}.yaml")

    

    print("")



if __name__ == "":

    main()
