
"""
Pre-processes the SWE-V annotations for Experiment 3
This mirrors the Experiment 1 processing (preprocess_reference1.py),
but uses the full set of annotated sample instances.
"""

import pandas as pd

print("\nPreprocessing for Experiment 3\n")

input_sample = "inputs/samples/experiment3_sample.txt"  # all annotated sample instances
input_annotations = "inputs/swe-bench-annotation-results/samples_with_3_annotations_public.csv"  # raw annotations
output_annotations = "inputs/annotations/experiment3_annotations.csv"  # re-ensembled annotations

print("Loading raw annotations")
cols = ["instance_id", "false_negative"]  # four-point test scores
df_raw = pd.read_csv(input_annotations, usecols=cols)

print("Aligning with sample instances")
df_sample = pd.read_csv(input_sample)
df_raw = pd.merge(df_raw, df_sample, how='inner', on='instance_id')

print("Ensembling using majority score (or median if no majority)")
df_ensembled = df_raw.groupby("instance_id")['false_negative'].agg(lambda scores: scores.mode().tolist()).to_frame()
df_ensembled = df_ensembled.explode('false_negative').groupby("instance_id")['false_negative'].agg('median')

print("Converting ensembled scores to binary labels")
df_ensembled = df_ensembled.map(lambda score: True if score >= 2 else False)
df_ensembled.to_csv(output_annotations)
