
"""
Pre-processes the SWE-V annotations for Experiment 1

The original annotations include a raw dataset with three annotations per instance,
and an ensembled dataset that takes the highest-severity score for each instance.

This script aligns the instance IDs with the input sample, and re-ensembles the 
annotations by taking the majority vote (or the median if there is no majority).

The ensembled 4-point scores are then converted to binary labels.

"""

print("\nPreprocessing for Experiment 1\n")

import pandas as pd
import numpy as np

input_sample = "inputs/samples/experiment1_sample.txt"  # experiment 1 sample instances
input_annotations = "inputs/swe-bench-annotation-results/samples_with_3_annotations_public.csv"  # raw annotations
output_annotations = "inputs/annotations/experiment1_annotations.csv"  # re-ensembled annotations

print("Loading raw annotations")
cols = ["instance_id", "false_negative"]  # four-point test scores
df_raw = pd.read_csv(input_annotations, usecols=cols)

print("Aligning with sample instances")
df_sample = pd.read_csv(input_sample)
df_raw = pd.merge(df_raw, df_sample, how='inner', on='instance_id')

print("Ensembling using majority score (or median if no majority)")
df_ensembled = df_raw.groupby("instance_id")['false_negative'].agg(lambda scores: scores.mode().tolist()).to_frame()
df_ensembled = df_ensembled.explode('false_negative').groupby("instance_id")['false_negative'].agg('median')

print("Converting ensembled scores to binary labels")
df_ensembled = df_ensembled.map(lambda score: True if score >= 2 else False)
df_ensembled.to_csv(output_annotations)
