
"""
Pre-processes the SWE-V annotations for Experiment 2

The original annotations include a raw dataset with three annotations per instance,
and an ensembled dataset that takes the highest-severity score for each instance.

This script aligns the instance IDs with the input sample,
and filters the raw annotations using the following criteria:

- Exclude instances with low annotator confidence (less than 4)
- Exclude instances with missing annotations
  (as determined by the four-point 'underspecified' and 'false negative' scores)

The remaining binary labels are returned.

"""

import pandas as pd

print("\nPreprocessing for Experiment 2\n")

input_sample = "inputs/samples/experiment2_sample.txt"
input_annotations = "inputs/swe-bench-annotation-results/samples_with_3_annotations_public.csv"  # raw
output_annotations = "inputs/annotations/experiment2_annotations.csv"  # filtered

print("Loading raw annotations")
df_raw_annotations = pd.read_csv(input_annotations)

print("Aligning with sample instances")
df_sample = pd.read_csv(input_sample)  # experiment 2 sample instances
df_raw_annotations = pd.merge(df_raw_annotations, df_sample, how='inner', on='instance_id')

print("Applying criteria")
df_processed = df_raw_annotations[df_raw_annotations['annotator_confidence'].isin([4,5])]  # high-confidence annotations only
df_processed = df_processed.dropna(subset=["underspecified","false_negative"], how='any')  # no missing scores

print("Writing processed annotations")
df_processed = df_processed[["instance_id","user_id","false_negative_problematic"]]  # binary labels
df_processed.rename(columns={"false_negative_problematic": "false_negative"}, inplace=True)  # for consistency with other experiments
df_processed.to_csv(output_annotations, index=False)
