import pandas as pd
from verl.utils.reward_score.livecodebench.tss_utils import process_response
import argparse
import re

parser = argparse.ArgumentParser()
parser.add_argument("--output_path", type=str,
                    default="eval/1.5B_adversarial_200_220_test_output_t0.8.pkl")
args = parser.parse_args()

output_path = args.output_path
with open(output_path, 'rb') as f:
    dataset = pd.read_pickle(f)

if type(dataset) == list:
    dataset = pd.DataFrame(dataset)

scores = 0
total = 0
for idx in range(len(dataset)):
    answer = dataset.iloc[idx]['answer']
    for j in range(len(dataset.iloc[idx]['responses'])):
        response = dataset.iloc[idx]['responses'][j]
        # Use regular expressions to find the content of the last \box{}
        box_matches = re.findall(r'\\boxed\{(.*?)\}', response, re.DOTALL)
        if box_matches:
            # Get the content of the last \box{}
            extracted_answer = box_matches[-1].strip()
            if extracted_answer == answer:
                scores += 1
        total += 1
print(scores, total)
print(scores / total)
