
import pandas as pd
from verl.utils.reward_score.livecodebench.tss_utils import process_response
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--output_path", type=str,
                    default="eval/1.5B_adversarial_200_220_test_output_t0.8.pkl")
args = parser.parse_args()

output_path = args.output_path
with open(output_path, 'rb') as f:
    dataset = pd.read_pickle(f)

if type(dataset) == list:
    dataset = pd.DataFrame(dataset)

scores = 0
total = 0
for idx in range(len(dataset)):
    answer = dataset.iloc[idx]['answer']
    for j in range(len(dataset.iloc[idx]['responses'])):
        response = dataset.iloc[idx]['responses'][j]
        success, json_contents = process_response(response)
        if success:
            try:
                if str(json_contents[0][0]['output']).strip() == answer:
                    scores += 1
            except:
                try:
                    if str(json_contents[0]['output']).strip() == answer:
                        scores += 1
                except Exception as e:
                    # print("================")
                    # print(answer)
                    # print(success, json_contents)
                    # print(response[-100:])
                    # print(e)
                    # print("================")
                    pass
        total += 1
print(scores, total)
print(scores / total)
