import os
import json

from tqdm import tqdm

from inference_rlhf.code.helpers.io import json_dump

TASK = 'mbpp'
DATA_DIR = './data/mbpp/gpt-4o-mini'

json_files = []
for root, dirs, files in os.walk(DATA_DIR):
    for file in files:
        if file.endswith('eval_results.json'):
            json_files.append(os.path.join(root, file))

for file in tqdm(json_files):
    # open results file
    with open(file, 'r') as f:
        eval_results = json.load(f)
    assert len(eval_results['eval'].keys()) == 1
    task_id = list(eval_results['eval'].keys())[0]
    eval_results = eval_results['eval'][task_id]
    
    # open responses file
    responses_file = file.replace('.eval_results.json', '.json')
    with open(responses_file, 'r') as f:
        responses = json.load(f)
    
    assert len(eval_results) == len(responses)

    responses_with_correct = []
    for response, eval_result in zip(responses, eval_results):
        response_with_correct = {
            **response,
            'strict_correct': eval_result['plus_status'] == 'pass'
        }
        responses_with_correct.append(response_with_correct)

    # save responses with correct
    json_dump(responses_with_correct, responses_file.replace('.json', '--CHECKED.json'))
    print('Saved file in ', responses_file.replace('.json', '--CHECKED.json'))
