import json, re
import pandas as pd


DATASET_NAME = "gsm8k"
EXCEL_ADDRESS = f"/Volumes/Academic/Projects/PRoMTd/evaluation/1_{DATASET_NAME}/data/excel/4_2_error_any.xlsx"
JSON_ADDRESS = f"/Volumes/Academic/Projects/PRoMTd/outputs/{DATASET_NAME}/gsm8k_final.json"
INDEX_COLUMN = "question_idx"
OUTPUT_DIR = "/Volumes/Academic/Projects/PRoMTd/reattempts"
invalid_question_indices = pd.read_excel(EXCEL_ADDRESS)[INDEX_COLUMN].tolist()

full_json = json.load(open(JSON_ADDRESS))

invalid_question_indices = [int(re.findall(r"\d+", i)[0]) for i in invalid_question_indices]


reattempt_data = []
for idx in invalid_question_indices:
    reattempt_data.append({"input":full_json[idx]["input"], "answer":full_json[idx]["answer"], "question_idx":idx})

with open(f"{OUTPUT_DIR}/{DATASET_NAME}_reattempt_{len(invalid_question_indices)}.json", "w") as f:
    json.dump(reattempt_data, f, indent = 4)