import sys
import json
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm


data = open(sys.argv[1], "r", encoding="utf-8")

output_path = sys.argv[2]

out = open(output_path, "a+", encoding="utf-8")

all_item = []

count = 0
for item in data:
    item = json.loads(item)

    if item["ground_truth"] == item["model_answer"][0]:
        res = "Yes"
        out.write(res+"\n")
    else:
        all_item.append([item["ground_truth"], item["model_answer"][0]])

def match_with_gpt(item):
    prompt = r"""
            Look at the following two expressions (answers to a math problem) and judge whether they are equivalent. Only perform trivial simplifications

            Examples:

                Expression 1: $2x+3$
                Expression 2: $3+2x$

            Yes

                Expression 1: 3/2
                Expression 2: 1.5

            Yes

                Expression 1: $x^2+2x+1$
                Expression 2: $y^2+2y+1$

            No

                Expression 1: $x^2+2x+1$
                Expression 2: $(x+1)^2$

            Yes

                Expression 1: 3245/5
                Expression 2: 649

            No
            (these are actually equal, don't mark them equivalent if you need to do nontrivial simplifications)

                Expression 1: 2/(-3)
                Expression 2: -2/3

            Yes
            (trivial simplifications are allowed)

                Expression 1: 72 degrees
                Expression 2: 72

            Yes
            (give benefit of the doubt to units)

                Expression 1: 64
                Expression 2: 64 square feet

            Yes
            (give benefit of the doubt to units)

            ---

            YOUR TASK

            Respond with only "Yes" or "No" (without quotes). Do not include a rationale.

                Expression 1: our_answer_1
                Expression 2: our_answer_2
            """

    prompt = prompt.replace("our_answer_1", item[0]).replace("our_answer_2", item[1])
    
    payload = json.dumps({
        "model": "gpt-4o-mini",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "max_new_tokens": 2
    })
    
    headers = {
        'Accept': 'application/json',
        'Authorization': f"Bearer {YOUR_API_KEY}",
        'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)

    res = json.loads(response.text)["choices"][0]["message"]["content"]

    return res


def process_items_concurrently():
    results = []

    with ThreadPoolExecutor(max_workers=32) as executor:
        futures = {executor.submit(match_with_gpt, item): item for item in all_item}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
            item = futures[future]
            try:
                res = future.result()
                out.write(res+"\n")
                print(item[0], item[1], res)
                results.append((item[0], item[1], res))
            except Exception as e:
                print(f"Error processing item {item}: {e}")
    
    return results


if __name__ == "__main__":
    results = process_items_concurrently()
    out.close()

    res_data = open(output_path, "r", encoding="utf-8")
    all_res = []
    for line in res_data:
        line = line.strip()
        all_res.append(line)
    accuracy = all_res.count("Yes")/(all_res.count("Yes") + all_res.count("No"))
    print(f"Accuracy is: {accuracy}")
    
    
