import openai
import base64
import matplotlib.pyplot as plt
from loguru import logger
import numpy as np
import json
import os
from tqdm import tqdm
import traceback
from typing import *
import io

from evaluation.evaluate_qa import evaluate_batch_qa

# CONFIG
EXP = 'thinktime_uts_reason'
DATASET = './evaluation/dataset/uts_reason.jsonl'


if __name__ == '__main__':
    print(EXP)
    dataset = json.load(open(DATASET)) if DATASET.endswith('.json') else [json.loads(line) for line in open(DATASET, 'rt')]

    # Load from existing generation
    generated_answer = [{} for _ in range(len(dataset))]
    for file in os.listdir(f"exp/{EXP}"):
        if 'generated_answer' in file and file.endswith('.json'):
            cur_answer = json.load(open(os.path.join(f"exp/{EXP}", file)))
            for ans in cur_answer:
                generated_answer[ans['idx']] = ans
    
    # Evaluation
    evaluate_batch_qa(dataset, generated_answer, EXP, num_workers=16)

