from utils.EncycloBenchEvaluator import EncycloBenchEvaluator
import asyncio
import os
import json



async def main():
    base_url = "YOUR BASE URL"
    api_key = "YOUR API KEY"
    prompt_file = "./prompts/eval/eval_bool.txt"
    evaluator = EncycloBenchEvaluator(base_url, api_key, prompt_file)

    infer_dataset_name = "0502-version"
    infer_models = ["DeepSeek-V3"]
    seed_nums = 1
    model_seed_results = {}
    for infer_model in infer_models:
        seed_results = []
        for seed_num in range(0,seed_nums):
            result_dir = f"./results/{infer_dataset_name}/{infer_model}/{seed_num}/"
            
            init_path = result_dir+"_init.jsonl"
            random_path = result_dir+"_random.jsonl"
            
        
            
            init_total_accuracy, init_accuracy_by_difficulty = await evaluator.evaluate(
                init_path,
                model="DeepSeek-V3", 
                max_tokens=1024,  
                temperature=0.7,  
            )
            
            random_total_accuracy, random_accuracy_by_difficulty = await evaluator.evaluate(
                random_path,
                model="DeepSeek-V3", 
                max_tokens=1024,  
                temperature=0.7,  
            )
            
            seed_results.append({
                "seed": seed_num,
                "init_total_accuracy": init_total_accuracy,
                "init_accuracy_by_difficulty": init_accuracy_by_difficulty,
                "random_total_accuracy": random_total_accuracy,
                "random_accuracy_by_difficulty": random_accuracy_by_difficulty,
            })
        model_seed_results[infer_model] = seed_results
        summary = {}
        
    for model, seeds in model_seed_results.items():
        
        n = len(seeds)

        avg_init_total = sum(s["init_total_accuracy"] for s in seeds) / n
        avg_random_total = sum(s["random_total_accuracy"] for s in seeds) / n

        difficulties = ["easy", "medium", "hard"]
        avg_init_by_diff = {
            d: sum(s["init_accuracy_by_difficulty"].get(d, 0) for s in seeds) / n
            for d in difficulties
        }
        avg_random_by_diff = {
            d: sum(s["random_accuracy_by_difficulty"].get(d, 0) for s in seeds) / n
            for d in difficulties
        }

        summary[model] = {
            "avg_init_total_accuracy": avg_init_total,
            "avg_init_accuracy_by_difficulty": avg_init_by_diff,
            "avg_random_total_accuracy": avg_random_total,
            "avg_random_accuracy_by_difficulty": avg_random_by_diff,
        }

    os.makedirs(f"./results/{infer_dataset_name}", exist_ok=True)
    summary_path = f"./results/{infer_dataset_name}/summary.json"
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=4)

    print(f"Summary written to {summary_path}")

if __name__ == "__main__":
    asyncio.run(main())