import json
import os
import glob
import time
import re
import numpy as np
from openai import OpenAI
from tqdm import tqdm 

def load_data(model_path, human_path):
    with open(model_path, "r", encoding="utf-8") as f:
        model_data = json.load(f)
    with open(human_path, "r", encoding="utf-8") as f:
        human_data = json.load(f)
    human_dict = {item["image"]: item for item in human_data}
    aligned_data = []
    for model_item in model_data:
        model_image = model_item.get("image_path")
        if not model_image:
            continue
        human_item = human_dict.get(model_image)
        if human_item:
            aligned_data.append({
                "model_r": model_item["model_reasoning_output"],
                "model_a": model_item["model_answer_output"],
                "human_r": human_item["reasoning"],
                "human_a": human_item["answer"]
            })
    return aligned_data

def get_deepseek_score(client, reference, candidate):
    retries = 3
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="deepseek-chat",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a professional evaluation assistant. Analyze the semantic consistency between reference and candidate answers.\n"
                        "Guidelines:\n"
                        "1. Score range: 0.0 (completely unrelated) to 1.0 (perfect match)\n"
                        "2. Consider both factual accuracy and contextual understanding\n"
                        "3. Return ONLY the numeric score with 4 decimal places\n"
                        "4. Avoid any additional text or explanation\n"
                    },
                    {
                        "role": "user",
                        "content": f"Reference: {reference}\nCandidate: {candidate}\nScore:"
                    }
                ],
                temperature=0.2,
                max_tokens=10,
                stream=False
            )
            content = response.choices[0].message.content.strip()
            match = re.search(r"0?\.\d{1,4}|1\.0{1,4}|[01]", content)
            if match:
                score = float(match.group())
                return max(0.0, min(1.0, score))
            return 0.0
        except Exception as e:
            if attempt < retries - 1:
                time.sleep(2 ** attempt)
                continue
            pass
    print(f"\nScore request failed after retries for Reference: {reference[:50]}... Candidate: {candidate[:50]}...")
    return 0.0

def evaluate_with_deepseek(dataset, client):
    if not dataset:
        return 0.0, 0.0, 0.0
    reasoning_scores = []
    answer_scores = []
    for item in tqdm(dataset, desc="Evaluating samples", unit="sample"):
        r_score = get_deepseek_score(client, item["human_r"], item["model_r"])
        reasoning_scores.append(r_score)
        a_score = get_deepseek_score(client, item["human_a"], item["model_a"])
        answer_scores.append(a_score)
    avg_r = np.mean(reasoning_scores) if reasoning_scores else 0.0
    avg_a = np.mean(answer_scores) if answer_scores else 0.0
    harmonic = 2 * avg_r * avg_a / (avg_r + avg_a) if (avg_r + avg_a) > 0 else 0.0
    return round(avg_r, 4), round(avg_a, 4), round(harmonic, 4)

def main(predictions_folder, human_file, output_file, api_key):
    client = OpenAI(
        api_key=api_key,
        base_url="https://api.deepseek.com",
        timeout=60
    )
    prediction_files = glob.glob(os.path.join(predictions_folder, "*200predictions.json"))
    results = []
    for file_path in prediction_files:
        print(f"\nProcessing: {os.path.basename(file_path)}")
        try:
            dataset = load_data(file_path, human_file)
            if not dataset:
                print("  No aligned data found")
                continue
            start_time = time.time()
            r_score, a_score, h_score = evaluate_with_deepseek(dataset, client)
            eval_time = time.time() - start_time
            results.append({
                "model_file": os.path.basename(file_path),
                "reasoning_score": r_score,
                "answer_score": a_score,
                "harmonic_score": h_score,
                "evaluation_time": round(eval_time, 2),
                "samples_evaluated": len(dataset)
            })
            print(f"\n  Results - Reasoning: {r_score}, Answer: {a_score}, Harmonic: {h_score}")
        except Exception as e:
            print(f"  Evaluation failed: {str(e)}")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"\nEvaluation complete. Results saved to {output_file}")

if __name__ == "__main__":
    PREDICTIONS_DIR = ""
    HUMAN_ANNOTATIONS = ""
    OUTPUT_FILE = ""
    DEEPSEEK_API_KEY = ""  
    main(PREDICTIONS_DIR, HUMAN_ANNOTATIONS, OUTPUT_FILE, DEEPSEEK_API_KEY)