import json
import os
import glob
import warnings
import nltk
import numpy as np
from sentence_transformers import SentenceTransformer, util

nltk.download('punkt')
warnings.filterwarnings('ignore')

def load_data(model_path, human_path):
    with open(model_path, "r", encoding="utf-8") as f:
        model_data = json.load(f)
    with open(human_path, "r", encoding="utf-8") as f:
        human_data = json.load(f)

    human_dict = {os.path.basename(item["image"]): item for item in human_data}
    aligned_data = []

    for model_item in model_data:
        model_image_path = model_item.get("image_path")
        if not model_image_path:
            continue
        basename = os.path.basename(model_image_path)
        human_item = human_dict.get(basename)
        if human_item:
            aligned_data.append({
                "model_r": model_item.get("model_reasoning_output", ""),
                "model_a": model_item.get("answer", ""),
                "human_r": human_item.get("reasoning", ""),
                "human_a": human_item.get("answer", "")
            })
    return aligned_data

def calculate_sentencebert_score(references, candidates, model):
    emb_ref = model.encode(references, convert_to_tensor=True)
    emb_cand = model.encode(candidates, convert_to_tensor=True)
    scores = [util.cos_sim(c, r).item() for c, r in zip(emb_cand, emb_ref)]
    return np.mean(scores) if scores else 0

def evaluate_sentencebert(dataset, model):
    refs_r = [item["human_r"] for item in dataset]
    cands_r = [item["model_r"] for item in dataset]
    refs_a = [item["human_a"] for item in dataset]
    cands_a = [item["model_a"] for item in dataset]

    sbert_r = calculate_sentencebert_score(refs_r, cands_r, model)
    sbert_a = calculate_sentencebert_score(refs_a, cands_a, model)
    harmonic = 2 * sbert_r * sbert_a / (sbert_r + sbert_a) if (sbert_r + sbert_a) != 0 else 0
    return sbert_r, sbert_a, harmonic

def main(predictions_folder, human_file, output_file, model_name="paraphrase-MiniLM-L6-v2"):
    sbert_model = SentenceTransformer(model_name)
    prediction_files = glob.glob(os.path.join(predictions_folder, "*_200predictions.json"))
    
    results = []
    for pred_file in prediction_files:
        try:
            dataset = load_data(pred_file, human_file)
            if not dataset:
                continue
            sbert_r, sbert_a, harmonic = evaluate_sentencebert(dataset, sbert_model)
            results.append({
                "model_file": os.path.basename(pred_file),
                "SentenceBERTScore-R": sbert_r,
                "SentenceBERTScore-A": sbert_a,
                "Harmonic Score": harmonic
            })
            print(f"{os.path.basename(pred_file)}: R={sbert_r:.4f}, A={sbert_a:.4f}, H={harmonic:.4f}")
        except Exception as e:
            print(f"Error processing {pred_file}: {e}")

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)

if __name__ == "__main__":
    predictions_folder = ""
    human_file = ""
    output_file = "model_results.json"
    main(predictions_folder, human_file, output_file, model_name="paraphrase-MiniLM-L6-v2")
