import json
import os
import glob
import warnings
from bert_score import score

warnings.filterwarnings('ignore')

def load_data(model_path, human_path):
    with open(model_path, "r", encoding="utf-8") as f:
        model_data = json.load(f)
    with open(human_path, "r", encoding="utf-8") as f:
        human_data = json.load(f)

    human_dict = {os.path.basename(item["image"]): item for item in human_data}
    aligned_data = []

    for model_item in model_data:
        path = model_item.get("image_path")
        if not path:
            continue
        basename = os.path.basename(path)
        human_item = human_dict.get(basename)
        if human_item:
            aligned_data.append({
                "model_r": model_item["model_reasoning_output"],
                "model_a": model_item["model_answer_output"],
                "human_r": human_item["reasoning"],
                "human_a": human_item["answer"]
            })
    return aligned_data

def evaluate_bertscore(dataset, lang='en'):
    if not dataset:
        return 0.0, 0.0, 0.0

    refs_r = [item["human_r"] for item in dataset]
    cands_r = [item["model_r"] for item in dataset]
    refs_a = [item["human_a"] for item in dataset]
    cands_a = [item["model_a"] for item in dataset]

    _, _, F1_r = score(cands_r, refs_r, lang=lang, verbose=False)
    _, _, F1_a = score(cands_a, refs_a, lang=lang, verbose=False)

    avg_F1_r = F1_r.mean().item()
    avg_F1_a = F1_a.mean().item()
    harmonic = 2 * avg_F1_r * avg_F1_a / (avg_F1_r + avg_F1_a) if (avg_F1_r + avg_F1_a) != 0 else 0
    return avg_F1_r, avg_F1_a, harmonic

def main(predictions_folder, human_file, output_file, lang='en'):
    prediction_files = glob.glob(os.path.join(predictions_folder, "*.json"))
    results = []

    for pred_file in prediction_files:
        try:
            dataset = load_data(pred_file, human_file)
            if not dataset:
                continue
            bert_r, bert_a, harmonic = evaluate_bertscore(dataset, lang=lang)
            results.append({
                "model_file": os.path.basename(pred_file),
                "BERTScore-R": bert_r,
                "BERTScore-A": bert_a,
                "Harmonic Score": harmonic
            })
        except Exception as e:
            print(f"Error processing {pred_file}: {e}")

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)

if __name__ == "__main__":
    predictions_folder = ""
    human_file = ""
    output_file = ""
    main(predictions_folder, human_file, output_file, lang='en')


