# eval.py
# Main script to compute NEAR scores and baseline comparisons

import argparse
from shapley_near.attribution import compute_shapley_near_score
from shapley_near.model_utils import load_model_and_tokenizer
from datasets import load_dataset
import pandas as pd
from sklearn.metrics import roc_auc_score, kendalltau, pearsonr
import json


def evaluate_near_scores(near_scores, true_labels):
    auroc = roc_auc_score(true_labels, near_scores)
    tau, _ = kendalltau(near_scores, true_labels)
    pcc, _ = pearsonr(near_scores, true_labels)
    return auroc, tau, pcc


if __name__ == "__main__":
    dataset_folder = "datasets"
    model_names = ["Qwen/Qwen1.5-1.8B", "facebook/opt-6.7b"]
    dataset_files = ["coqa.jsonl", "quac.jsonl", "squad.jsonl", "triviaqa.jsonl"]

    for model_name in model_names:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
        model.eval()

        print(f"\nEvaluating model: {model_name}")
        for file in dataset_files:
            file_path = os.path.join(dataset_folder, file)
            near_scores = []
            labels = []

            with open(file_path, 'r') as f:
                for line in f:
                    ex = json.loads(line)
                    context, question, label = ex["context"], ex["question"], ex["label"]
                    score = compute_shapley_near_score(model, tokenizer, context, question, M=20)
                    near_scores.append(score)
                    labels.append(label)

            auroc, tau, pcc = evaluate_near_scores(near_scores, labels)
            print(f"{file}: AUROC = {auroc:.3f}, Kendall Tau = {tau:.3f}, Pearson = {pcc:.3f}")

