import pandas as pd
import json
import os
from openai import OpenAI

# Initialize OpenAI client
client = OpenAI(
    api_key=os.environ.get(
        "OPENAI_API_KEY",
        "OPENAI_API_KEY",
    )
)


def run_rating(prompt, response):
    evaluation_prompt = f"""
    Evaluate the given summary on the following criteria using a scale of 1 to 7:

    1. coherence: How coherent is the summary on its own?
    2. accuracy: Does the factual information in the summary accurately match the original text?
    3. coverage: How well does the summary cover the important information in the original text?

    Use the following rubrics for scoring:

    coherence:
    1: The summary is impossible to understand.
    4: The summary has mistakes or confusing phrasing that make it a bit hard to understand.
    7: The summary is perfectly clear.

    accuracy:
    1: The summary is completely wrong, made up, or exactly contradicts what is written in the original text.
    4: The summary says at least one substantial thing that is not mentioned in the original text, or that contradicts something in the original text.
    5: The summary says anything, no matter how small, that is not mentioned in the original text, or that contradicts something in the original text.
    7: The summary has no incorrect statements or misleading implications.

    coverage:
    1: The summary contains no information relevant to the original text.
    4: The summary is missing at least 1 important piece of information required to understand the situation.
    7: The summary covers all of the important information required to understand the situation.

    Return the answer as a JSON object with the following fields: coherence, accuracy, coverage.

    Text to evaluate:
    Original text: {prompt}
    Summary: {response}
    """

    try:
        chat_completion = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": evaluation_prompt}],
            temperature=0,
        )

        raw_output = chat_completion.choices[0].message.content
        scores = json.loads(raw_output)
        return raw_output, scores
    except Exception as e:
        print(f"Error: {e}")
        return raw_output, None


# Load the data
response_data = pd.read_csv("data.csv")

dimensions = ["coherence", "accuracy", "coverage"]
comparison_pairs = [
    ("base", "explainable"),
    ("base", "blackbox"),
    ("explainable", "blackbox"),
]

results = {
    pair: {dim: {"wins": 0, "ties": 0, "losses": 0} for dim in dimensions}
    for pair in comparison_pairs
}

for index, row in response_data.iterrows():
    prompt = row["prompt"]
    base = row["response (base)"]
    explainable = row["response (explainable)"]
    blackbox = row["response (blackbox)"]

    base_rating, base_scores = run_rating(prompt, base)
    explainable_rating, explainable_scores = run_rating(prompt, explainable)
    blackbox_rating, blackbox_scores = run_rating(prompt, blackbox)

    response_data.loc[index, "base_rating"] = base_rating
    response_data.loc[index, "explainable_rating"] = explainable_rating
    response_data.loc[index, "blackbox_rating"] = blackbox_rating

    if base_scores and explainable_scores and blackbox_scores:
        for dim in dimensions:
            response_data.loc[index, f"base_{dim}"] = base_scores[dim]
            response_data.loc[index, f"explainable_{dim}"] = explainable_scores[dim]
            response_data.loc[index, f"blackbox_{dim}"] = blackbox_scores[dim]

        for pair in comparison_pairs:
            model1, model2 = pair
            scores1 = eval(f"{model1}_scores")
            scores2 = eval(f"{model2}_scores")

            for dim in dimensions:
                if scores1[dim] > scores2[dim]:
                    results[pair][dim]["wins"] += 1
                elif scores1[dim] < scores2[dim]:
                    results[pair][dim]["losses"] += 1
                else:
                    results[pair][dim]["ties"] += 1

    response_data.to_csv("saved_data.csv", index=False)

print("Evaluation complete. Results saved to 'saved_data.csv'.")

for pair in comparison_pairs:
    print(f"\nComparison: {pair[0]} vs {pair[1]}")
    for dim in dimensions:
        wins = results[pair][dim]["wins"]
        ties = results[pair][dim]["ties"]
        losses = results[pair][dim]["losses"]
        print(f"{dim.capitalize()}: Wins: {wins}, Ties: {ties}, Losses: {losses}")
