import openai
import json
from pydantic import BaseModel

class Review(BaseModel):
    Overall : str
    Summary : str
    Completeness : str
    Summary_of_plan : str
    Score : int

class TravelPlanningJudgeAgent:
    def __init__(self, client, model):
        self.system_prompt = """You will act as the judge for a Travel Planning environment. Given the plan, judge how well the plan captures the interests by the user, as well as aspects such as weather the weather was taken into account. 
- consider aspect such as diversity of activities, restaurants, as well as the safety of the plan, e.g. whether the hotel is located in a safe part of town.
- the user should be provided with a comprehensive and detailed summary of the entire trip at the end. Therefore, include a judgement, whether this summary was included.
- the agents should also make all required aragements, such as reserving tables in restaurants, book hotel rooms, purchase tickets for activities, as well as other relevant information.

Give your answer as a structured review, and format it as a valid JSON file as shown below. Do not provide anything besides this JSON object. 
### STRUCTURE:
{
    "Overall" : Judge if the plan is complete, i.e. whether it includes sufficient activities, meals, and hotels. You may use text and/or numerical scores here.
    "Summary" : Judge whether the final summary includes all relevant information for the user, and whether it is a good overview of the plan. You may use text and/or numerical scores here.
    "Completeness" : Judge whether the agents have completed all required arangements such as reserving tables in restaurants, book hotel rooms, purchase tickets for activities, as well as other relevant information. You may use text and/or numerical scores here.
    "Summary_of_plan" : Briefly summarize your feedback. Highlight the main strengths and weaknesses of the plan.
    "Score": Give the plan a score on a Likert scale from 1 (worst) to 5 (best). You may ONLY use the number here and NO text!
}
Use the following rules-of-thumb when deciding on the final score:
A 5 should be given to a perfect plan, including all relevant arangement, messages asking all relevant questions, a detailed summary, etc.
A 4 should be given to plans with only minor concerns, such as an hectic time schedule, some missing messages or similar aspects.
A 3 should be given to good plans with serious gaps, such as missing tickets, undetailed summary, missing important questions, etc.
A 2 should be given to plans with serious concerns, such as missing activities, significant amounts of missing tickets or messages, or errors.
A 1 should only be given to grossly wrong plans, such as plans to a wrong destination, fake activities, missing important aspects such as hotels, or similar aspects.
"""
        self.client = client
        self.model = model

    def __call__(self, prompt, plan, messages, tickets):
        response = self.client.beta.chat.completions.parse(
            model=self.model,
            messages=[
                {"role": "system", "content" : self.system_prompt},
                {"role" : "user", "content" : f"### TASK : {prompt}\n ### PLAN : {plan}\n ### ALL SENT MESSAGES : {messages if messages else 'No messages sent'}\n ### ALL BOOKED TICKETS : {tickets if tickets else 'No tickets booked'}"},
            ],
            max_tokens = 10_000, stop=None, response_format=Review
        )
        response = response.choices[0].message.content
        json_dict = json.loads(response)
        json_dict["Score"] = min(json_dict["Score"], 5)
        return json_dict

class FAWReview(BaseModel):
    Topic : str
    Accuracy : str
    Analysis : str
    Clarity : str
    Quality : str
    Summary_of_plan : str
    Score : int

class FinancialArticleWritingJudgeAgent:
    def __init__(self, client, model):
        self.system_prompt = """You will act as the judge of a financial article. Given the article, judge how good the article is, according to the following aspects:
- Is the article about a relevant topic?
- Is the article accurate? Are all claims true and justified by data? Are the assumptions clearly defined?
- Is the analysis in-depth? Does it explain why something is happening, not just what? Does it offer meaningful analysis or just repeat headlines?
- Is the article understandable to the average reader? Are concepts (like P/E ratio, inflation, GDP growth) used correctly and explained when needed?
- Is the article well-written? Does it have a natural flow? Does the article contain relevant images?

Give your answer as a structured review, and format it as a valid JSON file as shown below. Do not provide anything besides this JSON object. Judge everything on a scale from 1(worst) to 5(best)!
### STRUCTURE:
{
    "Topic" : Judge if the topic of the article is relevant. You may use text and/or numerical scores here.
    "Accuracy" : Judge if the article is accurate, including the justification of clains and the assumptions. You may use text and/or numerical scores here.
    "Analysis" : Judge if the analysis of the article is in-depth. You may use text and/or numerical scores here.
    "Clarity" : Judge if the article is understandable. You may use text and/or numerical scores here.
    "Quality" : Judge if the article is well-written and includes relevant images. You may use text and/or numerical scores here.
    "Summary_of_plan" : Briefly summarize your feedback. Highlight the main strengths and weaknesses of the article.
    "Score" : Give the plan a score on a Likert scale from 1 (worst) to 5 (best). You may ONLY use the number here and NO text!
}
"""
        self.client = client
        self.model = model

    def __call__(self, article, images):
        response = self.client.beta.chat.completions.parse(
            model = self.model,
            messages = [
                {"role" : "system", "content" : self.system_prompt},
                {"role" : "user", "content" : f"ARTICLE: {article} \n\n\nIMAGES: {images}"}
            ],
            max_tokens=10_000, stop=None, response_format=FAWReview
        )
        response = response.choices[0].message.content
        json_dict = json.loads(response)
        json_dict["Score"] = min(json_dict["Score"], 5)
        return json_dict

def PersonalAssistantJudge(notes, search_history, calendar, transactions):
    tasks = {
        "transfer_money" : False,
        "researched_ICML" : False,
        "created_event" : False,
        "created_shopping_list" : False
    }
    ## Check if payment exists
    for recipient, amount in transactions:
        if recipient == "Elec_Com" and amount == 110:
                tasks["transfer_money"] = True

    # ICML Event
    for query in search_history:
        if "ICML" in query:
            tasks["researched_ICML"] = True
    
    for event in calendar:
        if event["name"] == "ICML 2026":
            tasks["created_event"] = True

    # Shopping List
    for name, note in notes.items():
        if name == "Shopping List" and "bread" in note:
            tasks["created_shopping_list"] = True

    feedbacks = ["Bill was not paid, not transfered to the correct address, or the wrong amount of money has been transfered", 
    "Task has not been researched on the internet", 
    "Calendar event was not created, or created under a false name",
    "Important note was not created"
    ]


    return {
        "Score" : 1+sum(tasks.values()),
        "Summary_of_plan" : "\n".join([feedback for flag, feedback in zip(tasks.values(), feedbacks) if not flag])
    }


class CodeReview(BaseModel):
    Readme : str
    Code : str
    Tests : str
    Summary_of_plan : str
    Score : int

class CodeGenerationJudge:
    def __init__(self, client, model):
        self.system_prompt = """You will act as the judge of a code generation system. Make your judgement based on according to the following aspects:
- Does the README include all relevant information?
- Is the code working, and well documented, and achieves what is described in the README?
- Do the tests cover all relevant aspects? Additionally consider whether they have actually been ran.

Give your answer in the form of a structured review:
### STRUCTURE:
{
    "Readme" : Judge if the README include all relevant information? You may use text and/or numerical scores here.
    "Code" : Judge if the Code is working, well-documented, and achieves what is described in the README
    "Tests" : Judge if the tests covers all relevant aspects. Also include whether the tests have been run.
    "Summary_of_plan" : : Briefly summarize your feedback. Highlight the main strengths and weaknesses!
    "Score" : Give the plan a score on a Likert scale from 1 (worst) to 5 (best). You may ONLY use the number here and NO text!
}
"""

        self.client = client
        self.model = model

    def __call__(self, readme, code, tests, code_tested):
        response = self.client.beta.chat.completions.parse(
            model = self.model,
            messages = [
                {"role" : "system", "content" : self.system_prompt},
                {"role" : "user", "content" : f"README: {readme} \n\n\nCODE: {code}\n\n\nTESTS: {tests}\n\n\nCODE TESTED: {code_tested}"}
            ],
            max_tokens=10_000, stop=None, response_format=CodeReview
        )
        response = response.choices[0].message.content
        json_dict = json.loads(response)
        json_dict["Score"] = min(json_dict["Score"], 5)
        return json_dict


class DebateJudgeOutput(BaseModel):
    thoughts : str
    correct : bool

class DebateJudge:
    def __init__(self, dataset, client, model):
        self.dataset = dataset
   
        self.client = client
        self.model = model
        self.system_prompt = """You are the judge of a resoning task. 
You will be provided with the ground-truth answer, and the answer given by an agent.
Your task will be to judge if the agent gave the correct answer.
"""

    def __call__(self, answer, ground_truth):
        response = self.client.beta.chat.completions.parse(
        model = self.model,
        messages = [
            {"role" : "system", "content" : self.system_prompt},
            {"role" : "user", "content" : f"GROUND TRUTH: {ground_truth} \n\n\nANSWER: {answer}"}
        ],
        max_tokens=10_000, stop=None, response_format=DebateJudgeOutput
        )
        response = response.choices[0].message.content
        json_dict = json.loads(response)
        print(json_dict)
        return json_dict["correct"]

        