from typing import Tuple, Optional
import re
import json


def build_criteria_prompt(query, response_1, response_2, criteria_n=10):
    
    mapping = {
        3: "three",
        5: "five",
        10: "ten",
        15: "fifteen",
        20: "twenty",
        25: "twenty-five",
        30: "thirty",
        35: "thirty-five",
        45: "forty-five",
        50: "fifty",
        55: "fifty-five",
        60: "sixty",
    }
    n = mapping.get(criteria_n, "invalid input") 
    assert n != "invalid input", f"criteria_n should be a number of {list(mapping.keys())}"
    
    prompt = """You are an expert in generating evaluation criteria. Given a user query and two assistant replies (which may contain text, images, audio, or video), your task is to create **exactly {criteria_n} evaluation criteria** that best distinguish the strengths and weaknesses of the two responses.

    ### Instructions:
    1. Carefully compare the two replies and identify meaningful differences in how they respond—this could be in reasoning, factual accuracy, structure, clarity, creativity, style, or multimodal use.
    2. Based on those differences, derive {criteria_n} **context-specific** criteria. Each should be able to help a third-party evaluator compare the two replies fairly and systematically.
    3. Each criterion should have a short, clear **name**, followed by a **brief explanation** of what it evaluates and why it matters in this context.

    ### Rules:
    - **Do not** evaluate or compare the two replies yourself.
    - **Do not** add any comments, summaries, or conclusions.
    - Only output the **numbered list** of {criteria_n} criteria with explanations.
    - Stay neutral, specific, and objective.
    
    ### Output format:\n1. Criterion name: Explanation\n2. Criterion name: Explanation\n...\n9. Criterion name: Explanation\n10. Criterion name: Explanation\n\n\n### Input:\n[User Question]:{query}\n[The Start of Assistant A's Answer]:{response_1}[The End of Assistant A's Answer]\n[The Start of Assistant B's Answer]:{response_2}[The End of Assistant B's Answer]""".format(
        query=query,
        response_1=response_1,
        response_2=response_2,
        criteria_n=n
    )
    return prompt


def build_judge_prompt(query, response_1, response_2, criteria):

    prompt = """You are an expert evaluator. Your task is to assess how well each assistant response satisfies the user’s query, based strictly on the provided [Evaluation Criteria].

    ###Instructions:
	1.	Carefully read the user query, the two assistant responses, and the evaluation criteria.
	2.	For each response, analyze step-by-step how effectively it addresses the user’s query, as measured by the criteria. Support your analysis with clear, specific evidence from the content.
	3.	Assign an overall score to each response (integer from 1 to 10):
        + 1 = Does not satisfy the query at all based on the criteria
        + 6 = Partially satisfies the query
        + 10 = Fully satisfies the query with excellence across all criteria
    4. Your judgment must be strict, fair, and grounded in the criteria.
	5. Do not assume one response is better unless this is clearly justified by how well it satisfies the query under the criteria.
    
    ### Final Output Format:
    output your analysis and final decision:\n<Think>\n<Judge A>\n...</Judge A>\n<Judge B>\n...</Judge B>\n</Think>\nThe final verdict is `[[A]]` or `[[B]]` 
    
    Only output one of the above tags and nothing else in the final answer line.\n\n\n###Input\n[User Query]\n{query}\n\n[The Start of Assistant A's Answer]\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{response_2}\n[The End of Assistant B's Answer]\n####[The Begin of Evaluation Criteria]\n{criteria}\n[The End of Evaluation Criteria]\n""".format(
        query=query,
        response_1=response_1,
        response_2=response_2,
        criteria=criteria
    )
    return prompt

def build_critique_prompt(query, response, criteria, judgment):

    prompt = """You are an expert evaluator and editor. Your task is to evaluate the assistant response(s) strictly against the provided [Evaluation Criteria] and [Judgment], and to produce concrete, actionable revision suggestions when the response is imperfect.

    ## Instructions:
        1.	Read the User Query, the assistant response, the [Evaluation Criteria], and the [Judgment].
        2.	For the provided assistant response, do the following:
            ### Issues: List problems based on [Evaluation Criteria] and [Judgment].
            ### Actionable Revisions:
                1. Provide some concrete fixes.
                2. Each fix must include:
                    + What to change (precise edit),
                    + Why (explicit link to the violated Criteria/Judgment),
                3. Do not provide a full rewritten response; only give revision suggestions.

    ⚠️ Note: Your evaluation must explicitly address both the Evaluation Criteria and the Judgment.
    ###Input\n[User Query]\n{query}\n\n[The Start of Assistant Answer]\n{response}\n[The End of Assistant Answer]\n\n\n####[The Begin of Evaluation Criteria]\n{criteria}\n[The End of Evaluation Criteria]\n\n\n####[The Begin of Judgment]\n{judgment}\n[The End of Judgment]\n""".format(
        query=query,
        response=response,
        criteria=criteria,
        judgment=judgment
    )
    return prompt

def build_correct_prompt(prompt, response, critique):

    prompt = """You are tasked with revising the assistant responses based on a user query and a provided critique.
    
    ### Instructions:
    - Make only objective and necessary edits that directly address the specific points raised in the critique.
    - Do **not** change any content that is not explicitly mentioned in the critique.
    - Do **not** introduce new ideas, rephrase unrelated sections, or make stylistic edits beyond what the critique specifies.
    - Your revisions must be accurate, minimal, and strictly aligned with the critique.

    ### Output Format:
    Return **only** the fully revised response. Do **not** include any explanations, comments, or metadata.

    ###Input\n[User Query]\n{instruction}\n\n[The Start of Responses]\n{response}\n[The End of Responses]\n\n[The Start of Critique]\n{critique}\n[The End of Critique]\n\n\nPlease return revised responses""".format(
        instruction=prompt,
        response=response,
        critique=critique
    )
    return prompt

def build_judge_prompt_v2(query, response_1, response_2, criteria):

    prompt = """You are an expert evaluator. Your task is to assess how well each assistant response satisfies the user’s query, strictly based on the provided [Evaluation Criteria].

    ## Instructions:
        1.	Carefully read the user query, the two assistant responses, and the [Evaluation Criteria].
        2.	For each response, provide a step-by-step analysis of how effectively it addresses the user’s query in relation to the criteria. Support your evaluation with specific evidence from the response content.
        3.	Your judgment must be strict, fair, and explicitly grounded in the Evaluation Criteria.
        4.	Do not assume one response is better unless the evidence clearly shows it satisfies the query more effectively.
    
    ###  Final Output Format:
        1. Present your results in three sections using the following format:
            ### Final Output Format: <judge A>\n[Analysis of Response A: Evaluate against each Evaluation Criteria. Be explicit about strengths and weaknesses. Clearly state where improvements are needed and explain why. Assign an overall score from 1–10.]\n</judge A>\n<judge B>\n[Analysis of Response B: same requirements as above.]\n</judge B>\n# The Final Verdict is [[A]] or [[B]].
        2. Assign an overall score to each response (integer from 1 to 10):
            + 1 = Does not satisfy the query at all under the criteria
            + 6 = Partially satisfies the query
            + 10 = Fully satisfies the query with excellence across all criteria
        3. At the end, output your analysis and then give the final decision in this exact format: If Assistant A is better: explanation followed by [[A]]; If Assistant B is better: explanation followed by [[B]]. Only output one of the tags ([[A]] or [[B]]) on the final answer line, and nothing else.
    
    ###Input\n[User Query]\n{query}\n\n[The Start of Assistant A's Answer]\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{response_2}\n[The End of Assistant B's Answer]\n####[The Begin of Evaluation Criteria]\n{criteria}\n[The End of Evaluation Criteria]\n""".format(
        query=query,
        response_1=response_1,
        response_2=response_2,
        criteria=criteria
    )
    return prompt

#### image
def build_criteria_omni_prompt_triple(prompt, criteria_n=10):

    mapping = {
        3: "three",
        5: "five",
        10: "ten",
        15: "fifteen",
        20: "twenty",
        25: "twenty-five",
        30: "thirty",
        35: "thirty-five",
        45: "forty-five",
        50: "fifty",
        55: "fifty-five",
        60: "sixty",
    }
    n = mapping.get(criteria_n, "invalid input") 
    assert n != "invalid input", f"criteria_n should be a number of {list(mapping.keys())}"

    prefix = """You are an expert in generating evaluation criteria. Given a user query and two assistant replies (which may contain text, images, audio, or video), your task is to create **exactly {criteria_n} evaluation criteria** that best distinguish the strengths and weaknesses of the two responses.

    ### Instructions:
    1. Carefully compare the two replies and identify meaningful differences in how they respond—this could be in reasoning, factual accuracy, structure, clarity, creativity, style, or multimodal use.
    2. Based on those differences, derive {criteria_n} **context-specific** criteria. Each should be able to help a third-party evaluator compare the two replies fairly and systematically.
    3. Each criterion should have a short, clear **name**, followed by a **brief explanation** of what it evaluates and why it matters in this context.

    ### Rules:
    - **Do not** evaluate or compare the two replies yourself.
    - **Do not** add any comments, summaries, or conclusions.
    - Only output the **numbered list** of {criteria_n} criteria with explanations.
    - Stay neutral, specific, and objective.
    
    ### Output format:\n1. Criterion name: Explanation\n2. Criterion name: Explanation\n...\n9. Criterion name: Explanation\n10. Criterion name: Explanation\n\n\n### Input:\n[User Question]:{instruction}\n[The Start of Assistant A's Answer]:""".format(instruction=prompt,
        criteria_n=n
    )
    
    infix = """\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n"""
    
    suffix = """\n[The End of Assistant B's Answer]\n"""
    return prefix, infix, suffix
    
def build_criteria_omni_prompt_split(prompt, chosen, rejected, criteria_n=10):

    mapping = {
        3: "three",
        5: "five",
        10: "ten",
        15: "fifteen",
        20: "twenty",
        25: "twenty-five",
        30: "thirty",
        35: "thirty-five",
        45: "forty-five",
        50: "fifty",
        55: "fifty-five",
        60: "sixty",
    }
    n = mapping.get(criteria_n, "invalid input")
    assert n != "invalid input", f"criteria_n should be a number of {list(mapping.keys())}"

    prefix = """You are an expert in generating evaluation criteria. Given a user query and two assistant replies (which may contain text, images, audio, or video), your task is to create **exactly {criteria_n} evaluation criteria** that best distinguish the strengths and weaknesses of the two responses.

    ### Instructions:
    1. Carefully compare the two replies and identify meaningful differences in how they respond—this could be in reasoning, factual accuracy, structure, clarity, creativity, style, or multimodal use.
    2. Based on those differences, derive {criteria_n} **context-specific** criteria. Each should be able to help a third-party evaluator compare the two replies fairly and systematically.
    3. Each criterion should have a short, clear **name**, followed by a **brief explanation** of what it evaluates and why it matters in this context.

    ### Rules:
    - **Do not** evaluate or compare the two replies yourself.
    - **Do not** add any comments, summaries, or conclusions.
    - Only output the **numbered list** of {criteria_n} criteria with explanations.
    - Stay neutral, specific, and objective.
    
    ### Output format:\n1. Criterion name: Explanation\n2. Criterion name: Explanation\n...\n9. Criterion name: Explanation\n10. Criterion name: Explanation\n\n\n### Input:\n[User Question]:""".format(criteria_n=criteria_n)
    
    suffix = """\n{instruction}\n\n[The Start of Assistant A's Answer]\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{response_2}\n[The End of Assistant B's Answer]\n""".format(
        instruction=prompt,
        response_1=chosen,
        response_2=rejected,
    )
    return prefix, suffix


def build_judge_omni_prompt_v2(query, response_1, response_2, criteria):

    prefix = """You are an expert evaluator. Your task is to assess how well each assistant response satisfies the user’s query, strictly based on the provided [Evaluation Criteria].

    ## Instructions:
        1.	Carefully read the user query, the two assistant responses, and the [Evaluation Criteria].
        2.	For each response, provide a step-by-step analysis of how effectively it addresses the user’s query in relation to the criteria. Support your evaluation with specific evidence from the response content.
        3.	Your judgment must be strict, fair, and explicitly grounded in the Evaluation Criteria.
        4.	Do not assume one response is better unless the evidence clearly shows it satisfies the query more effectively.
    
    ###  Final Output Format:
        1. Present your results in three sections using the following format:
            ### Final Output Format: <judge A>\n[Analysis of Response A: Evaluate against each Evaluation Criteria. Be explicit about strengths and weaknesses. Clearly state where improvements are needed and explain why. Assign an overall score from 1–10.]\n</judge A>\n<judge B>\n[Analysis of Response B: same requirements as above.]\n</judge B>\n# The Final Verdict is [[A]] or [[B]].
        2. Assign an overall score to each response (integer from 1 to 10):
            + 1 = Does not satisfy the query at all under the criteria
            + 6 = Partially satisfies the query
            + 10 = Fully satisfies the query with excellence across all criteria
        3. At the end, output your analysis and then give the final decision in this exact format: If Assistant A is better: explanation followed by [[A]]; If Assistant B is better: explanation followed by [[B]]. Only output one of the tags ([[A]] or [[B]]) on the final answer line, and nothing else.
    
    ###Input\n[User Query]\n"""
    
    suffix = """{query}\n\n[The Start of Assistant A's Answer]\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{response_2}\n[The End of Assistant B's Answer]\n####[The Begin of Evaluation Criteria]\n{criteria}\n[The End of Evaluation Criteria]\n""".format(
        query=query,
        response_1=response_1,
        response_2=response_2,
        criteria=criteria
    )
    return prefix, suffix


def build_correct_omni_prompt(prompt, response, critique):

    prefix = """You are tasked with revising the assistant responses based on a user query and a provided critique.
    
    ### Instructions:
    - Make only objective and necessary edits that directly address the specific points raised in the critique.
    - Do **not** change any content that is not explicitly mentioned in the critique.
    - Do **not** introduce new ideas, rephrase unrelated sections, or make stylistic edits beyond what the critique specifies.
    - Your revisions must be accurate, minimal, and strictly aligned with the critique.

    ### Output Format:
    Return **only** the fully revised response. Do **not** include any explanations, comments, or metadata.

    ###Input\n[User Query]\n"""
    
    suffix = """{instruction}\n\n[The Start of Responses]\n{response}\n[The End of Responses]\n\n[The Start of Critique]\n{critique}\n[The End of Critique]\n\n\nPlease return revised responses""".format(
        instruction=prompt,
        response=response,
        critique=critique
    )
    return prefix, suffix
