
from typing import Tuple, Optional
import textwrap
from utils import criteria2str

# ---------------------------
# Prompt builders
# ---------------------------

# def build_direct_judge_prompt(
#     instruction: str, response_a: str, response_b: str, modality="language", task="U", think: bool=True, xargs: dict={}
# ):

#     if modality == "language":
#         build_language_prompt(instruction, response_a, response_b, think)
#     elif modality == "image" or modality == "video" or modality == "audio":
#         build_omni_prompt(instruction, response_a, response_b, task, xargs, think)
#     else:
#         raise NotImplementedError

def build_language_prompt(
    instruction: str, response_a: str, response_b: str, think: bool
) -> str:

    if think:
        prompt = textwrap.dedent(
            f"""
            You are a fair, professional, and neutral multimodal AI evaluator.
            You are tasked with evaluating two different multimodal responses (which may include Text, Image, Video, and Audio) generated for the same user query, and determining which one is better.
            Based on the overall analysis, clearly determine which response is superior.

            ### Important Notes:
            1. Stay completely neutral: Do not be influenced by the order, length, writing style, or the assistant’s name.
            2. Do not favor responses simply because they are longer or use more elaborate language.

            ### Final Output Format
            After completing the full analysis, you must output the final verdict in the following format:
            If you believe Assistant A’s response is better, output: [[A]]
            If you believe Assistant B’s response is better, output: [[B]].

            ### Input Format
            [User Question]
            {instruction}

            [The Start of Assistant A's Answer]
            {response_a}
            [The End of Assistant A's Answer]

            [The Start of Assistant B's Answer]
            {response_b}
            [The End of Assistant B's Answer]
            
            Please output your analysis and final verdict:"""
        ).strip()
    else:
        prompt = textwrap.dedent(
            f"""
            You are a fair, professional, and neutral multimodal AI evaluator.
            You are tasked with evaluating two different multimodal responses (which may include Text, Image, Video, and Audio) generated for the same user query, and determining which one is better.

            ### Please directly output the final judgment in the following format:
            If you believe Assistant A’s response is better, output: The final verdict is [[A]]
            If you believe Assistant B’s response is better, output: The final verdict is [[B]]. 

            ### Input Format
            [User Question]
            {instruction}

            [The Start of Assistant A's Answer]
            {response_a}
            [The End of Assistant A's Answer]

            [The Start of Assistant B's Answer]
            {response_b}
            [The End of Assistant B's Answer]
            
            Please output your final verdict:"""
        ).strip()
    return prompt


def build_stepwise_criteria_prompt(
    instruction: str,
    response_1: str,
    response_2: str,
    criteria_hisotry: Optional[str] = "",
) -> str:

    prompt = textwrap.dedent(
        f"""You are an expert in generating evaluation criteria. Given a user query and two assistant replies (which may include Text, Image, Video, and Audio), your task is to create **exactly three evaluation criteria** that best distinguish the strengths and weaknesses of the two responses.\n### Instructions:\n1. Carefully compare the two replies and identify meaningful differences in how they respond, these could involve reasoning, factual accuracy, structure, clarity, creativity, style, or multimodal use.\n2. From those differences, derive three context-specific criteria. Each criterion should help a third-party evaluator compare the two replies fairly and systematically.\n3. Ensure that none of your criteria overlap with the given candidate criteria. If no candidate criteria are provided, propose the three criteria you consider most important.\n4. For each criterion, provide a short, clear name, followed by a brief explanation of what it evaluates and why it matters in this context.\n\n### Output format:<criteria 1>xxx</criteria 1>\n<criteria 2>xxx</criteria 2>\n<criteria 3>xxx</criteria 3>\n\n### Input:[User Question]:\n{instruction}\n\n[The Start of Assistant A's Answer]:\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]: \n{response_2}\n[The End of Assistant B's Answer]\n\n[The Candidate Criteria Start]: \n{criteria_hisotry}\n[The Candidate Criteria End]"""
    ).strip()
    return prompt

def build_stepwise_judge_prompt(
    query: str, response_1: str, response_2: str, candidate_criteria: str
) -> str:

    prompt = textwrap.dedent(
        f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (which may include Text, Image, Video, and Audio) generated for the same user query, and determining which one is better, based strictly on the provided [Evaluation Criteria].\n\n###Instructions:\n1. Carefully read the user query, the two assistant responses, and the evaluation criteria.\n2. For each response, analyze step-by-step how effectively it addresses the user’s query, as measured by the criteria. Support your analysis with clear, specific evidence from the content.\n3. Your judgment must be strict, fair, and grounded in the criteria.\n4. Do not assume one response is better unless this is clearly justified by how well it satisfies the query under the criteria.\n\n### Final Output Format:
        output your analysis and final decision:\n<Think>\n<Judge A>\n<criteria 1>xxx</criteria 1>\n...</Judge A>\n<Judge B>\n<criteria 1>xxx</criteria 1>\n...</Judge B>\n</Think>\nThe final verdict is `[[A]]` or `[[B]]` \n\n### Input\n[User Query]\n{query}\n\n[The Start of Assistant A's Answer]\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{response_2}\n[The End of Assistant B's Answer]\n\n[The Start of Evaluation Criteria]\n{candidate_criteria}\n[The End of Evaluation Criteria]\n\nPlease output your analysis and final verdict:"""
    ).strip()
    
    return prompt

def build_correct_prompt(query: str, response: str, judge: str
) -> str:
    
    prompt = textwrap.dedent(
        f"""You are tasked with revising the assistant responses based on a user query and a provided critique.\n\n ### Instructions:\n1. Make only objective and necessary edits that directly address the specific points raised in the critique.\n2. Do **not** change any content that is not explicitly mentioned in the critique.\n3. Do **not** introduce new ideas, rephrase unrelated sections, or make stylistic edits beyond what the critique specifies.\n4. Your revisions must be accurate, minimal, and strictly aligned with the critique.\n\n### Output Format:\nReturn **only** the fully revised response. Do **not** include any explanations, comments, or metadata.\n\n###Input\n[User Query]\n{query}\n\n[The Start of Responses]\n{response}\n[The End of Responses]\n\n[The Start of Critique]\n{judge}\n[The End of Critique]\n\n\nPlease return revised responses:"""
    ).strip()
    return prompt

def build_correct_split_prompt(query: str, response: str, judge: str, task="und", think=True) -> str:
    if task == "und":
        if think:
            prefix = f"""You are tasked with revising the assistant responses based on a user query and a provided critique.\n\n ### Instructions:\n1. Make only objective and necessary edits that directly address the specific points raised in the critique.\n2. Do **not** change any content that is not explicitly mentioned in the critique.\n3. Do **not** introduce new ideas, rephrase unrelated sections, or make stylistic edits beyond what the critique specifies.\n4. Your revisions must be accurate, minimal, and strictly aligned with the critique.\n\n### Output Format:\nReturn **only** the fully revised response. Do **not** include any explanations, comments, or metadata.\n\n###Input\n[User Query]\n"""
            suffix = f"""{query}\n\n[The Start of Responses]\n{response}\n[The End of Responses]\n\n[The Start of Critique]\n{judge}\n[The End of Critique]\n\n\nPlease return revised responses:"""
            return prefix, suffix
        else:
            raise ValueError(f"Unknown task: {task}")
    else:
        raise ValueError(f"Unknown task: {task}")
    return prompt

def build_direct_judge_prompt(query: str, response_1: str, response_2: str, candidate_criteria: str, think=True) -> str:
    
    if think:
        # prompt = f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (which may include Text, Image, Video, and Audio) generated for the same user query, and determining which one is better.\nyour task is to create **exactly three evaluation criteria** that best distinguish the strengths and weaknesses of the two responses. And give your judgement based on the three criteria provided by you.\n\n### Instructions:\n1. Carefully compare the two replies and identify meaningful differences in how they respond, these could involve reasoning, factual accuracy, structure, clarity, creativity, style, or multimodal use.\n2. From those differences, derive three context-specific criteria. Each criterion should hele you compare the two replies fairly and systematically.\n3. Ensure that none of your criteria overlap with the given candidate criteria. If no candidate criteria are provided, propose three criteria you consider most important.\n4. For each criterion, provide a short, clear name, followed by a brief explanation of what it evaluates and why it matters in this context.\n5. For each response, analyze step-by-step how effectively it addresses the user’s query, as measured by the criteria. Support your analysis with clear, specific evidence from the content.\n6. Your judgment must be strict, fair, and grounded in the criteria.\n7. Do not assume one response is better unless this is clearly justified by how well it satisfies the query under the criteria.\n### Output format: \n<think>\n<criteria 1>xxxx. <Judge A>xxx</Judge A><Judge B>xxx</Judge B></criteria 1>...</think>\nThe final verdict is `[[A]]` or `[[B]]`\n\n### Input:[User Question]:\n{query}\n\n[The Start of Assistant A's Answer]:\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]: \n{response_2}\n[The End of Assistant B's Answer]\n\n[The Candidate Criteria Start]: \n{candidate_criteria}\n[The Candidate Criteria End]\nPlease output your analysis and final verdict:"""

        prompt = f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (which may include Text, Image, Video, and Audio) generated for the same user query, and determining which one is better.\nyour task is to create **exactly three evaluation criteria** that best distinguish the strengths and weaknesses of the two responses. And give your judgement based on the three criteria provided by you.\n\n### Instructions:\n1. Carefully compare the two replies and identify meaningful differences in how they respond, these could involve reasoning, factual accuracy, structure, clarity, creativity, style, or multimodal use.\n2. From those differences, derive three context-specific criteria. Each criterion should hele you compare the two replies fairly and systematically.\n3. Ensure that none of your criteria overlap with the given candidate criteria. If no candidate criteria are provided, propose three criteria you consider most important.\n4. For each criterion, provide a short, clear name, followed by a brief explanation of what it evaluates and why it matters in this context.\n\n### Output format:\n<think>\n<criteria 1>xxxx. <Judge A></Judge A><Judge B></Judge B></criteria 1>...</think>\nThe final verdict is `[[A]]` or `[[B]]`\n\n### Input:[User Question]:\n{query}\n\n[The Start of Assistant A's Answer]:\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]: \n{response_2}\n[The End of Assistant B's Answer]\n\n[The Candidate Criteria Start]: \n{candidate_criteria}\n[The Candidate Criteria End]\nPlease output your analysis and final verdict:"""
    else:
        prompt = f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (which may include Text, Image, Video, and Audio) generated for the same user query, and determining which one is better.\n\n    ### Final Output Format:\nPlease directly output the final verdict in the following format:\nThe final verdict is `[[A]]` or `[[B]]`\n\n\n###Input Format\n[User Question]\n{query}\n\n[The Start of Assistant A's Answer]\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{response_2}\n[The End of Assistant B's Answer]\n"""
    return prompt

def build_direct_judge_split_prompt(query: str, response_1: str, response_2: str, candidate_criteria: str, task="und", think=True) -> str:
    if task == "und":
        if think:
            prefix = f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (which may include Text, Image, Video, and Audio) generated for the same user query, and determining which one is better.\nyour task is to create **exactly three evaluation criteria** that best distinguish the strengths and weaknesses of the two responses. And give your judgement based on the three criteria provided by you.\n\n### Instructions:\n1. Carefully compare the two replies and identify meaningful differences in how they respond, these could involve reasoning, factual accuracy, structure, clarity, creativity, style, or multimodal use.\n2. From those differences, derive three context-specific criteria. Each criterion should hele you compare the two replies fairly and systematically.\n3. Ensure that none of your criteria overlap with the given candidate criteria. If no candidate criteria are provided, propose three criteria you consider most important.\n4. For each criterion, provide a short, clear name, followed by a brief explanation of what it evaluates and why it matters in this context.\n\n### Output format:\n<think>\n<criteria 1>xxxx. <Judge A></Judge A><Judge B></Judge B></criteria 1>...</think>\nThe final verdict is `[[A]]` or `[[B]]`\n\n### Input:[User Question]:"""
            suffix = f"""\n{query}\n\n[The Start of Assistant A's Answer]:\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]: \n{response_2}\n[The End of Assistant B's Answer]\n\n[The Candidate Criteria Start]: \n{candidate_criteria}\n[The Candidate Criteria End]\nPlease output your analysis and final verdict:"""
            return prefix, suffix
        else:
            prefix = f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (which may include Text, Image, Video, and Audio) generated for the same user query, and determining which one is better.\n\n    ### Final Output Format:\nPlease directly output the final verdict in the following format:\nThe final verdict is `[[A]]` or `[[B]]`\n\n\n###Input Format\n[User Question]\n"""
            suffix = f"""{query}\n\n[The Start of Assistant A's Answer]\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{response_2}\n[The End of Assistant B's Answer]\n"""
            return prefix, suffix
    elif task == "gen":
        if think:
            prefix = f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (which may include Text, Image, Video, and Audio) generated for the same user query, and determining which one is better.\nyour task is to create **exactly three evaluation criteria** that best distinguish the strengths and weaknesses of the two responses. And give your judgement based on the three criteria provided by you.\n\n### Instructions:\n1. Carefully compare the two replies and identify meaningful differences in how they respond, these could involve reasoning, factual accuracy, structure, clarity, creativity, style, or multimodal use.\n2. From those differences, derive three context-specific criteria. Each criterion should hele you compare the two replies fairly and systematically.\n3. Ensure that none of your criteria overlap with the given candidate criteria. If no candidate criteria are provided, propose three criteria you consider most important.\n4. For each criterion, provide a short, clear name, followed by a brief explanation of what it evaluates and why it matters in this context.\n\n### Output format:\n<think>\n<criteria 1>xxxx. <Judge A></Judge A><Judge B></Judge B></criteria 1>...</think>\nThe final verdict is `[[A]]` or `[[B]]`\n\n### Input:[User Question]:\n{query}\n\n[The Start of Assistant A's Answer]:\n"""
            inffix = f"""{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]: \n"""
            suffix = f"""{response_2}\n[The End of Assistant B's Answer]\n\n[The Candidate Criteria Start]: \n{candidate_criteria}\n[The Candidate Criteria End]\nPlease output your analysis and final verdict:"""
            return prefix, inffix, suffix
        else:
            prefix = f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (which may include Text, Image, Video, and Audio) generated for the same user query, and determining which one is better.\n\n    ### Final Output Format:\nPlease directly output the final verdict in the following format:\nThe final verdict is `[[A]]` or `[[B]]`\n\n\n###Input Format\n[User Question]\n{query}\n\n[The Start of Assistant A's Answer]\n"""
            inffix = f"""{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n"""
            suffix = f"""{response_2}\n[The End of Assistant B's Answer]\n"""
            return prefix, inffix, suffix
    else:
        raise ValueError(f"Unknown task: {task}")



def build_direct_judge_without_candidate_prompt(query: str, response_1: str, response_2: str, think=True) -> str:
    if think:
        prompt=f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (Text, Image, Video, Audio) generated for the same user query, and determining which one is better.\n\n### Instructions\n1. Comparison Basis. Carefully compare the two responses and identify meaningful differences in how they address the user’s query. Differences could involve: reasoning, factual accuracy, clarity, structure, creativity, or use of multimodal elements.\n2. Criteria Creation. From these differences, derive exactly three evaluation criteria that best distinguish the strengths and weaknesses of the two responses. Each criterion should have a short name and a brief explanation of what it evaluates and why it matters. Ensure the criteria do not overlap with each other.\n3. Evaluation. For each criterion, analyze Response A and Response B step by step. Support your evaluation with clear, specific evidence from the responses. Maintain a strict, fair, and grounded judgment.\n4. Verdict. Your final decision must be `[[A]]` or `[[B]]` (no ties allowed). Base your verdict solely on which response performs better under the three criteria.\n\n### Output Format\n<think>\n<criteria 1>Name. Explanation. \n<Judge A>Analysis of A.</Judge A>\n<Judge B>Analysis of B.</Judge B>\nBased on the criteria, the verdict is [[A]] or [[B]]</criteria 1>\n...\n</think> The final verdict is [[A]] or [[B]]\n\n### Input:[User Question]:\n{query}\n\n[The Start of Assistant A's Answer]:\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]: \n{response_2}\n[The End of Assistant B's Answer]\nPlease output your analysis and final verdict:"""
    else:
        raise NotImplementedError
    return prompt

def build_direct_judge_split_without_candidate_prompt(query: str, response_1: str, response_2: str, task="und", think=True) -> str:

    if task == "und":
        if think:
            prefix = f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (Text, Image, Video, Audio) generated for the same user query, and determining which one is better.\n\n### Instructions\n1. Comparison Basis. Carefully compare the two responses and identify meaningful differences in how they address the user’s query. Differences could involve: reasoning, factual accuracy, clarity, structure, creativity, or use of multimodal elements.\n2. Criteria Creation. From these differences, derive exactly three evaluation criteria that best distinguish the strengths and weaknesses of the two responses. Each criterion should have a short name and a brief explanation of what it evaluates and why it matters. Ensure the criteria do not overlap with each other.\n3. Evaluation. For each criterion, analyze Response A and Response B step by step. Support your evaluation with clear, specific evidence from the responses. Maintain a strict, fair, and grounded judgment.\n4. Verdict. Your final decision must be `[[A]]` or `[[B]]` (no ties allowed). Base your verdict solely on which response performs better under the three criteria.\n\n### Output Format\n<think>\n<criteria 1>Name. Explanation. \n<Judge A>Analysis of A.</Judge A>\n<Judge B>Analysis of B.</Judge B>\nBased on the criteria, the verdict is [[A]] or [[B]]</criteria 1>\n...\n</think> The final verdict is [[A]] or [[B]]\n\n### Input:[User Question]:"""
            suffix = f"""\n{query}\n\n[The Start of Assistant A's Answer]:\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]: \n{response_2}\n[The End of Assistant B's Answer]\nPlease output your analysis and final verdict:"""
            return prefix, suffix
        else:
            prefix = f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (which may include Text, Image, Video, and Audio) generated for the same user query, and determining which one is better.\n\n    ### Final Output Format:\nPlease directly output the final verdict in the following format:\nThe final verdict is `[[A]]` or `[[B]]`\n\n\n###Input Format\n[User Question]\n"""
            suffix = f"""{query}\n\n[The Start of Assistant A's Answer]\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{response_2}\n[The End of Assistant B's Answer]\n"""
            return prefix, suffix
    elif task == "gen":
        if think:
            prefix = f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (Text, Image, Video, Audio) generated for the same user query, and determining which one is better.\n\n### Instructions\n1. Comparison Basis. Carefully compare the two responses and identify meaningful differences in how they address the user’s query. Differences could involve: reasoning, factual accuracy, clarity, structure, creativity, or use of multimodal elements.\n2. Criteria Creation. From these differences, derive exactly three evaluation criteria that best distinguish the strengths and weaknesses of the two responses. Each criterion should have a short name and a brief explanation of what it evaluates and why it matters. Ensure the criteria do not overlap with each other.\n3. Evaluation. For each criterion, analyze Response A and Response B step by step. Support your evaluation with clear, specific evidence from the responses. Maintain a strict, fair, and grounded judgment.\n4. Verdict. Your final decision must be `[[A]]` or `[[B]]` (no ties allowed). Base your verdict solely on which response performs better under the three criteria.\n\n### Output Format\n<think>\n<criteria 1>Name. Explanation. \n<Judge A>Analysis of A.</Judge A>\n<Judge B>Analysis of B.</Judge B>\nBased on the criteria, the verdict is [[A]] or [[B]]</criteria 1>\n...\n</think> The final verdict is [[A]] or [[B]]\n\n### Input:[User Question]:\n{query}\n\n[The Start of Assistant A's Answer]:\n"""
            inffix = f"""{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n"""
            suffix = f"""{response_2}\n[The End of Assistant B's Answer]\nPlease output your analysis and final verdict:"""
            return prefix, inffix, suffix
        else:
            prefix = f"""You are a fair, professional, and neutral multimodal AI evaluator.\nYou are tasked with evaluating two different multimodal responses (which may include Text, Image, Video, and Audio) generated for the same user query, and determining which one is better.\n\n    ### Final Output Format:\nPlease directly output the final verdict in the following format:\nThe final verdict is `[[A]]` or `[[B]]`\n\n\n###Input Format\n[User Question]\n{query}\n\n[The Start of Assistant A's Answer]\n"""
            inffix = f"""{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n"""
            suffix = f"""{response_2}\n[The End of Assistant B's Answer]\n"""
            return prefix, inffix, suffix
    else:
        raise ValueError(f"Unknown task: {task}")

# def build_exploration_criteria_prompt(query: str, response_1: str, response_2: str, candidate_criteria: str, think=True) -> str:
#     assert isinstance(candidate_criteria, str)
#     if think:
#         if candidate_criteria == "":
#             prompt=f"""You are an expert in generating evaluation criteria. Given a user query and two assistant replies (which may include Text, Image, Video, and Audio), your task is to create **exactly 10 evaluation criteria** that best distinguish the strengths and weaknesses of the two responses.\n### Instructions:\n1. Carefully compare the two replies and identify meaningful differences in how they respond, these could involve reasoning, factual accuracy, structure, clarity, creativity, style, or multimodal use.\n2. From those differences, derive 10 context-specific criteria. Each criterion should help a third-party evaluator compare the two replies fairly and systematically.\n3. Ensure that none of your criteria overlap with the given candidate criteria. If no candidate criteria are provided, propose the 10 criteria you consider most important.\n4. For each criterion, provide a short, clear name, followed by a brief explanation of what it evaluates and why it matters in this context.\n\n### Output format:<criteria 1>Name. Explanation.</criteria 1>\n...\n<criteria 10>Name. Explanation.</criteria 10>\n\n### Input:[User Question]:\n{query}\n\n[The Start of Assistant A's Answer]:\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]: \n{response_2}\n[The End of Assistant B's Answer]\n\n[The Candidate Criteria Start]: \n{candidate_criteria}\n[The Candidate Criteria End]"""
#         else:
#             prompt=f"""You are an expert in generating evaluation criteria. Given a user query and two assistant replies (which may include Text, Image, Video, and Audio), your task is to create **exactly 7 evaluation criteria** that best distinguish the strengths and weaknesses of the two responses.\n### Instructions:\n1. Carefully compare the two replies and identify meaningful differences in how they respond, these could involve reasoning, factual accuracy, structure, clarity, creativity, style, or multimodal use.\n2. From those differences, derive 7 context-specific criteria. Each criterion should help a third-party evaluator compare the two replies fairly and systematically.\n3. Ensure that none of your criteria overlap with the given candidate criteria. If no candidate criteria are provided, propose the 10 criteria you consider most important.\n4. For each criterion, provide a short, clear name, followed by a brief explanation of what it evaluates and why it matters in this context.\n\n### Output format:<criteria 1>Name. Explanation.</criteria 1>\n...\n<criteria 7>Name. Explanation.</criteria 7>\n\n### Input:[User Question]:\n{query}\n\n[The Start of Assistant A's Answer]:\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]: \n{response_2}\n[The End of Assistant B's Answer]\n\n[The Candidate Criteria Start]: \n{candidate_criteria}\n[The Candidate Criteria End]"""

#     else:
#         raise NotImplementedError
#     return prompt
def build_exploration_criteria_prompt(query: str, response_1: str, response_2: str, candidate_criteria: str, think=True) -> str:
    assert isinstance(candidate_criteria, str)
    if think:
        if candidate_criteria.strip() == "":
            prompt=f"""You are an expert in generating evaluation criteria. Given a user query and two assistant replies (which may include Text, Image, Video, and Audio), your task is to create **exactly 10 evaluation criteria** that best distinguish the strengths and weaknesses of the two responses.\n### Instructions:\n1. Carefully compare the two replies and identify meaningful differences in how they respond, these could involve reasoning, factual accuracy, structure, clarity, creativity, style, or multimodal use.\n2. From those differences, derive 10 context-specific criteria. Each criterion should help a third-party evaluator compare the two replies fairly and systematically.\n3. Ensure that none of your criteria overlap with the given candidate criteria. If no candidate criteria are provided, propose the 10 criteria you consider most important.\n4. For each criterion, provide a short, clear name, followed by a brief explanation of what it evaluates and why it matters in this context.\n\n### Output format:<criteria 1>**Name**: Explanation</criteria 1>\n...\n<criteria 10>**Name**: Explanation</criteria 10>\n\n### Input:[User Question]:\n{query}\n\n[The Start of Assistant A's Answer]:\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]: \n{response_2}\n[The End of Assistant B's Answer]\n\n[The Candidate Criteria Start]: \n{candidate_criteria}\n[The Candidate Criteria End]"""
        else:
            prompt=f"""You are an expert in generating evaluation criteria. Given a user query and two assistant replies (which may include Text, Image, Video, and Audio), your task is to create **exactly 7 evaluation criteria** that best distinguish the strengths and weaknesses of the two responses.\n### Instructions:\n1. Carefully compare the two replies and identify meaningful differences in how they respond, these could involve reasoning, factual accuracy, structure, clarity, creativity, style, or multimodal use.\n2. From those differences, derive 7 context-specific criteria. Each criterion should help a third-party evaluator compare the two replies fairly and systematically.\n3. Ensure that none of your criteria overlap with the given candidate criteria. If no candidate criteria are provided, propose the 10 criteria you consider most important.\n4. For each criterion, provide a short, clear name, followed by a brief explanation of what it evaluates and why it matters in this context.\n\n### Output format:<criteria 1>**Name**: Explanation</criteria 1>\n...\n<criteria 7>**Name**: Explanation</criteria 7>\n\n### Input:[User Question]:\n{query}\n\n[The Start of Assistant A's Answer]:\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]: \n{response_2}\n[The End of Assistant B's Answer]\n\n[The Candidate Criteria Start]: \n{candidate_criteria}\n[The Candidate Criteria End]"""

    else:
        raise NotImplementedError
    return prompt

def build_exploration_judge_prompt(query: str, response_1: str, response_2: str, based_criterion: str) -> str:
    assert isinstance(based_criterion, str)
    prompt = """You are an expert evaluator. Your task is to assess how well each assistant response satisfies the user’s query, strictly based on the provided [Evaluation Criteria].

    ## Instructions:
        1.	Carefully read the user query, the two assistant responses, and the [Evaluation Criteria].
        2.	For each response, provide a step-by-step analysis of how effectively it addresses the user’s query in relation to the criteria. Support your evaluation with specific evidence from the response content.
        3.	Your judgment must be strict, fair, and explicitly grounded in the Evaluation Criteria.
        4.	Do not assume one response is better unless the evidence clearly shows it satisfies the query more effectively.
    
    ###  Final Output Format:
        1. Present your results in three sections using the following format:
            ### Final Output Format: <Judge A>\n[Analysis of Response A: Evaluate against each Evaluation Criteria. Be explicit about strengths and weaknesses. Clearly state where improvements are needed and explain why. Assign an overall score from 1–10.]\n</Judge A>\n<Judge B>\n[Analysis of Response B: same requirements as above.]\n</Judge B>\n# The Final Verdict is [[A]] or [[B]].
        2. Assign an overall score to each response (integer from 1 to 10):
            + 1 = Does not satisfy the query at all under the criteria
            + 6 = Partially satisfies the query
            + 10 = Fully satisfies the query with excellence across all criteria
        3. At the end, output your analysis and then give the final decision in this exact format: If Assistant A is better: explanation followed by [[A]]; If Assistant B is better: explanation followed by [[B]]. Only output one of the tags ([[A]] or [[B]]) on the final answer line, and nothing else.
    
    ###Input\n[User Query]\n{query}\n\n[The Start of Assistant A's Answer]\n{response_1}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{response_2}\n[The End of Assistant B's Answer]\n####[The Begin of Evaluation Criteria]\n{criteria}\n[The End of Evaluation Criteria]\n""".format(
        query=query,
        response_1=response_1,
        response_2=response_2,
        criteria=based_criterion
    )
    return prompt
def build_exploration_response(criteria: list[str]) -> str:
    assert isinstance(criteria, list)
    assert isinstance(criteria[0], str)
    assert len(criteria) >= 7
    return criteria2str(criteria= criteria)

def build_long_judge_response(criteria: list[dict], answer: int) -> str:
    
    assert answer in [0, 1]
    if answer == 0:
        verdict = "A"
    elif answer == 1:
        verdict = "B"
    else:
        raise Exception(f"answer is not 0/1. get answer:[{answer}]")
    lines = []
    lines.append("<think>")
    for i, crit in enumerate(criteria, start=1):
        lines.append(f"<criteria {i}>{crit['criterion']} ")
        lines.append(f"<Judge A>{crit['judge_A']}</Judge A>")
        lines.append(f"<Judge B>{crit['judge_B']}</Judge B>")
        # lines.append(f"Based on the criteria, the verdict is [[{verdict}]]")
        lines.append(f"</criteria {i}>")
    lines.append("</think>")


    lines.append(f"The final verdict is [[{verdict}]]")

    return "\n".join(lines)
def build_single_judge_response(judge_a: str, judge_b: str, answer: int) -> str:
    assert isinstance(judge_a, str)
    assert isinstance(judge_b, str)

    assert answer in [0, 1]
    if answer == 0:
        verdict = "A"
    elif answer == 1:
        verdict = "B"
    else:
        raise Exception(f"answer is not 0/1. get answer:[{answer}]")
    lines = []
    lines.append(f"<Judge A>{judge_a}</Judge A>")
    lines.append(f"<Judge B>{judge_b}</Judge B>")
    lines.append(f"The final verdict is [[{verdict}]]")
    return "\n".join(lines)
