# Prompt templates for MMRB2 Image Editing Evaluation
# Reuse prompts from editscore with minor adaptations

# ============ Without edit_region (original prompt) ============
CONTEXT_WITHOUT_REGION = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.

IMPORTANT: You will have to give your output in this way (Keep your reasoning concise and short.):
{
"reasoning" : "...",
"score" : [...]
}
"""

TWO_IMAGE_EDIT_RULE = """RULES:

Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
The objective is to evaluate how successfully the editing instruction has been executed in the second image.

Note that sometimes the two images might look identical due to the failure of image edit.
"""

MULTI_IMAGE_EDIT_RULE = """RULES:

Multiple input images and one edited output image will be provided. The first {num_input_images} images are the original input images used for editing, and the last image is the edited/fused result.
The objective is to evaluate how successfully the editing instruction has been executed in the edited image, considering all the input images.

Note that sometimes the edited image might not successfully integrate all input elements due to the failure of image editing/fusion.
"""


TIE_RULE_SC = """
From scale 0 to 10: 
A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.

Editing instruction: <instruction>
"""

RULE_PQ = """RULES:

The image is an AI-generated image.
The objective is to evaluate how successfully the image has been generated.

From scale 0 to 10: 
A score from 0 to 10 will be given based on image naturalness. 
(
    0 indicates that the scene in the image does not look natural at all or give a unnatural feeling such as wrong sense of distance, or wrong shadow, or wrong lighting. 
    10 indicates that the image looks natural.
)
A second score from 0 to 10 will rate the image artifacts. 
(
    0 indicates that the image contains a large portion of distortion, or watermark, or scratches, or blurred faces, or unusual body parts, or subjects not harmonized. 
    10 indicates the image has no artifacts.
)
Put the score in a list such that output score = [naturalness, artifacts]
"""

# ============ With edit_region (new trained model) ============
CONTEXT_WITH_REGION = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.

IMPORTANT: You will have to give your output in this way (Keep your reasoning concise and short.):
{
"edit_region" : [...],
"reasoning" : "...",
"score" : [...]
}
"""

TWO_IMAGE_EDIT_RULE_WITH_REGION = """RULES:

Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
The objective is to identify the editing region(s) and evaluate how successfully the editing instruction has been executed in the second image.

Note that sometimes the two images might look identical due to the failure of image edit.

First, identify where the editing occurred in the second image:
- If editing was successful, provide bounding boxes with labels: [{"id": 0~n, "label": "description of edited area", "bbox_2d": [x1, y1, x2, y2]}] (coordinates normalized to [0, 1000] range, where 0=left/top, 1000=right/bottom)
- If editing failed (images look identical), use empty list: []

Then, evaluate the editing quality from scale 0 to 10:
"""

MULTI_IMAGE_EDIT_RULE_WITH_REGION = """RULES:

Multiple input images and one edited output image will be provided. The first {num_input_images} images are the original input images used for editing, and the last image is the edited/fused result.
The objective is to identify which elements from the input images are successfully integrated and evaluate how successfully the editing instruction has been executed.

Note that sometimes the edited image might not successfully integrate all input elements due to the failure of image editing/fusion.

First, identify the successfully integrated elements in the edited image:
- For each element from input images that appears in the edited image, provide bounding boxes: [{{"id": 0~n, "label": "brief description", "bbox_2d": [x1, y1, x2, y2]}}] (coordinates normalized to [0, 1000] range, where 0=left/top, 1000=right/bottom)
- If fusion failed (no integration), use empty list: []

Then, evaluate the editing quality from scale 0 to 10:
"""

TIE_RULE_SC_WITH_REGION = """A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.

Editing instruction: <instruction>
"""

TIE_RULE_SC_WITH_REGION_INTERLEAVED = """A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.

SPECIAL TOKENS for Reasoning:
In your reasoning, use special tokens to reference regions:
- <|bbox_{id}|> before describing each edited region(if exist)
- <|global|> before overall assessment

Editing instruction: <instruction>
"""

TIE_RULE_SC_MULTI_IMAGE = """A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
A second score from 0 to 10 will rate the degree of overediting in the edited image. (0 indicates that the scene in the edited image is completely different from the original inputs or has excessive artifacts. 10 indicates that the edited image is a well-balanced fusion of the input images.)
Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the fusion quality.

SPECIAL TOKENS for Reasoning:
In your reasoning, use special token <|bbox_{{id}}|> before describing each edited region to reference it.

Editing instruction: <instruction>
"""


def build_sc_prompt(instruction: str, score_range: int = 10, with_region: bool = False, interleaved: bool = False, num_input_images: int = 1) -> str:
    """Build Success & Consistency evaluation prompt
    
    Args:
        instruction: Editing instruction text
        score_range: Score range (default 10, but typically 25 in practice)
        with_region: Whether to include region localization
        interleaved: Whether to use interleaved reasoning format (only for single-image)
        num_input_images: Number of input images (1 for single-image, >1 for multi-image fusion)
    """
    # Multi-image fusion editing
    if num_input_images > 1:
        if with_region:
            context = CONTEXT_WITH_REGION
            rule = MULTI_IMAGE_EDIT_RULE_WITH_REGION.format(num_input_images=num_input_images)
            tie_rule = TIE_RULE_SC_MULTI_IMAGE.replace('10', str(score_range))
        else:
            context = CONTEXT_WITHOUT_REGION
            rule = MULTI_IMAGE_EDIT_RULE.format(num_input_images=num_input_images)
            tie_rule = TIE_RULE_SC.replace('10', str(score_range))
    # Single-image editing
    else:
        if with_region:
            context = CONTEXT_WITH_REGION
            rule = TWO_IMAGE_EDIT_RULE_WITH_REGION
            # Use interleaved format if specified (only for single-image)
            if interleaved:
                tie_rule = TIE_RULE_SC_WITH_REGION_INTERLEAVED.replace('10', str(score_range))
            else:
                tie_rule = TIE_RULE_SC_WITH_REGION.replace('10', str(score_range))
        else:
            context = CONTEXT_WITHOUT_REGION
            rule = TWO_IMAGE_EDIT_RULE
            tie_rule = TIE_RULE_SC.replace('10', str(score_range))
    
    prompt = "\n".join([context, rule, tie_rule])
    return prompt.replace("<instruction>", instruction)


def build_pq_prompt(score_range: int = 10, with_region: bool = False) -> str:
    """Build Perceptual Quality evaluation prompt"""
    # PQ always uses CONTEXT_WITHOUT_REGION, regardless of with_region flag
    # Because PQ only evaluates image quality, not editing regions
    context = CONTEXT_WITHOUT_REGION
    
    prompt = "\n".join([context, RULE_PQ.replace('10', str(score_range))])
    return prompt

