# Prompt templates for EditScore evaluation

# ============ Without edit_region (original prompt) ============
CONTEXT_WITHOUT_REGION = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.

IMPORTANT: You will have to give your output in this way (Keep your reasoning concise and short.):
{
"reasoning" : "...",
"score" : [...]
}
"""

TWO_IMAGE_EDIT_RULE = """RULES:

Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
The objective is to evaluate how successfully the editing instruction has been executed in the second image.

Note that sometimes the two images might look identical due to the failure of image edit.
"""

TIE_RULE_SC = """
From scale 0 to 10: 
A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.

Editing instruction: <instruction>
"""

RULE_PQ = """RULES:

The image is an AI-generated image.
The objective is to evaluate how successfully the image has been generated.

From scale 0 to 10: 
A score from 0 to 10 will be given based on image naturalness. 
(
    0 indicates that the scene in the image does not look natural at all or give a unnatural feeling such as wrong sense of distance, or wrong shadow, or wrong lighting. 
    10 indicates that the image looks natural.
)
A second score from 0 to 10 will rate the image artifacts. 
(
    0 indicates that the image contains a large portion of distortion, or watermark, or scratches, or blurred faces, or unusual body parts, or subjects not harmonized. 
    10 indicates the image has no artifacts.
)
Put the score in a list such that output score = [naturalness, artifacts]
"""

# ============ With edit_region (new trained model) ============
CONTEXT_WITH_REGION = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.

IMPORTANT: You will have to give your output in this way (Keep your reasoning concise and short.):
{
"edit_region" : [...],
"reasoning" : "...",
"score" : [...]
}
"""

TWO_IMAGE_EDIT_RULE_WITH_REGION = """RULES:

Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
The objective is to identify the editing region(s) and evaluate how successfully the editing instruction has been executed in the second image.

Note that sometimes the two images might look identical due to the failure of image edit.

First, identify where the editing occurred in the second image:
- If editing was successful, provide bounding boxes with labels: [{"id": 0~n, "label": "description of edited area", "bbox_2d": [x1, y1, x2, y2]}] (coordinates normalized to [0, 1000] range, where 0=left/top, 1000=right/bottom)
- If editing failed (images look identical), use empty list: []

Then, evaluate the editing quality from scale 0 to 10:
"""

TIE_RULE_SC_WITH_REGION = """A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.

Editing instruction: <instruction>
"""

TIE_RULE_SC_WITH_REGION_INTERLEAVED = """A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.

SPECIAL TOKENS for Reasoning:
In your reasoning, use special tokens to reference regions:
- <|bbox_{id}|> before describing each edited region(if exist)
- <|global|> before overall assessment

Editing instruction: <instruction>
"""

# TIE_RULE_SC_WITH_REGION_INTERLEAVED = """A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
# A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
# Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.

# **SPECIAL TOKENS for Reasoning:**
# In your reasoning, use special tokens to reference regions:
# - <|bbox_{id}|> before describing each edited region (e.g., "<|bbox_0|>The background was replaced...")
#   * If edit_region is empty, do NOT use any <|bbox_id|> tokens
# - <|global|> before overall assessment (e.g., "<|global|>The editing maintains consistency...")
#   * Always use <|global|> once per response
# Use one <|bbox_id|> per region and one <|global|> per response.

# Editing instruction: <instruction>
# """

def build_sc_prompt(instruction: str, score_range: int = 10, with_region: bool = False, interleaved: bool = False) -> str:
    """Build Success & Consistency evaluation prompt"""
    if with_region:
        context = CONTEXT_WITH_REGION
        rule = TWO_IMAGE_EDIT_RULE_WITH_REGION
        # Use interleaved format if --interleaved is specified
        if interleaved:
            tie_rule = TIE_RULE_SC_WITH_REGION_INTERLEAVED.replace('10', str(score_range))
        else:
            tie_rule = TIE_RULE_SC_WITH_REGION.replace('10', str(score_range))
    else:
        context = CONTEXT_WITHOUT_REGION
        rule = TWO_IMAGE_EDIT_RULE
        tie_rule = TIE_RULE_SC.replace('10', str(score_range))
    
    prompt = "\n".join([context, rule, tie_rule])
    return prompt.replace("<instruction>", instruction)


def build_pq_prompt(score_range: int = 10, with_region: bool = False) -> str:
    """Build Perceptual Quality evaluation prompt"""
    # PQ always uses CONTEXT_WITHOUT_REGION, regardless of with_region flag
    # Because PQ only evaluates image quality, not editing regions
    context = CONTEXT_WITHOUT_REGION
    
    prompt = "\n".join([context, RULE_PQ.replace('10', str(score_range))])
    return prompt

