TEXT_RECOGNITION_TEXTOCR = """The task type is {task} and the instruction for the task is as follows:

### Task
{question}

### Requirements
1. Present the results as a JSON-formatted object that follows the format: {{"answer": "The text of the answer goes here", "bbox": [x0, y0, x1, y1]}}.
2. Please maintain line breaks when providing answer texts that span multiple lines.
3. The position or bounding box of the text should be defined relative to the image's dimensions, scaled from 0 to 1000."""

TEXT_RECOGNITION_OCRVQA = """The task type is {task} and the instruction for the task is as follows:

### Task
{question}

### Requirements
1. For this task, please return the answer line by line.
2. Present the results as a JSON-formatted list of objects that follows the format: [{{"answer": "The text of the answer goes here", "bbox": [x0, y0, x1, y1]}}].
3. The position or bounding box of the text should be defined relative to the image's dimensions, scaled from 0 to 1000."""

TABLE_RECOGNITION = """The task type is {task} and the instruction for the task is as follows:

### Task
{question}

### Requirements
1. Present the results as a JSON-formatted 2D array where each element is a cell object representing either a header or a cell from the table. The array must preserve the original table structure, organized in rows and columns. Each cell object should follow this format: {{"answer": "text goes here", "bbox": [x0, y0, x1, y1]}}. The table should look like:\n[\n  [\n    {{"answer": "text goes here", "bbox": [x0, y0, x1, y1]}},\n    {{"answer": "text goes here", "bbox": [x0, y0, x1, y1]}}\n  ],\n  [\n    {{"answer": "text goes here", "bbox": [x0, y0, x1, y1]}},\n    {{"answer": "text goes here", "bbox": [x0, y0, x1, y1]}}\n  ],\n  ...\n].
2. Ensure that each row in the 2D array contains the same number of cell objects. If a row has fewer cells than others, fill the missing cells with empty text and bbox `{{"answer": "", "bbox": []}}` to maintain column alignment.
3. Please maintain line breaks when providing cell texts that span multiple lines.
4. The position or bounding box of the cell should be defined relative to the image's dimensions, scaled from 0 to 1000."""

TEXT_LOCALIZATION_TEXT2BBOX = """The task type is {task} and the instruction for the task is as follows:

### Task
{question}

### Requirements
1. Present the results as a JSON-formatted object that follows the format: {{"answer": [x0, y0, x1, y1]}}.
2. The answer should be defined relative to the image's dimensions, scaled from 0 to 1000."""

TEXT_LOCALIZATION_BBOX2TEXT = """The task type is {task} and the instruction for the task is as follows:

### Task
{question}

Note that the bounding box from the task $bbox$ is defined relative to the image's dimensions, scaled from 0 to 1000.

### Requirements
Present the results as a JSON-formatted object that follows the format: {{"answer": "The text of the answer goes here"}}."""

TABLE_CELL_LOCALIZATION = """The task type is {task} and the instruction for the task is as follows:

### Task
{question}

### Requirements
1. Please start counting from the header row.
2. Present the results as a JSON-formatted object that follows the format: {{"answer": "The text of the answer goes here", "bbox": [x0, y0, x1, y1]}}.
3. Please maintain line breaks when providing answer texts that span multiple lines.
4. The position or bounding box of the text should be defined relative to the image's dimensions, scaled from 0 to 1000."""

KEY_INFORMATION_EXTRACTION = """The task type is {task} and the instruction for the task is as follows:

### Task
{question}

### Requirements
1. Present the results as a JSON-formatted object that follows the format: {{"answer": "The text of the answer goes here", "bbox": [x0, y0, x1, y1]}}.
2. The position or bounding box of the text should be defined relative to the image's dimensions, scaled from 0 to 1000."""

DOCUMENT_FORGERY_DETECTION = """The task type is {task} and the instruction for the task is as follows:

### Task
{question}

### Requirements
1. Detect forged texts only on printed texts, excluding handwritten texts.
2. Identify forged texts by checking for visual inconsistencies such as font style, alignment, coloration, font weight, etc.
3. Present the results as a JSON-formatted list of objects that follows the format: [{{"answer": "forged_text goes here", "bbox": [x0, y0, x1, y1]}}].
4. The position or bounding box of the text should be defined relative to the image's dimensions, scaled from 0 to 1000."""


VISUAL_REASONING = """The task type is {task} and the instruction for the task is as follows:

### Task
Answer the question below and then based on your answer provide a json summary with required format:

{question}

### Requirements for Question Answering
1. Answer step by step. Identify and mark all `supporting_evidence`s (numbers, visuals, text snippets) in the image.
2. Then, apply analytical skills to draw conclusions from `supporting_evidence` and explain it.

### Requirements for JSON Summary
1. Collect `supporting_evidence`s with bounding boxes (relative to image dimensions, scaled 0-1000).
2. If the final answer you get exists in the image, provide its bounding box too (relative to image dimensions, scaled 0-1000).
3. Finally, format JSON summary as:\n{{\n  "answer": "conclusion",\n  "bbox": [x0, y0, x1, y1],\n  "supporting_evidence": [\n    {{\n      "text": "evidence",\n      "bbox": [x0, y0, x1, y1]\n    }}\n  ]\n}}.
Note: Use empty strings for visual elements without text, and empty lists for final answers not visible in the image."""


PROMPT = {
    "Text Recognition": {
        "TextOCR": TEXT_RECOGNITION_TEXTOCR,
        "OCR-VQA": TEXT_RECOGNITION_OCRVQA,
    },
    "Table Recognition": TABLE_RECOGNITION,
    "Text Localization": {
        "Text2Bbox": TEXT_LOCALIZATION_TEXT2BBOX,
        "Bbox2Text": TEXT_LOCALIZATION_BBOX2TEXT,
    },
    "Table Cell Localization": TABLE_CELL_LOCALIZATION,
    "Key Information Extraction": KEY_INFORMATION_EXTRACTION,
    "Document Forgery Detection": DOCUMENT_FORGERY_DETECTION,
    "Document Question Answering": KEY_INFORMATION_EXTRACTION,
    "Chart Question Answering": KEY_INFORMATION_EXTRACTION,
    "Infographic Question Answering": KEY_INFORMATION_EXTRACTION,
    "Arithmetic Reasoning": VISUAL_REASONING,
    "Logical Reasoning": VISUAL_REASONING,
    "Spatial Reasoning": VISUAL_REASONING,
    "Comparison": VISUAL_REASONING,
    "Sorting": VISUAL_REASONING,
    "Counting": VISUAL_REASONING,
}
