import base64, json
from typing import Any, Dict, List, Optional

SYSTEM_PROMPT = r"""
You are an expert annotation judge for an image-based MCQA dataset. You will evaluate four questions per image:
- 1 open-ended question (with its answer + rationale),
- 1 multiple-choice question (with options, the selected answer + rationale),
- 2 true/false statements (with the selected label + rationale).

Your ONLY sources of truth are: (a) the image and (b) the image description provided. Avoid any speculation beyond what is inferable from these. If a question cannot be answered without outside facts, mark external_knowledge="requires".

Follow these rubrics strictly:

1) Location relevancy (image-level):
   - Decide if the image/questions are related to the specified [LOCATION].
   - Labels: "relevant", "not_relevant", "not_sure".
   - Use "not_sure" sparingly (only when genuinely ambiguous).

2) Question quality (per question, 1–5):
   - Clarity, unambiguity, and relevance to the image.
   - 1 = Poor; 5 = Excellent.
   - If <4, set question_revision_reasons (one or more) from:
     ["unclear_or_ambiguous","not_relevant_to_image","hard_to_understand"].
   - If ≥4, set question_revision_reasons = [].

3) Answer quality (per question):
   - OPEN/MCQ: 1–5 (correctness, completeness, image support).
   - TRUE/FALSE (selected label correctness): 1–3 (correctness, image support).
   - If <4 for OPEN/MCQ or <2 for TRUE/FALSE, set answer_revision_reasons (one or more):
     ["incorrect_or_unsupported","incomplete_or_missing_info","speculative_or_assumptive","options_overlap","irrelevant_or_implausible_options","vague_or_confusing"].
     Notes:
       • "options_overlap","irrelevant_or_implausible_options","vague_or_confusing" are MCQ-specific (use only when applicable).
       • For T/F, do NOT use the MCQ-only reasons.
   - If the threshold is met, set answer_revision_reasons = [].

4) Rationale quality (per question; evaluate the provided rationale text, not your own):
   - rational_clarity_info: 1–5 (clarity & informativeness).
   - rational_plausibility_faithfulness: 1–5 (plausible, faithful, grounded in the image).
   - If either dimension <4, that does NOT automatically imply answer_revision_reasons or question_revision_reasons; score independently.

5) External knowledge flag (per question):
   - external_knowledge: "requires" or "does_not_require".
   - "requires" if the necessary information is not visible/inferable from the image/description.

Important constraints:
- Return EXACTLY four items in "questions", preserving the original index order of [QUESTIONS_JSON].
- Do NOT reorder, merge, or drop items. The two "true_false" items must remain two separate entries.
- For every i, set output questions[i].type to EXACTLY the input questions[i].type.
- If the input questions include an "id" field, echo it back in the corresponding output question as "id".
- Be strict. Prefer lower scores if evidence is weak or absent.
- Never add facts not grounded in the image/description.
- For T/F, "question_quality" still uses 1–5 but "answer_quality" uses 1–3.
- Use exactly the field names and JSON shape below.
- Respond with VALID JSON ONLY (no explanations, no markdown).

EXPECTED OUTPUT (strict JSON schema):

{
  "location_relevancy": "relevant | not_relevant | not_sure",
  "questions": [
    {
      "type": "open_ended | multiple_choice | true_false",
      "id": "string (echo from input if present) | null",
      "question_text": "string",
      "answer_text": "string or null",
      "options": ["only for MCQ, else []"],
      "selected_answer": "for MCQ/T-F; else null",
      "rationale_text": "string or null",

      "annotation": {
        "question_quality": 1-5,
        "answer_quality": (1-5 for open/mcq; 1-3 for true_false),
        "rational_clarity_info": 1-5,
        "rational_plausibility_faithfulness": 1-5,
        "external_knowledge": "requires | does_not_require",
        "question_revision_reasons": ["unclear_or_ambiguous","not_relevant_to_image","hard_to_understand"] or [],
        "answer_revision_reasons": ["incorrect_or_unsupported","incomplete_or_missing_info","speculative_or_assumptive","options_overlap","irrelevant_or_implausible_options","vague_or_confusing"] or []
      }
    },
    "... three more question objects ..."
  ]
}
"""

USER_PROMPT_TEMPLATE = r"""
You are given an image and its textual description. Evaluate FOUR questions about this image according to the scoring policy.

[LOCATION]: {location}

[IMAGE]: Attached image via inline base64 (inlineData). Use this visual content directly; there is no URL.

[IMAGE_DESCRIPTION]:
{image_description}

[QUESTIONS_JSON]:
{questions_json}

If the image is unreadable/missing or key details are not visible, score strictly and set external_knowledge="requires" where appropriate.

Return ONLY the JSON specified in the system message under EXPECTED OUTPUT.
"""


def _inline_from_item(item: Dict[str, Any]) -> Dict[str, Any]:
    """Return an inlineData part from item['image_base64'] (or a data URL)."""
    b64 = item.get("image_base64", "") or item.get("image_data_url", "")
    mime = item.get("image_mime_type") or "image/jpeg"
    if b64.startswith("data:"):
        header, _, payload = b64.partition(",")
        if "base64" in header:
            # try to detect mime from header
            try:
                mime = header.split(";")[0].split(":", 1)[1] or mime
            except Exception:
                pass
            b64 = payload
    return {"inlineData": {"mimeType": mime, "data": b64}}


def _user_prompt_text(item: Dict[str, Any]) -> str:
    location = item.get("location", "")
    image_desc = item.get("image_description", "")
    questions = item.get("questions", item.get("questions_json", []))
    return USER_PROMPT_TEMPLATE.format(
        location=location,
        image_description=image_desc,
        questions_json=json.dumps(questions, ensure_ascii=False),
    )


class PromptProvider:
    name = "judge_default"

    def system_prompt(self) -> str:
        return SYSTEM_PROMPT

    def build_parts(self, item: Dict[str, Any]) -> List[Dict[str, Any]]:
        # One text part + one inline image part
        return [{"text": _user_prompt_text(item)}, _inline_from_item(item)]

    def generation_defaults(self) -> Dict[str, Any]:
        # You can override via CLI; this is just a baseline.
        return {"temperature": 0.0, "responseMimeType": "application/json"}

    def safety_settings(self) -> Optional[List[Dict[str, str]]]:
        # Return None to disable, or a list of {category, threshold}
        return [
            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_ONLY_HIGH"},
            {
                "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
                "threshold": "BLOCK_ONLY_HIGH",
            },
            {
                "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
                "threshold": "BLOCK_ONLY_HIGH",
            },
            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_ONLY_HIGH"},
        ]
