import json
import os

from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel
from tqdm import tqdm


class EvidenceClaim(BaseModel):
    source: str
    statement: str


class TargetActionOption(BaseModel):
    letter: str
    attribute: str
    target: str
    other_involved: str | None = None
    action: str
    edit_statement: str
    reason: str


class TargetActionAnswers(BaseModel):
    answers: list[TargetActionOption]


analysis_system_prompt_default = """You are a system that converts multiple choice question answers into Evidence-Claim JSON format.

Evidence-Claim JSON format:
```json
{
  "letter": "A" | "B" | "C" | "D",
  "attribute": str,
  "claim": {
    "source": "expectation" | str,
    "statement": str
  },
  "evidence": {
    "source": str,
    "statement": str
  },
}
```

There are two patterns of answer options:

Pattern 1: One part of the answer makes a claim that is contradicted by evidence in another part

Example:
```json
{
  "letter": "C", // The letter of the answer option
  "attribute": "optimal trade-off", // The attribute in the center of the answer option (e.g. rank parameter, complexity, name, etc.)
  "claim": {
    "source": "caption", // The source the claim about the attribute is based on (e.g., caption, text, figure_1 etc.)
    "statement": "at 128 tokens" // A brief 2-3 words description
  },
  "evidence": {
    "source": "plot", // The source the evidence about the attribute contradicting the claim is based on (e.g., plot, table, equation_2 etc.)
    "statement": "not visible at 128 tokens" // A brief 2-3 words description
  },
}

Pattern 2: One part of the answer makes a claim that contradicts common expectations to scientific correctness

Example:
```json
{
  "letter": "A",
  "attribute": "legend",
  "claim": {
    "source": "expectation", // In that case, the source for claim is always "expectation"
    "statement": "shouldn't occlude plot" 
  },
  "evidence": {
    "source": "figure_8",
    "statement": "occludes plot"
  },
}
```

Given:
- The question
- The answer options with letters (A, B, C, D)
- The correct answer letter
- The visual elements relevant to the inconsistency

Convert each multiple choice question answer (A, B, C, D) into the Target-Action JSON format. Ensure that the answer letters remain consistent with the input. Keep the JSON output concise. Do not use adjectives or any other descriptive language. The goal is to remove linguistic cues and focus on the core content of each answer option."""

analysis_system_prompt_edit = """You are a system that converts multiple choice question answers about inconsistencies in scientific papers into Target-Action JSON format. The goal is to identify what needs to be changed in the paper to resolve the inconsistency.

Target-Action JSON format:
```json
{
  "letter": "A" | "B" | "C" | "D",
  "attribute": str, // the core element at issue (e.g., legend, methods evaluated, F1 scores)
  "target": str, // where the edit is applied (e.g., caption, figure_4b, table_5, equation_2)
  "other_involved": str // (optional) other elements involved in the inconsistency, comma-separated
  "action": "modify" | "remove" | "add" | "reposition" | "replace",
  "edit_statement": str, // short 2-3 words description of the needed change (exclude word from action)
  "reason": str // why the change is needed in 2-3 words
}
```

Example:
```json
{
  "letter": "C",
  "attribute": "windows",
  "target": "figure_1b",
  "other_involved": "figure_1a",
  "action": "modify",
  "edit_statement": "align door position",
  "reason": "different"
}
```

Given:
- The question
- The answer options with letters (A, B, C, D)
- The correct answer letter
- The visual elements relevant to the inconsistency

Convert each multiple choice question answer (A, B, C, D) into the Target-Action JSON format. Ensure that the answer letters remain consistent with the input. Keep the JSON output concise. Do not use adjectives or any other descriptive language. Most important is to remove linguistic cues and focus on the core content of each answer option.
"""

analysis_user_prompt = """{question}

A) {A}
B) {B}
C) {C}
D) {D}

Correct answer: {correct_letter})

Visual elements: {visual_elements}"""
load_dotenv()

client = OpenAI(
    api_key=os.getenv("GEMINI_API_KEY"),
    base_url="https://openrouter.ai/api/v1",  # We used OpenRouter as a proxy to access Gemini via the OpenAI API
)

with open("annotations.json", "r") as f:
    annotations = json.load(f)

to_test = {
    "r0JfDTXAWx": [0],
    "mb2rHLcKN5": [0],
    "KJkbmBcZRx": [0],
    "7vH8DO2oPk": [0],
    "09TI1yUo9K": [0],
    "Aqfwhna1D7": [0],
    "dL3h1lyUNd": [0],
    "dIK7GpOwNY": [0],
    "0zZEbHLTwf": [0],
    "zgM66fu0wv": [0],
    "MazxSMs6Hs": [0],
    "yJAk0n0NyU": [0],
    "mXh8LbXXpx": [1],
    "A2muypu61H": [0],
    "zvYJ1qG1Fy": [1],
    "OzwGZP8h2A": [0],
    "CGT0T9uUOY": [0],
    "IUzQfdkkoL": [1],
    "YryL3QIWWc": [1],
    "LieTse3fQB": [1],
    "tsfR7JCwTf": [0],
    "DXaUC7lBq1": [0],
    "p3NVJg6ywM": [0],
    "0Xc6o1HKXD": [0],
    "wq4AeBWQJ4": [1],
    "iiK1vNRo6I": [0],
    "8EaDOGMPUL": [0],
    "kA5egaJjya": [0, 1, 2],
    "4vm6Nn2DW9": [0],
    "sec09tLQUl": [0],
    "zZU69H8tcr": [0],
    "wrVZ771SZQ": [0],
    "i2ue8J6aqI": [0],
    "r6XqXoRT6N": [1],
    "w0MAu8vjwj": [0],
    "3Q7y9No9VF": [0],
    "bx0IbCcBvO": [0],
    "EispKqtw5B": [0],
    "EXaKfdsw04": [0],
    "erowpbZcPi": [0],
    "Jl0aEFrp11": [0],
    "zPaTnGjgpa": [0],
    "AuckJjoD99": [0],
    "jwGPmIqE99": [0],
    "CH7Ba4RFa2": [0],
    "ZT33ACedmn": [0],
    "zAogQOIphH": [0],
    "Y0P6cOZzNm": [0],
}

for key, entries in tqdm(annotations.items()):
    for idx, entry in enumerate(entries):
        if not (key in to_test and idx in to_test[key]):
            continue
        try:
            mcq = entry["mcq"]["default_generic"]
            visual_elements = ", ".join(entry["visual_elements"])
            question = mcq["question"]
            answers = [mcq["correct"]] + mcq["incorrect"]
            letters = mcq["letters"]
            letter_answer_map = {
                letter: answer for letter, answer in zip(letters, answers)
            }

            # Format the analysis prompt
            analysis_text = analysis_user_prompt.format(
                question=question,
                correct_letter=letters[0],
                visual_elements=visual_elements,
                **letter_answer_map,
            )

            messages = [
                {"role": "system", "content": analysis_system_prompt_edit},
                {"role": "user", "content": analysis_text},
            ]

            response = client.chat.completions.parse(
                model="google/gemini-2.5-flash",
                messages=messages,
                response_format=TargetActionAnswers,
            )
            final_response = response.choices[0].message.parsed

            # Update the entry with the refined answers

            new_letter_answer_map = {ans.letter: ans for ans in final_response.answers}

            annotations[key][idx]["mcq"]["edit_generic"] = {
                "question": "What action needs to be taken to resolve the inconsistency in these parts of a scientific paper?",
                "correct": new_letter_answer_map[letters[0]].model_dump_json(),
                "incorrect": [
                    new_letter_answer_map[letter].model_dump_json()
                    for letter in letters[1:]
                ],
                "letters": letters,
            }
        except Exception as e:
            print(f"Failed for key {key}, idx {idx}: {e}")
            continue

    with open("json_expectation_edit_llama.json", "w") as f:
        json.dump(annotations, f, indent=2)
