#!/usr/bin/env python3
"""
LLMPlotter: use an LLM as a plotter to generate coordinates and plotting code.

New approach: the LLM directly generates Python code to compute coordinates,
avoiding DSL parsing issues.
"""

import json
import os
import re
import subprocess
import tempfile
import ast
import math
import time
from typing import Any, Dict, List, Optional

from openai import OpenAI


class LLMPlotter:
    """Use an LLM as a plotter to generate coordinates and plotting code."""
    
    def __init__(
        self,
        api_key: Optional[str] = None,
        model: Optional[str] = None,
        base_url: Optional[str] = None,
        max_retries: int = 3,
    ):
        """
        Initialize the plotter.
        
        Args:
            api_key: API key (if None, will be read from the environment).
            model: Model name (if None, a default is used).
            base_url: API base URL (if None, a default is used).
            max_retries: Maximum number of retries for API calls.
        """
        # Default configuration
        if model is None:
            model = os.getenv("OPENAI_MODEL") or "gpt-4o-mini"
        if base_url is None:
            base_url = os.getenv("OPENAI_BASE_URL") or "https://api.openai.com/v1"
        
        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
        
        if not self.api_key:
            raise ValueError("API key must be provided via argument or environment variable.")
        
        self.model = model
        self.base_url = base_url
        self.max_retries = max_retries
        client_kwargs = {"api_key": self.api_key, "timeout": 600.0}
        if self.base_url:
            client_kwargs["base_url"] = self.base_url
        self.client = OpenAI(**client_kwargs)
        
        # Quick connectivity test to fail fast if configuration is invalid.
        self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": "test"}],
            max_tokens=19,
        )
    
    def format_coordinates_prompt(self, question: str) -> str:
        """Format the first LLM prompt: coordinate computation only (returns code)."""
        
        prompt = f"""
Your task is to generate 2D numeric coordinates for **all points** that appear in the problem
text so that the configuration satisfies the stated geometric relations and the diagram
is reasonable, clear, and stable.

=====================
[Problem text]
=====================
{question}
=====================

You must output a block of **directly runnable Python code** that strictly follows
the requirements below.

=====================================================
[Global code constraints]
=====================================================
1. Produce **complete, executable Python code** only; do not output explanations,
   comments, or any extra text.
2. All variables, functions, and constants used in the code must be explicitly
   defined or imported inside the code.
3. All coordinates must be **concrete numeric float or int values**. Do **not** use
   symbolic expressions, lambdas, undefined variables, or lazy expressions.
4. You may import standard libraries (such as `import math`, `import numpy`), but
   file I/O and network requests are strictly forbidden.

=====================================================
[Geometric construction rules]
=====================================================
1. **Point names**: only use point names that appear in the problem statement.
   Do **not** introduce any new points.
2. **Geometric relations**: coordinates must satisfy all relations explicitly mentioned
   in the text (e.g. perpendicular / parallel, collinear, midpoints, lying on a circle,
   ratios, equal-length segments, etc.).
   Do **not** add extra properties (such as isosceles, additional parallels, or right angles)
   that are not explicitly given.
3. **Non-degenerate configuration**:
   - Different points must not coincide.
   - Points that should form an angle/triangle must not be almost collinear.
   - Coordinates should be reasonably spread out and numerically stable.
4. Enforce geometric relations via explicit construction (e.g. slopes, vectors,
   distances to circle centers, etc.).

=====================================================
[Final output structure]
=====================================================
At the end of the code, construct and print a dictionary **containing only point coordinates**
with the exact structure:

result = {{
    "points": {{
        "A": (xA, yA),
        "B": (xB, yB),
        ...
    }}
}}

import json
print(json.dumps(result, ensure_ascii=False))

You must output **only Python code**, with no explanatory text or non-code content.
"""

        return prompt

    def format_annotations_prompt(self, question: str) -> str:
        """Format the second LLM prompt: extract annotations, segments, circles (JSON)."""
        
        prompt = f"""
Your task is to extract segment, circle, and annotation information **only** from the
explicit textual conditions in the problem statement, without adding any inferred
or derived information.

=====================
[Problem text]
=====================
{question}
=====================

You must return a single JSON object with the following structure:

{{
    "segments": [
        ["A", "B"],
        ["B", "C"],
        ...
    ],
    "circles": [
        ["C1", "O", 5],
        ["C2", "A", "B", "diameter"],
        ["C3", "A", "B", "C"],
        ...
    ],
    "annotations": {{
        "right_angles": [
            ["A", "B", "C"],
            ...
        ],
        "length_of_line": [
            [["A", "B"], "5"],
            [["C", "D"], "2*sqrt(3)"],
            ...
        ],
        "measure_of_angle": [
            [["A", "B", "C"], "30"],
            ...
        ]
    }}
}}

[Extraction rules]

1. segments:
   - If the text describes a polygon (e.g. triangle ABC, quadrilateral ABCD, etc.),
     you **must** include all edges of the polygon in `segments` (no missing sides).
   - If a side is implicitly present from the wording, still include it in `segments`.
   - If the text explicitly mentions drawing or connecting a segment (e.g. "connect AD",
     "draw BE"), that segment must also appear in `segments`, as long as it does not
     introduce new points.
   - Segment format: ["A", "B"] for segment AB.

2. circles:
   - Unless the problem text clearly mentions a "circle", "inscribed circle",
     "circumcircle", etc., `circles` must be an empty list [].
   - If circles are present, assign each circle a unique ID such as C1, C2, C3, ...
   - Circle formats:
        ["C1", "O", 5]                # circle C1, center O, radius 5
        ["C2", "O", "P"]              # circle C2, center O, OP is radius
        ["C3", "A", "B", "diameter"]  # circle C3, AB is diameter
        ["C4", "A", "B", "C"]         # circle C4 passing through A, B, C

3. annotations:
   Only annotate numerical or right-angle information **explicitly given** in the text.
   Do not annotate anything that must be inferred from geometric theorems.

   Allowed examples:
   - Angles and lengths must be given as numbers.
   - If the text says "∠ABC = 30°", record in `measure_of_angle`:
       [["A", "B", "C"], "30"]
   - If it says "AB = 5" or "AB = 2√3", record in `length_of_line`:
       [["A", "B"], "5"] or [["A", "B"], "2*sqrt(3)"]
   - If it says "∠ABC is a right angle" or "∠ABC = 90°", record in `right_angles`:
       ["A", "B", "C"]

   Forbidden:
   - Do **not** add information derived from geometry, for example:
     - inferring base angles are equal because a triangle is isosceles,
     - inferring equal segments from "midpoint" definitions,
     - inferring equal angles from parallel lines,
     - inferring four right angles just because a shape is a square.
   - If the text does not explicitly give a numeric value or right-angle condition,
     it is better to omit the annotation.

   Formatting requirements:
   - `right_angles`: each entry is ["A", "B", "C"] meaning ∠ABC is a right angle.
   - `length_of_line`: each entry is [ ["A", "B"], "expression_string" ].
        - Use `sqrt(x)` for square roots, e.g. "2*sqrt(3)" (not `math.sqrt(3)`).
        - Simple integers or fractions can be given as "5", "3/2", etc.
   - `measure_of_angle`: each entry is [ ["A", "B", "C"], "angle_in_degrees" ], e.g. "30".
   - If a category does not appear at all in the text, use an empty list [] for that field.

Return **only** JSON, with no additional text or explanation.
"""

        return prompt

    def format_quantities_prompt(self, question: str, problem_type: str = "computation") -> str:
        """Format the third LLM prompt: quantity DSL expressions (JSON)."""
        
        if problem_type.lower() == "proof":
            # Proof problem: turn the conclusion into numeric verification expressions.
            prompt = f"""
Your task is to convert the **statement to be proved** into DSL expressions in the
`quantities` list, which will be used for numeric verification.

=====================
[Problem text]
=====================
{question}
=====================

Return a single JSON object:

{{
    "quantities": [
        "length(A, B) - length(C, D)",
        "angle(A, B, C) - angle(D, E, F)",
        ...
    ]
}}

[Rules for generating quantities (proof problems)]

1. Each element of `quantities` must be a string containing a **DSL expression**
   that represents an equality (or relation) to be checked.
2. **Key idea**:
   - For proof problems, rewrite the conclusion as an equality-check expression
     of the form `left - right`, which is expected to evaluate to 0.
   - For all circle-related functions (central_angle / arc_length / sector_area /
     arc_inscribed_angle / circle_area / circle_perimeter / segment_area /
     radius / diameter), the **first argument must be a circle ID** such as C1, C2,
     not a point name.
"""

        else:
            # Computation problem: original logic, in English.
            prompt = f"""
Your task is to read what the problem **asks you to find**, and then generate
the corresponding DSL expressions in the `quantities` list (without computing
any numeric answers).
"""

        return prompt
    
    def format_plotter_prompt(self, question: str) -> str:
        """Format the combined LLM prompt that asks for full plotting Python code."""
        
        prompt = f"""
You now have **three tasks** that must be completed in **one single, complete,
executable Python script** (do not split into multiple code blocks):

Task 1 (geometric construction):
    Generate 2D numeric coordinates for all points that appear in the problem text,
    and provide the necessary segment / circle information so that the configuration
    is reasonable, stable, and satisfies the given geometric relations.

Task 2 (explicit annotations):
    Based only on conditions explicitly stated in words in the problem text, generate
    the `annotations` structure. Do **not** add any information that needs to be
    inferred from geometry.

Task 3 (target quantities in DSL):
    Based on what the problem explicitly asks to compute or prove, generate the
    DSL expressions in the `quantities` list (do not compute numeric answers).

=====================
[Problem text]
=====================
{question}
=====================

You must output **directly runnable Python code** that strictly follows the
constraints below.

[Global code constraints]
1. The code must be syntactically valid and executable in a Python interpreter.
2. All variables, constants, and functions must be defined or imported in the
   script; no undefined names are allowed.
3. You may use any legal numeric computation (including `math`, `fractions`,
   `numpy`, etc.), but:
   - All point coordinates and circle radii stored in `result` must be numeric
     (int or float), **not** symbolic expressions or lazy objects.
4. The code must **not** compute the final numeric answer required by the problem
   and must **not** print or output any numeric answer. `quantities` must contain
   only DSL expressions as strings.
5. You may import standard libraries (such as `import math`, `import json`), but
   file I/O and network requests are forbidden.


[Geometry construction rules (Task 1)]

1. Points (`points`)
   - Use only point names that appear in the problem statement (e.g., A, B, C, D, O). **Do not introduce any new points**.
   - Each point must have numeric coordinates `(x, y)` where x and y are int or float.
   - Distinct points must not coincide (no identical coordinates for different points).
   - You may choose coordinates freely, but they must satisfy the geometric relations stated in the text (perpendicular, parallel, collinear, on a circle, etc.).

2. Segments (`segments`)
   - The segment list describes the basic edge structure of the figure.
   - If the text describes a polygon (e.g., triangle ABC, quadrilateral ABCD), you must include **all** polygon edges in `segments` (no missing sides).
   - Store a segment as a 2-tuple ("A", "B") representing segment AB.
   - If the text explicitly mentions drawing/connecting a segment (e.g., "connect AD", "draw BE"), include it in `segments` as long as it does not violate the no-new-points rule.

3. Circles (`circles`)
   - Unless the text **explicitly** mentions a "circle", "incircle", "circumcircle", or similar, then:
     - `circles` must be an empty list `[]`.
   - If circles are involved, each circle must use a **unique circle ID**: C1, C2, C3, ...
   - Allowed circle formats (the first argument must be the circle ID string):
        ["C1", "O", 5]                # circle C1, center O, radius 5
        ["C2", "O", "P"]              # circle C2, center O, OP is the radius
        ["C3", "A", "B", "diameter"]  # circle C3, AB is a diameter
        ["C4", "A", "B", "C"]         # circle C4 passing through points A, B, C
   - If the text mentions an incircle or a circle determined by three points, prefer the three-point circle form or diameter form when appropriate.
   - Do not invent additional circles not mentioned in the text.


[Annotation rules (Task 2)]

Only annotate geometric quantities that are **explicitly stated** in the problem text. Do not add any derived/inferred information. This includes (but is not limited to):

1. Allowed annotation examples:
   - If the text states "∠ABC = 30°", record in `measure_of_angle`:
       [["A", "B", "C"], "30"]
   - If the text states "AB = 5" or "AB = 2√3", record in `length_of_line`:
       [["A", "B"], "5"] or [["A", "B"], "2*sqrt(3)"]
   - If the text states "∠ABC is a right angle" or "∠ABC = 90°", record in `right_angles`:
       ["A", "B", "C"]

2. Explicitly forbidden:
   - Do not annotate information that must be inferred from geometry, e.g.:
     - adding "base angles are equal" because a triangle is isosceles,
     - adding "two segments are equal" because of a midpoint definition,
     - adding "corresponding angles are equal" because of parallel lines, etc.
   - If the text does not explicitly provide a numeric value or a right-angle condition, omit the annotation (better less than wrong).

3. Formatting requirements:
   - `right_angles`: each entry is ["A", "B", "C"] meaning ∠ABC is a right angle.
   - `length_of_line`: each entry is [ ["A", "B"], "expression_string" ].
        - Use `sqrt(x)` for square roots, e.g. "2*sqrt(3)" (not `math.sqrt(3)`).
        - Simple integers or fractions can be strings like "5", "3/2", etc.
   - `measure_of_angle`: each entry is [ ["A", "B", "C"], "degrees" ], e.g. "30".
   - If a category does not appear in the text at all, use an empty list [] for that field.

4. Do not annotate decimal lengths/angles unless they are explicitly given in the text.


[Quantities rules (Task 3 -- very important)]

1. Each element in `quantities` must be a string containing a **DSL expression** that represents what the problem asks you to compute/prove. **Only write expressions, not numeric values**.
2. Only the following DSL forms are allowed. Do not invent new function names or syntax.

(1) Point-based quantities:
    length(A, B)               # length of segment AB
    angle(A, B, C)             # measure of ∠ABC (degrees by default)
    tan(A, B, C)               # tangent of ∠ABC
    sin(A, B, C)               # sine of ∠ABC
    cos(A, B, C)               # cosine of ∠ABC
    area(A, B, C, D, ...)      # area of polygon A-B-C-D-...
    perimeter(A, B, C, D, ...) # perimeter of polygon A-B-C-D-...

(2) Angle between two lines and its trig functions (new):
    angle_between_lines(A, B, C, D)  # angle between line AB and line CD (0°~90°)
    tan_between_lines(A, B, C, D)    # tangent of the angle between AB and CD
    sin_between_lines(A, B, C, D)    # sine of the angle between AB and CD
    cos_between_lines(A, B, C, D)    # cosine of the angle between AB and CD
    # AB is the first line and CD is the second line; A, B, C, D must appear in the problem text.

(3) Circle-related quantities (circle ID must be the first argument):
    central_angle(C1, A, B)        # central angle in circle C1 subtending arc AB
    arc_length(C1, A, B)           # arc length of arc AB on circle C1
    sector_area(C1, A, B)          # sector area corresponding to arc AB in circle C1
    arc_inscribed_angle(C1, A, B)  # inscribed angle corresponding to arc AB in circle C1
    circle_area(C1)                # area of circle C1
    circle_perimeter(C1)           # circumference of circle C1
    segment_area(C1, A, B)         # circular segment area cut by chord AB in circle C1
    radius(C1)                     # radius of circle C1
    diameter(C1)                   # diameter of circle C1

3. If the problem asks for only one quantity, include only one expression in `quantities`.
   If the problem asks for multiple quantities, include multiple expressions.
4. Do not compute the values of these expressions in code, and do not put numeric results into `quantities`. Use only string-form DSL expressions.
5. You may combine expressions with basic arithmetic in a single string if the text explicitly requires it, e.g.:
       "length(A, B) + length(B, C)"
   But the atomic units must be the DSL primitives above, and the form should be clear and easy to parse.


[Final output structure]

At the end of the code, construct and print the following dictionary object (field names and structure must match exactly):

result = {{
    "points": {{  # mapping of all point coordinates
        "A": (xA, yA),
        "B": (xB, yB),
        # ...
    }},
    "segments": [
        ("A", "B"),
        ("B", "C"),
        # ...
    ],
    "circles": [
        # each circle must use a Ci ID
        # ["C1", "O", 5],
        # ["C2", "A", "B", "diameter"],
        # ["C3", "A", "B", "C"],
    ],
    "quantities": [
        # only DSL-expression strings; do not include numeric answers
        # "length(A, B)",
        # "angle(A, B, C)",
        # "angle_between_lines(A, B, C, D)",
        # "central_angle(C1, A, B)",
        # ...
    ],
    "annotations": {{
        "right_angles": [
            ["A", "B", "C"],
            # ...
        ],
        "length_of_line": [
            [["A", "B"], "5"],
            [["C", "D"], "2*sqrt(3)"],
            # ...
        ],
        "measure_of_angle": [
            [["A", "B", "C"], "30"],
            # ...
        ]
    }},
}}

import json
print(json.dumps(result, ensure_ascii=False))
"""

        return prompt
    
    def call_llm(self, prompt: str) -> Dict[str, Any]:
        """
        Call the LLM API for plotting-related tasks.
        
        Args:
            prompt: Fully formatted prompt text.
        
        Returns:
            API response payload.
        """
        for attempt in range(self.max_retries):
            try:
                import time as time_module
                start_time = time_module.time()
                completion = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {
                            "role": "user",
                            "content": prompt,
                        },
                    ],
                    temperature=0.3,
                    max_tokens=32767 * 2,
                    reasoning_effort="high"
                )
                elapsed_time = time_module.time() - start_time
                
                if not completion or not completion.choices:
                    raise Exception("LLM API returned an invalid response: completion.choices is empty.")
                
                choice = completion.choices[0]
                if not choice or not choice.message:
                    raise Exception("LLM API returned an invalid response: message is empty.")
                
                response = choice.message
                finish_reason = choice.finish_reason
                
                if finish_reason == "length":
                    usage_info = ""
                    if completion.usage:
                        usage = completion.usage
                        prompt_tokens = getattr(usage, "prompt_tokens", 0) or 0
                        completion_tokens = getattr(usage, "completion_tokens", 0) or 0
                        total_tokens = getattr(usage, "total_tokens", 0) or 0
                        usage_info = (
                            f" (prompt_tokens={prompt_tokens}, "
                            f"completion_tokens={completion_tokens}, total_tokens={total_tokens})"
                        )
                    raise Exception(
                        f"Response was truncated due to max_tokens limit (current max_tokens=32000){usage_info}. "
                        "If completion_tokens is close to this limit, consider increasing max_tokens."
                    )
                
                content = response.content
                if not content or not content.strip():
                    raise Exception(
                        f"LLM returned empty content (finish_reason={finish_reason}, "
                        f"elapsed={elapsed_time:.2f}s)"
                    )
                
                return {
                    "content": content,
                    "role": response.role,
                    "finish_reason": finish_reason,
                    "elapsed_time": elapsed_time,
                    "usage": completion.usage,
                }
            
            except Exception as e:
                error_msg = str(e)
                if attempt < self.max_retries - 1:
                    time.sleep(1)
                else:
                    # Final attempt failed; raise detailed error.
                    raise Exception(f"LLM call failed after {self.max_retries} retries: {error_msg}")
        
        raise Exception("LLM call failed.")
    
    @staticmethod
    def _serialize_usage(usage_obj) -> Optional[Dict[str, int]]:
        """
        Convert a usage object into a JSON-serializable dict.
        
        Args:
            usage_obj: The usage object (may be a CompletionUsage object or a dict).
            
        Returns:
            A dict-form usage; returns None if conversion is not possible.
        """
        if usage_obj is None:
            return None
        
        # If it's already a dict, return it directly.
        if isinstance(usage_obj, dict):
            return {
                "prompt_tokens": usage_obj.get("prompt_tokens", 0) or 0,
                "completion_tokens": usage_obj.get("completion_tokens", 0) or 0,
                "total_tokens": usage_obj.get("total_tokens", 0) or 0,
            }
        
        # If it's an object, extract attributes.
        try:
            prompt_tokens = getattr(usage_obj, 'prompt_tokens', None)
            completion_tokens = getattr(usage_obj, 'completion_tokens', None)
            total_tokens = getattr(usage_obj, 'total_tokens', None)
            
            # If attributes exist, convert them into a dict.
            if prompt_tokens is not None or completion_tokens is not None or total_tokens is not None:
                return {
                    "prompt_tokens": int(prompt_tokens) if prompt_tokens is not None else 0,
                    "completion_tokens": int(completion_tokens) if completion_tokens is not None else 0,
                    "total_tokens": int(total_tokens) if total_tokens is not None else 0,
                }
        except Exception:
            pass
        
        return None
    
    def extract_code_from_response(self, response_text: str) -> str:
        """
        Extract Python code from an LLM response.
        
        Args:
            response_text: Raw LLM response text.
        
        Returns:
            Extracted Python code.
        """
        # Try to extract a fenced code block (```python or ```).
        code_match = re.search(r'```(?:python)?\s*(.*?)\s*```', response_text, re.DOTALL)
        if code_match:
            code = code_match.group(1).strip()
            # Remove potential explanatory prefixes.
            lines = code.split('\n')
            code_lines = []
            for line in lines:
                stripped = line.strip()
                # Skip obviously non-code/explanatory lines.
                if stripped and not stripped.startswith('#') and not stripped.startswith('"""'):
                    code_lines.append(line)
            if code_lines:
                return '\n'.join(code_lines)
        
        # If there's no fenced code block, try extracting from the whole response (assume it is code).
        # Remove potential explanatory text.
        lines = response_text.strip().split('\n')
        code_lines = []
        in_code = False
        
        for line in lines:
            stripped = line.strip()
            # Skip empty lines and obviously explanatory text.
            if not stripped:
                if in_code:
                    code_lines.append(line)  # Keep blank lines within code.
                continue
            
            # Detect code start.
            if any(stripped.startswith(kw) for kw in ['import ', 'from ', 'def ', 'class ', 'if __name__']):
                in_code = True
            
            # If we're already in code, or this line looks like code.
            if in_code or any(kw in stripped for kw in ['import ', '=', 'print(', 'result', 'math.', 'json.']):
                # Skip obvious explanatory comment lines (best-effort heuristic).
                low = stripped.lower()
                if not (stripped.startswith('#') and ('code' in low or 'example' in low)):
                    code_lines.append(line)
                    in_code = True
        
        if code_lines:
            return '\n'.join(code_lines)
        
        # If nothing can be extracted, return the whole response (fail loudly rather than silently).
        return response_text.strip()
    
    def extract_json_from_response(self, response_text: str) -> Optional[Dict[str, Any]]:
        """
        Extract a JSON object from an LLM response.
        
        Args:
            response_text: Raw LLM response text.
        
        Returns:
            Extracted JSON dict; returns None on failure.
        """
        # Try to extract a fenced JSON block (```json or ```).
        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
        if json_match:
            try:
                return json.loads(json_match.group(1))
            except json.JSONDecodeError:
                pass
        
        # Try to directly locate a JSON object.
        json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
        if json_match:
            try:
                return json.loads(json_match.group(0))
            except json.JSONDecodeError:
                pass
        
        # If nothing is found, return None.
        return None
    
    def _call_llm_for_coordinates(
        self, question: str, index: Optional[int] = None
    ) -> Dict[str, Any]:
        """
        First LLM call: coordinate construction only (returns code and executes it; computes coordinates only).
        Includes retries: if code execution fails or the format is invalid, it will retry.
        
        Args:
            question: Geometry problem text.
            index: Optional sample index (for logging).
        
        Returns:
            A dict containing the coordinate result:
            - success: bool
            - points: point coordinates
            - code: generated code
            - error: error message (if failed)
        """
        for attempt in range(self.max_retries):
            try:
                prompt = self.format_coordinates_prompt(question)
                
                api_response = self.call_llm(prompt)
                llm_response = api_response.get("content", "")
                usage = api_response.get("usage")
                finish_reason = api_response.get("finish_reason")
                
                if not llm_response:
                    if attempt < self.max_retries - 1:
                        if index is not None:
                            print(f"[Sample {index}] Coordinate attempt {attempt + 1}/{self.max_retries}: LLM returned empty response; retrying...")
                        time.sleep(2 ** attempt)
                        continue
                    return {
                        "success": False,
                        "error": "LLM returned an empty response",
                        "usage": usage,
                    }
                
                # Extract code.
                code = self.extract_code_from_response(llm_response)
                if not code or len(code.strip()) < 10:
                    if attempt < self.max_retries - 1:
                        if index is not None:
                            print(f"[Sample {index}] Coordinate attempt {attempt + 1}/{self.max_retries}: Failed to extract valid code; retrying...")
                        time.sleep(2 ** attempt)
                        continue
                    return {
                        "success": False,
                        "error": "Failed to extract valid code from the LLM response",
                        "llm_response": llm_response,
                        "usage": usage,
                    }
                
                # Execute code.
                try:
                    result = self.safe_execute_code(code)
                    points = result.get("points", {})
                    
                    # Validate format: only check whether points can be parsed (do not validate other fields here).
                    is_valid, error_msg = self._validate_points(points)
                    
                    if not is_valid:
                        if attempt < self.max_retries - 1:
                            if index is not None:
                                print(f"[Sample {index}] Coordinate attempt {attempt + 1}/{self.max_retries}: Format validation failed ({error_msg}); retrying...")
                            time.sleep(2 ** attempt)
                            continue
                        return {
                            "success": False,
                            "error": f"Format validation failed: {error_msg}",
                            "code": code,
                            "usage": usage,
                        }
                    
                    return {
                        "success": True,
                        "points": points,
                        "code": code,
                        "usage": usage,
                        "finish_reason": finish_reason,
                    }
                except Exception as e:
                    if attempt < self.max_retries - 1:
                        if index is not None:
                            print(f"[Sample {index}] Coordinate attempt {attempt + 1}/{self.max_retries}: Code execution failed ({str(e)}); retrying...")
                        time.sleep(2 ** attempt)
                        continue
                    return {
                        "success": False,
                        "error": f"Code execution failed: {str(e)}",
                        "code": code,
                        "usage": usage,
                    }
            
            except Exception as e:
                if attempt < self.max_retries - 1:
                    if index is not None:
                        print(f"[Sample {index}] Coordinate attempt {attempt + 1}/{self.max_retries}: LLM call failed ({str(e)}); retrying...")
                    time.sleep(2 ** attempt)
                    continue
                return {
                    "success": False,
                    "error": f"LLM call failed: {str(e)}",
                }
        
        return {
            "success": False,
            "error": f"Coordinate construction failed (retried {self.max_retries} times)",
        }
    
    def _call_llm_for_annotations(
        self, question: str, index: Optional[int] = None
    ) -> Dict[str, Any]:
        """
        Second LLM call: extract annotations, segments, and circles (returns JSON).
        Includes retries: if JSON extraction fails, it will retry.
        
        Args:
            question: Geometry problem text.
            index: Optional sample index (for logging).
        
        Returns:
            A dict containing annotation extraction results:
            - success: bool
            - segments: segment list
            - circles: circle list
            - annotations: annotations dict
            - error: error message (if failed)
        """
        for attempt in range(self.max_retries):
            try:
                prompt = self.format_annotations_prompt(question)
                
                api_response = self.call_llm(prompt)
                llm_response = api_response.get("content", "")
                usage = api_response.get("usage")
                finish_reason = api_response.get("finish_reason")
                
                if not llm_response:
                    if attempt < self.max_retries - 1:
                        if index is not None:
                            print(f"[Sample {index}] Annotation attempt {attempt + 1}/{self.max_retries}: LLM returned empty response; retrying...")
                        time.sleep(2 ** attempt)
                        continue
                    return {
                        "success": False,
                        "error": "LLM returned an empty response",
                        "usage": usage,
                    }
                
                # Extract JSON.
                result_json = self.extract_json_from_response(llm_response)
                if not result_json:
                    if attempt < self.max_retries - 1:
                        if index is not None:
                            print(f"[Sample {index}] Annotation attempt {attempt + 1}/{self.max_retries}: Failed to extract valid JSON; retrying...")
                        time.sleep(2 ** attempt)
                        continue
                    return {
                        "success": False,
                        "error": "Failed to extract valid JSON from the LLM response",
                        "llm_response": llm_response,
                        "usage": usage,
                    }
                
                return {
                    "success": True,
                    "segments": result_json.get("segments", []),
                    "circles": result_json.get("circles", []),
                    "annotations": result_json.get("annotations", {
                        "right_angles": [],
                        "length_of_line": [],
                        "measure_of_angle": [],
                    }),
                    "usage": usage,
                    "finish_reason": finish_reason,
                }
            
            except Exception as e:
                if attempt < self.max_retries - 1:
                    if index is not None:
                        print(f"[Sample {index}] Annotation attempt {attempt + 1}/{self.max_retries}: LLM call failed ({str(e)}); retrying...")
                    time.sleep(2 ** attempt)
                    continue
                return {
                    "success": False,
                    "error": f"LLM call failed: {str(e)}",
                }
        
        return {
            "success": False,
            "error": f"Annotation extraction failed (retried {self.max_retries} times)",
        }
    
    def _call_llm_for_quantities(
        self, question: str, index: Optional[int] = None, problem_type: str = "computation"
    ) -> Dict[str, Any]:
        """
        Third LLM call: extract target quantities (returns JSON).
        Includes retries: if JSON extraction fails, it will retry.
        
        Args:
            question: Geometry problem text.
            index: Optional sample index (for logging).
            problem_type: Problem type: "computation" or "proof".
        
        Returns:
            A dict containing quantities extraction results:
            - success: bool
            - quantities: list of DSL expressions
            - error: error message (if failed)
        """
        for attempt in range(self.max_retries):
            try:
                prompt = self.format_quantities_prompt(question, problem_type=problem_type)
                
                api_response = self.call_llm(prompt)
                llm_response = api_response.get("content", "")
                usage = api_response.get("usage")
                finish_reason = api_response.get("finish_reason")
                
                if not llm_response:
                    if attempt < self.max_retries - 1:
                        if index is not None:
                            print(f"[Sample {index}] Quantities attempt {attempt + 1}/{self.max_retries}: LLM returned empty response; retrying...")
                        time.sleep(2 ** attempt)
                        continue
                    return {
                        "success": False,
                        "error": "LLM returned an empty response",
                        "usage": usage,
                    }
                
                # Extract JSON.
                result_json = self.extract_json_from_response(llm_response)
                if not result_json:
                    if attempt < self.max_retries - 1:
                        if index is not None:
                            print(f"[Sample {index}] Quantities attempt {attempt + 1}/{self.max_retries}: Failed to extract valid JSON; retrying...")
                        time.sleep(2 ** attempt)
                        continue
                    return {
                        "success": False,
                        "error": "Failed to extract valid JSON from the LLM response",
                        "llm_response": llm_response,
                        "usage": usage,
                    }
                
                return {
                    "success": True,
                    "quantities": result_json.get("quantities", []),
                    "usage": usage,
                    "finish_reason": finish_reason,
                }
            
            except Exception as e:
                if attempt < self.max_retries - 1:
                    if index is not None:
                        print(f"[Sample {index}] Quantities attempt {attempt + 1}/{self.max_retries}: LLM call failed ({str(e)}); retrying...")
                    time.sleep(2 ** attempt)
                    continue
                return {
                    "success": False,
                    "error": f"LLM call failed: {str(e)}",
                }
        
        return {
            "success": False,
            "error": f"Quantities extraction failed (retried {self.max_retries} times)",
        }
    
    def _validate_points(self, points: Dict[str, Any]) -> tuple[bool, Optional[str]]:
        """
        Validate only the `points` field (used during coordinate construction).
        
        Args:
            points: Dict of point coordinates.
        
        Returns:
            (is_valid, error_message): validity flag and error message (if invalid)
        """
        if not points:
            return False, "`points` field is missing or empty"
        
        if not isinstance(points, dict):
            return False, "`points` must be a dict"
        
        if len(points) == 0:
            return False, "`points` is empty; at least one point is required"
        
        # Check coordinate format for each point.
        for point_name, coord in points.items():
            if not isinstance(coord, (tuple, list)) or len(coord) != 2:
                return False, f"Point {point_name} has invalid coordinate format; expected (x, y) or [x, y]"
            
            try:
                x, y = float(coord[0]), float(coord[1])
                if not (isinstance(x, (int, float)) and isinstance(y, (int, float))):
                    return False, f"Point {point_name} coordinates must be numeric"
            except (ValueError, TypeError):
                return False, f"Point {point_name} coordinates cannot be converted to numbers: {coord}"
        
        # Check for duplicate point coordinates (plotter cannot parse overlapping distinct points reliably).
        point_list = list(points.items())
        for i in range(len(point_list)):
            name1, coord1 = point_list[i]
            coord1_tuple = (float(coord1[0]), float(coord1[1]))
            for j in range(i + 1, len(point_list)):
                name2, coord2 = point_list[j]
                coord2_tuple = (float(coord2[0]), float(coord2[1]))
                if coord1_tuple == coord2_tuple:
                    return False, f"Points {name1} and {name2} share the same coordinates; plotter cannot parse this"
        
        return True, None
    
    def _validate_plotting_code(self, plotting_code: Dict[str, Any]) -> tuple[bool, Optional[str]]:
        """
        Validate whether the full plotting_code format can be parsed by the plotter.
        Used for final validation after merging.
        
        Args:
            plotting_code: Plotting code dict (structured representation).
        
        Returns:
            (is_valid, error_message): validity flag and error message (if invalid)
        """
        # First validate points (required).
        points = plotting_code.get("points")
        is_valid, error_msg = self._validate_points(points if points else {})
        if not is_valid:
            return False, error_msg
        
        # Validate segments format (if present).
        segments = plotting_code.get("segments")
        if segments is not None and not isinstance(segments, (list, tuple)):
            return False, "`segments` must be a list or tuple"
        
        # Validate circles format (if present).
        circles = plotting_code.get("circles")
        if circles is not None and not isinstance(circles, (list, tuple)):
            return False, "`circles` must be a list or tuple"
        
        # Validate annotations format (if present).
        annotations = plotting_code.get("annotations")
        if annotations is not None and not isinstance(annotations, dict):
            return False, "`annotations` must be a dict"
        
        return True, None
    
    def safe_execute_code(self, code: str, timeout: int = 10) -> Dict[str, Any]:
        """
        Safely execute Python code and extract `result`.
        
        Args:
            code: Python code.
            timeout: Timeout in seconds.
        
        Returns:
            The `result` dict.
        """
        # Ensure required imports exist.
        if 'import json' not in code:
            code = 'import json\n' + code
        if 'import math' not in code:
            code = 'import math\n' + code
        
        # Create a temporary file.
        with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as f:
            f.write(code)
            temp_file = f.name
        
        try:
            # Execute code.
            result = subprocess.run(
                ['python3', temp_file],
                capture_output=True,
                text=True,
                timeout=timeout,
            )
            
            if result.returncode != 0:
                raise Exception(f"Code execution failed: {result.stderr}")
            
            # Extract JSON from stdout.
            output = result.stdout.strip()
            
            if not output:
                raise Exception("No output from executed code. Make sure the code ends with print(json.dumps(result)).")
            
            # Try parsing JSON.
            try:
                # Find a JSON object.
                json_match = re.search(r'\{.*\}', output, re.DOTALL)
                if json_match:
                    result_dict = json.loads(json_match.group(0))
                    return result_dict
            except json.JSONDecodeError as e:
                # If JSON parsing fails, include more context.
                raise Exception(f"JSON parsing failed: {e}. Output: {output[:500]}")
            
            # If stdout is not JSON, we could try eval (unsafe) as a fallback,
            # but we prefer requiring the LLM to output JSON.
            
            raise Exception(
                f"Failed to extract valid JSON from output. Output length: {len(output)}; first 500 chars: {output[:500]}"
            )
        
        finally:
            # Clean up temp file.
            try:
                os.unlink(temp_file)
            except:
                pass

    @staticmethod
    def _normalize_point_name(name: Any) -> str:
        return str(name).strip().lower()

    def _normalize_angle_triplets(self, triples: Any) -> List[List[str]]:
        normalized: List[List[str]] = []
        if not isinstance(triples, (list, tuple)):
            return normalized
        for triple in triples:
            if isinstance(triple, (list, tuple)) and len(triple) == 3:
                normalized.append(
                    [self._normalize_point_name(p) for p in triple]
                )
        return normalized

    def _normalize_equal_lines(self, groups: Any) -> List[List[List[str]]]:
        normalized: List[List[List[str]]] = []
        if not isinstance(groups, (list, tuple)):
            return normalized
        for group in groups:
            if not isinstance(group, (list, tuple)):
                continue
            norm_group: List[List[str]] = []
            for segment in group:
                norm_segment = self._normalize_segment(segment)
                if norm_segment:
                    norm_group.append(norm_segment)
            if norm_group:
                normalized.append(norm_group)
        return normalized

    def _normalize_length_annotations(self, entries: Any) -> List[List[Any]]:
        normalized: List[List[Any]] = []
        if not isinstance(entries, (list, tuple)):
            return normalized
        for entry in entries:
            if (
                isinstance(entry, (list, tuple))
                and len(entry) == 2
                and isinstance(entry[0], (list, tuple))
            ):
                segment = self._normalize_segment(entry[0])
                value = entry[1]
                if segment:
                    normalized.append([segment, str(value)])
        return normalized

    def _normalize_angle_measures(self, entries: Any) -> List[List[Any]]:
        normalized: List[List[Any]] = []
        if not isinstance(entries, (list, tuple)):
            return normalized
        for entry in entries:
            if (
                isinstance(entry, (list, tuple))
                and len(entry) == 2
                and isinstance(entry[0], (list, tuple))
                and len(entry[0]) == 3
            ):
                triple = [
                    self._normalize_point_name(p) for p in entry[0]
                ]
                value = entry[1]
                normalized.append([triple, str(value)])
        return normalized

    def _normalize_segment(self, segment: Any) -> Optional[List[str]]:
        if isinstance(segment, (list, tuple)) and len(segment) == 2:
            return [
                self._normalize_point_name(segment[0]),
                self._normalize_point_name(segment[1]),
            ]
        return None
    
    def plot(self, question: str, index: Optional[int] = None, problem_type: str = "computation") -> Dict[str, Any]:
        """
        Process a geometry problem and generate plotting code (structured representation).
        
        New implementation: split into three LLM calls:
        1) Coordinates (code)
        2) Annotations / segments / circles (JSON)
        3) Target quantities (JSON)
        Then merge into plotting_code.
        
        Args:
            question: Geometry problem text.
            index: Optional sample index (for logging).
            problem_type: Problem type: "computation" or "proof".
        
        Returns:
            Result dict containing:
            - success: bool
            - plotting_code: plotting code (structured representation, if successful)
            - llm_response: raw LLM response (merged from calls, for debugging)
            - code: generated code (coordinate code)
            - error: error message (if failed)
        """
        if index is not None:
            print(f"[Sample {index}] Plotting...")
        
        # First call: coordinate construction (includes retries and validation).
        coord_result = self._call_llm_for_coordinates(question, index)
        if not coord_result.get("success"):
            error_msg = coord_result.get("error", "Coordinate construction failed")
            return {
                "success": False,
                "error": f"Coordinate construction failed: {error_msg}",
                "llm_response": "",
                "usage": coord_result.get("usage"),
            }
        
        # Second call: annotations / segments / circles extraction (includes retries and validation).
        annot_result = self._call_llm_for_annotations(question, index)
        if not annot_result.get("success"):
            error_msg = annot_result.get("error", "Annotation extraction failed")
            return {
                "success": False,
                "error": f"Annotation extraction failed: {error_msg}",
                "llm_response": "",
                "usage": annot_result.get("usage"),
            }
        
        # Third call: target quantities extraction (includes retries and validation).
        quant_result = self._call_llm_for_quantities(question, index, problem_type=problem_type)
        if not quant_result.get("success"):
            error_msg = quant_result.get("error", "Quantities generation failed")
            return {
                "success": False,
                "error": f"Quantities generation failed: {error_msg}",
                "llm_response": "",
                "usage": quant_result.get("usage"),
            }
        
        # Merge results.
        # Points come only from coordinate construction.
        points = coord_result.get("points", {})
        
        # Segments come only from annotation extraction.
        segments = annot_result.get("segments", [])
        
        # Circles come only from annotation extraction.
        circles = annot_result.get("circles", [])
        
        # Annotations come only from annotation extraction.
        annotations = annot_result.get("annotations", {
            "right_angles": [],
            "length_of_line": [],
            "measure_of_angle": [],
        })
        
        # Quantities come only from the third call.
        quantities = quant_result.get("quantities", [])
        
        plotting_code = {
            "points": points,
            "segments": segments,
            "circles": circles,
            "quantities": quantities,
            "annotations": annotations,
        }
        
        # Final validation: ensure merged plotting_code can be parsed by the plotter.
        is_valid, error_msg = self._validate_plotting_code(plotting_code)
        if not is_valid:
            return {
                "success": False,
                "error": f"Final format validation failed: {error_msg}",
                "llm_response": "",
                "usage": coord_result.get("usage"),
            }
        
        # Merge usage info.
        all_usages = [
            coord_result.get("usage"),
            annot_result.get("usage"),
            quant_result.get("usage"),
        ]
        # Compute total usage (simple summation).
        total_usage = None
        if any(all_usages):
            total_prompt_tokens = sum(
                self._serialize_usage(u).get("prompt_tokens", 0) if u else 0
                for u in all_usages
            )
            total_completion_tokens = sum(
                self._serialize_usage(u).get("completion_tokens", 0) if u else 0
                for u in all_usages
            )
            total_usage = {
                "prompt_tokens": total_prompt_tokens,
                "completion_tokens": total_completion_tokens,
                "total_tokens": total_prompt_tokens + total_completion_tokens,
            }
        
        # Record call details.
        import time as time_module
        call_details = {
            "timestamp": time_module.time(),
            "elapsed_time": 0,  # Total elapsed time across the three calls (can be accumulated)
            "response_length": 0,  # Combined response length
            "finish_reason": "stop",  # If all succeed
            "usage": total_usage,
        }
        
        # Merge llm_response (for debugging).
        llm_responses = []
        if coord_result.get("code"):
            llm_responses.append(f"=== Coordinate code ===\n{coord_result.get('code', '')}")
        # Other response summaries can be appended here.
        
        return {
            "success": True,
            "plotting_code": plotting_code,
            "llm_response": "\n\n".join(llm_responses) if llm_responses else "",
            "code": coord_result.get("code", ""),  # coordinate code
            "result": plotting_code,  # for compatibility
            "usage": total_usage,
            "finish_reason": "stop",
            "call_details": call_details,
        }
        