from .agentic_world_framework import initialize_agent,agent2world_gen_cwmb_code
from .utils import WorldModelPromptBase
from typing import Any

class MctsPrompt(WorldModelPromptBase):
    def __init__(self, environemnt:str,task_describe: str,simulator:Any):
        self.environemnt = environemnt
        self.task_describe = task_describe
        self.research_report = ""
        with open(simulator.i_prompt_path, "r") as f:
            prompt = f.read()
        self.i_base_prompt = prompt
        
        with open(simulator.g_prompt_path, "r") as f:
            prompt = f.read()
        self.g_base_prompt = prompt
        
        with open(simulator.f_prompt_path, "r") as f:
            prompt = f.read()
        self.f_base_prompt = prompt
        self.gen_code_common = """
        
        <Output Format>

        <final>
        <code_file_path>
        The entrypoint file path of the generated code.
        </code_file_path>
        <entrypoint_code>
        ```python
        # Your complete, runnable single-file implementation here.
        ```
        </entrypoint_code>
        </final>

        </Output Format>
        
        """
    def build_research_prompt(self) -> str:
        return f"""
You are a world-class Systems Analyst and Technical Specification Writer, specializing in creating reinforcement learning environments. Your mission is to transform an ambiguous task description into a precise, actionable, and verifiable technical specification.

<Environment Name>
{self.environemnt}
</Environment Name>

<TASK DESCRIPTION>
{self.task_describe}
</TASK DESCRIPTION>

<Workflow>
Please strictly follow the following six-step process:
1. **Deconstruction and Analysis (Use Version Locking)**
- Identify all ambiguities, gaps, and conflicts in the task description.
- Lock the exact environment version and key library versions (record name, version, and source link).
- Categorize gaps by type: missing value/unit/range boundary/time-sensitive/ambiguous reference/unclosed list/conflict/no provenance.

2. **Planning and Investigation (Authoritative Search + Evidence Log)**
- For each high/mid-level gap, write 1-2 focused queries that include: synonyms/abbreviations, site filters to authoritative domains (e.g., site:numpy.org, site:docs.python.org), and recency windows (e.g., after:2024-01-01 or "last 2 years").
- Execute the query using browser_search and open ≥ 2 trusted results with browser_open.
- If the top sources disagree, open ≥1 additional authoritative sources and triangulate.
- Create an evidence log entry for each opened page: Title | Organization/Author | Version/Submission | URL (+ archived URL) | Publication Date | Access Date (Asia/Singapore) | 3 Key Facts | Confidence (High/Medium/Low).

3. **Synthesis and Citation (Conflict Resolution)**
- Integrate the findings into a concise evidence summary with citations.
- When sources conflict, explain the differences and justify the chosen resolution (related to version locking).

4. **Refinement and Improvement (Specification Patch)**
- Generate a structured "diff": action/observation space; rewards; termination/truncation; timing (dt/frame_skip); seeding and certainty; numerical tolerances; dependencies; interface flags.

5. **Formalization and Finalization (Ready-to-Use Specification)**
- Write the final specification according to the <Output Format>, including the public API, core logic, usage scenarios, and a verification plan aligned with metrics and statistical validation.

6. **Review and Self-Correction (Compliance Check)**
- Verify conformance to the <Output Constraints>, version consistency, SI units, ISO dates, and ensure no runnable code is included.
</Workflow>

<OUTPUT_CONSTRAINTS>
* Strictly adhere to the structure defined in <PLANNING_STRUCTURE>.
* Do NOT output runnable code definitions (classes, functions). Only may include short illustrative snippets or pseudo-code.
* All claims about industry standards or common practices MUST be supported by citations.
* Use ISO-8601 dates (e.g., 2025-09-02).
* Use SI units for physical and mathematical quantities.
* Data-leakage rule: Do not access, copy, quote, or derive from raw source code repositories. Prefer official documentation, standards, papers, or reputable secondary sources. If the only available evidence is a code repository, summarize behavior without copying code and mark it as an inference with risks.
</OUTPUT_CONSTRAINTS>

<PLANNING_STRUCTURE>
Your output must begin with this planning and analysis section.
1. **Ambiguity Analysis**
   - List each ambiguity/vagueness/conflict and mark Impact: High / Medium / Low.
   - Cover at least: missing numeric value, missing unit, missing boundary/range, time-sensitive items, unclear references, open lists ("etc."/"e.g."), conflicts, and missing citation.

2. **Investigation Plan**
   - For each High/Medium item, provide one atomic question.
   - For each question, provide 1–2 executable queries including: synonyms/abbreviations, a site filter to authoritative domains, and a time window (e.g., after:2024-01-01 or "past 2 years").
   - State the minimum evidence policy: High/Medium → ≥2 credible sources; if disagreement → add ≥1 more for triangulation.
</PLANNING_STRUCTURE>

<Formula requirements>
* For any formula, define all symbols, units, and applicability constraints.
* Cite the source of the formula immediately after its definition.
* Provide the complete formula rather than a descriptive explanation.
</Formula requirements>

<OUTPUT_FORMAT>
Please provide the final specification document structured as follows. This is the primary deliverable. Do NOT include code.

**Version & Provenance**
* Environment name and version lock (e.g., vX.Y); document/library versions; commit hash (if applicable).
* Source release date AND Accessed date (ISO-8601, Asia/Singapore).
* Scope note: this specification normalizes any legacy or paper-specific differences.

**Evidence Summary**
* For each ambiguity/issue, summarize findings with references.
* Citation format: [Source Name, YYYY-MM-DD] "Optional ≤20-word quote."
* If sources conflict, describe the conflict and justify the chosen resolution.

**Spec Patch**
* Structured key–value decisions that resolve ambiguities (the "diff").
* Cover: action/observation spaces; reward function; termination/truncation; timing (dt/frame_skip); seeding; dependencies; numeric tolerances; interface flags.
* Example entries: `reward.penalty.late = -0.5`, `termination.max_steps = 1000`, `obs.dtype = float32`.

**Theoretical Foundations**
* Formal MDP definition and core equations (e.g., discounted return, Bellman optimality).
* Domain formulas (physics/math) with full definitions, units, ranges, and assumptions.
* If reward shaping is used, specify potential-based shaping formula and cite invariance guarantees.
* Provide a Symbol Table listing every symbol (name, unit, allowable range).

**Final Specification**
* Standalone, unambiguous description integrating the Spec Patch.
* Public API Definition: class name; __init__ parameters; method signatures (reset, step, set_state, etc.) with type hints (no code).
* Core Logic Description: precise `reset`/`step` semantics; state update rules; reward calculation; termination and truncation checks; info fields.
* Usage Scenarios & Examples: at least two multi-step traces. For each step, list:
  - observation, action, reward, terminated (bool), truncated (bool), info (dict)
  - Use SI units and exact values/ranges; no vague terms ("about", "etc.").

**Assumptions & Risks**
* Assumptions made due to missing/conflicting evidence; risks and potential impacts.

**Library Usage**
* How official documentation informs interfaces, units, and signatures.
* Allowed libraries: Python standard library and NumPy only.
* Cite specific APIs (e.g., numpy.random.Generator seeding; np.clip semantics; math module functions).
* State explicitly: only standard library and NumPy are used; no external dependencies are introduced by this spec and no source code is copied; documentation is used only to clarify behavior.

</OUTPUT_FORMAT>

<CORE_PRIORITIES>
Correctness (evidence-backed) > Reproducibility (versioning & seeding) > Clarity (unambiguous) > Completeness (no open-ended lists).
</CORE_PRIORITIES>
""".strip()

    def build_gen_code_prompt(
        self,
        feedback: str | dict | None = None,
        debug: bool = False
    ) -> str:
        if debug:
            return self.build_fix_code_prompt(feedback)
        elif feedback:
            return self.build_improve_code_prompt(feedback)
        else:
            return self.build_generate_code_prompt()
    def build_generate_code_prompt(self) -> str:
        return f"""
{self.task_describe}

<Research Report>
{self.research_report}
</Research Report>

<Workflow>
1. **Specification Analysis:** Thoroughly analyze the task description and research report to understand the environment's requirements, including state/action spaces, dynamics, reward function, and termination conditions.
2. **Mathematical Modeling:** Evaluate if the task requires mathematical/physical simulation. If so, design appropriate mathematical models using standard numerical methods.
3. **Architecture Design:** Plan the Environment class structure, including internal state variables, helper methods, and the public interface methods.
4. **Implementation:** Write the complete implementation of the Environment class, ensuring all mathematical models are implemented directly within the class using standard library and NumPy.
5. **Quality Assurance:** Verify that the generated code fully complies with the task description, research report, and all implementation requirements.
6. **Final Review:** Present the complete, reviewed, and runnable single-file code in the specified format.
</Workflow>

<ImplementationRequirements>
**Interface Requirements (Single File):**
Implement a complete, self-contained Python class `Environment` with:
  • `__init__(self, seed: int | None = None)` - Initialize with optional seeding
  • `reset(self, seed: int | None = None) -> ndarray` - Reset episode and return initial observation in canonical shape
  • `set_state(self, state)` - Accept ndarray OR list/tuple in canonical shape
  • `step(self, action) -> tuple[ndarray, float, bool]` - Return (observation, reward, done)

**Core Constraints:**
- **Single-file constraint**: All code, including mathematical model definitions, must be in one Python file
- **No external dependencies**: Only Python standard library and NumPy allowed
- **No framework dependencies**: No Gym inheritance or external RL frameworks
- **Self-contained**: No external file dependencies; embed all model definitions

**Robustness Requirements:**
- **Input flexibility**: Handle equivalent representations (int/numpy scalar/0-D array/1-D len-1 array)
- **Input validation**: Raise clear ValueError/TypeError for invalid inputs with informative messages
- **State consistency**: Maintain internal state consistency throughout episode
- **Reproducibility**: Provide seeding via constructor and/or seed() method

**Mathematical Simulation Requirements:**
- **Numerical methods**: Implement appropriate integration methods (Euler, RK4) using NumPy
- **Stable timestep**: Use consistent dt; document integration scheme choice
- **Physical constraints**: Implement proper bounds checking and constraint handling
- **Finite values**: Ensure all values remain finite (no NaN/Inf)

**Performance Requirements:**
- **Efficient computation**: Keep per-step computation lightweight and allocation-minimal
- **Memory efficiency**: Minimize memory allocations in step() method
- **Numerical stability**: Use stable numerical methods; clamp to safety bounds

**Space Definitions:**
- **Action space**: Explicitly define type, shape, range, and format
- **Observation space**: Clearly specify dimensions, bounds, and data types
- **State space**: Document internal state representation and constraints

**Code Quality:**
- Clean, readable code suitable for RL research and experimentation
- Clear docstrings and inline comments explaining key components
- Proper error handling with informative messages
- Type conversion and validation for robustness
</ImplementationRequirements>

{self.gen_code_common}

    """.strip()
    
    def build_fix_code_prompt(self, feedback) -> str:
        return f"""
{self.f_base_prompt}

<Workflow>
1. **Error Analysis:** Carefully analyze the feedback to identify specific errors, exceptions, or failures in the existing code.
2. **Root Cause Investigation:** Trace back to the underlying causes - whether it's logic errors, type mismatches, model syntax issues, missing edge cases, incorrect numerical method usage, or incorrect implementations.
3. **Targeted Fix Planning:** Plan minimal, focused changes that directly address the identified issues without breaking existing functionality.
4. **Code Correction:** Implement the specific fixes, ensuring they resolve the reported problems while maintaining code integrity and single-file constraint. For numerical issues, consider standard library and NumPy best practices.
5. **Validation Review:** Verify that the fixes address all reported issues and don't introduce new problems.
</Workflow>

<feedback>
{feedback}
</feedback>
{self.gen_code_common}
      
""".strip()
    
    def build_improve_code_prompt(self, feedback) -> str:
        return f"""
{self.i_base_prompt}

<Workflow>
1. **Performance Gap Analysis:** Analyze the feedback to identify areas where the current implementation falls short of requirements or optimal performance.
2. **Enhancement Opportunity Assessment:** Review the existing code structure, model design, and the research report to identify specific improvement opportunities, particularly for numerical method implementation.
3. **Optimization Strategy Planning:** Plan targeted enhancements that improve functionality, performance, or compliance while preserving working components and maintaining single-file constraint. Consider standard library and NumPy optimizations.
4. **Incremental Code Enhancement:** Implement improvements systematically, building upon the existing codebase rather than rewriting from scratch.
5. **Quality Assurance Review:** Ensure enhancements meet the requirements and maintain backward compatibility where applicable.
</Workflow>

<research_report>
{self.research_report}
</research_report>
<feedback>
{feedback}
</feedback>
{self.gen_code_common}
""".strip()


    def build_play_env_prompt(self, code: str, code_file_path: str) -> str:
        return f"""
Your task is to interact with the environment code and then analyze the feedback from the interaction and propose modifications

<CodeArtifact path="{code_file_path}">
```python
{code}
</CodeArtifact> <ExecutionPolicy> - Use the play_env tool exactly once on "{code_file_path}" - If the tool throws or cannot run, perform diagnosis from static review only; still produce output in the required format. </ExecutionPolicy> <Rubric> **Success Criteria (Hierarchical Evaluation):**
Primary (step-level signals present):

success = true iff the run finished without exceptions AND there is NO misclassified_transition with (valid == false OR state_matches == false).
If only observation deltas are available, use obs_matches instead of state_matches.
When numeric deltas are provided, treat matches = true if max_abs_error ≤ 1e-3 or rel_error ≤ 1e-3.
Secondary (no per-step signals):

If success_rate exists: success = true iff no exceptions AND success_rate ≥ 0.95.
Else: success = true iff no exceptions AND no invariant/contract violations you can substantiate from code and logs.
Specific Validation Checks:

Reward/Termination Consistency:

If reward_matches == true AND done_matches == true, explicitly state they match and DO NOT propose changes to reward or termination logic.
Action Space Validation:

If GT exposes Box(low, high, shape): align predicted bounds and expose them (e.g., env.action_space or getter). Never place clipping inside the integrator; clamp only at action ingestion or at observation output.
If GT exposes Discrete(n): actions must be integer indices in [0, n-1]; expose n as a class attribute or property; if indices map to continuous commands/torques, list the mapping table and align it with GT; never float-clip discrete actions.
If action-space info is missing, skip these checks (do not speculate).
State vs Observation Handling:

If clipping or angle normalization is found inside the integrator step (e.g., in _rk4_step), this likely causes trajectory drift; propose moving them to the observation path (e.g., _get_observation) unless GT specifies otherwise.
If latent state is unavailable but observations exist, compare observations instead and state this explicitly.
Integrator & Timestep:

Mismatches in integrator method (e.g., RK4 vs Euler) or dt can cause state divergence even when reward/done match; acknowledge and, if state mismatches persist, propose aligning method/dt to GT.
Batched/Multiple Transitions:

If multiple transitions are reported, aggregate sensibly (e.g., mean success_rate or fraction matched ≥ 0.95) before deciding success. </Rubric>
<Procedure> 1. **Static Review:** Scan for action bounds, clipping/normalization inside integrator, integrator/dt choice, and how observation is formed. 2. **Dynamic Execution:** Call play_env tool once. 3. **Signal Diagnosis:** Reconcile play_env results with code analysis; if reward/done matched, explicitly say so. If state mismatched, point to ONE OR TWO most likely root causes. 4. **Targeted Recommendations:** Suggest 1–3 minimal patches that directly address the identified root causes. </Procedure> <OutputFormat> Return exactly one <final> block containing a single JSON object that matches PlayReport: {{ "success": true|false, "analysis": "<2–4 sentences summarizing what happened and why; mention matches/mismatches explicitly>", "suggest_fix": "- bullet 1\\n- bullet 2\\n- bullet 3 (optional)" }} No extra text outside <final>. No additional code fences. </OutputFormat>
""".strip()


    def build_pytest_env_prompt(self, code: str, code_file_path: str) -> str:
        return f"""

PYTEST_EVAL

<Task Description> {self.task_describe} </Task Description> <CodeArtifact path="{code_file_path}"> ```python {code} ``` </CodeArtifact> <ExecutionPolicy> - Do not modify the student's source file. - Create exactly one pytest file at "tests/test_env.py" using file_tool('save'). - Import the module from "{code_file_path}" via importlib (spec_from_file_location + module_from_spec). - Run tests with code_tool('run', 'pytest -q'); capture exit_code, duration, and stdout/stderr tail. </ExecutionPolicy> <TestPlan> **Comprehensive Test Coverage:**
Sanity Tests:

Environment class can be imported successfully
Environment can be instantiated with Environment(seed=0)
Basic method existence and callability
Contract Compliance Tests:

State Setting Flexibility: set_state accepts list/tuple/ndarray of the same logical content (convert to canonical).
Step Method Contract: step(action) returns a 3-tuple: (observation, reward, done) with expected types/shapes.
Determinism Verification: With the same seed and same initial state, the first step with the same action yields identical outputs.
Action Space Validation: Actions within bounds are accepted, out-of-bounds actions are handled gracefully.
Observation Space Validation: Observations match declared space bounds and shapes.
State Space Consistency: Internal state dimensions match expected environment specifications.
Robustness Tests:

Edge case handling (boundary values, extreme inputs)
Error handling with informative messages
Type conversion and validation
Numerical stability (no NaN/Inf values)
Integration Tests:

Multi-step episode execution
Reset functionality and state reinitialization
Seed consistency across multiple episodes
Acceptance Criteria:

success = true if pytest exit_code == 0 (all tests pass)
Partial success evaluation for diagnostic purposes if failures occur </TestPlan>
<ReportingGuidelines> - Summarize pytest results in 2–4 sentences; mention the first failing nodeid/assert if any. - Provide a brief contract coverage assessment and the most probable root cause for failures. - If failing, add 1–3 concise actionable fixes (no long logs). </ReportingGuidelines> <OutputFormat> Return exactly one <final> block containing a single JSON object that matches PytestReport: {{ "success": true|false, "analysis": "<2–4 sentence summary/diagnosis>", "suggest_fix": "<optional 1–3 bullets with minimal actionable changes>" }} No extra text outside <final>. No additional code fences. </OutputFormat>
""".strip()
    


def generate_mcts_code(
    benchmark_type: str,
    environemnt: str,
    task_describe: str,
    results_base_dir: str = "./result",
    simulator = None,
    enable_research: bool = True,
    enable_player: bool = True,
    enable_pytest: bool = True
) -> str:
    """
    Orchestrate the multi-agent pipeline:
      1. Extract schema
      2. Generate code
      3. Debug and fix iteratively
    Returns the finalized environment code.
    """

    agent2world_agents = initialize_agent(
        benchmark_type,
        results_base_dir,
        simulator,
        enable_research=enable_research,
        enable_player=enable_player,
        enable_pytest=enable_pytest
    )

    mcts_prompt = MctsPrompt(environemnt, task_describe,simulator)
    
    code = agent2world_gen_cwmb_code(
        agent2world_agents,
        mcts_prompt,
        enable_research=enable_research,
        enable_player=enable_player,
        enable_pytest=enable_pytest
    )
    
    return code
