import re
import json
import logging
from typing import Dict, Any, List
from .base import BaseAgent
from .common_prompts import TASK_CONTEXT

from json_repair import repair_json

# =============================================================================
# SUMMARIZE AGENT PROMPT COMPONENTS
# =============================================================================

SUMMARIZE_AGENT_BASE_PROMPT = """
<SUMMARIZE_AGENT_ROLE>
# Summarize Agent - Technical Documentation Indexing Specialist

## **Your Role in the System**
You are the **Summarize Agent** in this multi-agent collaborative system. As the first agent in the pipeline, you convert long technical document sections into indexed, concise summaries that enable efficient document retrieval by downstream agents.

## **Primary Mission**
Transform complex technical documentation into structured, searchable index entries that describe what technical content is covered in each section, enabling precise document allocation for subsequent agents.

## **Key Responsibilities**
* **Document Analysis**: Process every section of technical documents systematically
* **Index Creation**: Generate concise descriptions identifying modules, signals, and technical content
* **Retrieval Optimization**: Focus on WHAT information can be found, not WHAT it does
* **Complete Coverage**: Ensure ALL sections are processed without any truncation or omission

## **Critical Success Factors**
* **Module Identification**: MUST explicitly mention which module each section belongs to
* **Signal Documentation**: Clearly identify signal groups, interfaces, and connections
* **Technical Content Mapping**: Describe I/O definitions, clock logic, control signals, etc.
* **Downstream Agent Support**: Create indexes that enable Planner Agent to allocate appropriate documents
</SUMMARIZE_AGENT_ROLE>
"""

SUMMARIZE_SECTION_PROCESSING = """
<SECTION_PROCESSING_GUIDELINES>
# Section Processing Guidelines

## **Index Description Requirements**
For each section, create index description identifying:
* **Module Identification**: MUST explicitly mention which module the interface/logic in this section belongs to
  - If describing a sub-module's interface: "This section covers [sub_module_name] module's [interface_type] signals..."
  - If describing the main module's interface: "This section covers [main_module_name] module's [interface_type] signals..."
* **Specific Modules**: What modules are described in this section, or which module is the interface of this section describes belongs to
* **Signal Groups**: What signal groups or individual signals are documented  
* **Technical Content**: What type of content is covered (I/O definitions, clock logic, control signals, etc.)
* **Functional Areas**: What design aspects or functional areas are addressed

## **Quality Standards**
* **Content Focus**: Focus on WHAT is documented, not WHAT it does
* **Explicit Module References**: MUST explicitly mention the module name in every summary
* **Indexing Language**: Use "describes", "documents", "covers", "defines"
* **Consistent Style**: Maintain consistent indexing style across sections
* **Complete Output**: Complete JSON output without truncation
* **Valid Format**: MUST return valid JSON format exactly as specified
</SECTION_PROCESSING_GUIDELINES>
"""

SUMMARIZE_OUTPUT_FORMAT = """
<SUMMARIZE_OUTPUT_FORMAT>
# JSON Output Format

## **Required JSON Structure**
Return complete JSON with ALL sections processed. For each section, produce TWO complementary summaries:
- High-level summary: semantic overview of what the section documents
- Low-level summary: an exhaustive list of all module names, variable names, and signal names mentioned in the section (no omissions). If a category is not present, explicitly state "None".

```json
{{
  "section_0": {{
    "index": 0,
    "title": "Concise section title",
    "high_level_summary": "Semantic overview of documented content for this section.",
    "low_level_summary": "Modules: <m1,m2,... or None>; Signals: <s1,s2,... or None>; Variables: <v1,v2,... or None>"
  }},
  "section_1": {{
    "index": 1,
    "title": "Concise section title", 
    "high_level_summary": "Semantic overview of documented content for this section.",
    "low_level_summary": "Modules: <m1,m2,... or None>; Signals: <s1,s2,... or None>; Variables: <v1,v2,... or None>"
  }}
}}
```

## **Example Index Descriptions**
High-level examples:
* "This section describes cpu_core module's input/output signal definitions including clk, rst_n, and data_bus signals"
* "This section covers clk_ctrl module's clock generation logic and documents clk_sel, clk_div signals"  
Low-level examples (must be exhaustive):
* "Modules: cpu_core; Signals: clk, rst_n, data_bus; Variables: None"
* "Modules: clk_ctrl; Signals: clk_sel, clk_div; Variables: None"

## **Critical Module Reference Requirement**
The high-level summary MUST explicitly mention the module name in the first sentence:
- For sub-module interfaces: "This section covers [sub_module_name] module's [interface_type]..."
- For main module interfaces: "This section covers [main_module_name] module's [interface_type]..."
- The module name should be clearly identified in the first sentence of each high-level summary
</SUMMARIZE_OUTPUT_FORMAT>
"""

# Complete prompt assembly using + operator for consistency
SUMMARIZE_AGENT_COMPLETE_PROMPT = (
    TASK_CONTEXT +
    SUMMARIZE_AGENT_BASE_PROMPT +
    SUMMARIZE_SECTION_PROCESSING +
    SUMMARIZE_OUTPUT_FORMAT
)

class SummarizeAgent(BaseAgent):
    """Agent for preprocessing and indexing design documents"""
    
    def __init__(self, llm_client, logger=None):
        super().__init__(llm_client, "SummarizeAgent", logger)
    
    def run(self, summarize_input: Dict[str, Any]) -> Dict[str, Any]:
        """
        Process document and create section index descriptions
        
        Args:
            summarize_input: {"document": "long verilog design document"}
        
        Returns:
            {
                "section_dict": {section_id: section_content},
                "summary_dict": {section_id: section_index_description}
            }
        """
        document = summarize_input["document"]
        self.logger.info(f"Processing document of length: {len(document)} characters")
        
        # Split document by sections using improved logic
        sections = self._split_document_advanced(document)
        self.logger.info(f"Found {len(sections)} sections in document")
        
        # Log section titles for debugging
        for i, section in enumerate(sections):
            self.logger.debug(f"Section {i}: {section['title'][:50]}...")
        
        # Generate index descriptions for each section with full document context
        summary_result = self._generate_summaries_with_context(sections, document, summarize_input["task_name"])
        
        # Create section_dict mapping section_id to section content
        section_dict = {}
        for i, section in enumerate(sections):
            section_dict[f"section_{i}"] = section["content"]
        
        # Extract summary_dict from the result
        summary_dict = summary_result["sections"]
        
        return {
            "section_dict": section_dict,
            "summary_dict": summary_dict
        }
    
    def _split_document_advanced(self, document: str) -> List[Dict[str, Any]]:
        """
        Minimal logic:
        1. Split by # headings
        2. If only 1 block, split by **xxx** headings
        3. Drop blocks that become empty after removing images
        4. Merge empty headings forward into the next non-empty block
        """
        lines = document.splitlines()

        # Generic slicer: input regex, return [(title, lines)].
        def _slice(pattern: str) -> List[tuple]:
            chunks, cur_title, cur = [], None, []
            for ln in lines:
                m = re.match(pattern, ln.strip())
                if m:  # Heading matched.
                    if cur_title is not None or cur:   # Save previous chunk.
                        chunks.append((cur_title, cur))
                    cur_title, cur = m.group(1).strip(), []
                else:
                    cur.append(ln)
            # Tail chunk.
            if cur_title is not None or cur:
                chunks.append((cur_title, cur))
            return chunks

        # Step 1: split by # headings.
        blocks = _slice(r'^#{1,3}\s+(.+)$')
        if len(blocks) <= 1:           # Step 2
            blocks = _slice(r'^\*\*(.+)\*\*\s*$')

        # Step 3: drop blocks empty after image removal.
        cleaned = []
        for title, body in blocks:
            no_img = [re.sub(r'!\[.*?\]\(.*?\)', '', ln, flags=re.DOTALL) for ln in body]
            visible = '\n'.join(no_img).strip()
            if visible.strip():                  # Keep only visible content.
                cleaned.append((title, visible.strip()))
            # Otherwise drop the entire block (title included).

        # Step 4: merge empty headings forward.
        merged, pending = [], ''
        for title, content in cleaned:
            if title is None:
                # First block without a title.
                merged.append({'title': pending if pending else 'Main Document', 'content': content})
                pending = ''
                continue
            if pending:                  # Merge pending heading into current.
                title = f"{pending}\n{title}"
            merged.append({'title': title, 'content': content})
            pending = ''                 # Consumed.
        # If pending remains at the end, keep as a separate empty block.
        if pending:
            merged.append({'title': pending, 'content': ''})

        # Filter out tiny fragments (<=10 chars) and add indices.
        sections = [{'title': s['title'], 'content': s['content'], 'index': i}
                    for i, s in enumerate(merged) if len(s['content']) > 10]

        # Fallback.
        if not sections:
            sections = [{'title': 'Main Document', 'content': document.strip(), 'index': 0}]

        self.logger.info(f"Final section count: {len(sections)}")
        return sections
    
    def _parse_json_response(self, response: str) -> Dict[str, Any]:
        """Parse JSON response with repair attempts, fail fast if cannot parse"""
        self.logger.debug(f"Parsing JSON response: {response[:200]}...")
        
        # Only try to extract JSON from markdown code blocks
        json_match = re.search(r'```json\s*(\{.*?\})\s*```', response, re.DOTALL)
        if not json_match:
            raise ValueError(f"No ```json ``` block found in LLM response")
        
        json_str = json_match.group(1)
        
        # Try to parse JSON directly
        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            self.logger.warning(f"Direct JSON parsing failed: {e}, attempting repair")
            
            # Try jsonrepair
            try:
                repaired_json = repair_json(json_str)
                return json.loads(repaired_json)
            except Exception as repair_error:
                raise ValueError(f"JSON repair failed: {repair_error}, original error: {e}")
    
    def _generate_summaries_with_context(self, sections: List[Dict[str, Any]], full_document: str, task_name: str) -> Dict[str, Any]:
        """Generate index descriptions for all sections with a single LLM call"""

        # Build indexed section JSON using section_X format as keys
        sections_json = {}
        for idx, sec in enumerate(sections):
            sections_json[f"section_{idx}"] = {
                "index": idx,
                "title": sec["title"],
                "content": sec["content"]
            }

        prompt = SUMMARIZE_AGENT_COMPLETE_PROMPT.format(task_name=task_name) + f"""
        
Below is the **ALL sections** you need to create index descriptions for. They are split by the title of the section in a whole document.

Sections to index:
{json.dumps(sections_json, ensure_ascii=False, indent=2)}

CRITICAL REQUIREMENT: Your output JSON MUST contain EXACTLY the same section IDs as provided above. Do NOT create additional sections or skip any sections. The section IDs in your response must match exactly with the "Sections to index" above.

Now create **index descriptions for each section**

IMPORTANT: 
- You MUST include ALL sections from section_0 to section_{len(sections) - 1}
- Do NOT add extra sections beyond what is provided in "Sections to index"
- Do NOT skip any sections
- The section IDs must match exactly: {', '.join(sections_json.keys())}

"""

        response = self.llm_complete(prompt)
        self.logger.debug(f"Single-call raw response: {response[:500]}...")

        summaries = self._parse_json_response(response)
        if not isinstance(summaries, dict):
            raise ValueError("LLM did not return a JSON object")

        # Validate length
        if len(summaries) != len(sections):
            print(sections)
            print("--------------------------------")
            print(summaries)
            raise ValueError(
                f"Expected {len(sections)} sections, got {len(summaries)}"
            )

        # Validate fields again
        for section_key, section_data in summaries.items():
            required = {"index", "title", "high_level_summary", "low_level_summary"}
            if not required.issubset(section_data):
                raise ValueError(f"Missing keys in summary: {section_data}")

        return {
            "sections": summaries,
            "total_sections": len(summaries),
            "processing_notes": "All sections indexed in a single LLM call"
        }