import os
import re
import json
import asyncio
from typing import Dict, Any, Union, Optional, List


try:
    from pypdf import PdfReader
    from pypdf.errors import PdfReadError
except ImportError:
    # Provide a warning if pypdf is not installed, to ensure the code is runnable
    print("Warning: 'pypdf' library not found. Please install it with 'pip install pypdf' for PdfParser to function.")
    class PdfReader:
        def __init__(self, *args, **kwargs):
            raise ImportError("pypdf is not installed. Cannot use PdfParser.")
    class PdfReadError(Exception):
        pass




class PdfParser(Tool_node):
    """
    A tool for parsing PDF documents and searching for specific content within them.
    It can extract text from a PDF and find occurrences of a given query string,
    returning snippets and page numbers.
    """
    def __init__(self):
        super().__init__(
            name="PdfParser",
            description="Parses a PDF document and searches for content. Requires 'file_path' and 'query' parameters, "
                        "e.g., 'file_path=\"/path/to/document.pdf\", query=\"keyword to find\"'."
        )

    async def _execute_tool(self, task: str) -> str:
        """
        Executes the PDF parsing and search operation.
        The `task` string must contain the `file_path` to the PDF and the `query` string to search for.
        
        Example task: "file_path='my_document.pdf', query='important section'"

        Returns:
            A JSON string representing a list of found matches (each with page number and snippet),
            or an error message string if the operation fails.
        """
        file_path: Optional[str] = None
        query: Optional[str] = None

        # Attempt to parse parameters from the task string
        file_path_match = re.search(r"file_path=['\"]([^'\"]+)['\"]", task)
        query_match = re.search(r"query=['\"]([^'\"]+)['\"]", task)

        if file_path_match:
            file_path = file_path_match.group(1)
        if query_match:
            query = query_match.group(1)
        
        if not file_path:
            return "Error: 'file_path' parameter is missing. Please specify the path to the PDF document."
        if not query:
            return "Error: 'query' parameter is missing. Please specify the text to search for within the PDF."
        
        if not os.path.exists(file_path):
            return f"Error: PDF file not found at '{file_path}'."
        if not os.path.isfile(file_path):
            return f"Error: '{file_path}' is not a valid file."
        if not file_path.lower().endswith(".pdf"):
            return f"Error: '{file_path}' is not a PDF file."

        found_matches: List[Dict[str, Any]] = []

        try:
            reader = PdfReader(file_path)
            num_pages = len(reader.pages)

            for i in range(num_pages):
                page = reader.pages[i]
                text = page.extract_text()
                
                if text:
                  
                    
                    lines = text.split('\n')
                    for line_idx, line in enumerate(lines):
                        if re.search(re.escape(query), line, re.IGNORECASE):
                            # Extract a snippet: the entire line or a truncated version
                            snippet = line.strip()
                            if len(snippet) > 200: # Limit snippet length for readability
                                # Find start and end of query in the line for context
                                query_start = re.search(re.escape(query), snippet, re.IGNORECASE).start()
                                # Try to get context around the match
                                start_idx = max(0, query_start - 100)
                                end_idx = min(len(snippet), query_start + len(query) + 100)
                                snippet = "..." + snippet[start_idx:end_idx].strip() + "..."
                            
                            found_matches.append({
                                "page_number": i + 1, # Page numbers are 1-indexed
                                "snippet": snippet,
                                "matched_query": query # Record the exact query searched for
                            })
                            # Stop after the first match in a line to avoid redundant snippets for same line
                            # If multiple distinct matches in a line are desired, remove 'break'
                            break 
            
            return json.dumps(found_matches, indent=2)

        except PdfReadError as e:
            return f"Error reading PDF file '{file_path}': {e}"
        except Exception as e:
            return f"An unexpected error occurred while parsing PDF '{file_path}': {type(e).__name__}: {e}"

    def _format_result_to_natural_language(self, raw_result: str, task_description: str) -> str:
        """
        Converts the raw JSON string output from _execute_tool into a human-readable string.
        """
        try:
            parsed_results: List[Dict[str, Any]] = json.loads(raw_result)
        except json.JSONDecodeError:
            # If the raw_result is not valid JSON, it's likely an error message
            return f"Failed to interpret PDF search results for '{task_description}'. Raw output: {raw_result}"
        
        if not parsed_results:
            # Try to extract the query from task_description for better messaging
            query_match = re.search(r"query=['\"]([^'\"]+)['\"]", task_description)
            query_str = query_match.group(1) if query_match else "your query"
            return f"No occurrences of '{query_str}' found in the specified PDF document for task: '{task_description}'."

        output_lines: List[str] = [f"Found {len(parsed_results)} occurrences in the PDF for task: '{task_description}':"]
        for i, match in enumerate(parsed_results):
            output_lines.append(f"\n--- Occurrence {i+1} ---")
            output_lines.append(f"Page: {match.get('page_number', 'N/A')}")
            output_lines.append(f"Snippet: \"{match.get('snippet', 'N/A')}\"")
        
        return "\n".join(output_lines)