"""
Enhanced web fetching tool for Qwen Code Python implementation with better content processing and private network support.
"""
import re
import requests
import ipaddress
from typing import Dict, Any, List
from urllib.parse import urlparse
from .base_tool import BaseTool
from .tool_types import ToolKind


# Constants
URL_FETCH_TIMEOUT_MS = 10000
MAX_CONTENT_LENGTH = 100000


def is_private_ip(url: str) -> bool:
    """Check if a URL points to a private IP address."""
    try:
        parsed = urlparse(url)
        if not parsed.hostname:
            return False
            
        # Check for localhost
        if parsed.hostname.lower() in ['localhost', '127.0.0.1', '::1']:
            return True
            
        # Check for private IP ranges
        try:
            ip = ipaddress.ip_address(parsed.hostname)
            return ip.is_private
        except ValueError:
            # Not an IP address, so it's a domain name
            return False
    except Exception:
        return False


def convert_html_to_text(html: str) -> str:
    """Convert HTML to text using a more sophisticated approach."""
    try:
        from html2text import HTML2Text
        h = HTML2Text()
        h.ignore_links = True
        h.ignore_images = True
        h.body_width = 0  # Don't wrap lines
        return h.handle(html)
    except ImportError:
        # Fallback to simple regex-based conversion
        # Remove script and style elements
        html = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', html, flags=re.DOTALL | re.IGNORECASE)
        
        # Replace common HTML tags with whitespace or newlines
        html = re.sub(r'<(br|p|div|h[1-6])[^>]*>', '\n', html, flags=re.IGNORECASE)
        html = re.sub(r'<[^>]+>', ' ', html)  # Remove all other tags
        
        # Clean up whitespace
        html = re.sub(r'\s+', ' ', html)  # Multiple whitespace to single space
        return html.strip()


class WebFetchTool(BaseTool):
    """Enhanced tool for fetching content from URLs with better processing."""
    Name = "web_fetch"
    
    def __init__(self):
        super().__init__(
            self.Name,
            """Fetches content from a specified URL and processes it using an AI model
- Takes a URL and a prompt as input
- Fetches the URL content, converts HTML to markdown
- Processes the content with the prompt using a small, fast model
- Returns the model's response about the content
- Use this tool when you need to retrieve and analyze web content

Usage notes:
  - IMPORTANT: If an MCP-provided web fetch tool is available, prefer using that tool instead of this one, as it may have fewer restrictions. All MCP-provided tools start with "mcp__".
  - The URL must be a fully-formed valid URL
  - The prompt should describe what information you want to extract from the page
  - This tool is read-only and does not modify any files
  - Results may be summarized if the content is very large
  - Supports both public and private/localhost URLs using direct fetch""",
            {
                "type": "object",
                "properties": {
                    "url": {
                        "type": "string",
                        "description": "The URL to fetch content from"
                    },
                    "prompt": {
                        "type": "string",
                        "description": "The prompt to run on the fetched content"
                    }
                },
                "required": ["url", "prompt"]
            },
            ToolKind.FETCH
        )
        
    def validate_params(self, params: Dict[str, Any]) -> str:
        error = super().validate_params(params)
        if error:
            return error
            
        url = params["url"]
        prompt = params["prompt"]
        
        if not url or not url.strip():
            return "The 'url' parameter cannot be empty."
            
        if not url.startswith("http://") and not url.startswith("https://"):
            return "The 'url' must be a valid URL starting with http:// or https://."
            
        if not prompt or not prompt.strip():
            return "The 'prompt' parameter cannot be empty."
            
        return None
        
    def get_description(self, params: Dict[str, Any]) -> str:
        """Get a description of the web fetch operation."""
        url = params.get("url", "")
        prompt = params.get("prompt", "")
        display_prompt = prompt[:100] + "..." if len(prompt) > 100 else prompt
        return f"Fetching content from {url} and processing with prompt: \"{display_prompt}\""
        
    def execute(self, params: Dict[str, Any]) -> Dict[str, Any]:
        error = self.validate_params(params)
        if error:
            return {
                "llmContent": f"Error: {error}",
                "returnDisplay": f"Error: {error}"
            }
            
        url = params["url"]
        prompt = params["prompt"]
        
        # Convert GitHub blob URL to raw URL
        if "github.com" in url and "/blob/" in url:
            url = url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
            
        try:
            # Check if URL is private/localhost
            is_private = is_private_ip(url)
            
            if is_private:
                # For private IPs, use direct fetch
                pass
            else:
                # For public URLs, also use direct fetch
                pass
                
            # Fetch content
            response = requests.get(url, timeout=URL_FETCH_TIMEOUT_MS/1000)
            response.raise_for_status()
            
            # Process content based on content type
            content_type = response.headers.get('content-type', '').lower()
            
            if 'text/html' in content_type:
                html = response.text
                text_content = convert_html_to_text(html)
            else:
                text_content = response.text
                
            # Limit content length
            if len(text_content) > MAX_CONTENT_LENGTH:
                text_content = text_content[:MAX_CONTENT_LENGTH]
                
            # Create a prompt for processing the content
            fallback_prompt = f"""The user requested the following: "{prompt}".

I have fetched the content from {url}. Please use the following content to answer the user's request.

---
{text_content}
---"""
            
            # For this simplified implementation, we'll just return the processed content
            # In a full implementation, this would be processed by another LLM call
            result_text = f"Content from {url} processed successfully with prompt: {prompt}"
            
            return {
                "llmContent": result_text,
                "returnDisplay": f"Content from {url} processed successfully."
            }
            
        except requests.exceptions.RequestException as e:
            error_msg = f"Error during fetch for {url}: {str(e)}"
            return {
                "llmContent": f"Error: {error_msg}",
                "returnDisplay": f"Error: {error_msg}"
            }
        except Exception as e:
            error_msg = f"Unexpected error during fetch for {url}: {str(e)}"
            return {
                "llmContent": f"Error: {error_msg}",
                "returnDisplay": f"Error: {error_msg}"
            }