#!/usr/bin/env python3
"""
Enhanced Web Search Result Processor - Improves handling of empty results and invalid searches
"""

import re
import logging
from typing import Dict, List, Any, Optional
from dataclasses import dataclass

@dataclass
class SearchResultAnalysis:
    """Search result analysis"""
    has_real_content: bool
    is_simulation: bool
    quality_score: float  # 0-1 score
    issues: List[str]
    suggested_actions: List[str]

class EnhancedSearchProcessor:
    """Enhanced search result processor"""
    
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        
        # Simulation search result indicators
        self.simulation_indicators = [
            r'Wikipedia article with comprehensive information',
            r'Authoritative encyclopedia entry with verified facts',
            r'Detailed biographical information including early life',
            r'Complete filmography, career information',
            r'Historical information, timeline, and contextual background',
            r'raw_content=None',
            r'simulation_confidence=None',
        ]
        
        # Low quality search result indicators
        self.low_quality_indicators = [
            r'No results found',
            r'Search returned empty',
            r'Access denied',
            r'Page not found',
            r'Error 404',
            r'results=\[\]',
        ]

    def analyze_search_results(self, search_results: Any) -> SearchResultAnalysis:
        """Analyze quality and authenticity of search results"""
        
        if not search_results:
            return SearchResultAnalysis(
                has_real_content=False,
                is_simulation=True,
                quality_score=0.0,
                issues=["Empty search results"],
                suggested_actions=["Retry with different search terms", "Use alternative search method"]
            )
        
        # Convert to string for analysis
        results_str = str(search_results)
        
        # Check if results are simulated
        simulation_score = 0
        for indicator in self.simulation_indicators:
            if re.search(indicator, results_str, re.IGNORECASE):
                simulation_score += 1
        
        is_simulation = simulation_score >= 2  # If matching multiple indicators, likely simulation
        
        # Check low quality indicators
        quality_issues = []
        for indicator in self.low_quality_indicators:
            if re.search(indicator, results_str, re.IGNORECASE):
                quality_issues.append(f"Found low quality indicator: {indicator}")
        
        # Calculate quality score
        quality_score = 1.0
        if is_simulation:
            quality_score -= 0.6
        if quality_issues:
            quality_score -= 0.3 * len(quality_issues)
        
        quality_score = max(0.0, quality_score)
        
        # Generate suggested actions
        suggested_actions = []
        if is_simulation:
            suggested_actions.extend([
                "Search results appear to be simulated",
                "Consider using alternative information sources",
                "Try more specific search terms"
            ])
        
        if quality_issues:
            suggested_actions.extend([
                "Search quality is low",
                "Refine search query",
                "Try different search strategy"
            ])
        
        return SearchResultAnalysis(
            has_real_content=quality_score > 0.5,
            is_simulation=is_simulation,
            quality_score=quality_score,
            issues=quality_issues + (["Likely simulated results"] if is_simulation else []),
            suggested_actions=suggested_actions
        )

    def enhance_search_strategy(self, original_query: str, failed_attempts: int = 0) -> Dict[str, Any]:
        """Improve search strategy based on number of failures"""
        
        strategies = {
            0: {
                "query_modifications": [
                    f'"{original_query}" exact match',
                    f"{original_query} -wikipedia -britannica",  # Exclude common simulation sources
                ],
                "search_methods": ["precise", "academic"],
                "filters": ["verified_sources", "recent_content"]
            },
            1: {
                "query_modifications": [
                    " ".join(original_query.split()[:5]),  # Use first 5 words
                    re.sub(r'\b(who|what|when|where|how)\b', '', original_query, flags=re.IGNORECASE).strip()
                ],
                "search_methods": ["broad", "news"],
                "filters": ["primary_sources", "official_sites"]
            },
            2: {
                "query_modifications": [
                    self._extract_key_entities(original_query),
                    self._simplify_query(original_query)
                ],
                "search_methods": ["entity_based", "topic_search"],
                "filters": ["authoritative_only"]
            }
        }
        
        attempt_key = min(failed_attempts, 2)
        return strategies[attempt_key]

    def _extract_key_entities(self, query: str) -> str:
        """Extract key entities from query"""
        # Simple keyword extraction (can be extended to NER)
        key_patterns = [
            r'\b[A-Z][a-z]+ [A-Z][a-z]+\b',  # Person names
            r'\b\d{4}\b',  # Years
            r'\b[A-Z][a-z]*(?:\s+[A-Z][a-z]*)*\b',  # Proper nouns
        ]
        
        entities = []
        for pattern in key_patterns:
            entities.extend(re.findall(pattern, query))
        
        return " ".join(entities[:3]) if entities else query.split()[:3]

    def _simplify_query(self, query: str) -> str:
        """Simplify query by removing complex syntax"""
        # Remove quotes and special characters
        simplified = re.sub(r'[\"\'(){}[\]]', '', query)
        # Remove connecting words
        simplified = re.sub(r'\b(and|or|but|the|a|an|in|on|at|for|with|by)\b', ' ', simplified, flags=re.IGNORECASE)
        # Clean up extra spaces
        simplified = re.sub(r'\s+', ' ', simplified).strip()
        
        return simplified

    def create_fallback_response(self, query: str) -> Dict[str, Any]:
        """Create fallback response when search fails"""
        return {
            "status": "fallback",
            "message": f"Unable to find reliable search results for: {query}",
            "suggestions": [
                "The query may be too specific or complex",
                "Try breaking it into smaller, simpler questions",
                "Consider alternative information sources",
                "Verify if all entities in the query are correct"
            ],
            "alternative_approaches": [
                "Use domain-specific databases if available",
                "Consult primary sources directly",
                "Break complex queries into multiple simple searches"
            ]
        }

# Global search processor instance
search_processor = EnhancedSearchProcessor()