from typing import List, Dict, Any, Optional, Tuple
import os
import requests
import json
import time
from utils import openai_service

def read_page(url: str) -> str:
    """Read and return the content of a webpage using Jina reader."""
    jina_url = f'https://r.jina.ai/{url}'
    headers = {
        'Authorization': f'Bearer {os.getenv("JINA_API_KEY")}',
        'X-Engine': 'browser',
        'X-Return-Format': 'markdown',
        "X-Remove-Selector": "header, .class, #id",
        "X-Retain-Images": "none",
        'X-Timeout': '10',
        'X-Token-Budget': '200000',
    }

    try:
        response = requests.get(jina_url, headers=headers, timeout=15)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        return f"Error reading page: {str(e)}"

def web_search_google_serper(
    query: str, 
    filter_year: Optional[int] = None, 
    serp_num: int = 3, 
    max_retries: int = 3
) -> Tuple[List[Dict[str, Any]], str]:
    """Perform web search using Google Serper API."""
    if not query.strip():
        return [], "Query is empty. Please provide a valid search query."
    
    url = "https://google.serper.dev/search"
    payload = json.dumps({
        "q": query,
        "location": "United States",
        "num": serp_num
    })
    headers = {
        'X-API-KEY': os.getenv("SERPER_API_KEY"),
        'Content-Type': 'application/json'
    }

    for attempt in range(max_retries):
        try:
            response = requests.post(url, headers=headers, data=payload, timeout=10)
            response.raise_for_status()
            results = response.json()

            if "organic" not in results or not results["organic"]:
                year_filter_msg = f" with year filter={filter_year}" if filter_year else ""
                return [], f"No results found for '{query}'{year_filter_msg}. Try a more general query."
            
            search_results = []
            for idx, page in enumerate(results["organic"], 1):
                search_results.append({
                    "idx": idx,
                    "title": page.get("title", "No title"),
                    "date": f"\nDate published: {page['date']}" if "date" in page else "",
                    "snippet": f"\n{page.get('snippet', 'No snippet')}",
                    "source": f"\nSource: {page.get('source', 'Unknown source')}",
                    "link": page.get('link', '#')
                })
            
            return search_results, ""
        
        except (requests.RequestException, json.JSONDecodeError) as e:
            if attempt == max_retries - 1:
                return [], f"Search failed after {max_retries} attempts: {str(e)}"
            time.sleep(1)
    
    return [], "Unexpected error in web search"

def search_tool(query: str) -> str:
    """Execute web search and return formatted results."""
    search_results, error_msg = web_search_google_serper(query, serp_num=5)
    
    if error_msg:
        return error_msg
    
    formatted_results = []
    for result in search_results:
        formatted_results.append(
            f"{result['idx']}. [{result['title']}]({result['link']})"
            f"{result['date']}{result['source']}\n"
            f"   {result['snippet'].strip()}"
        )
    
    return "\n\n".join(formatted_results) if formatted_results else "No search results found"


def truncate_text(text: str, max_length: int = 60000) -> str:
    return text if len(text) <= max_length else text[:max_length] + "...(truncated)"

def get_summary_prompt(query: str, url: str, content: str) -> str:
    return (
        f"Task: Extract all content from the web page that matches the search query.\n"
        f"Search Query: {query}\n\n"
        f"Web Page Content [url:{url}]:\n{content}\n\n"
        "Instructions:\n"
        "- Summarize all relevant content for the query (text, tables, lists) into concise points\n"
        "- If no relevant information exists, please straightly output 'No relevant information'\n"
        "- Keep the summary under 500 words"
    )

def retry_predict(prompt: str, max_retries: int = 3) -> str:
    messages = [{"role": "user", "content": prompt}]
    for attempt in range(max_retries):
        try:
            response = openai_service(messages, os.getenv("OPENAI_API_KEY"), os.getenv("OPENAI_BASE_URL"), os.getenv("SUMMARY_MODEL"))
            if hasattr(response, 'content'):
                content = response.content
                return content.strip() if isinstance(content, str) else str(content)
            return str(response)
        except Exception as e:
            if attempt == max_retries - 1:
                return f"Content extraction failed: {str(e)}"
            wait_time = 2 ** attempt
            time.sleep(wait_time)
    return "Content extraction failed after multiple attempts"

def crawl_tool(url: str, query: str) -> str:

    if not url.startswith(('http://', 'https://')):
        return "Invalid URL format. Must start with http:// or https://"
    
    page_content = read_page(url)
    if page_content.startswith("Error"):
        return page_content
    
    truncated_content = truncate_text(page_content)
    prompt = get_summary_prompt(query, url, truncated_content)
    
    return retry_predict(prompt)
