import os
import re
import json
from typing import List, Optional

import requests

from .base import BaseTool, ToolResult, ToolFailure


def _clean_text(text: str) -> str:
    text = re.sub(r"\s+", " ", text or " ").strip()
    return text


class WebSearchSummarize(BaseTool):
    """整合：搜索 + 抓取正文 + LLM 总结。单次调用完成信息检索与归纳。"""

    name: str = "web_search_summarize"
    description: str = (
        "执行多源搜索并抓取前若干结果正文，并对内容进行中文总结与来源列举；适用于快速获取主题相关知识、最佳实践、概念解释。"
    )
    input: str = "query: 查询; max_results: 搜索结果条数(1-8); fetch_top: 抓取正文条数(1-5); summary_words: 目标总结字数"
    output: str = "{summary, sources: [{title,url,snippet}]}"
    parameters: dict = {
        "type": "object",
        "properties": {
            "query": {"type": "string", "description": "搜索主题或问题"},
            "max_results": {"type": "integer", "minimum": 1, "maximum": 8, "description": "搜索结果数量，默认 5"},
            "fetch_top": {"type": "integer", "minimum": 1, "maximum": 5, "description": "抓取正文的前N条，默认 3"},
            "summary_words": {"type": "integer", "minimum": 100, "maximum": 1200, "description": "目标总结字数上限，默认 400"},
        },
        "required": ["query"],
    }
    notices: list[str] = [
        "若配置 TAVILY_API_KEY / SERPAPI_API_KEY / BING_API_KEY / BAIDU_API_KEY 将提升结果质量。",
        "正文抓取会截断至约 4000 字，以控制上下文长度。",
        "summary_words 为软目标，LLM 可略微增减。",
    ]
    examples: list[str] = [
        "'pandas groupby agg 多指标用法 整理'",
        "'LLM RAG 检索增强 最新实践 总结'",
    ]

    def _search_tavily(self, query: str, max_results: int) -> Optional[List[dict]]:
        key = os.getenv("TAVILY_API_KEY")
        if not key:
            return None
        try:
            resp = requests.post(
                "https://api.tavily.com/search",
                json={"api_key": key, "query": query, "max_results": max_results},
                timeout=12,
            )
            data = resp.json()
            results = []
            for item in (data.get("results") or [])[:max_results]:
                results.append(
                    {
                        "title": item.get("title") or "",
                        "url": item.get("url") or "",
                        "snippet": _clean_text(item.get("content") or ""),
                    }
                )
            return results
        except Exception:
            return None

    def _search_serpapi(self, query: str, max_results: int) -> Optional[List[dict]]:
        key = os.getenv("SERPAPI_API_KEY")
        if not key:
            return None
        try:
            resp = requests.get(
                "https://serpapi.com/search.json",
                params={"engine": "google", "q": query, "api_key": key},
                timeout=12,
            )
            data = resp.json()
            results = []
            for item in (data.get("organic_results") or [])[:max_results]:
                results.append(
                    {
                        "title": item.get("title") or "",
                        "url": item.get("link") or "",
                        "snippet": _clean_text(item.get("snippet") or ""),
                    }
                )
            return results
        except Exception:
            return None

    def _search_baidu(self, query: str, max_results: int) -> List[dict]:
        url = "https://qianfan.baidubce.com/v2/ai_search/web_search"
        payload = json.dumps({
            "messages": [
                {
                    "role": "user",
                    "content": query
                }
            ],
            "edition": "standard",
            "search_source": "baidu_search_v2",
            "search_recency_filter": "year",
            "resource_type_filter": [
                {
                    "type": "web",
                    "top_k": max_results
                }
            ],
        }, ensure_ascii=False)
        headers = {
            'Content-Type': 'application/json',
            'Authorization': 'Bearer ' + os.getenv("BAIDU_API_KEY", "")
        }
        
        response = requests.request("POST", url, headers=headers, data=payload.encode("utf-8"))
        response.encoding = "utf-8"
        results = []
        try:
            data = response.json()
            for item in (data.get("references") or [])[:max_results]:
                results.append(
                    {
                        "title": item.get("title") or "",
                        "url": item.get("url") or "",
                        "snippet": _clean_text(item.get("snippet") or ""),
                    }
                )
        except Exception:
            pass
        return results
    
    def _search_bing(self, query, max_results):
        key = os.getenv("BING_API_KEY")
        endpoint = "https://api.bing.microsoft.com/v7.0/search"
        headers = {"Ocp-Apim-Subscription-Key": key}
        params = {"q": query, "count": max_results}
        resp = requests.get(endpoint, headers=headers, params=params, timeout=10)
        data = resp.json()
        return [
            {
                "title": item["name"],
                "url": item["url"],
                "snippet": _clean_text(item.get("snippet", "")),
            }
            for item in data.get("webPages", {}).get("value", [])
        ]

    
    def _search_duckduckgo(self, query: str, max_results: int) -> List[dict]:
        # 使用 DuckDuckGo HTML 版本，解析简单
        headers = {"User-Agent": "Mozilla/5.0 (compatible; AutoDS/1.0)"}
        url = "https://html.duckduckgo.com/html/"
        try:
            resp = requests.get(url, params={"q": query}, headers=headers, timeout=12)
            resp.raise_for_status()
            html = resp.text
        except Exception as e:
            return [{"title": "", "url": "", "snippet": f"duckduckgo error: {e}"}]

        try:
            try:
                from bs4 import BeautifulSoup
                soup = BeautifulSoup(html, "html.parser")
                results = []
                for res in soup.select(".result"):
                    a = res.select_one("a.result__a") or res.select_one("a.result__url")
                    if not a:
                        continue
                    href = a.get("href", "")
                    title = _clean_text(a.get_text())
                    snippet_el = res.select_one(".result__snippet")
                    snippet = _clean_text(snippet_el.get_text()) if snippet_el else ""
                    results.append({"title": title, "url": href, "snippet": snippet})
                    if len(results) >= max_results:
                        break
                if results:
                    return results
            except Exception:
                pass

            # 简单正则回退
            links = re.findall(r'<a[^>]+class="result__a"[^>]+href="([^"]+)"[^>]*>(.*?)</a>', html, re.S)
            out = []
            for href, t in links[:max_results]:
                out.append({"title": _clean_text(re.sub("<.*?>", "", t)), "url": href, "snippet": ""})
            return out
        except Exception as e:
            return [{"title": "", "url": "", "snippet": f"parse error: {e}"}]

    
    def execute(self, query: str, max_results: int = 5, fetch_top: int = 3, summary_words: int = 400) -> ToolResult:
        from llm import LLM  # 延迟导入避免循环
        max_results = max(1, min(int(max_results or 5), 8))
        fetch_top = max(1, min(int(fetch_top or 3), 5))
        summary_words = summary_words if summary_words else 400

        # (1) 搜索阶段
        results = self._search_baidu(query, max_results)
        if not results:
            results = self._search_tavily(query, max_results)
        if not results:
            results = self._search_serpapi(query, max_results)
        if not results:
            results = [{"title": "", "url": "", "snippet": "No results"}]

        # (2) 抓取正文（前 N 条）
        fetch_candidates = results[:fetch_top]
        bodies = []
        for item in fetch_candidates:
            url = item.get("url")
            if not url:
                continue
            page = FetchPageText().execute(url=url, max_chars=4000)
            if page.error:
                bodies.append(f"[抓取失败]{url}: {page.error}")
            else:
                bodies.append(f"URL: {url}\nTEXT: {page.output.get('text','')[:1000]}")

        # (3) 组装总结 Prompt 调用 LLM（使用默认配置模型）
        # 若不希望再次通过工具循环，这里直接内部调用 ask
        llm = LLM()
        system_prompt = (
            "你是一个信息整合助手。下面给出若干网页的正文片段与简要搜索摘要，请用中文归纳：\n"
            "1) 主题核心概念/定义\n2) 关键方法或实践要点（列表形式）\n3) 常见坑或注意事项（如有）\n4) 若涉及库/API，给出简要用法示例（无需冗长代码）\n"
            f"限制：总结总字数不超过 {summary_words} 字。保持精炼，不要虚构未出现的指标或结论。\n"
        )
        user_content = json.dumps({
            "query": query,
            "search_results": results[:max_results],
            "pages": bodies,
        }, ensure_ascii=False)
        try:
            resp = llm.ask([
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_content},
            ])
            # OpenAI ChatCompletion 风格
            summary = getattr(resp.choices[0].message, "content", "")
        except Exception as e:
            summary = f"总结阶段调用 LLM 失败: {e}"

        return ToolResult(output={
            "summary": summary.strip(),
            "sources": results[:max_results],
            "fetched_count": len(bodies),
        })


class FetchPageText(BaseTool):
    """保留供合并工具内部调用，不再直接暴露给模型。"""

    name: str = "fetch_page_text_internal"
    description: str = "内部使用：抓取指定 URL 的页面文本内容并清洗。"
    input: str = "url: 目标链接; max_chars: 最大返回字符数"
    output: str = "{url, text, length}"
    parameters: dict = {
        "type": "object",
        "properties": {
            "url": {"type": "string"},
            "max_chars": {"type": "integer"},
        },
        "required": ["url"],
    }

    def execute(self, url: str, max_chars: Optional[int] = 4000) -> ToolResult:
        headers = {"User-Agent": "Mozilla/5.0 (compatible; AutoDS/1.0)"}
        try:
            resp = requests.get(url, headers=headers, timeout=12)
            resp.raise_for_status()
            html = resp.text
        except Exception as e:
            return ToolFailure(error=f"request error: {e}")
        text = ""
        try:
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(html, "html.parser")
            for tag in soup(["script", "style", "noscript"]):
                tag.extract()
            text = _clean_text(soup.get_text(" "))
        except Exception:
            text = _clean_text(re.sub(r"<[^>]+>", " ", html))
        limit = int(max_chars or 4000)
        if len(text) > limit:
            text = text[:limit]
        return ToolResult(output={"url": url, "text": text, "length": len(text)})
