from meta_researcher.tool.tools.search_engine.log import logger
import os

import httpx
from asyncache import cached as acached
from cachetools import TTLCache
from playwright.async_api import async_playwright

from meta_researcher.tool.tools.search_engine.base_search import SearchConfig
from meta_researcher.tool.tools.search_engine.website_crawler.base_crawler import BaseContentFetch
from meta_researcher.tool.tools.search_engine.website_crawler.content_process import (
    CONTENT_PROCESS_MAP,
    DYNAMIC_WEBSITES,
    extract_main_content,
)
from meta_researcher.tool.tools.search_engine.website_crawler.crawl4ai_crawler import Craw4aiContentFetch

LLM_SEARCH_DEBUG = int(os.environ.get("LLM_SEARCH_DEBUG", "0"))


class SimpleContentFetch(BaseContentFetch):
    def __init__(self, args: SearchConfig):
        self.args = args
        self.request_headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0",
        }
        self.rag = None

    async def _fetch_dynamic_website(self, url: str, query: str | None = None):
        async with async_playwright() as p:
            proxy = None
            if "all_proxy" in os.environ:
                from playwright._impl._api_structures import ProxySettings

                proxy = ProxySettings(server=os.environ["all_proxy"])
            browser = await p.chromium.launch(proxy=proxy)
            page = await browser.new_page()
            await page.goto(url)
            try:
                await page.wait_for_selector("body", state="attached")
                content = await page.content()
            except:
                content = ""
            await page.close()
            await browser.close()
            url = page.url
        return content, url

    async def _fetch_static_website(self, url: str, query: str | None = None):
        async with httpx.AsyncClient(
            timeout=self.args.timeout, follow_redirects=True
        ) as client:
            response = await client.get(url, headers=self.request_headers)
            return response.content, str(response.url)

    def _check_is_dynamic_website(self, url):
        for f in DYNAMIC_WEBSITES:
            if f in url:
                return True
        return False

    @acached(cache=TTLCache(maxsize=100, ttl=600))
    async def fetch(self, url: str, query: str | None = None):
        meta = {
            "original_url": url,
            "query": query,
        }
        for _ in range(self.args.max_retry):
            try:
                if self._check_is_dynamic_website(url):
                    logger.info(f"use dynamic fetch for url {url} ...")
                    meta["dynamic_website"] = True
                    content, url = await self._fetch_dynamic_website(url, query)
                else:
                    logger.info(f"use static fetch for url {url} ...")
                    meta["dynamic_website"] = False
                    content, url = await self._fetch_static_website(url, query)

                main_content, special_url, soup = extract_main_content(
                    content, [meta["original_url"], url], CONTENT_PROCESS_MAP
                )
                main_content = main_content[:8192]
                meta["special_url"] = special_url
                meta["new_url"] = url
                meta["rag"] = False
                if LLM_SEARCH_DEBUG:
                    meta["soup"] = soup
                    meta["raw_content"] = content
                if (
                    (not special_url)
                    and len(main_content) > 2000
                    and (self.rag is not None)
                    and (query is not None)
                ):
                    rag_doc = await self.rag.async_similarity_search(
                        query=query,
                        document=main_content,
                        k=3,
                    )
                    if isinstance(rag_doc, str):
                        main_content = rag_doc
                    else:
                        main_content = "\n\n".join(
                            doc.page_content.removeprefix("passage: ")
                            for doc in rag_doc
                        )
                        meta["rag"] = True
                return main_content, meta
            except Exception as e:
                logger.warning(f"error of fetch {url}, retry ...")
        logger.warning(
            f"Error of fetch content for {url} after try {self.args.max_retry} times"
        )
        return "", meta
