from __future__ import annotations

import json
import logging
import os
import time
from typing import Any, Dict, List, Optional
from urllib.parse import quote_plus

import requests
from bs4 import BeautifulSoup

from ..utils.config import PipelineConfig
from ..utils.io import write_json
from ..utils.logging_utils import get_logger


class GoogleNewsFetcher:
    """Fetch news results via Google Custom Search API with caching.

    Requires a Google API key and Search Engine ID provided at initialization or via PipelineConfig.
    """

    def __init__(
        self,
        api_key: Optional[str],
        search_engine_id: Optional[str],
        cache_file: Optional[str] = None,
        link_cache_file: Optional[str] = None,
        base_url: str = "https://www.googleapis.com/customsearch/v1",
        logger: Optional[logging.Logger] = None,
    ) -> None:
        self.api_key = api_key
        self.search_engine_id = search_engine_id
        self.cache_file = cache_file
        self.link_cache_file = link_cache_file
        self.base_url = base_url
        self.logger = logger or get_logger(self.__class__.__name__)
        self.cache: Dict[str, Any] = self._load_json(cache_file) if cache_file else {}
        self.link_cache: Dict[str, str] = self._load_json(link_cache_file) if link_cache_file else {}

    @staticmethod
    def _load_json(path: Optional[str]) -> Dict[str, Any]:
        if not path or not os.path.exists(path):
            return {}
        try:
            with open(path, "r", encoding="utf-8") as f:
                return json.load(f)
        except json.JSONDecodeError:
            return {}

    def _save_cache(self) -> None:
        if self.cache_file:
            write_json(self.cache_file, self.cache, indent=2)

    def _save_link_cache(self) -> None:
        if self.link_cache_file:
            write_json(self.link_cache_file, self.link_cache, indent=2)

    @staticmethod
    def _key(query: str, start_date: str, end_date: str) -> str:
        return f"{query}_{start_date}_{end_date}"

    def fetch_news(self, query: str, start_date: str, end_date: str) -> Dict[str, Any]:
        assert self.api_key and self.search_engine_id, "Google API key and Search Engine ID are required"
        cache_key = self._key(query, start_date, end_date)
        if cache_key in self.cache:
            self.logger.info("Using cached news for '%s' (%s - %s)", query, start_date, end_date)
            return self.cache[cache_key]

        encoded_query = quote_plus(query)
        sort_param = f"date:r:{end_date.replace('-', '')}:{start_date.replace('-', '')}"

        results: Dict[str, Any] = {"query": query, "sort": sort_param, "results": []}
        for start in range(1, 100, 10):  # up to top 100
            params = {
                "q": encoded_query,
                "cx": self.search_engine_id,
                "key": self.api_key,
                "sort": sort_param,
                "num": 10,
                "start": start,
            }
            resp = requests.get(self.base_url, params=params, timeout=30)
            if resp.status_code == 200:
                try:
                    data = resp.json()
                    results["results"].extend(data.get("items", []))
                except json.JSONDecodeError:
                    self.logger.error("Error decoding Google API JSON response")
                    break
            elif resp.status_code == 429:
                self.logger.warning("Rate limited by Google API; sleeping 10s")
                time.sleep(10)
                continue
            else:
                self.logger.error("Google API error %s: %s", resp.status_code, resp.text)
                break

        self.cache[cache_key] = results
        self._save_cache()
        return results

    def summarize_link(self, link: str, fallback_summary: str) -> str:
        if link in self.link_cache:
            return self.link_cache[link]
        try:
            resp = requests.get(link, timeout=5)
            soup = BeautifulSoup(resp.text, "html.parser")
            full_title = soup.title.string if soup.title else ""
            meta_desc = soup.find("meta", attrs={"name": "description"})
            full_snippet = meta_desc.get("content", fallback_summary) if meta_desc else fallback_summary
            summary = f"{full_title} - {full_snippet}".strip()
            self.link_cache[link] = summary
            self._save_link_cache()
            return summary
        except Exception as e:  # broad catch to be resilient to parsing/network errors
            self.logger.warning("Failed to summarize link %s: %s", link, e)
            return fallback_summary


def attach_news_to_entries(
    entries: List[Dict[str, Any]],
    fetcher: GoogleNewsFetcher,
) -> List[Dict[str, Any]]:
    """Attach list of news summaries to each history item that has a comment_text field.

    Returns filtered entries containing at least one history item with news.
    """
    filtered: List[Dict[str, Any]] = []
    for entry in entries:
        title = entry.get("title", "")
        has_news = False
        for hist in entry.get("history", []) or []:
            if "comment_text" not in hist:
                continue
            start_date = hist.get("start_time")
            if not start_date:
                continue
            # In original code: window was 7 days back
            end_date = start_date  # current date
            begin_date = _days_back(start_date, days=7)

            res = fetcher.fetch_news(title, start_date=end_date, end_date=begin_date)
            news_summaries = [
                fetcher.summarize_link(
                    item.get("link", ""),
                    f"{item.get('title', '')} - {item.get('snippet', '')}",
                )
                for item in res.get("results", [])
            ]
            if news_summaries:
                hist["news"] = news_summaries
                has_news = True
        if has_news:
            filtered.append(entry)
    return filtered


def _days_back(date_str: str, days: int) -> str:
    from datetime import datetime, timedelta

    dt = datetime.strptime(date_str, "%Y-%m-%d")
    return (dt - timedelta(days=days)).strftime("%Y-%m-%d")
