# Copyright 2025 ZTE Corporation.
# All Rights Reserved.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

import re
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from typing import Tuple, Dict, List

class CSFetchWebsiteContent:
    """
    A tool to fetch and parse the textual content of a given website URL.
    """
    def __init__(self):
        """
        Initializes the tool and sets up its JSON schema definition.
        """
        self.tool_json_schema = {
            "name": 'fetch_website_content',
            "description": 'Extracts and returns the main text content from a webpage URL. This tool cannot read document files like PDFs or DOCs.',
            "parameters": {
                "type": "object",
                "properties": {
                    "url": {
                        "type": "string",
                        "description": "The full URL of the webpage to read."
                    }
                },
                "required": ["url"]
            }
        }
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Language": "en-US,en;q=0.9",
            "Referer": "https://www.google.com/",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
        }

    def _is_document_url(self, url: str) -> bool:
        """Checks if a URL points to a document file."""
        path = urlparse(url).path.lower()
        return path.endswith(('.pdf', '.doc', '.docx'))

    async def call_tool(self, arguments: dict, **kwargs) -> Tuple[str, bool]:
        """
        Fetches the content of a website and returns it as cleaned text.

        Args:
            arguments (dict): A dictionary containing the tool's parameters.
                - website_url (str): The URL of the website to scrape.

        Returns:
            str: The cleaned text content of the website, or an error message.
        """

        website_url = arguments["url"]

        # 1. Check if the URL is for a document file
        if self._is_document_url(website_url):
            return f"This tool cannot read document files. Please use a file download or reader tool for {website_url}", False

        # 2. Fetch website content using requests
        page = requests.get(
            website_url,
            timeout=15,
            headers=self.headers,
        )
        page.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        # 3. Parse and clean the HTML content
        page.encoding = page.apparent_encoding
        parsed = BeautifulSoup(page.text, "html.parser")

        # Remove script and style elements
        for script_or_style in parsed(["script", "style"]):
            script_or_style.decompose()

        text = parsed.get_text(" ")
        text = re.sub("[ \t]+", " ", text)
        text = re.sub("\\s+\n\\s+", "\n", text)

        return text, True
