import os
import sys
import json
import logging
from typing import Dict, List, Any
from typing import Optional, Dict, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

from meta_researcher.tool.base import BaseTool
from meta_researcher.tool.tools.doc.simple_doc_parser import doc_parser


class FileParseTool(BaseTool):
    name = "fileparser"
    description = '''
    File parsing tool for multiple formats. Supports common document, spreadsheet, presentation, web, and plain text formats, with optional image extraction. You should invoke this tool when encountering the following needs:
    1. Reading and aggregating textual content from documents such as PDF, Word (docx), or PowerPoint (pptx).
    2. Parsing plain text (txt) or web (html) files to extract text and optionally images.
    3. Processing CSV, TSV, or Excel (xlsx/xls) spreadsheet files and converting cell contents into a unified structure.
    4. Extracting embedded images from documents (e.g., charts in PowerPoint, images in Word, or <img> tags in HTML).
    5. Handling unsupported legacy or rare formats (e.g., doc) by logging the event and returning empty placeholders.

    Supported file types: pdf / docx / doc / pptx / txt / html / csv / tsv / xlsx / xls
    "examples": [{"urls": ["https://arxiv.org/pdf/2505.07473.pdf", "https://example.com/report.xlsx"], "files_type":["pdf", "xlsx"], "parallel_workers": 5}]
    '''
    parameters = {
        "type": "object",
        "properties": {
            "urls": {
                "type": "array",
                "items": {"type": "string"},
                "description": "The network link addresses (URLs) of the files to be read, which can be HTTP or HTTPS URLs."
            },
            "parallel_workers": {
                "type": "integer",
                "description": "Number of parallel threads, default is 4."
            }
        },
        "required": ["urls"]
    }

    def __init__(self):
        super().__init__()

    def _parse_single(self, url: str, file_type: Optional[str] = None) -> Dict[str, Any]:
        try:
            logger.info(f"Parsing URL: {url}")
            return doc_parser(url=url, max_ref_token=30000, file_type=str(file_type))
        except Exception as e:
            logger.error(f"Failed to parse {url}: {e}")
            return {"url": url, "error": str(e)}

    def execute(self, args: Dict[str, Any]) -> Dict[str, Any]:
        urls: List[str] = args.get("urls", [])
        files_type: List[str] = args.get("files_type", [])
        print(f"files_type:",files_type)
        workers: int = args.get("parallel_workers", 4)

        if not urls or not isinstance(urls, list):
            return {"content": "The parameter 'urls' must be a non-empty list.", "success": False}

        results = []
        with ThreadPoolExecutor(max_workers=workers) as executor:
            future_map = {executor.submit(self._parse_single, url, file_type): (url, file_type) for (url, file_type) in zip(urls, files_type)}
            # future_map = {executor.submit(self._parse_single, url): url for url in urls}
            for future in as_completed(future_map):
                res = future.result()
                results.append(res)
        
        try:
            segments = []
            for idx, one_url_content in enumerate(tqdm(results, total=len(results), desc="get url content")):
                url = one_url_content.get("url","")
                title = one_url_content.get("title","")
                content = one_url_content.get("content","")

                segments.append(f"{idx+1}. Link: {url}\n  File_content: \n(1). title: {title} \n(2). content: {content}")
            content_str = "\n\n".join(segments)

            return {"content": content_str, "success": True}
        except Exception as e:
            return {"content": str(e), "success": False}
