import os, sys
os.chdir(sys.path[0])
import requests
import time
import random
import asyncio
import json
import json5
import copy
from typing import List, Union
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.exceptions import RequestException, Timeout
from qwen_agent.tools.base import BaseTool, register_tool

try:
    from pdf_parser import download_pdf, parse_pdf
except:
    from .pdf_parser import download_pdf, parse_pdf

JINA_API_KEY = os.getenv("JINA_API_KEY", None)
assert JINA_API_KEY is not None, "JINA_API_KEY is not set."
MAX_WEBPAGE=24000

URL = os.getenv('EVAL_LLM_URL')
KEY = os.getenv('EVAL_LLM_KEY')
LLM_NAME = os.getenv('EVAL_LLM_NAME')


from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("PATH/Qwen3-8B")

import re


extractor_prompt = """Please process the following webpage content and user goal to extract relevant information:

## **Webpage Content** 
{webpage_content}

## **User Goal**
{goal}

## **Task Guidelines**
1. **Content Scanning for Rational**: Locate the **specific sections/data** directly related to the user's goal within the webpage content
2. **Key Extraction for Evidence**: Identify and extract the **most relevant information** from the content, you never miss any important information, output the **full original context** of the content as far as possible, it can be more than three paragraphs.
3. **Summary Output for Summary**: Organize into a concise paragraph with logical flow, prioritizing clarity and judge the contribution of the information to the goal.

**Final Output Format using JSON format has "rational", "evidence", "summary" feilds**
"""

def parse_json_output(raw):
    triple_match = re.search(r'```json\s*\n(.*?)\n```', raw, re.DOTALL)
    
    if triple_match:
        json_str = triple_match.group(1)
        try:
            return json5.loads(json_str)
        except Exception as e:
            print(f"Error parsing JSON5: {e}")
            return None
    else:
        try:
            return json5.loads(raw)
        except Exception as e:
            print(f"Error parsing raw string as JSON5: {e}")
            return None

class Visit(BaseTool):
    name = 'visit'
    description = 'Visit webpage(s) or paper(s) and return the summary of the content.'
    parameters = {
        "type": "object",
        "properties": {
            "url": {
                "type": "array",
                "items": {
                    "type": "string"
                },
                "minItems": 1,
                "description": "The URL(s) of the webpage(s) or paper(s) to visit. Can be a single URL or an array of URLs."
            },
            "goal": {
                "type": "string",
                "description": "The goal of the visit for webpage(s) or paper(s)."
            },
            "parse_type": {
                "type": "string",
                "enum": ["html", "pdf"],
                "default": "html",
                "description": "Specify whether to visit a HTML webpage or a PDF paper. Must be either 'html' or 'pdf'."
            }
        },
        "required": ["url", "goal"]
    }
    
    def call(self, params: Union[str, dict], **kwargs) -> str:
        try:
            params = self._verify_json_format_args(params)
            url = params["url"]
            goal = params["goal"]
            parse_type = params.get("parse_type", 'html')
        except:
            return """[Visit] Invalid request format. Required format:
{
    "url": "array of string(s)",  // Required
    "goal": "string",                     // Required
    "parse_type": "html" or "pdf"    // Optional, defaults to "html"
}""", [], False

        summary_message_list, tool_status_list = [], []
        if isinstance(url, str):
            response, summary_message_list, tool_status = self.readpage(url, goal, parse_type)
            tool_status_list.append(tool_status)
        else:
            response = []
            assert isinstance(url, List)
            with ThreadPoolExecutor(max_workers=3) as executor:
                futures = {executor.submit(self.readpage, u, goal, parse_type): u for u in url}
                for future in as_completed(futures):
                    try:
                        tool_result, summary_message, tool_status = future.result()
                        tool_status_list.append(tool_status)
                        response.append(tool_result)
                        if summary_message is not None:
                            summary_message_list.append(summary_message)
                    except Exception as e:
                        response.append(f"Error fetching {futures[future]}: {str(e)}")
            response = "\n=======\n".join(response)
        if len(tool_status_list) == 0:
            tool_status_list = [False]
        return response.strip(), summary_message_list, all(tool_status_list)

    def readpage(self, url: str, goal: str, parse_type: str = 'html') -> str:
        """
        Attempt to read webpage content by alternating between jina and aidata services.
        
        Args:
            url: The URL to read
            goal: The goal/purpose of reading the page
            parse_type: html or pdf
            
        Returns:
            str: The webpage content or error message
        """
        max_attempts = 8
        for attempt in range(max_attempts):

            if parse_type == 'html':
                content = self.jina_readpage(url)
                if '[visit] Failed' in content:
                    content = self.idp_readpage(url)

            else:
                content = self.idp_readpage(url)
                if '[visit] Failed' in content:
                    content = self.jina_readpage(url)
 
            if content and not content.startswith("[visit] Failed to read page.") and content != "[visit] Empty content." and not content.startswith("[document_parser]"):
                content = tokenizer.decode(tokenizer.encode(content)[:MAX_WEBPAGE])
                messages = [{"role":"user","content": extractor_prompt.format(webpage_content=content, goal=goal)}]
                parse_retry_times = 0
                model="YOUR_SUMMARY_MODEL"
                payload = {
                    "model": model,
                    "messages": messages,
                }
                raw = self.llm(payload)
                useful_information = "The useful information in '{url}' for user goal '{goal}' as follows: \n\n".format(url=url, goal=goal)

                if raw is None:
                    useful_information += "Evidence in page: \n" + "The provided webpage content could not be accessed. Please check the URL or file format." + "\n\n"
                    useful_information += "Summary: \n" + "The webpage content could not be processed, and therefore, no information is available." + "\n\n"
                else:
                    raw_json = parse_json_output(raw)
                    if raw_json is None:
                        print('parse json failed')
                        useful_information += raw
                    elif 'evidence' in raw_json and 'summary' in raw_json:
                        useful_information += "Evidence in page: \n" + str(raw_json["evidence"]) + "\n\n"
                        useful_information += "Summary: \n" + str(raw_json["summary"]) + "\n\n"
                    else:
                        useful_information += raw
                
                summary_message = copy.deepcopy(messages)
                summary_message.append({"role": "assistant", "content": raw})

                return useful_information, summary_message, True
        
        return f"[visit] Failed to read page (url: {url}, goal: {goal})", None, False

    
    def llm(self, payload: dict):
        url_llm = f"{URL}/chat/completions"
        headers = {
            'Content-Type': 'application/json',
            'Authorization': KEY
        }

        payload = json.dumps({
            "model": LLM_NAME,
            "messages": payload["messages"],
            "temperature": 0.7,
            "top_p": 0.8,
            "max_tokens": 16000,
            "presence_penalty": 1.5,
            "chat_template_kwargs": {
                "enable_thinking": False
            }
        })
        
        response = None
        for _ in range(3):
            try:
                response = requests.request("POST", url_llm, headers=headers, data=payload, timeout=600)
                data = response.json()
                if "error" in data and data["error"]["message"] == "Provider returned error":
                    print("error in message")
                    return None
                return data['choices'][0]['message']['content']
            except:
                print(response.text)
        return None

    def jina_readpage(self, url: str) -> str:
        """
        Read webpage content using Jina service.
        
        Args:
            url: The URL to read
            goal: The goal/purpose of reading the page
            
        Returns:
            str: The webpage content or error message
        """
        headers = {
            "Authorization": f"Bearer {JINA_API_KEY}",
        }
        max_retries = 3
        timeout = 40
        
        for attempt in range(max_retries):
            try:
                response = requests.get(
                    f"https://r.jina.ai/{url}",
                    headers=headers,
                    timeout=timeout
                )
                if response.status_code == 200:
                    webpage_content = response.text
                    return webpage_content
                else:
                    print(response.text)
                    raise ValueError("jina readpage error")
            except Exception as e:
                time.sleep(0.1)
                if attempt == max_retries - 1:
                    return "[visit] Failed to read page."
                
        return "[visit] Failed to read page."

    def idp_readpage(self, url: str) -> str:
        """
        Read webpage content using idp service.
        
        Args:
            url: The URL to parse
            
        Returns:
            str: The webpage content or error message
        """
        max_retries = 3
        
        for attempt in range(max_retries):
            try:
                path = download_pdf(url)
                if path is None:
                    print(f"[visit] Failed to download {url}.")
                    return f"[visit] Failed to download {url}."
                pdf_content = parse_pdf(path)
                return pdf_content
            except Exception as e:
                if attempt == max_retries - 1:
                    return "[visit] Failed to read page."
                
        return "[visit] Failed to read page."
                
