#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Wikipedia API工具
"""

import requests
import time
import logging
from typing import Dict, Any, Optional

class WikipediaAPI:
    """Wikipedia API客户端"""
    
    def __init__(self, timeout: int = 10, max_retries: int = 5, retry_delay: float = 1.0):
        self.base_url = "https://en.wikipedia.org/api/rest_v1"
        # 设置User-Agent以符合Wikipedia API要求
        self.headers = {
            'User-Agent': 'LLM_KG_Pipeline/1.0 (https://example.com/contact) requests/2.28.0'
        }
        self.timeout = timeout
        self.max_retries = max_retries
        self.retry_delay = retry_delay
        self.logger = logging.getLogger(__name__)
        
    def get_page_summary(self, title: str) -> Optional[str]:
        """获取页面摘要，带重试机制"""
        for attempt in range(self.max_retries):
            try:
                url = f"{self.base_url}/page/summary/{title}"
                response = requests.get(url, headers=self.headers, timeout=self.timeout)
                
                if response.status_code == 200:
                    data = response.json()
                    return data.get('extract', '')
                elif response.status_code == 429:  # 限流
                    wait_time = self.retry_delay * (2 ** attempt) + 10  # 限流时额外等待10秒
                    self.logger.warning(f"Wikipedia API 限流，等待 {wait_time:.1f} 秒后重试 (尝试 {attempt + 1}/{self.max_retries})")
                    if attempt < self.max_retries - 1:
                        time.sleep(wait_time)
                        continue
                elif response.status_code == 404:
                    # 页面不存在，不需要重试
                    self.logger.debug(f"Wikipedia 页面不存在: {title}")
                    return None
                else:
                    # 其他HTTP错误
                    self.logger.warning(f"Wikipedia API HTTP错误 {response.status_code}: {title}")
                    if attempt < self.max_retries - 1:
                        wait_time = self.retry_delay * (2 ** attempt)
                        self.logger.debug(f"等待 {wait_time:.1f} 秒后重试 (尝试 {attempt + 1}/{self.max_retries})")
                        time.sleep(wait_time)
                        continue
                        
            except requests.exceptions.Timeout:
                self.logger.warning(f"Wikipedia API 超时: {title} (尝试 {attempt + 1}/{self.max_retries})")
                if attempt < self.max_retries - 1:
                    wait_time = self.retry_delay * (2 ** attempt)
                    self.logger.debug(f"等待 {wait_time:.1f} 秒后重试")
                    time.sleep(wait_time)
                    continue
            except requests.exceptions.ConnectionError:
                self.logger.warning(f"Wikipedia API 连接错误: {title} (尝试 {attempt + 1}/{self.max_retries})")
                if attempt < self.max_retries - 1:
                    wait_time = self.retry_delay * (2 ** attempt)
                    self.logger.debug(f"等待 {wait_time:.1f} 秒后重试")
                    time.sleep(wait_time)
                    continue
            except Exception as e:
                self.logger.error(f"Wikipedia API 未知错误: {title}, 错误: {str(e)} (尝试 {attempt + 1}/{self.max_retries})")
                if attempt < self.max_retries - 1:
                    wait_time = self.retry_delay * (2 ** attempt)
                    time.sleep(wait_time)
                    continue
                    
        # 所有重试都失败
        self.logger.error(f"Wikipedia API 获取摘要失败，已重试 {self.max_retries} 次: {title}")
        return None
        
    def get_wikipedia_summary(self, title: str) -> Optional[str]:
        """获取Wikipedia页面摘要 - 与get_page_summary相同，但方法名匹配WikidataRetriever的期望"""
        return self.get_page_summary(title)
        
    def page_exists(self, title: str) -> bool:
        """检查页面是否存在，带重试机制"""
        for attempt in range(self.max_retries):
            try:
                url = f"{self.base_url}/page/summary/{title}"
                response = requests.get(url, headers=self.headers, timeout=self.timeout)
                return response.status_code == 200
            except requests.exceptions.Timeout:
                self.logger.warning(f"检查页面存在性超时: {title} (尝试 {attempt + 1}/{self.max_retries})")
                if attempt < self.max_retries - 1:
                    time.sleep(self.retry_delay * (2 ** attempt))
                    continue
            except Exception as e:
                self.logger.warning(f"检查页面存在性错误: {title}, 错误: {str(e)} (尝试 {attempt + 1}/{self.max_retries})")
                if attempt < self.max_retries - 1:
                    time.sleep(self.retry_delay * (2 ** attempt))
                    continue
        
        # 重试失败，假设页面不存在
        return False
