#!/usr/bin/env python3
"""
PDF Tools MCP Server
Use/Using标准MCP协议实现的 PDF Handle/Process工具
"""

import os
import io
import tempfile
import json
import asyncio
from typing import Any, Dict, List, Optional, Sequence
from pathlib import Path

# 抑制警告
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning, module="pydub")

# 设置 ONNX Runtime environment variable来禁用线程亲和性
os.environ['ORT_DISABLE_CPU_AFFINITY'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'

import PyPDF2
from markitdown import MarkItDown
from mcp.server import Server
from mcp.server.stdio import stdio_server
from mcp.types import Tool, TextContent

# Create MCP server实例
server = Server("pdf-tools-mcp")

# Initialize MarkItDown
md = MarkItDown()

@server.list_tools()
async def list_tools() -> List[Tool]:
    """List all available tools"""
    return [
        Tool(
            name="extract_text_from_pdf",
            description="从PDFFile中提取文本内容",
            inputSchema={
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "PDFFilePath"}
                },
                "required": ["file_path"]
            }
        ),
        Tool(
            name="pdf_to_markdown",
            description="将PDFFile转换为Markdown格式",
            inputSchema={
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "PDFFilePath"}
                },
                "required": ["file_path"]
            }
        ),
        Tool(
            name="get_pdf_info",
            description="Get/FetchPDFFileInfo/Information",
            inputSchema={
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "PDFFilePath"}
                },
                "required": ["file_path"]
            }
        ),
        Tool(
            name="merge_pdfs",
            description="合并多个PDFFile",
            inputSchema={
                "type": "object",
                "properties": {
                    "pdf_files": {"type": "array", "items": {"type": "string"}, "description": "PDFFilePathList"},
                    "output_path": {"type": "string", "description": "输出FilePath"}
                },
                "required": ["pdf_files", "output_path"]
            }
        ),
        Tool(
            name="split_pdf",
            description="拆分PDFFile",
            inputSchema={
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "输入PDFFilePath"},
                    "output_dir": {"type": "string", "description": "输出目录"},
                    "pages_per_file": {"type": "integer", "description": "每个File的页数", "default": 1}
                },
                "required": ["file_path", "output_dir"]
            }
        ),
        Tool(
            name="extract_pages",
            description="提取PDF的特定页面",
            inputSchema={
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "输入PDFFilePath"},
                    "page_numbers": {"type": "array", "items": {"type": "integer"}, "description": "要提取的页码List (从1开始)"},
                    "output_path": {"type": "string", "description": "输出FilePath"}
                },
                "required": ["file_path", "page_numbers", "output_path"]
            }
        )
    ]

@server.call_tool()
async def call_tool(name: str, arguments: Dict[str, Any]) -> Sequence[TextContent]:
    """Call tool"""
    try:
        if name == "extract_text_from_pdf":
            result = await extract_text_from_pdf_impl(arguments["file_path"])
        elif name == "pdf_to_markdown":
            result = await pdf_to_markdown_impl(arguments["file_path"])
        elif name == "get_pdf_info":
            result = await get_pdf_info_impl(arguments["file_path"])
        elif name == "merge_pdfs":
            result = await merge_pdfs_impl(arguments["pdf_files"], arguments["output_path"])
        elif name == "split_pdf":
            result = await split_pdf_impl(
                arguments["file_path"], 
                arguments["output_dir"], 
                arguments.get("pages_per_file", 1)
            )
        elif name == "extract_pages":
            result = await extract_pages_impl(
                arguments["file_path"], 
                arguments["page_numbers"], 
                arguments["output_path"]
            )
        else:
            result = {"status": "error", "message": f"未知工具: {name}"}
        
        return [TextContent(type="text", text=json.dumps(result, indent=2, ensure_ascii=False))]
    
    except Exception as e:
        error_result = {"status": "error", "message": str(e)}
        return [TextContent(type="text", text=json.dumps(error_result, indent=2, ensure_ascii=False))]

async def extract_text_from_pdf_impl(file_path: str) -> Dict[str, Any]:
    """从PDFFile中提取文本内容"""
    try:
        if not os.path.exists(file_path):
            return {"status": "error", "message": f"File不存在: {file_path}"}
        
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += f"\n--- 第 {page_num + 1} 页 ---\n"
                text += page.extract_text()
            
            return {
                "status": "success",
                "text": text.strip(),
                "page_count": len(pdf_reader.pages),
                "file_path": file_path
            }
    except Exception as e:
        return {"status": "error", "message": str(e)}

async def pdf_to_markdown_impl(file_path: str) -> Dict[str, Any]:
    """将PDFFile转换为Markdown格式"""
    try:
        if not os.path.exists(file_path):
            return {"status": "error", "message": f"File不存在: {file_path}"}
        
        # Use/Using markitdown 转换 PDF
        result = md.convert(file_path)
        
        return {
            "status": "success",
            "markdown": result.text_content,
            "title": result.title or "Untitled",
            "file_path": file_path
        }
    except Exception as e:
        return {"status": "error", "message": str(e)}

async def get_pdf_info_impl(file_path: str) -> Dict[str, Any]:
    """Get/FetchPDFFileInfo/Information"""
    try:
        if not os.path.exists(file_path):
            return {"status": "error", "message": f"File不存在: {file_path}"}
        
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            info = {
                "status": "success",
                "file_path": file_path,
                "page_count": len(pdf_reader.pages),
                "file_size": os.path.getsize(file_path),
                "metadata": {}
            }
            
            # Get/Fetch元Data
            if pdf_reader.metadata:
                metadata = pdf_reader.metadata
                info["metadata"] = {
                    "title": metadata.get("/Title", ""),
                    "author": metadata.get("/Author", ""),
                    "subject": metadata.get("/Subject", ""),
                    "creator": metadata.get("/Creator", ""),
                    "producer": metadata.get("/Producer", ""),
                    "creation_date": str(metadata.get("/CreationDate", "")),
                    "modification_date": str(metadata.get("/ModDate", ""))
                }
            
            return info
    except Exception as e:
        return {"status": "error", "message": str(e)}

async def merge_pdfs_impl(pdf_files: List[str], output_path: str) -> Dict[str, Any]:
    """合并多个PDFFile"""
    try:
        # 检查输入File
        for file_path in pdf_files:
            if not os.path.exists(file_path):
                return {"status": "error", "message": f"File不存在: {file_path}"}
        
        pdf_writer = PyPDF2.PdfWriter()
        
        for file_path in pdf_files:
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                for page in pdf_reader.pages:
                    pdf_writer.add_page(page)
        
        # 写入输出File
        with open(output_path, 'wb') as output_file:
            pdf_writer.write(output_file)
        
        return {
            "status": "success",
            "message": f"Success合并 {len(pdf_files)} 个PDFFile",
            "output_path": output_path,
            "input_files": pdf_files
        }
    except Exception as e:
        return {"status": "error", "message": str(e)}

async def split_pdf_impl(file_path: str, output_dir: str, pages_per_file: int = 1) -> Dict[str, Any]:
    """拆分PDFFile"""
    try:
        if not os.path.exists(file_path):
            return {"status": "error", "message": f"File不存在: {file_path}"}
        
        # 创建输出目录
        os.makedirs(output_dir, exist_ok=True)
        
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            total_pages = len(pdf_reader.pages)
            
            output_files = []
            
            for i in range(0, total_pages, pages_per_file):
                pdf_writer = PyPDF2.PdfWriter()
                
                # 添加页面到新PDF
                for j in range(i, min(i + pages_per_file, total_pages)):
                    pdf_writer.add_page(pdf_reader.pages[j])
                
                # 生成输出File名
                base_name = Path(file_path).stem
                output_file = os.path.join(output_dir, f"{base_name}_part_{i//pages_per_file + 1}.pdf")
                
                # 写入File
                with open(output_file, 'wb') as out_file:
                    pdf_writer.write(out_file)
                
                output_files.append(output_file)
        
        return {
            "status": "success",
            "message": f"Success拆分PDF为 {len(output_files)} 个File",
            "output_files": output_files,
            "total_pages": total_pages
        }
    except Exception as e:
        return {"status": "error", "message": str(e)}

async def extract_pages_impl(file_path: str, page_numbers: List[int], output_path: str) -> Dict[str, Any]:
    """提取PDF的特定页面"""
    try:
        if not os.path.exists(file_path):
            return {"status": "error", "message": f"File不存在: {file_path}"}
        
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            total_pages = len(pdf_reader.pages)
            
            pdf_writer = PyPDF2.PdfWriter()
            
            for page_num in page_numbers:
                if 1 <= page_num <= total_pages:
                    pdf_writer.add_page(pdf_reader.pages[page_num - 1])
                else:
                    return {"status": "error", "message": f"页码 {page_num} 超出范围 (1-{total_pages})"}
            
            # 写入输出File
            with open(output_path, 'wb') as output_file:
                pdf_writer.write(output_file)
        
        return {
            "status": "success",
            "message": f"Success提取 {len(page_numbers)} 页",
            "output_path": output_path,
            "extracted_pages": page_numbers
        }
    except Exception as e:
        return {"status": "error", "message": str(e)}

async def main():
    """Run MCP server"""
    async with stdio_server() as (read_stream, write_stream):
        await server.run(read_stream, write_stream, server.create_initialization_options())

if __name__ == "__main__":
    asyncio.run(main()) 