#!/usr/bin/env python3
"""
Check PDF file health status, identify corrupted files
"""
import os
import sys
from pathlib import Path
import pdfplumber
import PyPDF2

def check_pdf_with_pdfplumber(pdf_path: Path) -> tuple[bool, str]:
    """Check PDF using pdfplumber"""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if len(pdf.pages) == 0:
                return False, "PDF has no pages"
            # Try to read the first page
            first_page = pdf.pages[0]
            text = first_page.extract_text()
            return True, f"Normal, {len(pdf.pages)} pages total"
    except Exception as e:
        return False, f"pdfplumber error: {str(e)}"

def check_pdf_with_pypdf2(pdf_path: Path) -> tuple[bool, str]:
    """Check PDF using PyPDF2"""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            if len(reader.pages) == 0:
                return False, "PDF has no pages"
            # Try to read the first page
            first_page = reader.pages[0]
            text = first_page.extract_text()
            return True, f"Normal, {len(reader.pages)} pages total"
    except Exception as e:
        return False, f"PyPDF2 error: {str(e)}"

def check_pdf_file(pdf_path: Path) -> dict:
    """Comprehensive check of PDF file"""
    result = {
        "path": str(pdf_path),
        "size": pdf_path.stat().st_size,
        "pdfplumber_ok": False,
        "pypdf2_ok": False,
        "error_msg": ""
    }
    
    # Check file size
    if result["size"] < 1024:  # Less than 1KB may be problematic
        result["error_msg"] = f"File too small ({result['size']} bytes)"
        return result
    
    # Check with pdfplumber
    pdfplumber_ok, pdfplumber_msg = check_pdf_with_pdfplumber(pdf_path)
    result["pdfplumber_ok"] = pdfplumber_ok
    
    # Check with PyPDF2
    pypdf2_ok, pypdf2_msg = check_pdf_with_pypdf2(pdf_path)
    result["pypdf2_ok"] = pypdf2_ok
    
    if not pdfplumber_ok and not pypdf2_ok:
        result["error_msg"] = f"Both methods failed: {pdfplumber_msg}; {pypdf2_msg}"
    elif not pdfplumber_ok:
        result["error_msg"] = pdfplumber_msg
    elif not pypdf2_ok:
        result["error_msg"] = pypdf2_msg
    else:
        result["error_msg"] = "Normal"
    
    return result

def main():
    pdf_dir = Path("")
    
    if not pdf_dir.exists():
        print(f"Directory does not exist: {pdf_dir}")
        sys.exit(1)
    
    pdf_files = list(pdf_dir.rglob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files")
    
    healthy_files = []
    broken_files = []
    
    for i, pdf_path in enumerate(pdf_files, 1):
        print(f"[{i}/{len(pdf_files)}] Checking {pdf_path.name}...", end=" ")
        
        result = check_pdf_file(pdf_path)
        
        if result["pdfplumber_ok"] or result["pypdf2_ok"]:
            healthy_files.append(result)
            print("✓")
        else:
            broken_files.append(result)
            print(f"✗ {result['error_msg']}")
    
    print(f"\n=== Summary ===")
    print(f"Healthy files: {len(healthy_files)}")
    print(f"Broken files: {len(broken_files)}")
    
    if broken_files:
        print(f"\n=== Broken files list ===")
        for broken in broken_files:
            print(f"{Path(broken['path']).name}: {broken['error_msg']}")
        
        # Optional: Move broken files to separate directory
        broken_dir = pdf_dir.parent / "broken_pdfs"
        choice = input(f"\nMove {len(broken_files)} broken files to {broken_dir}? (y/N): ")
        if choice.lower() == 'y':
            broken_dir.mkdir(exist_ok=True)
            for broken in broken_files:
                src = Path(broken['path'])
                dst = broken_dir / src.name
                try:
                    src.rename(dst)
                    print(f"Moved: {src.name} -> {dst}")
                except Exception as e:
                    print(f"Failed to move {src.name}: {e}")

if __name__ == "__main__":
    main()
