import tiktoken
from pathlib import Path
from typing import Tuple

def count_tokens(text: str) -> int:
    """Count tokens using tiktoken with GPT encoding."""
    try:
        encoding = tiktoken.encoding_for_model("gpt-4")
        return len(encoding.encode(text))
    except Exception:
        # Fallback to simple word count if tiktoken fails
        return len(text.split())

def try_decode_file(file_path: Path) -> Tuple[str, str]:
    """
    Try to decode a file with multiple encodings.
    Returns (content, encoding_used).
    """
    encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252', 'ascii']
    
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                content = f.read()
            return content, encoding
        except UnicodeDecodeError:
            continue
    
    # Fallback to binary mode with error handling
    with open(file_path, 'rb') as f:
        raw_content = f.read()
        content = raw_content.decode('utf-8', errors='replace')
        return content, 'binary_fallback'

def detect_file_type(file_path: Path) -> str:
    """Detect file type based on extension and content."""
    suffix = file_path.suffix.lower()
    
    # Text files
    text_extensions = {'.txt', '.py', '.js', '.html', '.css', '.md', '.yml', '.yaml', 
                      '.json', '.xml', '.csv', '.tsv', '.log', '.ini', '.cfg', '.conf'}
    
    # PDF files (special handling)
    if suffix == '.pdf':
        return 'pdf'
    
    # Other binary files that are not supported
    binary_extensions = {'.docx', '.xlsx', '.zip', '.tar', '.gz', '.jpg', 
                        '.png', '.gif', '.mp4', '.avi', '.mp3', '.wav'}
    
    if suffix in text_extensions or suffix == '':
        return 'text'
    elif suffix in binary_extensions:
        return 'unsupported_binary'
    else:
        # For unknown extensions, try to detect based on content
        try:
            with open(file_path, 'rb') as f:
                sample = f.read(1024)
            if len(sample) == 0:
                return 'text'  # Empty file, treat as text
            # Simple heuristic: if mostly printable ASCII, treat as text
            printable_ratio = sum(1 for b in sample if 32 <= b <= 126 or b in [9, 10, 13]) / len(sample)
            return 'text' if printable_ratio > 0.8 else 'unsupported_binary'
        except:
            return 'unsupported_binary'

def extract_pdf_text(file_path: Path) -> str:
    """Extract text from PDF using pdfplumber."""
    try:
        import pdfplumber
        
        text_content = []
        with pdfplumber.open(file_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                try:
                    page_text = page.extract_text()
                    if page_text:
                        text_content.append(f"--- Page {page_num + 1} ---\n{page_text}")
                except Exception as e:
                    text_content.append(f"--- Page {page_num + 1} ---\n[Error extracting page: {str(e)}]")
        
        return "\n\n".join(text_content) if text_content else "[No text content found]"
    
    except ImportError:
        raise Exception("pdfplumber library not found. Please install it: pip install pdfplumber")
    except Exception as e:
        raise Exception(f"Failed to extract PDF text: {str(e)}")

def truncate_content_by_tokens(content: str, token_limit: int) -> str:
    """Truncate content to fit within token limit."""
    if count_tokens(content) <= token_limit:
        return content
    
    lines = content.split('\n')
    tokens_so_far = 0
    truncated_lines = []
    
    for line in lines:
        line_tokens = count_tokens(line + '\n')
        if tokens_so_far + line_tokens > token_limit:
            # Try to fit partial line
            remaining_tokens = token_limit - tokens_so_far
            if remaining_tokens > 10:  # Only if we have reasonable space left
                words = line.split()
                partial_line = ""
                for word in words:
                    word_tokens = count_tokens(word + " ")
                    if tokens_so_far + word_tokens <= token_limit:
                        partial_line += word + " "
                        tokens_so_far += word_tokens
                    else:
                        break
                if partial_line:
                    truncated_lines.append(partial_line.rstrip())
            truncated_lines.append("... [truncated due to token limit]")
            break
        truncated_lines.append(line)
        tokens_so_far += line_tokens
    
    return '\n'.join(truncated_lines)
