"""
Filter Python code files to keep only English-language files.
Reads from sample_files/ and writes English files to sample_files_filtered/.
"""

from pathlib import Path
import re
import shutil
from tqdm import tqdm

try:
    from langdetect import detect, LangDetectException
    HAS_LANGDETECT = True
except ImportError:
    HAS_LANGDETECT = False
    print("ERROR: langdetect not installed.")
    print("Install with: pip install langdetect")
    exit(1)

# Configuration
INPUT_DIR = Path("sample_files")
OUTPUT_DIR = Path("sample_files_filtered")


def is_english_code(file_path: Path) -> bool:
    """
    Detect if code file is primarily English.
    Extracts strings and comments, then uses langdetect.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Extract strings and comments for language detection
        # (code keywords would bias toward English)
        strings = re.findall(r'["\']([^"\']{10,})["\']', content)
        comments = re.findall(r'#\s*(.+)', content)
        # Also get docstrings
        docstrings = re.findall(r'"""(.+?)"""', content, re.DOTALL)
        text_content = ' '.join(strings + comments + docstrings)

        # Need reasonable amount of text to detect
        if len(text_content) < 100:  # Increased from 50
            # Check for non-ASCII as fallback
            non_ascii_ratio = sum(1 for c in text_content if ord(c) > 127) / max(len(text_content), 1)
            if non_ascii_ratio > 0.1:  # >10% non-ASCII suggests non-English
                return False
            return True  # Too little text, assume OK

        # Detect language
        lang = detect(text_content)

        # Additional check: if text has significant non-ASCII, be more strict
        non_ascii_ratio = sum(1 for c in text_content if ord(c) > 127) / len(text_content)
        if non_ascii_ratio > 0.15:  # >15% non-ASCII characters
            return False

        return lang == 'en'

    except (LangDetectException, Exception) as e:
        # If detection fails, check for non-ASCII as fallback
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            non_ascii_ratio = sum(1 for c in content if ord(c) > 127) / max(len(content), 1)
            if non_ascii_ratio > 0.05:  # >5% non-ASCII in whole file
                return False
        except:
            pass
        return True


def main():
    # Check input directory
    if not INPUT_DIR.exists():
        print(f"ERROR: Input directory not found: {INPUT_DIR}")
        print("Run fetch_sample_files_s3.py first")
        return

    # Get all files
    input_files = list(INPUT_DIR.glob("file_*"))
    if not input_files:
        print(f"ERROR: No files found in {INPUT_DIR}")
        return

    print(f"Found {len(input_files):,} files in {INPUT_DIR}")

    # Create output directory
    OUTPUT_DIR.mkdir(exist_ok=True)
    print(f"Output directory: {OUTPUT_DIR}\n")

    # Filter and copy files
    print("Filtering for English-only files...")
    english_files = []
    non_english_files = []

    for file_path in tqdm(input_files, desc="Detecting language", unit="file"):
        if is_english_code(file_path):
            english_files.append(file_path)
            # Copy to output directory
            shutil.copy2(file_path, OUTPUT_DIR / file_path.name)
        else:
            non_english_files.append(file_path)

    # Print summary
    print(f"\n{'='*60}")
    print(f"Summary:")
    print(f"{'='*60}")
    print(f"Total files processed: {len(input_files):,}")
    print(f"English files (copied): {len(english_files):,} ({len(english_files)/len(input_files)*100:.1f}%)")
    print(f"Non-English files (excluded): {len(non_english_files):,} ({len(non_english_files)/len(input_files)*100:.1f}%)")
    print(f"\nOutput: {OUTPUT_DIR.absolute()}/")

    # Show some excluded files
    if non_english_files:
        print(f"\nSample excluded files (first 10):")
        for f in non_english_files[:10]:
            print(f"  - {f.name}")

    # Also process test_files if it exists
    test_input_dir = Path("test_files")
    test_output_dir = Path("test_files_filtered")

    if test_input_dir.exists():
        test_files = list(test_input_dir.glob("file_*"))
        if test_files:
            print(f"\n{'='*60}")
            print(f"Also filtering test files...")
            print(f"{'='*60}")

            test_output_dir.mkdir(exist_ok=True)
            english_test = []
            non_english_test = []

            for file_path in tqdm(test_files, desc="Detecting language (test)", unit="file"):
                if is_english_code(file_path):
                    english_test.append(file_path)
                    shutil.copy2(file_path, test_output_dir / file_path.name)
                else:
                    non_english_test.append(file_path)

            print(f"\nTest files processed: {len(test_files):,}")
            print(f"English test files (copied): {len(english_test):,}")
            print(f"Non-English test files (excluded): {len(non_english_test):,}")
            print(f"Output: {test_output_dir.absolute()}/")

    print(f"\n{'='*60}")
    print(f"Done! Update train_tokenizer.py to use:")
    print(f"  INPUT_DIR = Path('sample_files_filtered')")
    print(f"  FILTER_NON_ENGLISH = False  # Already filtered")


if __name__ == "__main__":
    main()
