#!/usr/bin/env python3
"""
Mojibake Pattern Scanner
Scans CSV files for UTF-8 mojibake patterns and generates detailed reports.
"""

import argparse
import csv
import os
import sys
from collections import defaultdict
from pathlib import Path


class MojibakeScanner:
    """Scans files for mojibake patterns and generates reports."""

    def __init__(self):
        # Common UTF-8 mojibake patterns - using Unicode escapes to avoid syntax issues
        self.mojibake_patterns = {}

        # Build patterns using character codes to avoid encoding issues
        patterns_data = [
            ([226, 8364, 8482], "right single quote mojibake"),
            ([226, 8364, 339], "left double quote mojibake"),
            ([226, 8364], "right double quote mojibake"),
            ([226, 8364, 8220], "en dash mojibake"),
            ([226, 8364, 8221], "em dash mojibake"),
            ([226, 8364, 166], "ellipsis mojibake"),
            ([194], "remaining capital A with circumflex patterns"),
        ]

        for char_codes, description in patterns_data:
            pattern = "".join(chr(code) for code in char_codes)
            self.mojibake_patterns[pattern] = description

        self.scan_results = defaultdict(list)
        self.pattern_counts = defaultdict(int)

    def scan_file(self, file_path, encoding="utf-8"):
        """Scan a single file for mojibake patterns."""
        print(f"Scanning {file_path}...")

        try:
            with open(file_path, "r", encoding=encoding, newline="") as f:
                if file_path.endswith(".csv"):
                    reader = csv.reader(f)
                    for row_num, row in enumerate(reader, 1):
                        for col_num, field in enumerate(row):
                            self._scan_text(field, file_path, row_num, col_num)
                else:
                    for line_num, line in enumerate(f, 1):
                        self._scan_text(line, file_path, line_num, 0)

        except Exception as e:
            print(f"Error scanning {file_path}: {e}")
            return False

        return True

    def _scan_text(self, text, file_path, row_num, col_num):
        """Scan text for mojibake patterns."""
        if not text:
            return

        for pattern, description in self.mojibake_patterns.items():
            if pattern in text:
                # Extract context around the pattern
                contexts = []
                start_pos = 0
                while True:
                    pos = text.find(pattern, start_pos)
                    if pos == -1:
                        break

                    # Extract 30 characters before and after for context
                    context_start = max(0, pos - 30)
                    context_end = min(len(text), pos + len(pattern) + 30)
                    context = text[context_start:context_end]

                    # Mark the pattern in context
                    pattern_in_context = context.replace(pattern, f"[{pattern}]")
                    contexts.append(pattern_in_context)

                    self.pattern_counts[(file_path, pattern)] += 1
                    start_pos = pos + 1

                if contexts:
                    self.scan_results[file_path].append(
                        {
                            "pattern": pattern,
                            "description": description,
                            "row": row_num,
                            "col": col_num,
                            "contexts": contexts,
                            "count": len(contexts),
                        }
                    )

    def scan_multiple_files(self, file_patterns, encoding="utf-8"):
        """Scan multiple files matching patterns."""
        scanned_files = []

        for pattern in file_patterns:
            if "*" in pattern:
                # Handle glob patterns
                from glob import glob

                files = glob(pattern)
            else:
                files = [pattern] if os.path.exists(pattern) else []

            for file_path in files:
                if os.path.isfile(file_path):
                    if self.scan_file(file_path, encoding):
                        scanned_files.append(file_path)

        return scanned_files

    def generate_report(self, output_file=None):
        """Generate a comprehensive scan report."""
        report_lines = []
        report_lines.append("=" * 80)
        report_lines.append("MOJIBAKE PATTERN SCAN REPORT")
        report_lines.append("=" * 80)
        report_lines.append("")

        # Summary statistics
        total_files = len(self.scan_results)
        total_patterns = sum(len(results) for results in self.scan_results.values())
        total_occurrences = sum(self.pattern_counts.values())

        report_lines.append("SUMMARY:")
        report_lines.append(f"  Files scanned: {total_files}")
        report_lines.append(
            f"  Files with issues: {len([f for f in self.scan_results if self.scan_results[f]])}"
        )
        report_lines.append(f"  Total pattern types found: {total_patterns}")
        report_lines.append(f"  Total pattern occurrences: {total_occurrences}")
        report_lines.append("")

        # Pattern frequency summary
        if self.pattern_counts:
            report_lines.append("PATTERN FREQUENCY SUMMARY:")
            pattern_totals = defaultdict(int)
            for (file_path, pattern), count in self.pattern_counts.items():
                pattern_totals[pattern] += count

            for pattern, total_count in sorted(
                pattern_totals.items(), key=lambda x: x[1], reverse=True
            ):
                description = self.mojibake_patterns.get(pattern, "unknown pattern")
                # Convert pattern to hex representation for display
                hex_repr = " ".join(f"{ord(c):02x}" for c in pattern)
                report_lines.append(
                    f"  Pattern 0x{hex_repr} ({description}): {total_count} occurrences"
                )
            report_lines.append("")

        # Detailed results by file
        if self.scan_results:
            report_lines.append("DETAILED RESULTS BY FILE:")
            report_lines.append("-" * 40)

            for file_path in sorted(self.scan_results.keys()):
                results = self.scan_results[file_path]
                if not results:
                    continue

                report_lines.append(f"\nFile: {file_path}")

                # Group by pattern
                pattern_groups = defaultdict(list)
                for result in results:
                    pattern_groups[result["pattern"]].extend(result["contexts"])

                for pattern, contexts in pattern_groups.items():
                    description = self.mojibake_patterns.get(pattern, "unknown")
                    hex_repr = " ".join(f"{ord(c):02x}" for c in pattern)
                    report_lines.append(
                        f"  Pattern 0x{hex_repr} ({description}) - {len(contexts)} occurrences"
                    )

                    # Show first few contexts
                    for i, context in enumerate(contexts[:5]):
                        report_lines.append(f"    Context {i+1}: ...{context}...")

                    if len(contexts) > 5:
                        report_lines.append(
                            f"    ... and {len(contexts) - 5} more occurrences"
                        )
                    report_lines.append("")
        else:
            report_lines.append("No mojibake patterns found in any scanned files!")

        report_lines.append("=" * 80)

        report_content = "\n".join(report_lines)

        if output_file:
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(report_content)
            print(f"Report saved to: {output_file}")
        else:
            print(report_content)

        return report_content


def main():
    parser = argparse.ArgumentParser(description="Scan CSV files for mojibake patterns")
    parser.add_argument("files", nargs="+", help="Files or file patterns to scan")
    parser.add_argument(
        "--encoding", default="utf-8", help="File encoding (default: utf-8)"
    )
    parser.add_argument("--output", "-o", help="Output report file")
    parser.add_argument(
        "--summary-only", action="store_true", help="Show only summary statistics"
    )

    args = parser.parse_args()

    scanner = MojibakeScanner()
    scanned_files = scanner.scan_multiple_files(args.files, args.encoding)

    print(f"\nScanned {len(scanned_files)} files:")
    for file_path in scanned_files:
        print(f"  - {file_path}")
    print()

    # Generate report
    if args.summary_only:
        total_files = len(scanner.scan_results)
        total_patterns = sum(len(results) for results in scanner.scan_results.values())
        total_occurrences = sum(scanner.pattern_counts.values())

        print("SUMMARY:")
        print(
            f"  Files with issues: {len([f for f in scanner.scan_results if scanner.scan_results[f]])}"
        )
        print(f"  Total pattern occurrences: {total_occurrences}")
    else:
        scanner.generate_report(args.output)


if __name__ == "__main__":
    main()
