#!/usr/bin/env python3
"""
Windows-1252 to UTF-8 CSV Converter
Converts CSV files from Windows-1252 encoding to UTF-8 with smart character replacements.
"""

import argparse
import csv
import logging
import os
import shutil
import sys
from datetime import datetime
from pathlib import Path


class EncodingConverter:
    """Handles conversion from Windows-1252 to UTF-8 with smart character replacements."""

    def __init__(self):
        self.conversion_stats = {
            "total_chars": 0,
            "converted_chars": 0,
            "replaced_chars": 0,
            "conversions": {},
        }

        # Windows-1252 to Unicode mapping for problematic characters
        self.windows1252_unicode_map = {
            0x80: "€",  # Euro sign
            0x82: "‚",  # Single low-9 quotation mark
            0x83: "ƒ",  # Latin small letter f with hook
            0x84: "„",  # Double low-9 quotation mark
            0x85: "…",  # Horizontal ellipsis
            0x86: "†",  # Dagger
            0x87: "‡",  # Double dagger
            0x88: "ˆ",  # Modifier letter circumflex accent
            0x89: "‰",  # Per mille sign
            0x8A: "Š",  # Latin capital letter S with caron
            0x8B: "‹",  # Single left-pointing angle quotation mark
            0x8C: "Œ",  # Latin capital ligature OE
            0x8E: "Ž",  # Latin capital letter Z with caron
            0x91: """,  # Left single quotation mark
            0x92: """,  # Right single quotation mark
            0x93: '"',  # Left double quotation mark
            0x94: '"',  # Right double quotation mark
            0x95: "•",  # Bullet
            0x96: "–",  # En dash
            0x97: "—",  # Em dash
            0x98: "˜",  # Small tilde
            0x99: "™",  # Trade mark sign
            0x9A: "š",  # Latin small letter s with caron
            0x9B: "›",  # Single right-pointing angle quotation mark
            0x9C: "œ",  # Latin small ligature oe
            0x9E: "ž",  # Latin small letter z with caron
            0x9F: "Ÿ",  # Latin capital letter Y with diaeresis
            0xA0: " ",  # Non-breaking space
        }

        # Smart replacements for typographic characters to ASCII equivalents
        self.smart_replacements = {
            # Smart quotes to standard quotes
            """: "'",   # Left single quotation mark
            """: "'",  # Right single quotation mark
            '"': '"',  # Left double quotation mark
            '"': '"',  # Right double quotation mark
            "‚": "'",  # Single low-9 quotation mark
            "„": '"',  # Double low-9 quotation mark
            # Dashes to hyphens
            "–": "-",  # En dash
            "—": "-",  # Em dash
            # Ellipsis to three dots
            "…": "...",  # Horizontal ellipsis
            # Non-breaking space to regular space
            " ": " ",  # Non-breaking space (already handled above but for clarity)
        }

    def convert_windows1252_to_utf8(self, text):
        """Convert Windows-1252 encoded text to proper UTF-8."""
        if not text:
            return text

        result = text

        # Handle UTF-8 byte sequence artifacts that appear as separate characters
        # This occurs when Windows-1252 bytes were incorrectly decoded as UTF-8
        # Handle UTF-8 mojibake patterns where C2 byte is followed by Unicode chars
        utf8_fixes = []

        # Pattern 1: 'Â' (0xC2) followed by various Unicode characters
        mojibake_patterns = [
            (chr(194) + chr(8217), "'"),  # Â' -> '
            (chr(194) + chr(8216), "'"),  # Â' -> '
            (chr(194) + chr(8221), '"'),  # Â" -> "
            (chr(194) + chr(8220), '"'),  # Â" -> "
            (chr(194) + chr(8230), "..."),  # Â… -> ...
            (chr(194) + chr(8211), "-"),  # Â– -> -
            (chr(194) + chr(8212), "-"),  # Â— -> -
            (chr(194) + chr(8482), "™"),  # Â™ -> ™
            (chr(194) + chr(160), " "),  # Â  -> space
        ]

        # Pattern 2: â€ mojibake patterns (more common in our files)
        ae_mojibake_patterns = [
            (chr(226) + chr(8364) + chr(8482), "'"),  # â€™ -> '
            (chr(226) + chr(8364) + chr(339), '"'),  # â€œ -> "
            (chr(226) + chr(8364), '"'),  # â€ -> "
            (chr(226) + chr(8364) + chr(8220), "-"),  # â€" -> -
            (chr(226) + chr(8364) + chr(8221), "-"),  # â€" -> -
            (chr(226) + chr(8364) + chr(166), "..."),  # â€¦ -> ...
        ]

        mojibake_patterns.extend(ae_mojibake_patterns)

        utf8_fixes.extend(mojibake_patterns)

        for bad_seq, good_char in utf8_fixes:
            if bad_seq in result:
                count = result.count(bad_seq)
                result = result.replace(bad_seq, good_char)
                self.conversion_stats["converted_chars"] += count
                self._record_conversion(bad_seq, good_char, count)

        return result

    def apply_smart_replacements(self, text):
        """Apply smart character replacements for better compatibility."""
        result = text
        for unicode_char, ascii_replacement in self.smart_replacements.items():
            if unicode_char in result:
                count = result.count(unicode_char)
                result = result.replace(unicode_char, ascii_replacement)
                self.conversion_stats["replaced_chars"] += count
                self._record_conversion(unicode_char, ascii_replacement, count)

        return result

    def _record_conversion(self, from_char, to_char, count=1):
        """Record conversion statistics."""
        conversion_key = f"{from_char} → {to_char}"
        if conversion_key not in self.conversion_stats["conversions"]:
            self.conversion_stats["conversions"][conversion_key] = 0
        self.conversion_stats["conversions"][conversion_key] += count

    def convert_text(self, text):
        """Apply full conversion pipeline to text."""
        if not text:
            return text

        self.conversion_stats["total_chars"] += len(text)

        # Stage 1: Convert Windows-1252 to proper UTF-8
        utf8_text = self.convert_windows1252_to_utf8(text)

        # Stage 2: Apply smart replacements
        final_text = self.apply_smart_replacements(utf8_text)

        return final_text


def setup_logging(log_file=None):
    """Set up logging configuration."""
    log_format = "%(asctime)s - %(levelname)s - %(message)s"

    if log_file:
        logging.basicConfig(
            level=logging.INFO,
            format=log_format,
            handlers=[logging.FileHandler(log_file), logging.StreamHandler(sys.stdout)],
        )
    else:
        logging.basicConfig(
            level=logging.INFO,
            format=log_format,
            handlers=[logging.StreamHandler(sys.stdout)],
        )


def create_backup(file_path):
    """Create a backup of the original file."""
    backup_path = f"{file_path}.bak"
    try:
        shutil.copy2(file_path, backup_path)
        logging.info(f"Backup created: {backup_path}")
        return backup_path
    except Exception as e:
        logging.error(f"Failed to create backup: {e}")
        raise


def convert_csv_file(input_file, output_file, converter):
    """Convert a CSV file from Windows-1252 to UTF-8."""
    rows_processed = 0

    try:
        # Read with Windows-1252 encoding
        with open(input_file, "r", encoding="windows-1252", newline="") as infile:
            # Detect dialect with more robust settings
            sample = infile.read(1024)
            infile.seek(0)
            sniffer = csv.Sniffer()

            try:
                dialect = sniffer.sniff(sample)
                # Ensure we have an escape character for safety
                if not hasattr(dialect, "escapechar") or dialect.escapechar is None:
                    dialect.escapechar = "\\"
            except csv.Error:
                # Fallback to default dialect if sniffing fails
                dialect = csv.excel
                dialect.escapechar = "\\"

            reader = csv.reader(infile, dialect=dialect)

            # Write with UTF-8 encoding using a more compatible dialect
            with open(output_file, "w", encoding="utf-8", newline="") as outfile:
                # Use a safe writer configuration
                writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL, escapechar="\\")

                for row in reader:
                    converted_row = []
                    for field in row:
                        converted_field = converter.convert_text(field)
                        converted_row.append(converted_field)

                    writer.writerow(converted_row)
                    rows_processed += 1

                    if rows_processed % 1000 == 0:
                        logging.info(f"Processed {rows_processed} rows...")

    except Exception as e:
        logging.error(f"Error processing CSV file: {e}")
        raise

    return rows_processed


def generate_report(
    converter, input_file, output_file, rows_processed, processing_time
):
    """Generate conversion report."""
    stats = converter.conversion_stats

    report = f"""
CONVERSION REPORT
================
Input file: {input_file}
Output file: {output_file}
Processing time: {processing_time:.2f} seconds
Rows processed: {rows_processed:,}

CHARACTER STATISTICS:
Total characters processed: {stats['total_chars']:,}
Characters converted from Windows-1252: {stats['converted_chars']:,}
Characters replaced with ASCII equivalents: {stats['replaced_chars']:,}

CONVERSION DETAILS:
"""

    if stats["conversions"]:
        for conversion, count in sorted(stats["conversions"].items()):
            report += f"  {conversion}: {count:,} times\n"
    else:
        report += "  No character conversions needed.\n"

    return report


def main():
    """Main program entry point."""
    parser = argparse.ArgumentParser(
        description="Convert CSV files from Windows-1252 to UTF-8 encoding with smart character replacements"
    )
    parser.add_argument("input_file", help="Input CSV file path")
    parser.add_argument(
        "-o", "--output", help="Output file path (default: input_file_converted.csv)"
    )
    parser.add_argument("-l", "--log", help="Log file path (default: conversion.log)")
    parser.add_argument(
        "--no-backup", action="store_true", help="Skip creating backup file"
    )

    args = parser.parse_args()

    # Validate input file
    input_path = Path(args.input_file)
    if not input_path.exists():
        print(f"Error: Input file '{args.input_file}' does not exist.")
        sys.exit(1)

    if not input_path.suffix.lower() == ".csv":
        print(f"Warning: Input file '{args.input_file}' does not have .csv extension.")

    # Set up output file path
    if args.output:
        output_path = Path(args.output)
    else:
        output_path = input_path.parent / f"{input_path.stem}_converted.csv"

    # Set up logging
    log_file = (
        args.log
        if args.log
        else f"conversion_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
    )
    setup_logging(log_file)

    logging.info(f"Starting conversion: {input_path} → {output_path}")

    try:
        # Create backup
        if not args.no_backup:
            create_backup(input_path)

        # Initialize converter
        converter = EncodingConverter()

        # Process file
        start_time = datetime.now()
        rows_processed = convert_csv_file(input_path, output_path, converter)
        end_time = datetime.now()
        processing_time = (end_time - start_time).total_seconds()

        # Generate and display report
        report = generate_report(
            converter, input_path, output_path, rows_processed, processing_time
        )
        logging.info(report)

        # Write report to file
        report_file = output_path.parent / f"{output_path.stem}_report.txt"
        with open(report_file, "w", encoding="utf-8") as f:
            f.write(report)

        logging.info(f"Conversion completed successfully!")
        logging.info(f"Output file: {output_path}")
        logging.info(f"Report file: {report_file}")

    except Exception as e:
        logging.error(f"Conversion failed: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()
