#!/usr/bin/env python3
"""
Date Format Handler
Detects and handles various date/time formats in the database.
"""

import json
import os
import sqlite3
import re
from collections import Counter

def analyze_date_formats(db_path):
    """Analyze all date/time formats in the database."""
    date_formats = {
        'detected_formats': {},
        'conversion_patterns': {},
        'comparison_rules': [],
        'parsing_functions': {},
        'common_errors': []
    }

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Date format patterns to check
    format_patterns = {
        'yyyy-mm-dd': re.compile(r'^\d{4}-\d{2}-\d{2}$'),
        'yyyy/mm/dd': re.compile(r'^\d{4}/\d{2}/\d{2}$'),
        'mm/dd/yyyy': re.compile(r'^\d{1,2}/\d{1,2}/\d{4}$'),
        'dd/mm/yyyy': re.compile(r'^\d{1,2}/\d{1,2}/\d{4}$'),
        'mm-dd-yyyy': re.compile(r'^\d{2}-\d{2}-\d{4}$'),
        'dd-mm-yyyy': re.compile(r'^\d{2}-\d{2}-\d{4}$'),
        'yyyy-mm-dd hh:mm:ss': re.compile(r'^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}'),
        'mm/dd/yyyy hh:mm': re.compile(r'^\d{1,2}/\d{1,2}/\d{4}\s+\d{1,2}:\d{2}'),
        'yyyy': re.compile(r'^\d{4}$'),
        'unix_timestamp': re.compile(r'^\d{10,13}$')
    }

    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = [row[0] for row in cursor.fetchall()]

    # Check each table for date columns
    for table in tables:
        cursor.execute(f"PRAGMA table_info({table})")
        columns = cursor.fetchall()

        for col in columns:
            col_name = col[1]
            col_type = col[2]

            # Check if likely date/time column
            if (any(pattern in col_name.lower() for pattern in
                   ['date', 'time', 'year', 'month', 'day', 'created', 'updated', 'modified']) or
                    col_type in ['DATE', 'DATETIME', 'TIMESTAMP']):

                # Sample values to detect format
                cursor.execute(f"SELECT DISTINCT {col_name} FROM {table} WHERE {col_name} IS NOT NULL LIMIT 20")
                samples = [row[0] for row in cursor.fetchall()]

                if samples:
                    # Detect format
                    format_counts = Counter()
                    for sample in samples:
                        sample_str = str(sample).strip()
                        for format_name, pattern in format_patterns.items():
                            if pattern.match(sample_str):
                                format_counts[format_name] += 1
                                break

                    if format_counts:
                        dominant_format = format_counts.most_common(1)[0][0]
                        date_formats['detected_formats'][f"{table}.{col_name}"] = {
                            'format': dominant_format,
                            'samples': samples[:3],
                            'column_type': col_type,
                            'confidence': format_counts[dominant_format] / len(samples)
                        }

    # Generate conversion patterns for each format
    date_formats['conversion_patterns'] = {
        'yyyy-mm-dd': {
            'description': 'Standard SQL date format',
            'comparison': "Direct string comparison works",
            'extraction': {
                'year': "SUBSTR(date_col, 1, 4)",
                'month': "SUBSTR(date_col, 6, 2)",
                'day': "SUBSTR(date_col, 9, 2)"
            },
            'like_patterns': {
                'specific_date': "'2023-08-15'",
                'month': "'2023-08-%'",
                'year': "'2023-%'"
            }
        },
        'mm/dd/yyyy': {
            'description': 'US date format',
            'comparison': "String comparison unreliable",
            'extraction': {
                'year': "SUBSTR(date_col, -4)",
                'month': "SUBSTR(date_col, 1, INSTR(date_col, '/') - 1)",
                'day': "SUBSTR(date_col, INSTR(date_col, '/') + 1, 2)"
            },
            'like_patterns': {
                'specific_date': "'8/15/2023'",
                'month': "'8/%/2023'",
                'year': "'%/2023'"
            },
            'conversion_to_standard': "SUBSTR(date_col, -4) || '-' || PRINTF('%02d', SUBSTR(date_col, 1, INSTR(date_col, '/') - 1)) || '-' || PRINTF('%02d', SUBSTR(date_col, INSTR(date_col, '/') + 1, 2))"
        },
        'mm/dd/yyyy hh:mm': {
            'description': 'US datetime format',
            'comparison': "Extract date part for date-only comparisons",
            'extraction': {
                'date_part': "SUBSTR(date_col, 1, INSTR(date_col, ' ') - 1)",
                'time_part': "SUBSTR(date_col, INSTR(date_col, ' ') + 1)",
                'year': "SUBSTR(date_col, INSTR(date_col, '/') + INSTR(SUBSTR(date_col, INSTR(date_col, '/') + 1), '/') + 1, 4)"
            },
            'like_patterns': {
                'specific_date': "'8/15/2023 %'",
                'specific_datetime': "'8/15/2023 14:30'",
                'month': "'8/%/2023 %'"
            }
        }
    }

    # Comparison rules based on formats
    date_formats['comparison_rules'] = [
        {
            'format': 'yyyy-mm-dd',
            'rule': 'Direct string comparison works',
            'example': "WHERE date >= '2023-01-01' AND date < '2024-01-01'"
        },
        {
            'format': 'mm/dd/yyyy',
            'rule': 'Use LIKE patterns or parse components',
            'example': "WHERE date LIKE '8/%/2023' for August 2023"
        },
        {
            'format': 'Mixed formats',
            'rule': 'Convert to common format before comparison',
            'example': 'Use DATE() function or string manipulation'
        },
        {
            'format': 'With timestamp',
            'rule': 'Use LIKE for date-only comparisons',
            'example': "WHERE datetime LIKE '2023-08-15%'"
        }
    ]

    # Parsing functions for different requirements
    date_formats['parsing_functions'] = {
        'after_month_year': {
            'description': 'For "after August 2013" type queries',
            'mm/dd/yyyy': "(SUBSTR(date_col, -4) > '2013' OR (SUBSTR(date_col, -4) = '2013' AND CAST(SUBSTR(date_col, 1, INSTR(date_col, '/') - 1) AS INTEGER) > 8))",
            'yyyy-mm-dd': "date_col > '2013-08-31'"
        },
        'in_month_year': {
            'description': 'For "in August 2013" type queries',
            'mm/dd/yyyy': "date_col LIKE '8/%/2013'",
            'yyyy-mm-dd': "date_col LIKE '2013-08-%'"
        },
        'year_only': {
            'description': 'For year-based filtering',
            'mm/dd/yyyy': "SUBSTR(date_col, -4) = '2013'",
            'yyyy-mm-dd': "date_col LIKE '2013-%'"
        },
        'between_dates': {
            'description': 'For date range queries',
            'note': 'Convert both dates to same format as column'
        }
    }

    # Common date-related errors
    date_formats['common_errors'] = [
        {
            'error': 'Wrong date format assumption',
            'example': 'Assuming YYYY-MM-DD when actual is MM/DD/YYYY',
            'fix': 'Check actual format in data samples'
        },
        {
            'error': 'Incorrect "after" interpretation',
            'example': '"after August 2013" treated as "in August 2013"',
            'fix': '"after" means September onwards, not including August'
        },
        {
            'error': 'Timestamp ignored in comparison',
            'example': 'date = "2023-08-15" when column has timestamp',
            'fix': 'Use LIKE "2023-08-15%" for date with timestamp'
        },
        {
            'error': 'String comparison on non-standard format',
            'example': '> comparison on MM/DD/YYYY format',
            'fix': 'Parse year/month/day or convert format'
        },
        {
            'error': 'Wrong date component extraction',
            'example': 'Using wrong SUBSTR positions',
            'fix': 'Account for variable-length months/days (8 vs 08)'
        }
    ]

    # Special handling for specific databases
    date_formats['special_cases'] = []

    # Check for specific date columns that need special handling
    for table_col, format_info in date_formats['detected_formats'].items():
        if format_info['format'] == 'mm/dd/yyyy':
            date_formats['special_cases'].append({
                'column': table_col,
                'warning': 'US date format - careful with comparisons',
                'recommendation': 'Use LIKE patterns or component extraction'
            })
        elif format_info['format'] == 'unix_timestamp':
            date_formats['special_cases'].append({
                'column': table_col,
                'warning': 'Unix timestamp format',
                'recommendation': 'Use datetime() function for conversions'
            })

    # Date interpretation rules
    date_formats['interpretation_rules'] = {
        'after': 'Means AFTER the specified time, not including it',
        'before': 'Means BEFORE the specified time, not including it',
        'in': 'Means DURING the specified period',
        'since': 'Means FROM that point onwards, including it',
        'until': 'Means UP TO that point, including it',
        'between': 'Usually inclusive of both endpoints'
    }

    conn.close()
    return date_formats

def main():
    db_path = "./database.sqlite"
    output_dir = "./tool_output"
    os.makedirs(output_dir, exist_ok=True)

    # Analyze date formats
    formats = analyze_date_formats(db_path)

    # Save results
    output_path = os.path.join(output_dir, "date_formats.json")
    with open(output_path, 'w') as f:
        json.dump(formats, f, indent=2)

    print("Date format analysis complete")
    print(f"Detected {len(formats['detected_formats'])} date/time columns")
    print(f"Generated {len(formats['conversion_patterns'])} conversion patterns")
    print(f"Identified {len(formats['common_errors'])} common errors")
    print(f"Results saved to {output_path}")

    # Report special cases
    if formats['special_cases']:
        print("\nWARNING: Special date formats detected:")
        for case in formats['special_cases'][:3]:
            print(f"  - {case['column']}: {case['warning']}")

if __name__ == "__main__":
    main()