#!/usr/bin/env python3
"""
Temporal Analyzer
Specialized tool for detecting and documenting date/time formats and patterns.
Addresses specific issues with date/time handling in queries.
"""

import sqlite3
import json
import os
import re
from datetime import datetime
from collections import Counter, defaultdict

def analyze_temporal_data(db_path):
    """Main temporal analysis function."""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    analysis = {
        'date_columns': {},
        'date_formats': {},
        'time_patterns': {},
        'query_templates': {},
        'common_filters': [],
        'conversion_rules': {},
        'join_patterns': []
    }

    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = [row[0] for row in cursor.fetchall()]

    # Analyze each table
    for table in tables:
        analyze_table_dates(cursor, table, analysis)

    # Generate query templates
    generate_date_templates(analysis)
    generate_conversion_rules(analysis)
    identify_join_patterns(analysis, tables)

    conn.close()
    return analysis

def analyze_table_dates(cursor, table, analysis):
    """Analyze date/time columns in a table."""
    cursor.execute(f"PRAGMA table_info({table})")
    columns = cursor.fetchall()

    for col in columns:
        col_name = col[1]
        col_type = col[2]

        # Check if this is a date/time column
        if is_temporal_column(col_name, col_type):
            # Get sample values
            try:
                cursor.execute(f"SELECT DISTINCT {col_name} FROM {table} WHERE {col_name} IS NOT NULL LIMIT 100")
                samples = [row[0] for row in cursor.fetchall()]

                if samples:
                    # Detect format
                    format_info = detect_date_format(samples)

                    analysis['date_columns'][f"{table}.{col_name}"] = {
                        'type': col_type,
                        'format': format_info['format'],
                        'has_time': format_info['has_time'],
                        'has_seconds': format_info['has_seconds'],
                        'sample': samples[0] if samples else None,
                        'distinct_dates': len(samples)
                    }

                    # Store format patterns
                    if format_info['format'] not in analysis['date_formats']:
                        analysis['date_formats'][format_info['format']] = []
                    analysis['date_formats'][format_info['format']].append(f"{table}.{col_name}")

                    # Detect time patterns
                    if format_info['has_time']:
                        time_patterns = analyze_time_patterns(samples)
                        if time_patterns:
                            analysis['time_patterns'][f"{table}.{col_name}"] = time_patterns
            except Exception as e:
                # Handle columns that might not be queryable directly
                pass

def is_temporal_column(col_name, col_type):
    """Check if a column is temporal."""
    name_lower = col_name.lower()
    type_upper = col_type.upper() if col_type else ""

    # Check column name
    temporal_keywords = ['date', 'time', 'year', 'month', 'day', 'hour',
                        'minute', 'second', 'timestamp', 'datetime', 'created',
                        'updated', 'modified', 'start', 'end', 'birth', 'dob']

    if any(keyword in name_lower for keyword in temporal_keywords):
        return True

    # Check column type
    if any(t in type_upper for t in ['DATE', 'TIME', 'DATETIME', 'TIMESTAMP']):
        return True

    return False

def detect_date_format(samples):
    """Detect the format of date samples."""
    format_votes = Counter()
    has_time = False
    has_seconds = False

    for sample in samples[:20]:  # Check first 20 samples
        if not sample:
            continue

        sample_str = str(sample)

        # Check various formats
        if re.match(r'^\d{1,2}/\d{1,2}/\d{4}$', sample_str):
            format_votes['M/D/YYYY'] += 1

        elif re.match(r'^\d{1,2}/\d{1,2}/\d{4} \d{1,2}:\d{2}$', sample_str):
            format_votes['M/D/YYYY H:MM'] += 1
            has_time = True

        elif re.match(r'^\d{1,2}/\d{1,2}/\d{4} \d{1,2}:\d{2}:\d{2}$', sample_str):
            format_votes['M/D/YYYY H:MM:SS'] += 1
            has_time = True
            has_seconds = True

        elif re.match(r'^\d{4}-\d{2}-\d{2}$', sample_str):
            format_votes['YYYY-MM-DD'] += 1

        elif re.match(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$', sample_str):
            format_votes['YYYY-MM-DD HH:MM:SS'] += 1
            has_time = True
            has_seconds = True

        elif re.match(r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}', sample_str):
            format_votes['ISO-8601'] += 1
            has_time = True
            has_seconds = True

        elif re.match(r'^\d{1,2}-[A-Za-z]{3}-\d{4}$', sample_str):
            format_votes['DD-MON-YYYY'] += 1

        else:
            # Try to parse as a general datetime
            format_votes['CUSTOM'] += 1

    # Get most common format
    if format_votes:
        most_common = format_votes.most_common(1)[0][0]
    else:
        most_common = 'UNKNOWN'

    return {
        'format': most_common,
        'has_time': has_time,
        'has_seconds': has_seconds
    }

def analyze_time_patterns(samples):
    """Analyze time components in datetime samples."""
    patterns = {
        'has_midnight': False,
        'has_seconds': False,
        'time_precision': None,
        'common_times': []
    }

    time_counts = Counter()

    for sample in samples[:50]:
        if not sample:
            continue

        sample_str = str(sample)

        # Extract time component
        time_match = re.search(r'(\d{1,2}):(\d{2})(?::(\d{2}))?', sample_str)
        if time_match:
            hour = int(time_match.group(1))
            minute = int(time_match.group(2))
            second = int(time_match.group(3)) if time_match.group(3) else 0

            time_counts[f"{hour:02d}:{minute:02d}:{second:02d}"] += 1

            if hour == 0 and minute == 0:
                patterns['has_midnight'] = True
            if time_match.group(3):
                patterns['has_seconds'] = True

    # Get common times
    if time_counts:
        patterns['common_times'] = [t for t, _ in time_counts.most_common(5)]

    # Determine precision
    if patterns['has_seconds']:
        patterns['time_precision'] = 'seconds'
    elif any(':' in str(s) for s in samples):
        patterns['time_precision'] = 'minutes'
    else:
        patterns['time_precision'] = 'day'

    return patterns

def generate_date_templates(analysis):
    """Generate SQL templates for date operations."""
    templates = {
        'exact_match': {},
        'date_extraction': {},
        'date_range': {},
        'time_extraction': {}
    }

    for col_path, col_info in analysis['date_columns'].items():
        format_type = col_info['format']

        # Exact match templates
        if format_type == 'M/D/YYYY H:MM':
            templates['exact_match'][col_path] = [
                f"WHERE {col_path} = '8/29/2013 12:45'",
                f"WHERE {col_path} LIKE '8/29/2013%'  -- For date-only match"
            ]
        elif format_type == 'YYYY-MM-DD':
            templates['exact_match'][col_path] = [
                f"WHERE {col_path} = '2013-08-29'",
                f"WHERE DATE({col_path}) = '2013-08-29'"
            ]

        # Date extraction templates
        templates['date_extraction'][col_path] = [
            f"DATE({col_path}) -- Extract date portion",
            f"strftime('%Y', {col_path}) -- Extract year",
            f"strftime('%m', {col_path}) -- Extract month",
            f"strftime('%d', {col_path}) -- Extract day"
        ]

        # Time extraction templates (if has time)
        if col_info['has_time']:
            templates['time_extraction'][col_path] = [
                f"TIME({col_path}) -- Extract time portion",
                f"strftime('%H', {col_path}) -- Extract hour",
                f"strftime('%M', {col_path}) -- Extract minute"
            ]

        # Date range templates
        templates['date_range'][col_path] = [
            f"WHERE {col_path} BETWEEN 'start_date' AND 'end_date'",
            f"WHERE DATE({col_path}) >= '2013-01-01' AND DATE({col_path}) <= '2013-12-31'"
        ]

    analysis['query_templates'] = templates

def generate_conversion_rules(analysis):
    """Generate conversion rules for different date formats."""
    rules = {}

    for format_type in analysis['date_formats'].keys():
        if format_type == 'M/D/YYYY':
            rules[format_type] = {
                'to_iso': "Use DATE() or convert manually",
                'comparison': "Use LIKE for string dates or convert to ISO",
                'example': "WHERE date_col LIKE '8/29/2013%'"
            }
        elif format_type == 'M/D/YYYY H:MM':
            rules[format_type] = {
                'to_date': "DATE(column)",
                'to_time': "TIME(column)",
                'comparison': "Direct string comparison or DATE() extraction",
                'example': "WHERE DATE(start_date) = '2013-08-29'"
            }
        elif format_type == 'YYYY-MM-DD':
            rules[format_type] = {
                'comparison': "Direct comparison works",
                'example': "WHERE date_col = '2013-08-29'"
            }
        elif format_type == 'YYYY-MM-DD HH:MM:SS':
            rules[format_type] = {
                'to_date': "DATE(column)",
                'to_time': "TIME(column)",
                'comparison': "Use DATE() for date-only comparison",
                'example': "WHERE DATE(timestamp_col) = '2013-08-29'"
            }

    analysis['conversion_rules'] = rules

def identify_join_patterns(analysis, tables):
    """Identify patterns for date-based joins."""
    patterns = []

    # Look for weather-trip patterns
    weather_dates = [col for col in analysis['date_columns'] if 'weather' in col.lower()]
    trip_dates = [col for col in analysis['date_columns'] if 'trip' in col.lower() or 'ride' in col.lower()]

    if weather_dates and trip_dates:
        for w_col in weather_dates:
            w_info = analysis['date_columns'][w_col]
            for t_col in trip_dates:
                t_info = analysis['date_columns'][t_col]

                # Generate appropriate join condition
                if w_info['has_time'] and not t_info['has_time']:
                    join_condition = f"DATE({w_col}) = {t_col}"
                elif not w_info['has_time'] and t_info['has_time']:
                    join_condition = f"{w_col} = DATE({t_col})"
                elif w_info['has_time'] and t_info['has_time']:
                    join_condition = f"DATE({w_col}) = DATE({t_col})"
                else:
                    join_condition = f"{w_col} = {t_col}"

                patterns.append({
                    'type': 'weather-trip',
                    'weather_column': w_col,
                    'trip_column': t_col,
                    'condition': join_condition,
                    'note': 'Also join on location (zip_code)'
                })

    # Look for general date join patterns
    date_cols_by_table = defaultdict(list)
    for col_path in analysis['date_columns']:
        table = col_path.split('.')[0]
        date_cols_by_table[table].append(col_path)

    # Find tables with dates that might join
    for t1 in date_cols_by_table:
        for t2 in date_cols_by_table:
            if t1 >= t2:
                continue

            for c1 in date_cols_by_table[t1]:
                for c2 in date_cols_by_table[t2]:
                    if 'start' in c1.lower() and 'end' in c2.lower():
                        patterns.append({
                            'type': 'temporal-range',
                            'column1': c1,
                            'column2': c2,
                            'condition': f"{c1} <= {c2}",
                            'use_case': 'Temporal range queries'
                        })

    analysis['join_patterns'] = patterns

def main():
    """Main execution function."""
    db_path = "./database.sqlite"

    if not os.path.exists(db_path):
        print(f"Error: Database not found at {db_path}")
        return

    # Create output directory
    os.makedirs("tool_output", exist_ok=True)

    # Analyze temporal data
    analysis = analyze_temporal_data(db_path)

    # Save results
    output_path = "tool_output/temporal_analysis.json"
    with open(output_path, 'w') as f:
        json.dump(analysis, f, indent=2)

    print(f"✅ Temporal analysis complete")
    print(f"📅 Date columns found: {len(analysis['date_columns'])}")
    print(f"⏰ Time patterns detected: {len(analysis['time_patterns'])}")
    print(f"🔄 Conversion rules: {len(analysis['conversion_rules'])}")
    print(f"💾 Results saved to: {output_path}")

    # Print key insights
    if analysis['date_formats']:
        print("\n📊 Date formats detected:")
        for fmt, cols in list(analysis['date_formats'].items())[:3]:
            print(f"  • {fmt}: {len(cols)} columns")

if __name__ == "__main__":
    main()