#!/usr/bin/env python3
"""
Pattern Detector Tool
Identifies common patterns, data formats, and potential SQL generation issues.
Helps prevent common errors identified in the error analysis.
"""

import sqlite3
import json
import os
import re
from datetime import datetime

def detect_patterns(db_path="database.sqlite"):
    """Detect patterns and potential issues for SQL generation."""

    os.makedirs("tool_output", exist_ok=True)

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    patterns = {
        "case_sensitivity_issues": [],
        "date_format_patterns": {},
        "aggregation_candidates": [],
        "nullable_columns": [],
        "columns_with_spaces": [],
        "reserved_keywords": [],
        "common_filter_columns": [],
        "potential_issues": []
    }

    # SQLite reserved keywords to check against
    reserved_words = {'SELECT', 'FROM', 'WHERE', 'JOIN', 'ON', 'GROUP', 'BY', 'ORDER',
                     'LIMIT', 'AS', 'AND', 'OR', 'NOT', 'IN', 'EXISTS', 'BETWEEN',
                     'LIKE', 'IS', 'NULL', 'COUNT', 'SUM', 'AVG', 'MAX', 'MIN',
                     'DATE', 'TIME', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE', 'SECOND'}

    try:
        # Get all tables
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = [row[0] for row in cursor.fetchall() if not row[0].startswith("sqlite_")]

        for table in tables:
            # Check for table names that need quoting
            if ' ' in table or table.upper() in reserved_words:
                patterns["potential_issues"].append({
                    "type": "table_name",
                    "table": table,
                    "issue": "Requires quoting in SQL",
                    "solution": f"Use `{table}` with backticks"
                })

            # Get column information
            cursor.execute(f"PRAGMA table_info(`{table}`)")
            columns = cursor.fetchall()

            for col in columns:
                col_name = col[1]
                col_type = col[2].upper()
                nullable = not col[3]

                # Check for columns with spaces
                if ' ' in col_name:
                    patterns["columns_with_spaces"].append(f"{table}.{col_name}")
                    patterns["potential_issues"].append({
                        "type": "column_name",
                        "table": table,
                        "column": col_name,
                        "issue": "Contains spaces - requires quoting",
                        "solution": f"Use `{col_name}` with backticks"
                    })

                # Check for reserved keywords
                if col_name.upper() in reserved_words:
                    patterns["reserved_keywords"].append(f"{table}.{col_name}")

                # Track nullable columns
                if nullable:
                    patterns["nullable_columns"].append(f"{table}.{col_name}")

                # Identify potential aggregation columns
                if col_type in ['INTEGER', 'REAL', 'NUMERIC']:
                    # Check if this looks like a metric
                    metric_keywords = ['amount', 'total', 'count', 'sum', 'quantity',
                                     'price', 'cost', 'value', 'score', 'rating']
                    if any(keyword in col_name.lower() for keyword in metric_keywords):
                        patterns["aggregation_candidates"].append({
                            "table": table,
                            "column": col_name,
                            "type": col_type,
                            "suggested_operations": ["SUM", "AVG", "MAX", "MIN", "COUNT"]
                        })

                # Analyze date columns
                if col_type in ['TEXT', 'VARCHAR', 'DATE', 'DATETIME', 'TIMESTAMP']:
                    try:
                        # Sample some values to detect date format
                        cursor.execute(f"""
                            SELECT DISTINCT `{col_name}`
                            FROM `{table}`
                            WHERE `{col_name}` IS NOT NULL
                            LIMIT 10
                        """)
                        samples = [row[0] for row in cursor.fetchall()]

                        if samples:
                            detected_formats = []

                            # Check various date formats
                            date_formats = [
                                (r'^\d{4}-\d{2}-\d{2}$', 'YYYY-MM-DD', '%Y-%m-%d'),
                                (r'^\d{2}/\d{2}/\d{4}$', 'MM/DD/YYYY', '%m/%d/%Y'),
                                (r'^\d{2}/\d{2}/\d{2}$', 'MM/DD/YY', '%m/%d/%y'),
                                (r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$', 'YYYY-MM-DD HH:MM:SS', '%Y-%m-%d %H:%M:%S'),
                                (r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}', 'ISO 8601', '%Y-%m-%dT%H:%M:%S')
                            ]

                            for sample in samples[:5]:
                                if isinstance(sample, str):
                                    for regex, format_name, strftime_fmt in date_formats:
                                        if re.match(regex, sample):
                                            detected_formats.append(format_name)
                                            break

                            if detected_formats:
                                most_common = max(set(detected_formats), key=detected_formats.count)
                                patterns["date_format_patterns"][f"{table}.{col_name}"] = {
                                    "format": most_common,
                                    "samples": samples[:3],
                                    "extraction_hint": f"Use strftime() for date operations"
                                }

                    except Exception as e:
                        pass

                # Check for case sensitivity issues
                if col_type in ['TEXT', 'VARCHAR', 'CHAR']:
                    try:
                        # Check if values have mixed cases
                        cursor.execute(f"""
                            SELECT `{col_name}`, COUNT(*) as cnt
                            FROM `{table}`
                            WHERE `{col_name}` IS NOT NULL
                            GROUP BY `{col_name}`
                            LIMIT 50
                        """)
                        values = cursor.fetchall()

                        # Group by lowercase to find case variations
                        case_groups = {}
                        for val, count in values:
                            if isinstance(val, str):
                                lower = val.lower()
                                if lower not in case_groups:
                                    case_groups[lower] = []
                                case_groups[lower].append((val, count))

                        # Find actual case variations in data
                        for lower_val, variations in case_groups.items():
                            if len(variations) > 1:
                                patterns["case_sensitivity_issues"].append({
                                    "table": table,
                                    "column": col_name,
                                    "example": [v[0] for v in variations],
                                    "warning": "Case-sensitive values found - exact matching required"
                                })
                                break

                    except Exception as e:
                        pass

                # Identify common filter columns
                if col_name.lower() in ['status', 'type', 'category', 'state', 'active',
                                        'deleted', 'enabled', 'visible']:
                    patterns["common_filter_columns"].append({
                        "table": table,
                        "column": col_name,
                        "type": "categorical",
                        "note": "Commonly used in WHERE clauses"
                    })

        # Additional pattern detection for common SQL issues
        # Check for tables that might need multiple joins
        for table in tables:
            cursor.execute(f"PRAGMA table_info(`{table}`)")
            columns = cursor.fetchall()

            fk_count = sum(1 for col in columns if col[1].lower().endswith('_id') or col[1].lower().endswith('id'))
            if fk_count >= 3:
                patterns["potential_issues"].append({
                    "type": "complex_joins",
                    "table": table,
                    "issue": f"Table has {fk_count} potential foreign keys",
                    "solution": "May require multiple JOINs - verify relationships"
                })

    except Exception as e:
        patterns["error"] = str(e)
    finally:
        conn.close()

    # Write to output file
    output_path = "tool_output/patterns.json"
    with open(output_path, 'w') as f:
        json.dump(patterns, f, indent=2)

    print(f"Pattern detection complete - results in {output_path}")

    if patterns["case_sensitivity_issues"]:
        print(f"WARNING: Found {len(patterns['case_sensitivity_issues'])} columns with case-sensitive values")

    if patterns["columns_with_spaces"]:
        print(f"Found {len(patterns['columns_with_spaces'])} columns with spaces requiring quoting")

    if patterns["potential_issues"]:
        print(f"Identified {len(patterns['potential_issues'])} potential SQL generation issues")

if __name__ == "__main__":
    detect_patterns()