#!/usr/bin/env python3
"""
Pattern Detector Tool - Enhanced with domain-specific patterns
Detects common patterns and special cases in the database.
"""

import sqlite3
import json
import os
from collections import defaultdict

def detect_patterns(db_path="database.sqlite"):
    """Detect patterns and special cases in the database."""

    os.makedirs("tool_output", exist_ok=True)

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    patterns = {
        "domain_patterns": {},
        "aggregation_hints": {},
        "common_pitfalls": [],
        "exact_match_requirements": [],
        "conditional_aggregation_patterns": []
    }

    try:
        # Get all tables
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = cursor.fetchall()

        # Detect domain from table names
        table_names_str = ' '.join([t[0].lower() for t in tables])

        # Medical domain patterns
        if 'patient' in table_names_str or 'careplan' in table_names_str:
            patterns["domain_patterns"]["medical"] = {
                "age_calculation": "(julianday(date1) - julianday(date2)) / 365.25",
                "year_extraction": "strftime('%Y', date_column)",
                "common_confusions": [
                    "race vs ethnicity - these are different columns",
                    "medical codes must be exact - check evidence",
                    "descriptions must match exactly (e.g., 'grass pollen' not 'grass')"
                ]
            }

        # Weather/sales domain patterns
        if 'weather' in table_names_str or 'station' in table_names_str:
            patterns["domain_patterns"]["weather"] = {
                "date_matching": "Join sales and weather on BOTH date AND location",
                "time_comparisons": "Use time('HH:MM:SS') for sunrise/sunset",
                "conditional_sums": "Use SUM(CASE WHEN condition THEN value ELSE 0 END)",
                "common_issues": [
                    "Weather data needs date matching",
                    "Station-store relationships through relation table",
                    "Time values need time() function"
                ]
            }

        # Recipe/food domain patterns
        if 'recipe' in table_names_str or 'ingredient' in table_names_str:
            patterns["domain_patterns"]["food"] = {
                "wildcard_patterns": "Use %term% not %term for LIKE",
                "name_matching": "Ingredient names must be exact (e.g., 'sea bass steak')",
                "nutritional_thresholds": "Use < not BETWEEN for thresholds",
                "common_issues": [
                    "Check for max_qty = min_qty conditions",
                    "Ingredient names have variations",
                    "Categories use wildcard matching"
                ]
            }

        # Analyze each table for patterns
        for table_name, in tables:
            if table_name.startswith("sqlite_"):
                continue

            cursor.execute(f"PRAGMA table_info(`{table_name}`)")
            columns = cursor.fetchall()

            # Get row count
            cursor.execute(f"SELECT COUNT(*) FROM `{table_name}`")
            row_count = cursor.fetchone()[0]

            if row_count == 0:
                continue

            for col in columns:
                col_name = col[1]
                col_type = col[2]
                col_lower = col_name.lower()

                # Columns needing exact matching
                if any(word in col_lower for word in ['code', 'type', 'status', 'description', 'name']):
                    patterns["exact_match_requirements"].append({
                        "column": f"{table_name}.{col_name}",
                        "reason": "Values must match exactly from database"
                    })

                # Columns often needing DISTINCT in COUNT
                if col_lower.endswith(('_id', 'id')) and col_lower != 'id':
                    patterns["aggregation_hints"][f"{table_name}.{col_name}"] = {
                        "hint": "Use COUNT(DISTINCT {}) for unique counts".format(col_name),
                        "reason": "Foreign key - likely has duplicates"
                    }

                # Date columns needing special handling
                if 'date' in col_lower:
                    # Check if this is a date column with special patterns
                    try:
                        cursor.execute(f"SELECT DISTINCT `{col_name}` FROM `{table_name}` LIMIT 5")
                        samples = [row[0] for row in cursor.fetchall() if row[0]]

                        if samples:
                            # Check format
                            sample = str(samples[0])
                            if '-' in sample and len(sample) >= 10:
                                patterns["aggregation_hints"][f"{table_name}.{col_name}"] = {
                                    "hint": "Use strftime for date extraction",
                                    "format": "YYYY-MM-DD detected"
                                }
                    except:
                        pass

        # Common pitfalls based on domain
        patterns["common_pitfalls"] = [
            {
                "issue": "Returning extra columns",
                "solution": "Return ONLY what's requested - no helpful extras"
            },
            {
                "issue": "Missing DISTINCT in COUNT",
                "solution": "Use COUNT(DISTINCT) for foreign keys and unique counts"
            },
            {
                "issue": "Wrong date functions",
                "solution": "Use strftime for dates, time() for times"
            },
            {
                "issue": "Inexact value matching",
                "solution": "Use exact values from database samples"
            }
        ]

        # Conditional aggregation patterns
        patterns["conditional_aggregation_patterns"] = [
            {
                "pattern": "Count with condition",
                "template": "SUM(CASE WHEN condition THEN 1 ELSE 0 END)"
            },
            {
                "pattern": "Sum with condition",
                "template": "SUM(CASE WHEN condition THEN value ELSE 0 END)"
            },
            {
                "pattern": "Percentage calculation",
                "template": "CAST(SUM(CASE WHEN condition THEN 1 ELSE 0 END) AS REAL) * 100 / COUNT(*)"
            }
        ]

    except Exception as e:
        patterns["error"] = str(e)
    finally:
        conn.close()

    # Write to output file
    output_path = "tool_output/patterns.json"
    with open(output_path, 'w') as f:
        json.dump(patterns, f, indent=2)

    print(f"Pattern detection complete - results in {output_path}")

    if patterns.get("domain_patterns"):
        print(f"Domain-specific patterns detected: {', '.join(patterns['domain_patterns'].keys())}")

    if patterns.get("exact_match_requirements"):
        print(f"Found {len(patterns['exact_match_requirements'])} columns requiring exact matching")

if __name__ == "__main__":
    detect_patterns()