#!/usr/bin/env python3
"""
Pattern profiler - analyzes data patterns, formats, and distributions.
Combines detailed profiling from tool_runner_3a with pattern focus from column_strict.
"""

import sqlite3
import json
import re
from pathlib import Path
from datetime import datetime

def profile_patterns(conn):
    """Profile data patterns across all tables."""
    cursor = conn.cursor()
    
    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
    tables = [row[0] for row in cursor.fetchall()]
    
    patterns = {
        "date_formats": {},
        "string_patterns": {},
        "numeric_ranges": {},
        "null_patterns": {},
        "special_values": {},
        "categorical_values": {},
        "common_filters": {}
    }
    
    for table in tables:
        cursor.execute(f"PRAGMA table_info({table})")
        columns = cursor.fetchall()
        
        cursor.execute(f"SELECT COUNT(*) FROM {table}")
        row_count = cursor.fetchone()[0]
        
        if row_count == 0:
            continue
        
        for col in columns:
            col_name = col[1]
            col_type = col[2].upper()
            
            # Analyze based on column type
            if 'DATE' in col_type or 'TIME' in col_type:
                # Date pattern analysis
                cursor.execute(f"SELECT DISTINCT {col_name} FROM {table} WHERE {col_name} IS NOT NULL LIMIT 10")
                samples = [row[0] for row in cursor.fetchall()]
                
                if samples:
                    # Detect date format
                    date_format = detect_date_format(samples[0]) if samples[0] else "Unknown"
                    
                    # Get range
                    cursor.execute(f"SELECT MIN({col_name}), MAX({col_name}) FROM {table} WHERE {col_name} IS NOT NULL")
                    min_val, max_val = cursor.fetchone()
                    
                    patterns["date_formats"][f"{table}.{col_name}"] = {
                        "format": date_format,
                        "min": min_val,
                        "max": max_val,
                        "samples": samples[:3]
                    }
            
            elif 'INT' in col_type or 'REAL' in col_type or 'NUMERIC' in col_type:
                # Numeric pattern analysis
                cursor.execute(f"""
                    SELECT 
                        MIN({col_name}), 
                        MAX({col_name}), 
                        AVG({col_name}),
                        COUNT(DISTINCT {col_name})
                    FROM {table} 
                    WHERE {col_name} IS NOT NULL
                """)
                min_val, max_val, avg_val, distinct_count = cursor.fetchone()
                
                if min_val is not None:
                    patterns["numeric_ranges"][f"{table}.{col_name}"] = {
                        "min": min_val,
                        "max": max_val,
                        "avg": avg_val,
                        "distinct_values": distinct_count,
                        "likely_id": col_name.endswith('_id') or col_name.endswith('ID'),
                        "likely_count": 'count' in col_name.lower() or 'total' in col_name.lower(),
                        "likely_amount": 'amount' in col_name.lower() or 'price' in col_name.lower()
                    }
                
                # Check for special numeric values
                cursor.execute(f"""
                    SELECT {col_name}, COUNT(*) as cnt
                    FROM {table}
                    WHERE {col_name} IN (0, -1, 999, 9999, -999)
                    GROUP BY {col_name}
                """)
                special_nums = cursor.fetchall()
                if special_nums:
                    patterns["special_values"][f"{table}.{col_name}"] = [
                        {"value": val, "count": cnt} for val, cnt in special_nums
                    ]
            
            elif 'TEXT' in col_type or 'CHAR' in col_type or 'VARCHAR' in col_type:
                # String pattern analysis
                cursor.execute(f"""
                    SELECT 
                        {col_name},
                        COUNT(*) as cnt
                    FROM {table}
                    WHERE {col_name} IS NOT NULL
                    GROUP BY {col_name}
                    ORDER BY COUNT(*) DESC
                    LIMIT 20
                """)
                value_counts = cursor.fetchall()
                
                if value_counts:
                    distinct_count = len(value_counts)
                    
                    # Check if categorical (low cardinality)
                    if distinct_count <= 100 and row_count > 10:
                        patterns["categorical_values"][f"{table}.{col_name}"] = [
                            val for val, cnt in value_counts[:10]
                        ]
                    
                    # Analyze string patterns
                    samples = [val for val, cnt in value_counts[:5] if val]
                    if samples:
                        case_pattern = detect_case_pattern(samples)
                        common_prefix = find_common_prefix(samples)
                        common_suffix = find_common_suffix(samples)
                        
                        patterns["string_patterns"][f"{table}.{col_name}"] = {
                            "case_pattern": case_pattern,
                            "common_prefix": common_prefix if len(common_prefix) > 1 else None,
                            "common_suffix": common_suffix if len(common_suffix) > 1 else None,
                            "distinct_values": distinct_count,
                            "is_categorical": distinct_count <= 100,
                            "top_values": [val for val, cnt in value_counts[:5]]
                        }
            
            # NULL analysis
            cursor.execute(f"""
                SELECT 
                    COUNT(*) - COUNT({col_name}) as null_count,
                    COUNT(*) as total_count
                FROM {table}
            """)
            null_count, total_count = cursor.fetchone()
            
            if null_count > 0:
                patterns["null_patterns"][f"{table}.{col_name}"] = {
                    "null_count": null_count,
                    "null_percentage": (null_count / total_count * 100) if total_count > 0 else 0,
                    "nullable": not col[3]  # From PRAGMA table_info
                }
            
            # Identify common filter columns
            if col_name.lower() in ['status', 'type', 'category', 'state', 'country', 'year', 'month', 'active', 'deleted']:
                cursor.execute(f"SELECT DISTINCT {col_name} FROM {table} WHERE {col_name} IS NOT NULL LIMIT 20")
                filter_values = [row[0] for row in cursor.fetchall()]
                if filter_values:
                    patterns["common_filters"][f"{table}.{col_name}"] = filter_values
    
    return {"patterns": patterns}

def detect_date_format(date_str):
    """Detect the format of a date string."""
    if not date_str:
        return "Unknown"
    
    patterns = {
        r'^\d{4}-\d{2}-\d{2}$': "YYYY-MM-DD",
        r'^\d{2}/\d{2}/\d{4}$': "MM/DD/YYYY",
        r'^\d{2}-\d{2}-\d{4}$': "DD-MM-YYYY",
        r'^\d{4}$': "YYYY",
        r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$': "YYYY-MM-DD HH:MM:SS",
        r'^\d{8}$': "YYYYMMDD"
    }
    
    for pattern, format_name in patterns.items():
        if re.match(pattern, str(date_str)):
            return format_name
    
    return "Custom"

def detect_case_pattern(strings):
    """Detect the predominant case pattern in strings."""
    patterns = {"UPPER": 0, "lower": 0, "Mixed": 0, "CamelCase": 0}
    
    for s in strings:
        if not s or not isinstance(s, str):
            continue
        
        if s.isupper():
            patterns["UPPER"] += 1
        elif s.islower():
            patterns["lower"] += 1
        elif s[0].isupper() and not s.isupper():
            if ' ' in s or '_' in s:
                patterns["Mixed"] += 1
            else:
                patterns["CamelCase"] += 1
        else:
            patterns["Mixed"] += 1
    
    return max(patterns, key=patterns.get) if any(patterns.values()) else "Unknown"

def find_common_prefix(strings):
    """Find common prefix among strings."""
    if not strings:
        return ""
    
    strings = [str(s) for s in strings if s]
    if not strings:
        return ""
    
    prefix = strings[0]
    for s in strings[1:]:
        while not s.startswith(prefix):
            prefix = prefix[:-1]
            if not prefix:
                return ""
    return prefix

def find_common_suffix(strings):
    """Find common suffix among strings."""
    if not strings:
        return ""
    
    strings = [str(s) for s in strings if s]
    if not strings:
        return ""
    
    suffix = strings[0]
    for s in strings[1:]:
        while not s.endswith(suffix):
            suffix = suffix[1:]
            if not suffix:
                return ""
    return suffix

def main():
    """Main entry point."""
    db_path = Path("./database.sqlite")
    
    if not db_path.exists():
        print(json.dumps({"error": "Database file not found"}))
        return
    
    try:
        conn = sqlite3.connect(str(db_path))
        results = profile_patterns(conn)
        conn.close()
        
        # Output as JSON
        print(json.dumps(results, indent=2))
        
    except Exception as e:
        print(json.dumps({"error": str(e)}))

if __name__ == "__main__":
    main()