#!/usr/bin/env python3
"""
Profile data to understand patterns, formats, and characteristics.
"""

import sqlite3
import json
import sys
from pathlib import Path
from collections import Counter

def profile_database(db_path="database.sqlite", sample_size=10):
    """Profile data patterns in the database."""
    
    profile = {
        "tables": {}
    }
    
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        
        # Get all tables
        cursor.execute("""
            SELECT name FROM sqlite_master 
            WHERE type='table' 
            AND name NOT LIKE 'sqlite_%'
            ORDER BY name
        """)
        tables = cursor.fetchall()
        
        for (table_name,) in tables:
            profile["tables"][table_name] = profile_table(
                cursor, table_name, sample_size
            )
        
        conn.close()
        
    except sqlite3.Error as e:
        return {"error": f"Database error: {str(e)}"}
    except Exception as e:
        return {"error": f"Unexpected error: {str(e)}"}
    
    return profile

def profile_table(cursor, table_name, sample_size=10):
    """Profile a single table."""
    
    table_profile = {
        "columns": {},
        "row_count": 0,
        "sample_rows": []
    }
    
    # Get row count
    cursor.execute(f"SELECT COUNT(*) FROM '{table_name}'")
    row_count = cursor.fetchone()[0]
    table_profile["row_count"] = row_count
    
    if row_count == 0:
        return table_profile
    
    # Get column info
    cursor.execute(f"PRAGMA table_info('{table_name}')")
    columns = cursor.fetchall()
    
    column_names = [col[1] for col in columns]
    column_types = {col[1]: col[2] for col in columns}
    
    # Get sample rows
    cursor.execute(f"SELECT * FROM '{table_name}' LIMIT ?", (sample_size,))
    sample_rows = cursor.fetchall()
    
    # Convert to list of dicts for better JSON serialization
    table_profile["sample_rows"] = [
        dict(zip(column_names, row)) for row in sample_rows
    ]
    
    # Profile each column
    for col_name in column_names:
        col_type = column_types[col_name].upper()
        col_profile = {
            "type": col_type,
            "nullable": None,
            "unique_count": 0,
            "null_count": 0,
            "sample_values": []
        }
        
        # Get null count
        cursor.execute(f"""
            SELECT COUNT(*) FROM '{table_name}' 
            WHERE "{col_name}" IS NULL
        """)
        col_profile["null_count"] = cursor.fetchone()[0]
        col_profile["null_percentage"] = (
            col_profile["null_count"] / row_count * 100 if row_count > 0 else 0
        )
        
        # Get unique count
        cursor.execute(f"""
            SELECT COUNT(DISTINCT "{col_name}") FROM '{table_name}'
        """)
        col_profile["unique_count"] = cursor.fetchone()[0]
        
        # Determine if likely a categorical column
        col_profile["is_categorical"] = (
            col_profile["unique_count"] < 100 and 
            col_profile["unique_count"] < row_count * 0.5
        )
        
        # Get sample values
        cursor.execute(f"""
            SELECT DISTINCT "{col_name}" FROM '{table_name}' 
            WHERE "{col_name}" IS NOT NULL
            LIMIT 20
        """)
        sample_values = [row[0] for row in cursor.fetchall()]
        col_profile["sample_values"] = sample_values
        
        # Analyze patterns based on type
        if "TEXT" in col_type or "CHAR" in col_type:
            col_profile.update(analyze_text_column(
                cursor, table_name, col_name, sample_values
            ))
        elif "INT" in col_type or "REAL" in col_type or "NUM" in col_type:
            col_profile.update(analyze_numeric_column(
                cursor, table_name, col_name
            ))
        elif "DATE" in col_type or "TIME" in col_type:
            col_profile.update(analyze_date_column(
                cursor, table_name, col_name, sample_values
            ))
        
        table_profile["columns"][col_name] = col_profile
    
    return table_profile

def analyze_text_column(cursor, table_name, col_name, sample_values):
    """Analyze text column patterns."""
    
    analysis = {
        "mixed_case": False,
        "all_uppercase": False,
        "all_lowercase": False,
        "contains_spaces": False,
        "avg_length": 0,
        "max_length": 0,
        "min_length": 0,
        "common_patterns": []
    }
    
    if not sample_values:
        return analysis
    
    # Check case patterns
    has_upper = any(s for s in sample_values if isinstance(s, str) and any(c.isupper() for c in s))
    has_lower = any(s for s in sample_values if isinstance(s, str) and any(c.islower() for c in s))
    
    if has_upper and has_lower:
        # Check if individual values have mixed case
        mixed_values = [s for s in sample_values if isinstance(s, str) and 
                       any(c.isupper() for c in s) and any(c.islower() for c in s)]
        analysis["mixed_case"] = len(mixed_values) > 0
    
    analysis["all_uppercase"] = has_upper and not has_lower
    analysis["all_lowercase"] = has_lower and not has_upper
    
    # Check for spaces
    analysis["contains_spaces"] = any(
        s for s in sample_values if isinstance(s, str) and ' ' in s
    )
    
    # Get length statistics
    cursor.execute(f"""
        SELECT 
            AVG(LENGTH("{col_name}")) as avg_len,
            MAX(LENGTH("{col_name}")) as max_len,
            MIN(LENGTH("{col_name}")) as min_len
        FROM '{table_name}'
        WHERE "{col_name}" IS NOT NULL
    """)
    stats = cursor.fetchone()
    if stats:
        analysis["avg_length"] = round(stats[0], 2) if stats[0] else 0
        analysis["max_length"] = stats[1] if stats[1] else 0
        analysis["min_length"] = stats[2] if stats[2] else 0
    
    # Detect common patterns (e.g., email, phone, ID formats)
    str_values = [s for s in sample_values if isinstance(s, str)]
    if str_values:
        # Check for email pattern
        if any('@' in s and '.' in s for s in str_values):
            analysis["common_patterns"].append("email")
        
        # Check for phone pattern
        if any(any(c.isdigit() for c in s) and 
               (len(s) >= 10 or '-' in s or '(' in s) 
               for s in str_values):
            analysis["common_patterns"].append("phone")
        
        # Check for ID pattern (alphanumeric codes)
        if all(s.replace('-', '').replace('_', '').isalnum() for s in str_values[:5]):
            analysis["common_patterns"].append("id_code")
    
    return analysis

def analyze_numeric_column(cursor, table_name, col_name):
    """Analyze numeric column patterns."""
    
    analysis = {
        "min_value": None,
        "max_value": None,
        "avg_value": None,
        "is_integer": True,
        "is_percentage": False,
        "is_currency": False,
        "common_values": []
    }
    
    # Get statistics
    cursor.execute(f"""
        SELECT 
            MIN("{col_name}") as min_val,
            MAX("{col_name}") as max_val,
            AVG("{col_name}") as avg_val
        FROM '{table_name}'
        WHERE "{col_name}" IS NOT NULL
    """)
    stats = cursor.fetchone()
    
    if stats and stats[0] is not None:
        analysis["min_value"] = stats[0]
        analysis["max_value"] = stats[1]
        analysis["avg_value"] = round(stats[2], 2) if stats[2] else None
        
        # Check if values are between 0 and 1 (potential percentage)
        if 0 <= analysis["min_value"] <= 1 and 0 <= analysis["max_value"] <= 1:
            analysis["is_percentage"] = True
        
        # Check if all values are integers
        cursor.execute(f"""
            SELECT COUNT(*) FROM '{table_name}'
            WHERE "{col_name}" != CAST("{col_name}" AS INTEGER)
            AND "{col_name}" IS NOT NULL
        """)
        non_int_count = cursor.fetchone()[0]
        analysis["is_integer"] = non_int_count == 0
        
        # Check for currency (2 decimal places)
        if not analysis["is_integer"]:
            cursor.execute(f"""
                SELECT COUNT(*) FROM '{table_name}'
                WHERE ROUND("{col_name}", 2) = "{col_name}"
                AND "{col_name}" IS NOT NULL
            """)
            decimal_2_count = cursor.fetchone()[0]
            cursor.execute(f"""
                SELECT COUNT(*) FROM '{table_name}'
                WHERE "{col_name}" IS NOT NULL
            """)
            total_count = cursor.fetchone()[0]
            
            if total_count > 0 and decimal_2_count / total_count > 0.9:
                analysis["is_currency"] = True
    
    # Get most common values
    cursor.execute(f"""
        SELECT "{col_name}", COUNT(*) as cnt
        FROM '{table_name}'
        WHERE "{col_name}" IS NOT NULL
        GROUP BY "{col_name}"
        ORDER BY cnt DESC
        LIMIT 5
    """)
    common = cursor.fetchall()
    analysis["common_values"] = [{"value": val, "count": cnt} for val, cnt in common]
    
    return analysis

def analyze_date_column(cursor, table_name, col_name, sample_values):
    """Analyze date/time column patterns."""
    
    analysis = {
        "format": "unknown",
        "min_date": None,
        "max_date": None,
        "common_formats": []
    }
    
    # Get min/max
    cursor.execute(f"""
        SELECT MIN("{col_name}"), MAX("{col_name}")
        FROM '{table_name}'
        WHERE "{col_name}" IS NOT NULL
    """)
    min_max = cursor.fetchone()
    if min_max:
        analysis["min_date"] = min_max[0]
        analysis["max_date"] = min_max[1]
    
    # Detect format from samples
    str_values = [s for s in sample_values if isinstance(s, str) and s]
    if str_values:
        # Common date formats
        if all('-' in s for s in str_values[:5]):
            if all(len(s.split('-')[0]) == 4 for s in str_values[:5]):
                analysis["format"] = "YYYY-MM-DD"
            else:
                analysis["format"] = "DD-MM-YYYY or MM-DD-YYYY"
        elif all('/' in s for s in str_values[:5]):
            analysis["format"] = "MM/DD/YYYY or DD/MM/YYYY"
        elif all(' ' in s and ':' in s for s in str_values[:5]):
            analysis["format"] = "DATETIME"
        elif all(':' in s and '-' not in s and '/' not in s for s in str_values[:5]):
            analysis["format"] = "TIME"
    
    return analysis

def main():
    """Run data profiling and output JSON."""
    profile = profile_database()
    
    # Output as JSON
    print(json.dumps(profile, indent=2, default=str))
    
    # Also save to file
    output_dir = Path("tool_output")
    output_dir.mkdir(exist_ok=True)
    
    with open(output_dir / "data_profile.json", 'w') as f:
        json.dump(profile, f, indent=2, default=str)

if __name__ == "__main__":
    main()