#!/usr/bin/env python3
"""
Data Profiler - Analyzes data patterns, distributions, and quality issues
"""
import sqlite3
import json
import os
from datetime import datetime

def profile_data(db_path):
    """Profile data patterns and distributions in the database"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    profile = {
        "tables": {},
        "data_quality_issues": [],
        "special_values": {},
        "date_formats": [],
        "text_patterns": {}
    }
    
    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = [row[0] for row in cursor.fetchall()]
    
    for table_name in tables:
        # Get column info
        cursor.execute(f"PRAGMA table_info('{table_name}')")
        columns = cursor.fetchall()
        
        table_profile = {
            "columns": {},
            "row_count": 0
        }
        
        # Get row count
        cursor.execute(f"SELECT COUNT(*) FROM '{table_name}'")
        table_profile["row_count"] = cursor.fetchone()[0]
        
        if table_profile["row_count"] == 0:
            profile["tables"][table_name] = table_profile
            continue
        
        for col_info in columns:
            col_name = col_info[1]
            col_type = col_info[2].upper()
            
            column_profile = {
                "type": col_type,
                "nullable": not col_info[3],
                "null_count": 0,
                "distinct_count": 0,
                "sample_values": [],
                "patterns": {}
            }
            
            try:
                # Count NULLs
                cursor.execute(f"SELECT COUNT(*) FROM '{table_name}' WHERE \"{col_name}\" IS NULL")
                column_profile["null_count"] = cursor.fetchone()[0]
                column_profile["null_percentage"] = (column_profile["null_count"] / table_profile["row_count"]) * 100
                
                # Count distinct values
                cursor.execute(f"SELECT COUNT(DISTINCT \"{col_name}\") FROM '{table_name}'")
                column_profile["distinct_count"] = cursor.fetchone()[0]
                
                # Get sample values
                cursor.execute(f"SELECT DISTINCT \"{col_name}\" FROM '{table_name}' WHERE \"{col_name}\" IS NOT NULL LIMIT 10")
                column_profile["sample_values"] = [row[0] for row in cursor.fetchall()]
                
                # Type-specific analysis
                if 'INT' in col_type or 'REAL' in col_type or 'FLOAT' in col_type or 'NUMERIC' in col_type:
                    # Numeric analysis
                    cursor.execute(f"""
                        SELECT 
                            MIN(CAST(\"{col_name}\" AS REAL)) as min_val,
                            MAX(CAST(\"{col_name}\" AS REAL)) as max_val,
                            AVG(CAST(\"{col_name}\" AS REAL)) as avg_val,
                            COUNT(CASE WHEN \"{col_name}\" = 0 THEN 1 END) as zero_count,
                            COUNT(CASE WHEN \"{col_name}\" < 0 THEN 1 END) as negative_count
                        FROM '{table_name}'
                        WHERE \"{col_name}\" IS NOT NULL
                    """)
                    stats = cursor.fetchone()
                    if stats:
                        column_profile["min"] = stats[0]
                        column_profile["max"] = stats[1]
                        column_profile["avg"] = stats[2]
                        column_profile["zero_count"] = stats[3]
                        column_profile["negative_count"] = stats[4]
                        
                        # Check for special numeric values
                        if stats[3] > 0 and 'ID' in col_name.upper():
                            profile["special_values"][f"{table_name}.{col_name}"] = {
                                "pattern": "Zero as NULL indicator",
                                "count": stats[3]
                            }
                
                elif 'DATE' in col_type or 'TIME' in col_type:
                    # Date/time analysis
                    cursor.execute(f"SELECT DISTINCT \"{col_name}\" FROM '{table_name}' WHERE \"{col_name}\" IS NOT NULL LIMIT 5")
                    date_samples = [row[0] for row in cursor.fetchall()]
                    
                    # Detect date format
                    if date_samples:
                        sample = str(date_samples[0])
                        if '-' in sample and len(sample) >= 8:
                            if len(sample) == 10:
                                format_guess = "YYYY-MM-DD"
                            else:
                                format_guess = "YYYY-MM-DD HH:MM:SS"
                        elif '/' in sample:
                            format_guess = "MM/DD/YYYY or DD/MM/YYYY"
                        else:
                            format_guess = "Unknown"
                        
                        profile["date_formats"].append({
                            "table": table_name,
                            "column": col_name,
                            "format": format_guess,
                            "samples": date_samples[:3]
                        })
                
                elif 'TEXT' in col_type or 'CHAR' in col_type or 'VARCHAR' in col_type:
                    # Text analysis
                    # Check for patterns
                    cursor.execute(f"""
                        SELECT 
                            MAX(LENGTH(\"{col_name}\")) as max_length,
                            MIN(LENGTH(\"{col_name}\")) as min_length,
                            AVG(LENGTH(\"{col_name}\")) as avg_length,
                            COUNT(CASE WHEN \"{col_name}\" = '' THEN 1 END) as empty_count,
                            COUNT(CASE WHEN \"{col_name}\" LIKE '%@%' THEN 1 END) as email_like,
                            COUNT(CASE WHEN \"{col_name}\" LIKE '%http%' THEN 1 END) as url_like,
                            COUNT(CASE WHEN \"{col_name}\" GLOB '[0-9]*' THEN 1 END) as numeric_string
                        FROM '{table_name}'
                        WHERE \"{col_name}\" IS NOT NULL
                    """)
                    text_stats = cursor.fetchone()
                    if text_stats:
                        column_profile["max_length"] = text_stats[0]
                        column_profile["min_length"] = text_stats[1]
                        column_profile["avg_length"] = text_stats[2]
                        column_profile["empty_count"] = text_stats[3]
                        
                        # Detect text patterns
                        if text_stats[4] > table_profile["row_count"] * 0.5:
                            column_profile["patterns"]["likely_email"] = True
                        if text_stats[5] > table_profile["row_count"] * 0.3:
                            column_profile["patterns"]["likely_url"] = True
                        if text_stats[6] > table_profile["row_count"] * 0.8:
                            column_profile["patterns"]["numeric_string"] = True
                        
                        # Check for boolean text values
                        cursor.execute(f"""
                            SELECT DISTINCT UPPER(\"{col_name}\") 
                            FROM '{table_name}' 
                            WHERE \"{col_name}\" IS NOT NULL
                        """)
                        distinct_upper = [row[0] for row in cursor.fetchall()]
                        if set(distinct_upper).issubset({'TRUE', 'FALSE', 'T', 'F', '0', '1', 'YES', 'NO', 'Y', 'N'}):
                            column_profile["patterns"]["likely_boolean"] = True
                            profile["text_patterns"][f"{table_name}.{col_name}"] = {
                                "type": "boolean_text",
                                "values": column_profile["sample_values"]
                            }
                
            except Exception as e:
                column_profile["error"] = str(e)
            
            table_profile["columns"][col_name] = column_profile
        
        profile["tables"][table_name] = table_profile
    
    # Identify data quality issues
    for table_name, table_data in profile["tables"].items():
        for col_name, col_data in table_data.get("columns", {}).items():
            # High NULL rate
            if col_data.get("null_percentage", 0) > 50:
                profile["data_quality_issues"].append({
                    "table": table_name,
                    "column": col_name,
                    "issue": "High NULL rate",
                    "null_percentage": col_data["null_percentage"]
                })
            
            # Suspicious numeric ranges
            if "max" in col_data and "min" in col_data:
                if col_data["max"] > 1000000 and "year" in col_name.lower():
                    profile["data_quality_issues"].append({
                        "table": table_name,
                        "column": col_name,
                        "issue": "Suspicious year values",
                        "range": f"{col_data['min']} to {col_data['max']}"
                    })
    
    conn.close()
    
    # Create output directory
    os.makedirs('tool_output', exist_ok=True)
    
    # Save results
    with open('tool_output/data_profile.json', 'w') as f:
        json.dump(profile, f, indent=2, default=str)
    
    # Print summary
    print("=" * 60)
    print("DATA PROFILING COMPLETE")
    print("=" * 60)
    print(f"Tables profiled: {len(profile['tables'])}")
    print(f"Data quality issues found: {len(profile['data_quality_issues'])}")
    
    if profile["data_quality_issues"]:
        print("\nDATA QUALITY ISSUES:")
        for issue in profile["data_quality_issues"][:5]:
            print(f"  - {issue['table']}.{issue['column']}: {issue['issue']}")
    
    if profile["special_values"]:
        print(f"\nSPECIAL VALUES DETECTED: {len(profile['special_values'])}")
        for col, info in list(profile["special_values"].items())[:5]:
            print(f"  - {col}: {info['pattern']}")
    
    if profile["date_formats"]:
        print(f"\nDATE FORMATS DETECTED:")
        for df in profile["date_formats"][:5]:
            print(f"  - {df['table']}.{df['column']}: {df['format']}")
    
    print("\nResults saved to tool_output/data_profile.json")

if __name__ == "__main__":
    profile_data("database.sqlite")