#!/usr/bin/env python3
"""
Generate comprehensive statistics about the database.
"""

import sqlite3
import json
import sys
from pathlib import Path
from collections import Counter

def generate_statistics(db_path="database.sqlite"):
    """Generate comprehensive database statistics."""
    
    stats = {
        "database_size": {},
        "table_statistics": {},
        "column_statistics": {},
        "data_quality": {},
        "performance_hints": []
    }
    
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        
        # Get database size info
        stats["database_size"] = get_database_size(db_path)
        
        # Get all tables
        cursor.execute("""
            SELECT name FROM sqlite_master 
            WHERE type='table' 
            AND name NOT LIKE 'sqlite_%'
            ORDER BY name
        """)
        tables = [t[0] for t in cursor.fetchall()]
        
        # Generate statistics for each table
        for table in tables:
            stats["table_statistics"][table] = analyze_table(cursor, table)
        
        # Generate column-level statistics
        stats["column_statistics"] = analyze_columns_globally(cursor, tables)
        
        # Analyze data quality
        stats["data_quality"] = analyze_data_quality(cursor, tables)
        
        # Generate performance hints
        stats["performance_hints"] = generate_performance_hints(
            cursor, tables, stats["table_statistics"]
        )
        
        conn.close()
        
    except sqlite3.Error as e:
        return {"error": f"Database error: {str(e)}"}
    except Exception as e:
        return {"error": f"Unexpected error: {str(e)}"}
    
    return stats

def get_database_size(db_path):
    """Get database file size information."""
    
    size_info = {}
    
    try:
        db_file = Path(db_path)
        if db_file.exists():
            size_bytes = db_file.stat().st_size
            size_info["bytes"] = size_bytes
            size_info["kb"] = round(size_bytes / 1024, 2)
            size_info["mb"] = round(size_bytes / (1024 * 1024), 2)
            
            # Categorize size
            if size_bytes < 1024 * 1024:  # < 1MB
                size_info["category"] = "small"
            elif size_bytes < 10 * 1024 * 1024:  # < 10MB
                size_info["category"] = "medium"
            elif size_bytes < 100 * 1024 * 1024:  # < 100MB
                size_info["category"] = "large"
            else:
                size_info["category"] = "very_large"
    except Exception as e:
        size_info["error"] = str(e)
    
    return size_info

def analyze_table(cursor, table_name):
    """Analyze statistics for a single table."""
    
    table_stats = {
        "row_count": 0,
        "column_count": 0,
        "null_percentages": {},
        "cardinality": {},
        "storage_estimate": 0
    }
    
    # Get row count
    cursor.execute(f"SELECT COUNT(*) FROM '{table_name}'")
    table_stats["row_count"] = cursor.fetchone()[0]
    
    # Get column info
    cursor.execute(f"PRAGMA table_info('{table_name}')")
    columns = cursor.fetchall()
    table_stats["column_count"] = len(columns)
    
    if table_stats["row_count"] > 0:
        for col in columns:
            col_name = col[1]
            col_type = col[2]
            
            # Calculate null percentage
            cursor.execute(f"""
                SELECT COUNT(*) FROM '{table_name}' 
                WHERE "{col_name}" IS NULL
            """)
            null_count = cursor.fetchone()[0]
            table_stats["null_percentages"][col_name] = round(
                (null_count / table_stats["row_count"]) * 100, 2
            )
            
            # Calculate cardinality (distinct values)
            cursor.execute(f"""
                SELECT COUNT(DISTINCT "{col_name}") FROM '{table_name}'
            """)
            distinct_count = cursor.fetchone()[0]
            table_stats["cardinality"][col_name] = {
                "distinct_values": distinct_count,
                "selectivity": round(
                    distinct_count / table_stats["row_count"], 4
                ) if table_stats["row_count"] > 0 else 0
            }
            
            # Estimate storage per column
            if "INT" in col_type.upper():
                col_bytes = 4
            elif "REAL" in col_type.upper() or "FLOAT" in col_type.upper():
                col_bytes = 8
            elif "TEXT" in col_type.upper() or "CHAR" in col_type.upper():
                # Sample average length
                cursor.execute(f"""
                    SELECT AVG(LENGTH("{col_name}")) 
                    FROM '{table_name}' 
                    WHERE "{col_name}" IS NOT NULL
                    LIMIT 1000
                """)
                avg_len = cursor.fetchone()[0]
                col_bytes = avg_len if avg_len else 10
            else:
                col_bytes = 8  # Default
            
            table_stats["storage_estimate"] += col_bytes * table_stats["row_count"]
    
    # Convert storage estimate to readable format
    storage_bytes = table_stats["storage_estimate"]
    table_stats["storage_estimate"] = {
        "bytes": storage_bytes,
        "kb": round(storage_bytes / 1024, 2),
        "mb": round(storage_bytes / (1024 * 1024), 2)
    }
    
    return table_stats

def analyze_columns_globally(cursor, tables):
    """Analyze columns across all tables."""
    
    global_stats = {
        "total_columns": 0,
        "type_distribution": Counter(),
        "nullable_columns": 0,
        "primary_keys": [],
        "unique_columns": [],
        "common_column_names": Counter()
    }
    
    for table in tables:
        cursor.execute(f"PRAGMA table_info('{table}')")
        columns = cursor.fetchall()
        
        for col in columns:
            col_name = col[1]
            col_type = col[2].upper()
            nullable = col[3] == 0
            is_pk = col[5] > 0
            
            global_stats["total_columns"] += 1
            
            # Type distribution
            if "INT" in col_type:
                global_stats["type_distribution"]["INTEGER"] += 1
            elif "TEXT" in col_type or "CHAR" in col_type:
                global_stats["type_distribution"]["TEXT"] += 1
            elif "REAL" in col_type or "FLOAT" in col_type or "DOUBLE" in col_type:
                global_stats["type_distribution"]["REAL"] += 1
            elif "DATE" in col_type or "TIME" in col_type:
                global_stats["type_distribution"]["DATETIME"] += 1
            elif "BLOB" in col_type:
                global_stats["type_distribution"]["BLOB"] += 1
            else:
                global_stats["type_distribution"]["OTHER"] += 1
            
            # Nullable columns
            if nullable:
                global_stats["nullable_columns"] += 1
            
            # Primary keys
            if is_pk:
                global_stats["primary_keys"].append(f"{table}.{col_name}")
            
            # Common column names
            global_stats["common_column_names"][col_name] += 1
        
        # Check for unique constraints
        cursor.execute(f"PRAGMA index_list('{table}')")
        indexes = cursor.fetchall()
        
        for idx in indexes:
            if idx[2] == 1:  # Unique index
                idx_name = idx[1]
                cursor.execute(f"PRAGMA index_info('{idx_name}')")
                idx_columns = cursor.fetchall()
                for idx_col in idx_columns:
                    global_stats["unique_columns"].append(
                        f"{table}.{idx_col[2]}"
                    )
    
    # Convert Counters to dict for JSON serialization
    global_stats["type_distribution"] = dict(global_stats["type_distribution"])
    
    # Get top common column names
    top_common = dict(global_stats["common_column_names"].most_common(10))
    global_stats["common_column_names"] = top_common
    
    return global_stats

def analyze_data_quality(cursor, tables):
    """Analyze data quality issues."""
    
    quality = {
        "empty_tables": [],
        "high_null_columns": [],
        "low_cardinality_columns": [],
        "potential_issues": []
    }
    
    for table in tables:
        # Check for empty tables
        cursor.execute(f"SELECT COUNT(*) FROM '{table}'")
        row_count = cursor.fetchone()[0]
        
        if row_count == 0:
            quality["empty_tables"].append(table)
            continue
        
        # Check columns
        cursor.execute(f"PRAGMA table_info('{table}')")
        columns = cursor.fetchall()
        
        for col in columns:
            col_name = col[1]
            
            # Check for high null percentage
            cursor.execute(f"""
                SELECT COUNT(*) FROM '{table}' 
                WHERE "{col_name}" IS NULL
            """)
            null_count = cursor.fetchone()[0]
            null_percentage = (null_count / row_count) * 100
            
            if null_percentage > 50:
                quality["high_null_columns"].append({
                    "column": f"{table}.{col_name}",
                    "null_percentage": round(null_percentage, 2)
                })
            
            # Check for low cardinality
            cursor.execute(f"""
                SELECT COUNT(DISTINCT "{col_name}") FROM '{table}'
            """)
            distinct_count = cursor.fetchone()[0]
            
            if distinct_count == 1 and row_count > 10:
                quality["low_cardinality_columns"].append({
                    "column": f"{table}.{col_name}",
                    "distinct_values": distinct_count,
                    "total_rows": row_count
                })
    
    # Identify potential issues
    if quality["empty_tables"]:
        quality["potential_issues"].append(
            f"Found {len(quality['empty_tables'])} empty tables"
        )
    
    if len(quality["high_null_columns"]) > 5:
        quality["potential_issues"].append(
            f"Many columns ({len(quality['high_null_columns'])}) have >50% null values"
        )
    
    if quality["low_cardinality_columns"]:
        quality["potential_issues"].append(
            f"Found {len(quality['low_cardinality_columns'])} columns with only one distinct value"
        )
    
    return quality

def generate_performance_hints(cursor, tables, table_stats):
    """Generate performance optimization hints."""
    
    hints = []
    
    # Check for missing indexes on large tables
    for table in tables:
        if table in table_stats:
            row_count = table_stats[table]["row_count"]
            
            if row_count > 10000:
                # Check if table has indexes
                cursor.execute(f"PRAGMA index_list('{table}')")
                indexes = cursor.fetchall()
                
                if not indexes:
                    hints.append({
                        "type": "missing_index",
                        "table": table,
                        "suggestion": f"Large table ({row_count} rows) has no indexes",
                        "priority": "high"
                    })
            
            # Check for high cardinality columns without indexes
            if row_count > 1000:
                for col_name, card_info in table_stats[table].get("cardinality", {}).items():
                    if card_info["selectivity"] > 0.8:  # High selectivity
                        # Check if column is indexed
                        cursor.execute(f"""
                            SELECT COUNT(*) FROM sqlite_master 
                            WHERE type='index' 
                            AND tbl_name='{table}'
                            AND sql LIKE '%{col_name}%'
                        """)
                        has_index = cursor.fetchone()[0] > 0
                        
                        if not has_index:
                            hints.append({
                                "type": "index_candidate",
                                "table": table,
                                "column": col_name,
                                "suggestion": f"High selectivity column ({card_info['selectivity']}) could benefit from index",
                                "priority": "medium"
                            })
    
    # Check for tables that might benefit from aggregation
    large_tables = [
        t for t in tables 
        if t in table_stats and table_stats[t]["row_count"] > 50000
    ]
    
    if large_tables:
        hints.append({
            "type": "aggregation_candidate",
            "tables": large_tables,
            "suggestion": "Consider using GROUP BY and aggregation functions for large tables",
            "priority": "medium"
        })
    
    # SQLite-specific hints
    hints.append({
        "type": "sqlite_tip",
        "suggestion": "Use EXPLAIN QUERY PLAN to analyze complex queries",
        "priority": "low"
    })
    
    hints.append({
        "type": "sqlite_tip",
        "suggestion": "PRAGMA case_sensitive_like = false (default) makes LIKE case-insensitive",
        "priority": "low"
    })
    
    return hints

def main():
    """Run statistics generation and output JSON."""
    stats = generate_statistics()
    
    # Output as JSON
    print(json.dumps(stats, indent=2, default=str))
    
    # Also save to file
    output_dir = Path("tool_output")
    output_dir.mkdir(exist_ok=True)
    
    with open(output_dir / "statistics.json", 'w') as f:
        json.dump(stats, f, indent=2, default=str)

if __name__ == "__main__":
    main()