#!/usr/bin/env python3
"""
Calculate statistics about the database - row counts, cardinality, distributions.
This tool is available to subagents for database analysis.
"""

import sqlite3
import json
import sys
from pathlib import Path

def generate_statistics(db_path: str) -> dict:
    """Generate comprehensive statistics about the database."""
    conn = sqlite3.connect(f'file:{db_path}?mode=ro', uri=True)
    cursor = conn.cursor()
    
    stats = {
        'database_size': Path(db_path).stat().st_size,
        'tables': {}
    }
    
    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
    tables = [row[0] for row in cursor.fetchall()]
    
    for table in tables:
        stats['tables'][table] = {
            'row_count': 0,
            'columns': {}
        }
        
        # Get row count
        cursor.execute(f'SELECT COUNT(*) FROM "{table}"')
        row_count = cursor.fetchone()[0]
        stats['tables'][table]['row_count'] = row_count
        
        if row_count > 0:
            # Get column information and statistics
            cursor.execute(f'PRAGMA table_info("{table}")')
            columns = cursor.fetchall()
            
            for col in columns:
                col_name = col[1]
                col_type = col[2]
                
                col_stats = {
                    'type': col_type,
                    'nullable': not col[3],
                    'primary_key': bool(col[5])
                }
                
                # Get unique count and null count
                cursor.execute(f"""
                    SELECT 
                        COUNT(DISTINCT "{col_name}") as unique_count,
                        COUNT(*) - COUNT("{col_name}") as null_count
                    FROM "{table}"
                """)
                unique_count, null_count = cursor.fetchone()
                
                col_stats['unique_count'] = unique_count
                col_stats['null_count'] = null_count
                col_stats['null_percentage'] = (null_count / row_count * 100) if row_count > 0 else 0
                col_stats['cardinality'] = unique_count / row_count if row_count > 0 else 0
                
                # For columns with low cardinality, get value distribution
                if unique_count <= 20 and unique_count > 0:
                    cursor.execute(f"""
                        SELECT "{col_name}", COUNT(*) as cnt
                        FROM "{table}"
                        WHERE "{col_name}" IS NOT NULL
                        GROUP BY "{col_name}"
                        ORDER BY cnt DESC
                        LIMIT 20
                    """)
                    col_stats['value_distribution'] = [
                        {'value': str(row[0]), 'count': row[1]}
                        for row in cursor.fetchall()
                    ]
                    col_stats['is_categorical'] = True
                else:
                    col_stats['is_categorical'] = False
                
                # Check if likely ID column
                col_stats['is_likely_id'] = (
                    'id' in col_name.lower() or 
                    col_stats['primary_key'] or
                    (col_stats['cardinality'] > 0.95 and col_stats['null_count'] == 0)
                )
                
                # For numeric columns, get min/max
                if 'INT' in col_type.upper() or 'REAL' in col_type.upper() or 'NUMERIC' in col_type.upper():
                    try:
                        cursor.execute(f"""
                            SELECT MIN("{col_name}"), MAX("{col_name}"), AVG("{col_name}")
                            FROM "{table}"
                            WHERE "{col_name}" IS NOT NULL
                        """)
                        min_val, max_val, avg_val = cursor.fetchone()
                        if min_val is not None:
                            col_stats['min'] = min_val
                            col_stats['max'] = max_val
                            col_stats['avg'] = avg_val
                    except:
                        pass  # Column might not be truly numeric
                
                stats['tables'][table]['columns'][col_name] = col_stats
    
    conn.close()
    return stats

def main():
    if len(sys.argv) != 2:
        print("Usage: python generate_statistics.py <database.sqlite>")
        sys.exit(1)
    
    db_path = sys.argv[1]
    if not Path(db_path).exists():
        print(f"Error: Database not found: {db_path}")
        sys.exit(1)
    
    stats = generate_statistics(db_path)
    print(json.dumps(stats, indent=2, default=str))

if __name__ == "__main__":
    main()