#!/usr/bin/env python3
"""
Unified Analyzer - Combines schema validation and consistency checking
Streamlined version focusing on critical information
"""

import sqlite3
import json
import os
from typing import Dict, List, Any, Set

def connect_db(db_path: str) -> sqlite3.Connection:
    """Connect to the database"""
    return sqlite3.connect(db_path)

def analyze_schema(conn: sqlite3.Connection) -> Dict[str, Any]:
    """Comprehensive schema analysis with focus on critical patterns"""
    cursor = conn.cursor()

    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = [row[0] for row in cursor.fetchall()]

    schema = {}
    column_mappings = {}
    potential_issues = []

    for table in tables:
        # Get detailed column info
        cursor.execute(f"PRAGMA table_info({table})")
        columns = cursor.fetchall()

        # Get foreign keys
        cursor.execute(f"PRAGMA foreign_key_list({table})")
        foreign_keys = cursor.fetchall()

        # Get row count for cardinality estimation
        cursor.execute(f"SELECT COUNT(*) FROM {table}")
        row_count = cursor.fetchone()[0]

        table_info = {
            'columns': [],
            'foreign_keys': [],
            'row_count': row_count,
            'primary_key': None,
            'likely_lookup_table': False,
            'likely_junction_table': False
        }

        # Process columns
        for col in columns:
            col_name = col[1]
            col_type = col[2]
            is_nullable = not col[3]
            is_primary = bool(col[5])

            # Get sample values and patterns
            sample_values = []
            distinct_count = 0
            null_count = 0

            try:
                # Get distinct count and nulls
                cursor.execute(f"SELECT COUNT(DISTINCT {col_name}), COUNT(*) - COUNT({col_name}) FROM {table}")
                result = cursor.fetchone()
                distinct_count = result[0] if result else 0
                null_count = result[1] if result else 0

                # Get sample values
                cursor.execute(f"SELECT DISTINCT {col_name} FROM {table} WHERE {col_name} IS NOT NULL LIMIT 5")
                sample_values = [row[0] for row in cursor.fetchall()]
            except:
                pass

            column_info = {
                'name': col_name,
                'type': col_type,
                'nullable': is_nullable,
                'primary_key': is_primary,
                'distinct_count': distinct_count,
                'null_count': null_count,
                'null_percentage': (null_count / max(row_count, 1)) * 100,
                'sample_values': sample_values,
                'likely_identifier': col_name.endswith('_id') or col_name.endswith('ID') or col_name == 'id',
                'likely_name_column': 'name' in col_name.lower() or 'title' in col_name.lower(),
                'likely_date_column': 'date' in col_name.lower() or 'time' in col_name.lower() or col_type == 'DATE',
                'likely_amount_column': any(x in col_name.lower() for x in ['amount', 'price', 'cost', 'total', 'sum'])
            }

            table_info['columns'].append(column_info)

            if is_primary:
                table_info['primary_key'] = col_name

            # Track column name mappings across tables
            if col_name not in column_mappings:
                column_mappings[col_name] = []
            column_mappings[col_name].append(table)

        # Process foreign keys
        for fk in foreign_keys:
            fk_info = {
                'column': fk[3],
                'references_table': fk[2],
                'references_column': fk[4]
            }
            table_info['foreign_keys'].append(fk_info)

        # Determine table type
        if len(foreign_keys) >= 2 and len(columns) <= len(foreign_keys) + 2:
            table_info['likely_junction_table'] = True
        elif row_count < 1000 and any(c['likely_name_column'] for c in table_info['columns']):
            table_info['likely_lookup_table'] = True

        schema[table] = table_info

    # Identify potential issues and patterns

    # Find columns that appear in multiple tables (potential joins)
    for col_name, tables_list in column_mappings.items():
        if len(tables_list) > 1:
            if col_name.endswith('_id') or col_name.endswith('ID') or col_name == 'id':
                potential_issues.append({
                    'type': 'common_join_column',
                    'column': col_name,
                    'tables': tables_list,
                    'recommendation': f"Potential join column - verify relationships between {', '.join(tables_list)}"
                })

    # Check for name variations
    name_columns = {}
    for table, info in schema.items():
        for col in info['columns']:
            if col['likely_name_column']:
                base_name = col['name'].lower().replace('_', '').replace('name', '')
                if base_name not in name_columns:
                    name_columns[base_name] = []
                name_columns[base_name].append(f"{table}.{col['name']}")

    # Check for missing foreign key definitions
    for table, info in schema.items():
        for col in info['columns']:
            if col['likely_identifier'] and not col['primary_key']:
                # Check if this is defined as a foreign key
                is_fk = any(fk['column'] == col['name'] for fk in info['foreign_keys'])
                if not is_fk and col['name'] != 'id':
                    # Try to find matching table
                    potential_ref = col['name'].replace('_id', '').replace('ID', '')
                    if potential_ref in schema:
                        potential_issues.append({
                            'type': 'missing_foreign_key',
                            'table': table,
                            'column': col['name'],
                            'likely_references': potential_ref,
                            'recommendation': f"Column {col['name']} likely references {potential_ref} table"
                        })

    return {
        'schema': schema,
        'column_mappings': column_mappings,
        'potential_issues': potential_issues,
        'summary': {
            'total_tables': len(schema),
            'junction_tables': [t for t, info in schema.items() if info['likely_junction_table']],
            'lookup_tables': [t for t, info in schema.items() if info['likely_lookup_table']],
            'total_foreign_keys': sum(len(info['foreign_keys']) for info in schema.values())
        }
    }

def generate_column_reference(schema_analysis: Dict) -> List[Dict]:
    """Generate a quick reference for column locations and types"""
    reference = []

    for table, info in schema_analysis['schema'].items():
        for col in info['columns']:
            entry = {
                'column': col['name'],
                'table': table,
                'type': col['type'],
                'nullable': col['nullable'],
                'distinct_values': col['distinct_count'],
                'null_percentage': col['null_percentage'],
                'samples': col['sample_values'][:3] if col['sample_values'] else [],
                'patterns': []
            }

            # Identify patterns
            if col['likely_identifier']:
                entry['patterns'].append('IDENTIFIER')
            if col['likely_name_column']:
                entry['patterns'].append('NAME')
            if col['likely_date_column']:
                entry['patterns'].append('DATE')
            if col['likely_amount_column']:
                entry['patterns'].append('AMOUNT')
            if col['primary_key']:
                entry['patterns'].append('PRIMARY_KEY')

            reference.append(entry)

    # Sort by column name for easy lookup
    reference.sort(key=lambda x: x['column'])

    return reference

def generate_validation_rules(schema_analysis: Dict) -> Dict:
    """Generate specific validation rules based on schema"""
    rules = {
        'join_validations': [],
        'column_validations': [],
        'aggregation_hints': []
    }

    schema = schema_analysis['schema']

    # Join validation rules
    for table, info in schema.items():
        for fk in info['foreign_keys']:
            rules['join_validations'].append({
                'rule': f"JOIN_{table}_{fk['references_table']}",
                'from': table,
                'to': fk['references_table'],
                'condition': f"{table}.{fk['column']} = {fk['references_table']}.{fk['references_column']}",
                'confidence': 'HIGH'
            })

    # Column validation rules
    for table, info in schema.items():
        for col in info['columns']:
            # High null columns need special handling
            if col['null_percentage'] > 50:
                rules['column_validations'].append({
                    'table': table,
                    'column': col['name'],
                    'warning': f"High NULL rate ({col['null_percentage']:.1f}%) - consider using COALESCE or IS NULL checks"
                })

            # Low cardinality columns might be categories
            if col['distinct_count'] < 20 and col['distinct_count'] > 0 and info['row_count'] > 100:
                rules['column_validations'].append({
                    'table': table,
                    'column': col['name'],
                    'hint': f"Low cardinality ({col['distinct_count']} values) - good for GROUP BY or filtering",
                    'values': col['sample_values']
                })

    # Aggregation hints
    for table, info in schema.items():
        numeric_cols = [c for c in info['columns'] if 'INT' in c['type'] or 'REAL' in c['type'] or 'NUMERIC' in c['type']]
        id_cols = [c for c in info['columns'] if c['likely_identifier']]

        if numeric_cols:
            rules['aggregation_hints'].append({
                'table': table,
                'numeric_columns': [c['name'] for c in numeric_cols],
                'aggregatable': True
            })

        if id_cols and not info['likely_junction_table']:
            rules['aggregation_hints'].append({
                'table': table,
                'count_distinct_columns': [c['name'] for c in id_cols],
                'hint': 'Use COUNT(DISTINCT col) for counting unique entities'
            })

    return rules

def main():
    db_path = "./database.sqlite"
    output_dir = "./tool_output"
    os.makedirs(output_dir, exist_ok=True)

    conn = connect_db(db_path)

    # Perform comprehensive analysis
    schema_analysis = analyze_schema(conn)
    column_reference = generate_column_reference(schema_analysis)
    validation_rules = generate_validation_rules(schema_analysis)

    # Create unified report
    unified_report = {
        'database_overview': {
            'total_tables': schema_analysis['summary']['total_tables'],
            'junction_tables': schema_analysis['summary']['junction_tables'],
            'lookup_tables': schema_analysis['summary']['lookup_tables'],
            'total_foreign_keys': schema_analysis['summary']['total_foreign_keys']
        },
        'schema_details': schema_analysis['schema'],
        'column_reference': column_reference,
        'validation_rules': validation_rules,
        'potential_issues': schema_analysis['potential_issues'],
        'column_mappings': schema_analysis['column_mappings']
    }

    # Write output
    with open(f"{output_dir}/unified_analysis.json", 'w') as f:
        json.dump(unified_report, f, indent=2, default=str)

    print(f"Unified analysis complete")
    print(f"Analyzed {schema_analysis['summary']['total_tables']} tables")
    print(f"Found {len(schema_analysis['potential_issues'])} potential issues")
    print(f"Generated {len(validation_rules['join_validations'])} join validation rules")

    conn.close()

if __name__ == "__main__":
    main()