#!/usr/bin/env python3
"""
Comprehensive Database Analyzer
Performs automated analysis of database structure, values, and patterns.
Outputs structured JSON for the agent to use.
"""

import sqlite3
import json
import re
import os
from collections import defaultdict, Counter
from datetime import datetime

def connect_db():
    """Connect to the database."""
    return sqlite3.connect('./database.sqlite')

def get_table_info(conn):
    """Get all tables and their basic info."""
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
    tables = [row[0] for row in cursor.fetchall()]
    
    table_info = {}
    for table in tables:
        # Get row count
        cursor.execute(f"SELECT COUNT(*) FROM `{table}`")
        row_count = cursor.fetchone()[0]
        
        # Get column info
        cursor.execute(f"PRAGMA table_info(`{table}`)")
        columns = cursor.fetchall()
        
        # Get foreign keys
        cursor.execute(f"PRAGMA foreign_key_list(`{table}`)")
        foreign_keys = cursor.fetchall()
        
        table_info[table] = {
            'row_count': row_count,
            'columns': [
                {
                    'name': col[1],
                    'type': col[2],
                    'nullable': not col[3],
                    'default': col[4],
                    'primary_key': bool(col[5])
                } for col in columns
            ],
            'foreign_keys': [
                {
                    'column': fk[3],
                    'ref_table': fk[2],
                    'ref_column': fk[4]
                } for fk in foreign_keys
            ]
        }
    
    return table_info

def analyze_column_values(conn, table, column, limit=100):
    """Analyze values in a specific column."""
    cursor = conn.cursor()
    
    # Get sample values
    cursor.execute(f"SELECT DISTINCT `{column}` FROM `{table}` WHERE `{column}` IS NOT NULL LIMIT {limit}")
    sample_values = [row[0] for row in cursor.fetchall()]
    
    # Get unique count
    cursor.execute(f"SELECT COUNT(DISTINCT `{column}`) FROM `{table}`")
    unique_count = cursor.fetchone()[0]
    
    # Get null count
    cursor.execute(f"SELECT COUNT(*) FROM `{table}` WHERE `{column}` IS NULL")
    null_count = cursor.fetchone()[0]
    
    return {
        'unique_count': unique_count,
        'null_count': null_count,
        'sample_values': sample_values[:10],  # Limit samples in output
        'patterns': detect_patterns(sample_values)
    }

def detect_patterns(values):
    """Detect common patterns in values."""
    if not values:
        return {}
    
    patterns = {}
    
    # Convert all to strings for pattern detection
    str_values = [str(v) for v in values if v is not None]
    
    if not str_values:
        return patterns
    
    # Date patterns
    date_patterns = [
        (r'^\d{4}-\d{2}-\d{2}$', 'YYYY-MM-DD'),
        (r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', 'YYYY-MM-DD HH:MM:SS'),
        (r'^\d{2}/\d{2}/\d{4}$', 'MM/DD/YYYY'),
        (r'^\d{1,2}/\d{1,2}/\d{2,4}$', 'M/D/YY or MM/DD/YYYY')
    ]
    
    for pattern, name in date_patterns:
        if any(re.match(pattern, v) for v in str_values[:10]):
            patterns['date_format'] = name
            break
    
    # Currency patterns
    currency_patterns = [
        (r'^[A-Z]{2,3}\$[\d,]+\.?\d*$', 'Currency with symbol prefix'),
        (r'^\$[\d,]+\.?\d*$', 'Dollar amount'),
        (r'^[\d,]+\.\d{2}$', 'Decimal currency')
    ]
    
    for pattern, name in currency_patterns:
        if any(re.match(pattern, v) for v in str_values[:10]):
            patterns['currency_format'] = name
            break
    
    # Check if it's an ID/code column
    if all(re.match(r'^[A-Z0-9_-]+$', v) for v in str_values[:10]):
        patterns['type'] = 'ID/Code'
    
    # Check for percentage (decimal between 0 and 1)
    try:
        numeric_values = [float(v) for v in str_values[:10] if v.replace('.', '').replace('-', '').isdigit()]
        if numeric_values and all(0 <= v <= 1 for v in numeric_values):
            patterns['type'] = 'Percentage (decimal)'
    except:
        pass
    
    # Check for boolean-like values
    unique_values = set(str_values)
    if len(unique_values) <= 3:
        patterns['categorical'] = list(unique_values)[:10]
    
    return patterns

def determine_ownership(table_info):
    """Determine which table owns which columns."""
    ownership = {}
    
    for table, info in table_info.items():
        owned_columns = []
        referenced_columns = []
        
        # Get foreign key columns
        fk_columns = {fk['column'] for fk in info['foreign_keys']}
        
        for col in info['columns']:
            col_name = col['name']
            if col_name in fk_columns:
                # This is a foreign key, so it's a reference
                referenced_columns.append(col_name)
            elif not col['primary_key'] or col_name != 'id':
                # This is owned by this table (unless it's just 'id')
                owned_columns.append(col_name)
        
        ownership[table] = {
            'owns': owned_columns,
            'references': referenced_columns
        }
    
    return ownership

def identify_relationships(table_info):
    """Identify relationships between tables."""
    relationships = []
    
    for table, info in table_info.items():
        for fk in info['foreign_keys']:
            relationships.append({
                'from_table': table,
                'from_column': fk['column'],
                'to_table': fk['ref_table'],
                'to_column': fk['ref_column'],
                'type': 'many-to-one'  # Default assumption
            })
    
    # Identify junction tables (many-to-many)
    for table, info in table_info.items():
        fk_count = len(info['foreign_keys'])
        col_count = len(info['columns'])
        
        # Heuristic: table with mostly foreign keys is likely a junction table
        if fk_count >= 2 and fk_count >= col_count - 2:
            # Update relationship types
            for rel in relationships:
                if rel['from_table'] == table:
                    rel['type'] = 'many-to-many (junction)'
    
    return relationships

def main():
    """Main analysis function."""
    conn = connect_db()
    
    # Get basic table information
    print("Analyzing database structure...")
    table_info = get_table_info(conn)
    
    # Analyze sample values for each column
    print("Analyzing column values and patterns...")
    for table, info in table_info.items():
        if info['row_count'] > 0:
            for col in info['columns']:
                col_analysis = analyze_column_values(conn, table, col['name'])
                col.update(col_analysis)
    
    # Determine ownership
    ownership = determine_ownership(table_info)
    
    # Identify relationships
    relationships = identify_relationships(table_info)
    
    # Create output directory
    os.makedirs('./tool_output', exist_ok=True)
    
    # Generate comprehensive analysis
    analysis = {
        'database_overview': {
            'table_count': len(table_info),
            'total_rows': sum(t['row_count'] for t in table_info.values()),
            'tables': list(table_info.keys())
        },
        'table_details': table_info,
        'ownership_map': ownership,
        'relationships': relationships,
        'analysis_timestamp': datetime.now().isoformat()
    }
    
    # Save to file
    output_path = './tool_output/comprehensive_analysis.json'
    with open(output_path, 'w') as f:
        json.dump(analysis, f, indent=2, default=str)
    
    print(f"Analysis complete! Results saved to {output_path}")
    
    # Also create a human-readable summary
    create_summary(analysis)
    
    conn.close()

def create_summary(analysis):
    """Create a human-readable summary of the analysis."""
    summary_lines = []
    
    summary_lines.append("=== DATABASE ANALYSIS SUMMARY ===\n")
    summary_lines.append(f"Total Tables: {analysis['database_overview']['table_count']}")
    summary_lines.append(f"Total Rows: {analysis['database_overview']['total_rows']}\n")
    
    summary_lines.append("=== OWNERSHIP MAP ===")
    for table, ownership in analysis['ownership_map'].items():
        if ownership['owns']:
            summary_lines.append(f"\n{table}:")
            summary_lines.append(f"  OWNS: {', '.join(ownership['owns'])}")
            if ownership['references']:
                summary_lines.append(f"  REFERENCES: {', '.join(ownership['references'])}")
    
    summary_lines.append("\n=== KEY RELATIONSHIPS ===")
    for rel in analysis['relationships'][:10]:  # Limit to first 10
        summary_lines.append(f"{rel['from_table']}.{rel['from_column']} -> {rel['to_table']}.{rel['to_column']} ({rel['type']})")
    
    summary_lines.append("\n=== PATTERN DETECTION ===")
    for table, info in analysis['table_details'].items():
        patterns_found = False
        for col in info['columns']:
            if 'patterns' in col and col['patterns']:
                if not patterns_found:
                    summary_lines.append(f"\n{table}:")
                    patterns_found = True
                summary_lines.append(f"  {col['name']}: {json.dumps(col['patterns'])}")
    
    # Save summary
    with open('./tool_output/analysis_summary.txt', 'w') as f:
        f.write('\n'.join(summary_lines))
    
    print("Summary saved to ./tool_output/analysis_summary.txt")

if __name__ == "__main__":
    main()