#!/usr/bin/env python3
"""
Column Separator Analyzer
Detects when columns should be returned separately vs concatenated.
Focuses on name patterns, multi-part fields, and evidence interpretation.
"""

import sqlite3
import os
import json
import re

def ensure_output_dir():
    """Ensure tool_output directory exists."""
    os.makedirs('tool_output', exist_ok=True)

def analyze_column_separation(db_path):
    """Analyze columns to determine separation patterns."""
    
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    analysis = {
        'multi_part_columns': {},
        'name_patterns': [],
        'concatenation_guidance': [],
        'separation_rules': [],
        'common_evidence_patterns': [],
        'sqlite_functions': []
    }
    
    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = [row[0] for row in cursor.fetchall()]
    
    for table in tables:
        # Get columns
        cursor.execute(f"PRAGMA table_info(`{table}`)")
        columns = cursor.fetchall()
        
        # Check for multi-part name patterns
        check_name_patterns(table, columns, analysis)
        
        # Check for columns that often get concatenated
        check_concatenation_patterns(table, columns, cursor, analysis)
        
        # Check for time/date columns needing parsing
        check_time_patterns(table, columns, cursor, analysis)
    
    # Generate separation rules
    generate_separation_rules(analysis)
    
    # Add SQLite function templates
    add_sqlite_templates(analysis)
    
    # Generate common evidence patterns
    generate_evidence_patterns(analysis)
    
    conn.close()
    
    # Create report
    create_separation_report(analysis)
    
    return analysis

def check_name_patterns(table, columns, analysis):
    """Check for multi-part name fields."""
    
    # Look for first/middle/last name patterns
    name_parts = {'first': [], 'middle': [], 'last': []}
    
    for col in columns:
        col_name = col[1].lower()
        
        if 'first' in col_name and 'name' not in col_name:
            name_parts['first'].append((table, col[1]))
        elif 'middle' in col_name:
            name_parts['middle'].append((table, col[1]))
        elif 'last' in col_name and 'name' not in col_name:
            name_parts['last'].append((table, col[1]))
    
    # If we have all three parts in same table, it's a multi-part pattern
    if name_parts['first'] and name_parts['last']:
        for first_table, first_col in name_parts['first']:
            for last_table, last_col in name_parts['last']:
                if first_table == last_table:
                    if table not in analysis['multi_part_columns']:
                        analysis['multi_part_columns'][table] = []
                    
                    middle_col = None
                    for mid_table, mid in name_parts['middle']:
                        if mid_table == table:
                            middle_col = mid
                            break
                    
                    pattern = {
                        'type': 'name',
                        'parts': {
                            'first': first_col,
                            'middle': middle_col,
                            'last': last_col
                        },
                        'guidance': f"Return {first_col}, {middle_col}, {last_col} SEPARATELY unless evidence shows concatenation operator"
                    }
                    
                    analysis['multi_part_columns'][table].append(pattern)
                    analysis['name_patterns'].append({
                        'table': table,
                        'pattern': pattern
                    })

def check_concatenation_patterns(table, columns, cursor, analysis):
    """Check for columns that are commonly concatenated incorrectly."""
    
    # Check for address patterns
    address_parts = []
    for col in columns:
        col_name = col[1].lower()
        if any(addr in col_name for addr in ['address', 'street', 'city', 'state', 'zip', 'country']):
            address_parts.append(col[1])
    
    if len(address_parts) >= 2:
        analysis['concatenation_guidance'].append({
            'table': table,
            'columns': address_parts,
            'guidance': 'Address parts should be returned separately unless evidence specifies concatenation'
        })
    
    # Check for code/number patterns that might be concatenated
    code_parts = []
    for col in columns:
        col_name = col[1].lower()
        if any(code in col_name for code in ['code', 'number', 'prefix', 'suffix']):
            code_parts.append(col[1])
    
    if len(code_parts) >= 2:
        analysis['concatenation_guidance'].append({
            'table': table,
            'columns': code_parts,
            'guidance': 'Code/number parts: check evidence for concatenation vs separation'
        })

def check_time_patterns(table, columns, cursor, analysis):
    """Check for time columns that need parsing."""
    
    for col in columns:
        col_name = col[1]
        col_type = col[2]
        
        if 'time' in col_name.lower() or 'duration' in col_name.lower():
            # Sample the format
            try:
                cursor.execute(f"SELECT DISTINCT `{col_name}` FROM `{table}` WHERE `{col_name}` IS NOT NULL LIMIT 5")
                samples = [row[0] for row in cursor.fetchall()]
                
                for sample in samples:
                    if isinstance(sample, str) and ':' in sample:
                        # Time format detected
                        if 'time_patterns' not in analysis:
                            analysis['time_patterns'] = []
                        
                        analysis['time_patterns'].append({
                            'table': table,
                            'column': col_name,
                            'format': 'HH:MM:SS' if sample.count(':') == 2 else 'HH:MM',
                            'sample': sample,
                            'extraction': {
                                'hours': "CAST(SUBSTR(col, 1, 2) AS INTEGER)",
                                'minutes': "CAST(SUBSTR(col, 4, 2) AS INTEGER)",
                                'seconds': "CAST(SUBSTR(col, 7, 2) AS INTEGER)" if sample.count(':') == 2 else None
                            }
                        })
                        break
            except:
                pass

def generate_separation_rules(analysis):
    """Generate rules for when to separate vs concatenate."""
    
    rules = [
        {
            'rule': 'Multi-part names',
            'pattern': 'full name refers to first middle last',
            'action': 'SELECT first, middle, last',
            'not': "SELECT first || ' ' || middle || ' ' || last",
            'unless': 'Evidence explicitly shows concatenation operator ||'
        },
        {
            'rule': 'List of attributes',
            'pattern': 'X refers to Y Z W',
            'action': 'SELECT Y, Z, W',
            'not': 'SELECT Y || Z || W',
            'unless': 'Evidence shows operators between columns'
        },
        {
            'rule': 'Address components',
            'pattern': 'address refers to line1 line2 city state',
            'action': 'SELECT line1, line2, city, state',
            'not': 'Concatenation',
            'unless': 'Evidence explicitly requests single string'
        },
        {
            'rule': 'Date/time components',
            'pattern': 'datetime refers to date time',
            'action': 'SELECT date, time',
            'not': "SELECT date || ' ' || time",
            'unless': 'Evidence requests combined format'
        }
    ]
    
    analysis['separation_rules'] = rules

def add_sqlite_templates(analysis):
    """Add SQLite function templates for common operations."""
    
    templates = [
        {
            'operation': 'Extract year from date',
            'template': "STRFTIME('%Y', date_column)",
            'not': "CAST(date_column AS DATE)",
            'example': "STRFTIME('%Y', ship_date) = '2016'"
        },
        {
            'operation': 'Extract month',
            'template': "STRFTIME('%m', date_column)",
            'example': "STRFTIME('%m', order_date) = '03'"
        },
        {
            'operation': 'Extract day',
            'template': "STRFTIME('%d', date_column)",
            'example': "STRFTIME('%d', birth_date) = '15'"
        },
        {
            'operation': 'Extract time component',
            'template': "CAST(SUBSTR(time_col, position, length) AS INTEGER)",
            'example': "CAST(SUBSTR(duration, 4, 2) AS INTEGER) -- minutes"
        },
        {
            'operation': 'Date comparison',
            'template': "date_column LIKE 'YYYY%' OR STRFTIME",
            'example': "date_column LIKE '2016%'"
        }
    ]
    
    analysis['sqlite_functions'] = templates

def generate_evidence_patterns(analysis):
    """Generate common evidence interpretation patterns."""
    
    patterns = [
        {
            'evidence': 'full name refers to first middle last',
            'interpretation': 'Return 3 separate columns',
            'sql': 'SELECT first, middle, last',
            'warning': 'Do NOT concatenate unless || shown'
        },
        {
            'evidence': 'full name refers to first || \' \' || middle || \' \' || last',
            'interpretation': 'Concatenate into single column',
            'sql': "SELECT first || ' ' || middle || ' ' || last",
            'warning': 'Only concatenate when operators shown'
        },
        {
            'evidence': 'detailed issue refers to Sub-issue',
            'interpretation': 'Use exact column name Sub-issue',
            'sql': 'SELECT `Sub-issue`',
            'warning': 'Not Issue column - use exact name'
        },
        {
            'evidence': 'non player/builder',
            'interpretation': 'Could mean NOT IN or could be category',
            'sql': 'Check context - may need IN not NOT IN',
            'warning': 'Ambiguous negation - verify logic'
        },
        {
            'evidence': 'division refers to Region',
            'interpretation': 'Use Region column',
            'sql': "WHERE Region = 'value'",
            'warning': 'Not division column - follow evidence'
        }
    ]
    
    analysis['common_evidence_patterns'] = patterns

def create_separation_report(analysis):
    """Create a formatted report on column separation."""
    
    report = []
    report.append("# Column Separation Analysis")
    report.append("")
    
    # Separation rules
    report.append("## 📦 Column Separation Rules")
    report.append("")
    for rule in analysis['separation_rules']:
        report.append(f"### {rule['rule']}")
        report.append(f"- Pattern: `{rule['pattern']}`")
        report.append(f"- ✅ Action: `{rule['action']}`")
        report.append(f"- ❌ NOT: `{rule['not']}`")
        report.append(f"- Unless: {rule['unless']}")
        report.append("")
    
    # Multi-part columns
    if analysis['multi_part_columns']:
        report.append("## 🔄 Multi-Part Columns Found")
        report.append("")
        for table, patterns in analysis['multi_part_columns'].items():
            report.append(f"### Table: {table}")
            for pattern in patterns:
                if pattern['type'] == 'name':
                    parts = pattern['parts']
                    report.append(f"- Name parts: {parts['first']}, {parts.get('middle', 'N/A')}, {parts['last']}")
                    report.append(f"- {pattern['guidance']}")
            report.append("")
    
    # SQLite function templates
    if analysis['sqlite_functions']:
        report.append("## 🔧 SQLite Function Templates")
        report.append("")
        for func in analysis['sqlite_functions']:
            report.append(f"### {func['operation']}")
            report.append(f"- Template: `{func['template']}`")
            if 'not' in func:
                report.append(f"- NOT: `{func['not']}`")
            report.append(f"- Example: `{func['example']}`")
            report.append("")
    
    # Evidence patterns
    if analysis['common_evidence_patterns']:
        report.append("## 🔍 Evidence Interpretation Patterns")
        report.append("")
        for pattern in analysis['common_evidence_patterns']:
            report.append(f"### Evidence: `{pattern['evidence']}`")
            report.append(f"- Interpretation: {pattern['interpretation']}")
            report.append(f"- SQL: `{pattern['sql']}`")
            report.append(f"- ⚠️ {pattern['warning']}")
            report.append("")
    
    # Save report
    ensure_output_dir()
    with open('tool_output/column_separation.txt', 'w') as f:
        f.write('\n'.join(report))
    
    # Save JSON
    with open('tool_output/column_separation.json', 'w') as f:
        json.dump(analysis, f, indent=2, default=str)
    
    print("Column separation analysis complete - results in tool_output/")

if __name__ == "__main__":
    analyze_column_separation("database.sqlite")