#!/usr/bin/env python3
"""
Value Precision Extractor
Extracts exact categorical values and creates lookup references for precision matching.
Addresses issues with case sensitivity and exact value requirements.
"""

import sqlite3
import json
import os
from collections import defaultdict, Counter
import re

def extract_values(db_path):
    """Main value extraction function."""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    extraction = {
        'categorical_values': {},
        'common_patterns': {},
        'case_sensitivity': {},
        'value_lookups': {},
        'format_warnings': [],
        'null_columns': []
    }

    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = [row[0] for row in cursor.fetchall()]

    # Extract values from each table
    for table in tables:
        extract_table_values(cursor, table, extraction)

    # Analyze patterns
    analyze_value_patterns(extraction)
    create_value_lookups(extraction)
    detect_format_issues(extraction)

    conn.close()
    return extraction

def extract_table_values(cursor, table, extraction):
    """Extract categorical values from a table."""
    cursor.execute(f"PRAGMA table_info({table})")
    columns = cursor.fetchall()

    for col in columns:
        col_name = col[1]
        col_type = col[2]

        # Focus on text columns that might be categorical
        if is_categorical_candidate(col_name, col_type):
            try:
                # Get distinct values
                cursor.execute(f"SELECT DISTINCT {col_name} FROM {table} WHERE {col_name} IS NOT NULL ORDER BY {col_name} LIMIT 100")
                values = [row[0] for row in cursor.fetchall()]

                # Count NULL values
                cursor.execute(f"SELECT COUNT(*) FROM {table} WHERE {col_name} IS NULL")
                null_count = cursor.fetchone()[0]

                # Get total count
                cursor.execute(f"SELECT COUNT(*) FROM {table}")
                total_count = cursor.fetchone()[0]

                if values:
                    col_path = f"{table}.{col_name}"

                    # Determine if truly categorical (limited distinct values)
                    if len(values) <= 50:  # Likely categorical
                        extraction['categorical_values'][col_path] = {
                            'values': values,
                            'count': len(values),
                            'null_count': null_count,
                            'null_percentage': (null_count / total_count * 100) if total_count > 0 else 0,
                            'is_categorical': True
                        }

                        # Check case sensitivity
                        analyze_case_sensitivity(col_path, values, extraction)

                    else:  # Too many values, extract patterns instead
                        patterns = extract_patterns(values[:50])
                        if patterns:
                            extraction['common_patterns'][col_path] = patterns

                    # Check for high NULL percentage
                    if total_count > 0 and null_count / total_count > 0.5:
                        extraction['null_columns'].append({
                            'column': col_path,
                            'null_percentage': null_count / total_count * 100
                        })
            except Exception as e:
                # Handle columns that can't be queried
                pass

def is_categorical_candidate(col_name, col_type):
    """Check if a column is likely categorical."""
    name_lower = col_name.lower()
    type_upper = col_type.upper() if col_type else ""

    # Check type
    if 'TEXT' not in type_upper and 'VARCHAR' not in type_upper and 'CHAR' not in type_upper:
        return False

    # Check name patterns
    categorical_indicators = [
        'type', 'status', 'category', 'state', 'city', 'country',
        'style', 'class', 'group', 'level', 'role', 'position',
        'subscription', 'gender', 'color', 'size', 'notes'
    ]

    # Also check for _id that might be categorical text
    if any(ind in name_lower for ind in categorical_indicators):
        return True

    # Check for columns ending with _name
    if name_lower.endswith('_name'):
        return True

    return False

def analyze_case_sensitivity(col_path, values, extraction):
    """Analyze if values are case sensitive."""
    if col_path not in extraction['case_sensitivity']:
        extraction['case_sensitivity'][col_path] = {
            'is_case_sensitive': False,
            'has_mixed_case': False,
            'examples': []
        }

    # Check for mixed case
    mixed_case_values = []
    for val in values:
        if isinstance(val, str):
            if val != val.upper() and val != val.lower():
                mixed_case_values.append(val)
                extraction['case_sensitivity'][col_path]['has_mixed_case'] = True

    # Check if different cases of same word exist
    value_lower_map = defaultdict(list)
    for val in values:
        if isinstance(val, str):
            value_lower_map[val.lower()].append(val)

    for lower_val, actual_vals in value_lower_map.items():
        if len(actual_vals) > 1:
            extraction['case_sensitivity'][col_path]['is_case_sensitive'] = True
            extraction['case_sensitivity'][col_path]['examples'].append({
                'variations': actual_vals,
                'note': 'Multiple case variations found'
            })

    # Store examples of mixed case values
    if mixed_case_values:
        extraction['case_sensitivity'][col_path]['examples'].extend([
            {'value': v, 'note': 'Contains mixed case'} for v in mixed_case_values[:3]
        ])

def extract_patterns(values):
    """Extract common patterns from values."""
    patterns = {
        'prefixes': Counter(),
        'suffixes': Counter(),
        'formats': [],
        'special_chars': set()
    }

    for val in values:
        if not isinstance(val, str):
            continue

        # Check for common prefixes (first 3 chars)
        if len(val) >= 3:
            patterns['prefixes'][val[:3]] += 1

        # Check for common suffixes (last 3 chars)
        if len(val) >= 3:
            patterns['suffixes'][val[-3:]] += 1

        # Check for special characters
        special = re.findall(r'[^a-zA-Z0-9\s]', val)
        patterns['special_chars'].update(special)

        # Detect format patterns
        if re.match(r'^[A-Z]{2}$', val):  # State codes
            if 'STATE_CODE' not in patterns['formats']:
                patterns['formats'].append('STATE_CODE')
        elif re.match(r'^\d{5}$', val):  # ZIP codes
            if 'ZIP_CODE' not in patterns['formats']:
                patterns['formats'].append('ZIP_CODE')
        elif re.match(r'^[A-Z]{3}-\d{3}$', val):  # Pattern like ABC-123
            if 'CODE_PATTERN' not in patterns['formats']:
                patterns['formats'].append('CODE_PATTERN')

    # Keep only common patterns
    patterns['prefixes'] = dict(patterns['prefixes'].most_common(5))
    patterns['suffixes'] = dict(patterns['suffixes'].most_common(5))
    patterns['special_chars'] = list(patterns['special_chars'])

    return patterns

def analyze_value_patterns(extraction):
    """Analyze patterns across all categorical values."""
    # Find columns with similar values (potential join candidates)
    value_signatures = defaultdict(list)

    for col_path, col_data in extraction['categorical_values'].items():
        if col_data['is_categorical']:
            # Create a signature of the values
            signature = tuple(sorted(col_data['values'][:10]))  # Use first 10 for signature
            if signature:
                value_signatures[signature].append(col_path)

    # Store columns with identical values
    for signature, columns in value_signatures.items():
        if len(columns) > 1:
            extraction['format_warnings'].append({
                'type': 'identical_values',
                'columns': columns,
                'note': 'These columns have identical categorical values'
            })

def create_value_lookups(extraction):
    """Create quick lookup references for common queries."""
    lookups = {}

    # Group by common value types
    for col_path, col_data in extraction['categorical_values'].items():
        if not col_data['is_categorical']:
            continue

        table, column = col_path.split('.')
        values = col_data['values']

        # State codes
        if all(isinstance(v, str) and re.match(r'^[A-Z]{2}$', v) for v in values[:10] if v):
            if 'state_codes' not in lookups:
                lookups['state_codes'] = {}
            lookups['state_codes'][col_path] = values

        # Status/Type values
        if 'status' in column.lower() or 'type' in column.lower():
            if 'status_types' not in lookups:
                lookups['status_types'] = {}
            lookups['status_types'][col_path] = values

        # Yes/No or Boolean-like
        if set(str(v).lower() for v in values) <= {'yes', 'no', 'y', 'n', 'true', 'false', '1', '0'}:
            if 'boolean_like' not in lookups:
                lookups['boolean_like'] = {}
            lookups['boolean_like'][col_path] = values

    extraction['value_lookups'] = lookups

def detect_format_issues(extraction):
    """Detect potential format issues that could cause query problems."""
    for col_path, col_data in extraction['categorical_values'].items():
        if not col_data['is_categorical']:
            continue

        values = col_data['values']

        # Check for leading/trailing spaces
        space_issues = [v for v in values if isinstance(v, str) and (v != v.strip())]
        if space_issues:
            extraction['format_warnings'].append({
                'type': 'whitespace',
                'column': col_path,
                'examples': space_issues[:3],
                'note': 'Values contain leading/trailing whitespace'
            })

        # Check for similar values that might be confused
        for i, val1 in enumerate(values):
            if not isinstance(val1, str):
                continue
            for val2 in values[i+1:]:
                if not isinstance(val2, str):
                    continue
                # Check for very similar values
                if val1.lower() == val2.lower() and val1 != val2:
                    extraction['format_warnings'].append({
                        'type': 'case_variation',
                        'column': col_path,
                        'values': [val1, val2],
                        'note': 'Same value with different case'
                    })
                    break

        # Check for NULL-like strings
        null_like = [v for v in values if isinstance(v, str) and v.lower() in ['null', 'none', 'n/a', 'na', '']]
        if null_like:
            extraction['format_warnings'].append({
                'type': 'null_strings',
                'column': col_path,
                'examples': null_like,
                'note': 'String values that might represent NULL'
            })

def main():
    """Main execution function."""
    db_path = "./database.sqlite"

    if not os.path.exists(db_path):
        print(f"Error: Database not found at {db_path}")
        return

    # Create output directory
    os.makedirs("tool_output", exist_ok=True)

    # Extract values
    extraction = extract_values(db_path)

    # Save results
    output_path = "tool_output/value_extraction.json"
    with open(output_path, 'w') as f:
        json.dump(extraction, f, indent=2)

    print(f"✅ Value extraction complete")
    print(f"🏷️ Categorical columns: {len(extraction['categorical_values'])}")
    print(f"🔍 Pattern detections: {len(extraction['common_patterns'])}")
    print(f"⚠️ Format warnings: {len(extraction['format_warnings'])}")
    print(f"💾 Results saved to: {output_path}")

    # Print key insights
    if extraction['value_lookups']:
        print("\n🎯 Value lookup categories:")
        for category in extraction['value_lookups'].keys():
            print(f"  • {category}: {len(extraction['value_lookups'][category])} columns")

    if extraction['format_warnings']:
        print("\n⚠️ Top format warnings:")
        for warning in extraction['format_warnings'][:3]:
            print(f"  • {warning['type']}: {warning['note']}")

if __name__ == "__main__":
    main()