#!/usr/bin/env python3
"""
Value Matcher Tool - Inspired by CHESS Information Retriever
Performs fuzzy matching and semantic similarity for database values
"""

import sqlite3
import json
import re
from difflib import SequenceMatcher
from collections import defaultdict
import os

def fuzzy_match(s1, s2, threshold=0.8):
    """Calculate fuzzy match score between two strings"""
    # Handle None values
    if s1 is None or s2 is None:
        return 0.0
    
    # Convert to strings and lowercase for comparison
    s1, s2 = str(s1).lower(), str(s2).lower()
    
    # Exact match
    if s1 == s2:
        return 1.0
    
    # Check if one contains the other
    if s1 in s2 or s2 in s1:
        return 0.9
    
    # Use sequence matcher for fuzzy matching
    return SequenceMatcher(None, s1, s2).ratio()

def analyze_value_patterns(cursor, table, column, limit=100):
    """Analyze patterns in column values"""
    query = f'SELECT DISTINCT "{column}" FROM "{table}" WHERE "{column}" IS NOT NULL LIMIT {limit}'
    cursor.execute(query)
    values = [row[0] for row in cursor.fetchall()]
    
    patterns = {
        'has_spaces': any(' ' in str(v) for v in values),
        'has_special_chars': any(re.search(r'[^a-zA-Z0-9\s]', str(v)) for v in values),
        'mixed_case': any(v != v.upper() and v != v.lower() if isinstance(v, str) else False for v in values),
        'numeric_strings': any(str(v).isdigit() for v in values if isinstance(v, str)),
        'date_patterns': any(re.search(r'\d{4}[-/]\d{1,2}[-/]\d{1,2}', str(v)) for v in values),
        'sample_values': values[:5]
    }
    
    return patterns

def find_value_variations(cursor, tables_columns):
    """Find variations of the same concept across columns"""
    variations = defaultdict(list)
    
    # Common concept mappings
    concept_patterns = {
        'location': ['city', 'state', 'country', 'location', 'address', 'region'],
        'time': ['date', 'time', 'datetime', 'timestamp', 'created', 'updated'],
        'identifier': ['id', 'code', 'number', 'key', 'uid', 'uuid'],
        'name': ['name', 'title', 'description', 'label'],
        'amount': ['amount', 'total', 'sum', 'cost', 'price', 'fee'],
        'status': ['status', 'state', 'flag', 'active', 'enabled', 'cancelled']
    }
    
    for concept, patterns in concept_patterns.items():
        for table, columns in tables_columns.items():
            for column in columns:
                col_lower = column.lower()
                if any(p in col_lower for p in patterns):
                    variations[concept].append(f"{table}.{column}")
    
    return dict(variations)

def main():
    # Connect to database
    conn = sqlite3.connect('./database.sqlite')
    cursor = conn.cursor()
    
    # Create output directory
    os.makedirs('./tool_output', exist_ok=True)
    
    # Get all tables and columns
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = [t[0] for t in cursor.fetchall()]
    
    tables_columns = {}
    for table in tables:
        cursor.execute(f'PRAGMA table_info("{table}")')
        columns = [row[1] for row in cursor.fetchall()]
        tables_columns[table] = columns
    
    results = {
        'value_patterns': {},
        'concept_variations': {},
        'fuzzy_match_index': {}
    }
    
    # Analyze value patterns for each table
    print("Analyzing value patterns...")
    for table in tables[:5]:  # Limit to first 5 tables for performance
        results['value_patterns'][table] = {}
        for column in tables_columns[table][:10]:  # Limit columns
            try:
                patterns = analyze_value_patterns(cursor, table, column)
                results['value_patterns'][table][column] = patterns
            except Exception as e:
                results['value_patterns'][table][column] = {'error': str(e)}
    
    # Find concept variations
    print("Finding concept variations...")
    results['concept_variations'] = find_value_variations(cursor, tables_columns)
    
    # Create fuzzy match index for common values
    print("Creating fuzzy match index...")
    common_values = []
    for table in tables[:3]:  # Sample from first 3 tables
        for column in tables_columns[table][:5]:
            try:
                cursor.execute(f'SELECT DISTINCT "{column}" FROM "{table}" LIMIT 20')
                values = [str(row[0]) for row in cursor.fetchall() if row[0] is not None]
                common_values.extend(values)
            except:
                pass
    
    # Find similar values
    fuzzy_matches = defaultdict(list)
    for i, v1 in enumerate(common_values):
        for v2 in common_values[i+1:]:
            score = fuzzy_match(v1, v2)
            if 0.7 < score < 1.0:  # Similar but not identical
                fuzzy_matches[v1].append({'value': v2, 'score': score})
    
    results['fuzzy_match_index'] = dict(fuzzy_matches)
    
    # Save results
    with open('./tool_output/value_matching_analysis.json', 'w') as f:
        json.dump(results, f, indent=2, default=str)
    
    # Print summary
    print("\n=== VALUE MATCHING ANALYSIS COMPLETE ===")
    print(f"Analyzed {len(tables)} tables")
    print(f"Found {len(results['concept_variations'])} concept groups")
    print(f"Created fuzzy match index with {len(results['fuzzy_match_index'])} entries")
    print("\nKey findings:")
    for concept, locations in results['concept_variations'].items():
        if locations:
            print(f"  {concept}: found in {len(locations)} locations")
    
    print("\nResults saved to ./tool_output/value_matching_analysis.json")
    
    conn.close()

if __name__ == "__main__":
    main()