#!/usr/bin/env python3
"""
Ambiguity Detector
Identifies potentially ambiguous patterns in evidence and column names.
Suggests clarifications and warns about common misinterpretations.
"""

import sqlite3
import os
import json
import difflib

def ensure_output_dir():
    """Ensure tool_output directory exists."""
    os.makedirs('tool_output', exist_ok=True)

def detect_ambiguities(db_path):
    """Detect ambiguous patterns and suggest clarifications."""
    
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    analysis = {
        'similar_columns': [],
        'ambiguous_patterns': [],
        'implicit_filters': [],
        'negation_warnings': [],
        'column_name_mismatches': [],
        'context_patterns': []
    }
    
    # Get all tables and columns
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = [row[0] for row in cursor.fetchall()]
    
    all_columns = {}
    for table in tables:
        cursor.execute(f"PRAGMA table_info(`{table}`)")
        columns = cursor.fetchall()
        all_columns[table] = [col[1] for col in columns]
    
    # Check for similar column names
    check_similar_columns(all_columns, analysis)
    
    # Check for columns requiring implicit filters
    check_implicit_filters(all_columns, cursor, analysis)
    
    # Generate ambiguous pattern warnings
    generate_ambiguous_patterns(analysis)
    
    # Generate negation warnings
    generate_negation_warnings(analysis)
    
    # Check for common column name mismatches
    check_column_mismatches(all_columns, analysis)
    
    # Generate context patterns
    generate_context_patterns(all_columns, analysis)
    
    conn.close()
    
    # Create report
    create_ambiguity_report(analysis)
    
    return analysis

def check_similar_columns(all_columns, analysis):
    """Find similar column names that might be confused."""
    
    # Flatten all columns
    flat_columns = []
    for table, cols in all_columns.items():
        for col in cols:
            flat_columns.append((table, col))
    
    # Find similar pairs
    for i, (table1, col1) in enumerate(flat_columns):
        for table2, col2 in flat_columns[i+1:]:
            # Skip if same column
            if col1.lower() == col2.lower():
                continue
            
            # Check similarity
            similarity = difflib.SequenceMatcher(None, col1.lower(), col2.lower()).ratio()
            
            # Flag if very similar but not identical
            if 0.7 < similarity < 1.0:
                analysis['similar_columns'].append({
                    'column1': f"{table1}.{col1}",
                    'column2': f"{table2}.{col2}",
                    'similarity': round(similarity, 2),
                    'warning': f"Easy to confuse {col1} with {col2}"
                })
    
    # Sort by similarity
    analysis['similar_columns'] = sorted(
        analysis['similar_columns'], 
        key=lambda x: x['similarity'], 
        reverse=True
    )[:10]  # Top 10

def check_implicit_filters(all_columns, cursor, analysis):
    """Check for columns that often require implicit filters."""
    
    implicit_patterns = [
        {'column_pattern': 'result', 'likely_filter': "result = 'Winner'", 'context': 'award queries'},
        {'column_pattern': 'status', 'likely_filter': "status = 'Active'", 'context': 'current records'},
        {'column_pattern': 'deleted', 'likely_filter': "deleted = 0", 'context': 'active records'},
        {'column_pattern': 'archived', 'likely_filter': "archived = 0", 'context': 'current data'},
        {'column_pattern': 'winner', 'likely_filter': "winner = 1", 'context': 'winning records'},
        {'column_pattern': 'rank', 'likely_filter': "rank = 1", 'context': 'top records'}
    ]
    
    for table, columns in all_columns.items():
        for col in columns:
            col_lower = col.lower()
            for pattern in implicit_patterns:
                if pattern['column_pattern'] in col_lower:
                    analysis['implicit_filters'].append({
                        'table': table,
                        'column': col,
                        'suggested_filter': pattern['likely_filter'].replace(pattern['column_pattern'], col),
                        'context': pattern['context'],
                        'warning': f"May need implicit filter for {pattern['context']}"
                    })

def generate_ambiguous_patterns(analysis):
    """Generate warnings for commonly ambiguous patterns."""
    
    patterns = [
        {
            'pattern': 'non-X',
            'example': 'non player/builder',
            'ambiguity': 'Could mean NOT IN (player, builder) or IN (player, builder)',
            'suggestion': 'Check ground truth logic - negation may be inverted'
        },
        {
            'pattern': 'between X and Y',
            'example': 'between 2011 and 2015',
            'ambiguity': 'Could be inclusive (2011-2015) or exclusive (2012-2014)',
            'suggestion': 'SQLite BETWEEN is inclusive of both endpoints'
        },
        {
            'pattern': 'years of X',
            'example': 'years of playing',
            'ambiguity': 'Could be COUNT(year) or MAX(year)-MIN(year)+1',
            'suggestion': 'Check evidence formula - don\'t assume'
        },
        {
            'pattern': 'average X time',
            'example': 'average server time',
            'ambiguity': 'Time might be stored as string needing parsing',
            'suggestion': 'Check if time needs SUBSTR extraction'
        },
        {
            'pattern': 'full X',
            'example': 'full name, full address',
            'ambiguity': 'Could mean concatenated or separate columns',
            'suggestion': 'Check evidence - separate unless || shown'
        },
        {
            'pattern': 'most recent',
            'example': 'most recent movie',
            'ambiguity': 'Single result or all from most recent date?',
            'suggestion': 'Usually ORDER BY date DESC LIMIT 1'
        }
    ]
    
    analysis['ambiguous_patterns'] = patterns

def generate_negation_warnings(analysis):
    """Generate warnings for negation patterns."""
    
    warnings = [
        {
            'pattern': 'NOT IN with categories',
            'warning': 'Check if logic is inverted - ground truth may use IN instead',
            'example': "NOT IN ('Player', 'Builder') might actually need IN"
        },
        {
            'pattern': 'Negative conditions',
            'warning': 'Verify the negative logic matches evidence exactly',
            'example': '!= vs = confusion'
        },
        {
            'pattern': 'Exclusion patterns',
            'warning': 'Double-check what\'s being excluded',
            'example': 'All except X vs Only X'
        }
    ]
    
    analysis['negation_warnings'] = warnings

def check_column_mismatches(all_columns, analysis):
    """Check for common column name mismatches."""
    
    common_mismatches = [
        ('Issue', 'Sub-issue'),
        ('Region', 'Division'),
        ('Name', 'FirstName'),
        ('Name', 'LastName'),
        ('Date', 'DateTime'),
        ('Type', 'Category'),
        ('Status', 'Result'),
        ('Code', 'Number')
    ]
    
    for mismatch1, mismatch2 in common_mismatches:
        found1 = []
        found2 = []
        
        for table, cols in all_columns.items():
            for col in cols:
                if mismatch1.lower() in col.lower():
                    found1.append(f"{table}.{col}")
                if mismatch2.lower() in col.lower():
                    found2.append(f"{table}.{col}")
        
        if found1 and found2:
            analysis['column_name_mismatches'].append({
                'pair': f"{mismatch1} vs {mismatch2}",
                'found': {'type1': found1[:2], 'type2': found2[:2]},
                'warning': f"Don't confuse {mismatch1} with {mismatch2} - check evidence"
            })

def generate_context_patterns(all_columns, analysis):
    """Generate patterns that depend on context."""
    
    patterns = [
        {
            'context': 'Award queries',
            'pattern': 'Tables with award/prize/winner columns',
            'guidance': 'Often need result=\'Winner\' or similar filter',
            'check_for': ['award', 'prize', 'winner', 'result']
        },
        {
            'context': 'Time queries',
            'pattern': 'Tables with time/duration columns',
            'guidance': 'May need SUBSTR parsing for HH:MM:SS format',
            'check_for': ['time', 'duration', 'hours', 'minutes']
        },
        {
            'context': 'Name queries',
            'pattern': 'Tables with first/middle/last columns',
            'guidance': 'Return separately unless concatenation shown',
            'check_for': ['first', 'middle', 'last', 'surname']
        },
        {
            'context': 'Geographic queries',
            'pattern': 'Tables with region/division/area columns',
            'guidance': 'Check exact column name - similar terms differ',
            'check_for': ['region', 'division', 'area', 'zone', 'district']
        }
    ]
    
    for pattern in patterns:
        tables_found = []
        for table, cols in all_columns.items():
            for col in cols:
                if any(check in col.lower() for check in pattern['check_for']):
                    tables_found.append(table)
                    break
        
        if tables_found:
            pattern['tables'] = list(set(tables_found))[:5]
            analysis['context_patterns'].append(pattern)

def create_ambiguity_report(analysis):
    """Create a formatted ambiguity report."""
    
    report = []
    report.append("# Ambiguity Detection Report")
    report.append("")
    
    # Similar columns
    if analysis['similar_columns']:
        report.append("## 🔄 Similar Columns (Easy to Confuse)")
        report.append("")
        for sim in analysis['similar_columns'][:5]:
            report.append(f"### {sim['column1']} ≈ {sim['column2']}")
            report.append(f"- Similarity: {sim['similarity']}")
            report.append(f"- ⚠️ {sim['warning']}")
            report.append("")
    
    # Ambiguous patterns
    if analysis['ambiguous_patterns']:
        report.append("## 🔍 Ambiguous Patterns")
        report.append("")
        for pattern in analysis['ambiguous_patterns']:
            report.append(f"### Pattern: {pattern['pattern']}")
            report.append(f"- Example: {pattern['example']}")
            report.append(f"- Ambiguity: {pattern['ambiguity']}")
            report.append(f"- 💡 {pattern['suggestion']}")
            report.append("")
    
    # Implicit filters
    if analysis['implicit_filters']:
        report.append("## 🎯 Implicit Filters Often Needed")
        report.append("")
        for filter_info in analysis['implicit_filters'][:5]:
            report.append(f"### {filter_info['table']}.{filter_info['column']}")
            report.append(f"- Context: {filter_info['context']}")
            report.append(f"- Suggested: `{filter_info['suggested_filter']}`")
            report.append(f"- {filter_info['warning']}")
            report.append("")
    
    # Negation warnings
    if analysis['negation_warnings']:
        report.append("## ⚠️ Negation Warnings")
        report.append("")
        for warning in analysis['negation_warnings']:
            report.append(f"### {warning['pattern']}")
            report.append(f"- {warning['warning']}")
            report.append(f"- Example: {warning['example']}")
            report.append("")
    
    # Column mismatches
    if analysis['column_name_mismatches']:
        report.append("## 🆚 Common Column Confusions")
        report.append("")
        for mismatch in analysis['column_name_mismatches']:
            report.append(f"### {mismatch['pair']}")
            report.append(f"- {mismatch['warning']}")
            report.append("")
    
    # Context patterns
    if analysis['context_patterns']:
        report.append("## 🎯 Context-Specific Patterns")
        report.append("")
        for context in analysis['context_patterns']:
            report.append(f"### {context['context']}")
            report.append(f"- Pattern: {context['pattern']}")
            report.append(f"- Guidance: {context['guidance']}")
            if 'tables' in context:
                report.append(f"- Found in: {', '.join(context['tables'])}")
            report.append("")
    
    # Save report
    ensure_output_dir()
    with open('tool_output/ambiguity_report.txt', 'w') as f:
        f.write('\n'.join(report))
    
    # Save JSON
    with open('tool_output/ambiguities.json', 'w') as f:
        json.dump(analysis, f, indent=2, default=str)
    
    print("Ambiguity detection complete - results in tool_output/")

if __name__ == "__main__":
    detect_ambiguities("database.sqlite")