#!/usr/bin/env python3
"""
Column Disambiguator - Maps column ownership and identifies ambiguous columns
"""
import sqlite3
import json
import os
from collections import defaultdict

def disambiguate_columns(db_path):
    """Identify column ownership and potential ambiguities"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    disambiguation = {
        "column_ownership": {},
        "ambiguous_columns": {},
        "unique_columns": {},
        "composite_keys": [],
        "calculated_vs_stored": {},
        "column_purposes": {}
    }
    
    # Track all columns across tables
    all_columns = defaultdict(list)  # column_name -> [(table, type, is_pk, is_fk)]
    foreign_key_columns = {}  # table.column -> references
    
    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = [row[0] for row in cursor.fetchall()]
    
    for table_name in tables:
        # Get column info
        cursor.execute(f"PRAGMA table_info('{table_name}')")
        columns = cursor.fetchall()
        
        # Get foreign keys
        cursor.execute(f"PRAGMA foreign_key_list('{table_name}')")
        foreign_keys = cursor.fetchall()
        
        # Build foreign key map
        fk_map = {}
        for fk in foreign_keys:
            fk_map[fk[3]] = {  # from_column
                "references_table": fk[2],
                "references_column": fk[4]
            }
            foreign_key_columns[f"{table_name}.{fk[3]}"] = f"{fk[2]}.{fk[4]}"
        
        # Process each column
        primary_keys = []
        for col_info in columns:
            col_name = col_info[1]
            col_type = col_info[2]
            is_pk = bool(col_info[5])
            is_fk = col_name in fk_map
            
            if is_pk:
                primary_keys.append(col_name)
            
            # Track column across tables
            all_columns[col_name.lower()].append({
                "table": table_name,
                "original_name": col_name,
                "type": col_type,
                "is_primary_key": is_pk,
                "is_foreign_key": is_fk,
                "references": fk_map.get(col_name)
            })
            
            # Assign ownership
            ownership_key = f"{table_name}.{col_name}"
            disambiguation["column_ownership"][ownership_key] = {
                "table": table_name,
                "column": col_name,
                "type": col_type,
                "role": "primary_key" if is_pk else "foreign_key" if is_fk else "attribute",
                "references": fk_map.get(col_name)
            }
        
        # Track composite keys
        if len(primary_keys) > 1:
            disambiguation["composite_keys"].append({
                "table": table_name,
                "columns": primary_keys
            })
    
    # Identify ambiguous columns (same name in multiple tables)
    for col_name, occurrences in all_columns.items():
        if len(occurrences) > 1:
            disambiguation["ambiguous_columns"][col_name] = {
                "appears_in": [occ["table"] for occ in occurrences],
                "details": occurrences
            }
        else:
            disambiguation["unique_columns"][col_name] = {
                "table": occurrences[0]["table"],
                "original_name": occurrences[0]["original_name"]
            }
    
    # Identify calculated vs stored patterns
    for table_name in tables:
        cursor.execute(f"PRAGMA table_info('{table_name}')")
        columns = cursor.fetchall()
        
        col_names = [col[1].lower() for col in columns]
        
        # Look for calculated column patterns
        calculated_patterns = [
            ("total", ["quantity", "price", "amount"]),
            ("count", ["id", "number"]),
            ("average", ["value", "score", "rating"]),
            ("percentage", ["total", "count"]),
            ("difference", ["start", "end", "from", "to"]),
            ("duration", ["start_date", "end_date", "start_time", "end_time"])
        ]
        
        for pattern_name, required_cols in calculated_patterns:
            if any(pattern_name in col for col in col_names):
                has_components = any(req in ' '.join(col_names) for req in required_cols)
                if has_components:
                    matching_col = next((col for col in col_names if pattern_name in col), None)
                    if matching_col:
                        key = f"{table_name}.{matching_col}"
                        disambiguation["calculated_vs_stored"][key] = {
                            "type": "potentially_calculated",
                            "pattern": pattern_name,
                            "has_component_columns": True
                        }
    
    # Determine column purposes based on naming patterns
    purpose_patterns = {
        "identifier": ["id", "code", "number", "key"],
        "name": ["name", "title", "label"],
        "description": ["description", "desc", "comment", "note", "text"],
        "temporal": ["date", "time", "created", "updated", "modified", "timestamp"],
        "status": ["status", "state", "active", "enabled", "flag"],
        "quantity": ["count", "quantity", "amount", "number", "total"],
        "financial": ["price", "cost", "fee", "amount", "payment", "balance"],
        "location": ["address", "city", "state", "country", "zip", "postal", "latitude", "longitude"],
        "contact": ["email", "phone", "fax", "mobile", "contact"],
        "classification": ["type", "category", "class", "group", "kind"],
        "measurement": ["size", "weight", "height", "width", "length", "volume"],
        "percentage": ["percent", "rate", "ratio"],
        "boolean": ["is_", "has_", "can_", "should_", "active", "enabled", "flag"]
    }
    
    for ownership_key, ownership_data in disambiguation["column_ownership"].items():
        col_name_lower = ownership_data["column"].lower()
        
        detected_purposes = []
        for purpose, patterns in purpose_patterns.items():
            if any(pattern in col_name_lower for pattern in patterns):
                detected_purposes.append(purpose)
        
        if detected_purposes:
            disambiguation["column_purposes"][ownership_key] = detected_purposes
    
    # Generate disambiguation rules
    rules = []
    
    # Rule 1: Ambiguous column rules
    for col_name, info in disambiguation["ambiguous_columns"].items():
        tables_list = info["appears_in"]
        if len(tables_list) > 1:
            rules.append({
                "column": col_name,
                "rule": f"Always specify table alias when using '{col_name}' - appears in: {', '.join(tables_list)}",
                "sql_example": f"SELECT t1.{col_name} FROM {tables_list[0]} t1"
            })
    
    # Rule 2: Foreign key disambiguation
    for fk_col, references in foreign_key_columns.items():
        table, col = fk_col.split('.')
        ref_table, ref_col = references.split('.')
        rules.append({
            "column": col,
            "rule": f"{table}.{col} references {ref_table}.{ref_col}",
            "sql_example": f"JOIN {ref_table} ON {table}.{col} = {ref_table}.{ref_col}"
        })
    
    disambiguation["disambiguation_rules"] = rules
    
    # Create output directory
    os.makedirs('tool_output', exist_ok=True)
    
    # Save results
    with open('tool_output/column_disambiguation.json', 'w') as f:
        json.dump(disambiguation, f, indent=2)
    
    # Print summary
    print("=" * 60)
    print("COLUMN DISAMBIGUATION COMPLETE")
    print("=" * 60)
    print(f"Total columns analyzed: {len(disambiguation['column_ownership'])}")
    print(f"Ambiguous columns found: {len(disambiguation['ambiguous_columns'])}")
    print(f"Unique columns: {len(disambiguation['unique_columns'])}")
    print(f"Composite keys: {len(disambiguation['composite_keys'])}")
    
    if disambiguation["ambiguous_columns"]:
        print("\nAMBIGUOUS COLUMNS (require table qualification):")
        for col_name, info in list(disambiguation["ambiguous_columns"].items())[:10]:
            print(f"  - '{col_name}' appears in: {', '.join(info['appears_in'])}")
    
    if disambiguation["composite_keys"]:
        print("\nCOMPOSITE PRIMARY KEYS:")
        for ck in disambiguation["composite_keys"]:
            print(f"  - {ck['table']}: ({', '.join(ck['columns'])})")
    
    if disambiguation["calculated_vs_stored"]:
        print(f"\nPOTENTIALLY CALCULATED COLUMNS: {len(disambiguation['calculated_vs_stored'])}")
        for col, info in list(disambiguation["calculated_vs_stored"].items())[:5]:
            print(f"  - {col}: {info['pattern']} pattern detected")
    
    print("\nResults saved to tool_output/column_disambiguation.json")

if __name__ == "__main__":
    disambiguate_columns("database.sqlite")