#!/usr/bin/env python3
"""
Table Disambiguator
Identifies similar table names and provides guidance on when to use each.
Helps prevent confusion between Sales/Purchase, Order/Invoice, etc.
"""

import sqlite3
import json
import os
from collections import defaultdict
import re

def disambiguate_tables(db_path):
    """Main disambiguation function."""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    disambiguation = {
        'similar_tables': {},
        'table_purposes': {},
        'selection_rules': [],
        'common_confusions': [],
        'decision_trees': {}
    }

    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tables = [row[0] for row in cursor.fetchall()]

    # Analyze tables
    identify_similar_tables(tables, disambiguation)
    analyze_table_purposes(cursor, tables, disambiguation)
    generate_selection_rules(disambiguation)
    identify_common_confusions(disambiguation)
    create_decision_trees(disambiguation)

    conn.close()
    return disambiguation

def identify_similar_tables(tables, disambiguation):
    """Identify tables with similar names that might be confused."""

    # Group tables by common patterns
    patterns = {
        'order': [],
        'sales': [],
        'purchase': [],
        'customer': [],
        'product': [],
        'person': [],
        'employee': [],
        'store': [],
        'vendor': [],
        'invoice': [],
        'payment': [],
        'transaction': []
    }

    for table in tables:
        table_lower = table.lower()
        for pattern in patterns:
            if pattern in table_lower:
                patterns[pattern].append(table)

    # Find potential confusions
    similar_groups = []

    # Sales vs Purchase
    if patterns['sales'] and patterns['purchase']:
        similar_groups.append({
            'group': 'sales_vs_purchase',
            'tables': patterns['sales'] + patterns['purchase'],
            'confusion_risk': 'HIGH',
            'key_difference': 'Sales = customer transactions, Purchase = vendor/supplier transactions'
        })

    # Order variations
    if patterns['order']:
        order_types = defaultdict(list)
        for table in patterns['order']:
            if 'sales' in table.lower():
                order_types['sales_orders'].append(table)
            elif 'purchase' in table.lower():
                order_types['purchase_orders'].append(table)
            elif 'work' in table.lower():
                order_types['work_orders'].append(table)
            else:
                order_types['generic_orders'].append(table)

        if len(order_types) > 1:
            similar_groups.append({
                'group': 'order_variations',
                'tables': patterns['order'],
                'confusion_risk': 'HIGH',
                'types': dict(order_types)
            })

    # Person vs Employee vs Customer
    person_related = patterns['person'] + patterns['employee'] + patterns['customer']
    if len(person_related) > 1:
        similar_groups.append({
            'group': 'person_entities',
            'tables': person_related,
            'confusion_risk': 'MEDIUM',
            'key_difference': 'Person = general people data, Employee = staff, Customer = buyers'
        })

    # Store vs Vendor
    if patterns['store'] and patterns['vendor']:
        similar_groups.append({
            'group': 'business_entities',
            'tables': patterns['store'] + patterns['vendor'],
            'confusion_risk': 'MEDIUM',
            'key_difference': 'Store = selling locations, Vendor = suppliers'
        })

    disambiguation['similar_tables'] = similar_groups

def analyze_table_purposes(cursor, tables, disambiguation):
    """Analyze the purpose of each table based on columns and data."""

    table_purposes = {}

    for table in tables:
        cursor.execute(f"PRAGMA table_info({table})")
        columns = cursor.fetchall()
        col_names = [col[1].lower() for col in columns]

        purpose = {
            'likely_purpose': 'unknown',
            'key_columns': [],
            'relationships': [],
            'data_type': 'unknown'
        }

        # Identify purpose based on column patterns

        # Sales-related
        if any(col in col_names for col in ['customerid', 'customer_id', 'salespersonid', 'sales_person_id']):
            purpose['likely_purpose'] = 'sales_transactions'
            purpose['data_type'] = 'transactional'

        # Purchase-related
        elif any(col in col_names for col in ['vendorid', 'vendor_id', 'purchaseorderid', 'purchase_order_id', 'supplierid']):
            purpose['likely_purpose'] = 'purchase_transactions'
            purpose['data_type'] = 'transactional'

        # Person data
        elif any(col in col_names for col in ['firstname', 'lastname', 'first_name', 'last_name', 'birthdate', 'email']):
            purpose['likely_purpose'] = 'person_information'
            purpose['data_type'] = 'master_data'

        # Product data
        elif any(col in col_names for col in ['productid', 'product_id', 'productname', 'product_name', 'price', 'cost']):
            purpose['likely_purpose'] = 'product_catalog'
            purpose['data_type'] = 'master_data'

        # Store/Location data
        elif any(col in col_names for col in ['storeid', 'store_id', 'storename', 'store_name', 'location', 'address']):
            purpose['likely_purpose'] = 'store_information'
            purpose['data_type'] = 'master_data'

        # Territory/Region data
        elif any(col in col_names for col in ['territoryid', 'territory_id', 'region', 'country', 'state']):
            purpose['likely_purpose'] = 'geographic_data'
            purpose['data_type'] = 'reference_data'

        # Financial data
        elif any(col in col_names for col in ['amount', 'total', 'subtotal', 'tax', 'discount']):
            purpose['likely_purpose'] = 'financial_records'
            purpose['data_type'] = 'transactional'

        # Identify key columns
        key_columns = []
        for col_name in col_names:
            if 'id' in col_name or 'key' in col_name:
                key_columns.append(col_name)
            elif any(keyword in col_name for keyword in ['name', 'date', 'amount', 'total', 'status', 'type']):
                key_columns.append(col_name)

        purpose['key_columns'] = key_columns[:5]  # Limit to top 5

        # Check for foreign keys
        cursor.execute(f"PRAGMA foreign_key_list({table})")
        foreign_keys = cursor.fetchall()
        for fk in foreign_keys:
            purpose['relationships'].append({
                'to_table': fk[2],
                'column': fk[3]
            })

        table_purposes[table] = purpose

    disambiguation['table_purposes'] = table_purposes

def generate_selection_rules(disambiguation):
    """Generate rules for selecting the correct table."""

    rules = []

    # Rules based on similar tables found
    for group in disambiguation['similar_tables']:
        if group['group'] == 'sales_vs_purchase':
            rules.append({
                'pattern': 'order_type_selection',
                'rule': 'Use "Sales" tables for customer-related queries, "Purchase" tables for vendor/supplier queries',
                'examples': [
                    "Customer order → SalesOrderHeader",
                    "Vendor purchase → PurchaseOrderHeader",
                    "Product bought by customer → SalesOrderDetail",
                    "Product bought from supplier → PurchaseOrderDetail"
                ]
            })

        elif group['group'] == 'order_variations':
            rules.append({
                'pattern': 'order_disambiguation',
                'rule': 'Match order type to business context',
                'examples': [
                    "Customer purchases → SalesOrder*",
                    "Company purchases → PurchaseOrder*",
                    "Internal work → WorkOrder*"
                ]
            })

        elif group['group'] == 'person_entities':
            rules.append({
                'pattern': 'person_selection',
                'rule': 'Choose person table based on role',
                'examples': [
                    "Customer names → Customer + Person",
                    "Employee details → Employee + Person",
                    "General people → Person"
                ]
            })

    # Rules based on table purposes
    purposes = disambiguation['table_purposes']

    # Check for specific patterns
    has_sales = any(p['likely_purpose'] == 'sales_transactions' for p in purposes.values())
    has_purchase = any(p['likely_purpose'] == 'purchase_transactions' for p in purposes.values())

    if has_sales and has_purchase:
        rules.append({
            'pattern': 'transaction_type',
            'rule': 'Identify transaction direction from context',
            'decision_logic': [
                "If 'customer' mentioned → Sales tables",
                "If 'vendor'/'supplier' mentioned → Purchase tables",
                "If 'tax amount' with 'purchase' → PurchaseOrderHeader",
                "If 'tax amount' with 'sales' → SalesOrderHeader"
            ]
        })

    disambiguation['selection_rules'] = rules

def identify_common_confusions(disambiguation):
    """Identify common table selection mistakes."""

    confusions = []

    # Check for Sales/Purchase confusion
    tables = [t for t in disambiguation['table_purposes'].keys()]

    sales_tables = [t for t in tables if 'sales' in t.lower() or 'sale' in t.lower()]
    purchase_tables = [t for t in tables if 'purchase' in t.lower()]

    if sales_tables and purchase_tables:
        confusions.append({
            'type': 'sales_purchase_confusion',
            'tables_involved': sales_tables + purchase_tables,
            'common_mistake': 'Using SalesOrderHeader when PurchaseOrderHeader is needed',
            'how_to_avoid': 'Check if dealing with customer (sales) or vendor (purchase) transactions',
            'keywords_to_watch': ['purchase order', 'vendor', 'supplier', 'procurement']
        })

    # Check for Store navigation confusion
    store_tables = [t for t in tables if 'store' in t.lower()]
    customer_tables = [t for t in tables if 'customer' in t.lower()]

    if store_tables and customer_tables:
        confusions.append({
            'type': 'store_navigation',
            'tables_involved': store_tables + customer_tables,
            'common_mistake': 'Joining Store through Address instead of Customer',
            'how_to_avoid': 'For territory-store relationships, go through Customer table',
            'correct_path': 'SalesTerritory → Customer → Store'
        })

    # Check for Person/Employee confusion
    person_tables = [t for t in tables if t.lower() == 'person']
    employee_tables = [t for t in tables if 'employee' in t.lower()]

    if person_tables and employee_tables:
        confusions.append({
            'type': 'person_entity_confusion',
            'tables_involved': person_tables + employee_tables,
            'common_mistake': 'Using Employee when Person is needed for names',
            'how_to_avoid': 'Person table typically has FirstName/LastName',
            'join_pattern': 'Entity → Person for name details'
        })

    disambiguation['common_confusions'] = confusions

def create_decision_trees(disambiguation):
    """Create decision trees for complex table selection."""

    trees = {}

    # Order selection tree
    order_tables = [t for t in disambiguation['table_purposes'].keys() if 'order' in t.lower()]
    if len(order_tables) > 1:
        trees['order_selection'] = {
            'question': 'What type of order?',
            'branches': {
                'customer_order': {
                    'indicators': ['customer', 'sold', 'revenue'],
                    'table': 'SalesOrderHeader or similar',
                    'next': 'Check for Header vs Detail'
                },
                'vendor_order': {
                    'indicators': ['purchase', 'vendor', 'supplier', 'procurement'],
                    'table': 'PurchaseOrderHeader or similar',
                    'next': 'Check for Header vs Detail'
                },
                'internal_order': {
                    'indicators': ['work', 'production', 'manufacturing'],
                    'table': 'WorkOrder or similar',
                    'next': None
                }
            }
        }

    # Territory navigation tree
    territory_tables = [t for t in disambiguation['table_purposes'].keys() if 'territory' in t.lower()]
    if territory_tables:
        trees['territory_navigation'] = {
            'question': 'How to reach territory-related data?',
            'branches': {
                'stores_in_territory': {
                    'path': 'SalesTerritory → Customer → Store',
                    'not': 'Store → Address → StateProvince',
                    'reason': 'Customer links territories to stores'
                },
                'sales_in_territory': {
                    'path': 'SalesTerritory → SalesOrderHeader',
                    'alternative': 'SalesTerritory → Customer → SalesOrderHeader',
                    'reason': 'Direct or through Customer'
                }
            }
        }

    disambiguation['decision_trees'] = trees

def main():
    """Main execution function."""
    db_path = "./database.sqlite"

    if not os.path.exists(db_path):
        print(f"Error: Database not found at {db_path}")
        return

    # Create output directory
    os.makedirs("tool_output", exist_ok=True)

    # Perform disambiguation
    disambiguation = disambiguate_tables(db_path)

    # Save results
    output_path = "tool_output/table_disambiguation.json"
    with open(output_path, 'w') as f:
        json.dump(disambiguation, f, indent=2)

    print(f"✅ Table disambiguation complete")
    print(f"🔍 Similar table groups: {len(disambiguation['similar_tables'])}")
    print(f"📋 Table purposes identified: {len(disambiguation['table_purposes'])}")
    print(f"📏 Selection rules: {len(disambiguation['selection_rules'])}")
    print(f"⚠️ Common confusions: {len(disambiguation['common_confusions'])}")
    print(f"💾 Results saved to: {output_path}")

    # Print key insights
    if disambiguation['common_confusions']:
        print("\n⚠️ Watch out for these confusions:")
        for confusion in disambiguation['common_confusions'][:2]:
            print(f"  • {confusion['type']}: {confusion['common_mistake']}")
            print(f"    Fix: {confusion['how_to_avoid']}")

if __name__ == "__main__":
    main()