#!/usr/bin/env python3
"""
Attribution Mapper - Identifies which tables own which types of data
Critical for preventing wrong-table errors
"""

import sqlite3
import os
import re

def map_attribution(db_path='./database.sqlite'):
    """Map data ownership patterns to prevent attribution errors."""
    
    if not os.path.exists(db_path):
        print(f"Database not found at {db_path}")
        return
    
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Get all tables and their columns
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
    tables = [row[0] for row in cursor.fetchall()]
    
    table_columns = {}
    for table in tables:
        cursor.execute(f"PRAGMA table_info([{table}])")
        columns = [(row[1], row[2]) for row in cursor.fetchall()]  # name, type
        table_columns[table] = columns
    
    print("=" * 60)
    print("ATTRIBUTION MAPPING - WHO OWNS WHAT")
    print("=" * 60)
    
    # Identify common attribute patterns
    attribute_patterns = {
        'biographical': ['birth', 'death', 'age', 'gender', 'nationality'],
        'temporal': ['date', 'time', 'year', 'month', 'day', 'created', 'updated'],
        'financial': ['price', 'cost', 'amount', 'total', 'salary', 'wage', 'revenue'],
        'performance': ['score', 'points', 'wins', 'losses', 'rating', 'rank'],
        'identification': ['id', 'code', 'number', 'key'],
        'descriptive': ['name', 'description', 'title', 'label'],
        'location': ['address', 'city', 'state', 'country', 'zip', 'postal', 'region'],
        'contact': ['email', 'phone', 'fax', 'mobile'],
        'status': ['status', 'active', 'enabled', 'completed', 'approved'],
        'quantity': ['count', 'quantity', 'amount', 'total', 'sum']
    }
    
    # Map attributes to tables
    attribute_ownership = {}
    for attr_type, patterns in attribute_patterns.items():
        attribute_ownership[attr_type] = []
        for table, columns in table_columns.items():
            for col_name, col_type in columns:
                col_lower = col_name.lower()
                for pattern in patterns:
                    if pattern in col_lower:
                        attribute_ownership[attr_type].append((table, col_name))
                        break
    
    # Identify potential conflicts (same type of data in multiple tables)
    print("\nPOTENTIAL ATTRIBUTION CONFLICTS:")
    print("-" * 40)
    
    conflicts_found = False
    for attr_type, locations in attribute_ownership.items():
        if len(locations) > 1:
            # Group by similar column names
            col_groups = {}
            for table, col in locations:
                col_base = re.sub(r'[_\s]+', '', col.lower())
                if col_base not in col_groups:
                    col_groups[col_base] = []
                col_groups[col_base].append((table, col))
            
            for col_base, group in col_groups.items():
                if len(group) > 1:
                    conflicts_found = True
                    print(f"\n{attr_type.upper()} Data - '{col_base}':")
                    for table, col in group:
                        # Try to determine if this is the primary source
                        cursor.execute(f"SELECT COUNT(DISTINCT [{col}]) FROM [{table}]")
                        distinct_count = cursor.fetchone()[0]
                        cursor.execute(f"SELECT COUNT(*) FROM [{table}]")
                        total_count = cursor.fetchone()[0]
                        
                        if distinct_count > 0:
                            print(f"  → {table}.{col} ({distinct_count} distinct values)")
    
    if not conflicts_found:
        print("No significant conflicts detected")
    
    # Identify tables that likely own specific entity types
    print("\n\nENTITY OWNERSHIP PATTERNS:")
    print("-" * 40)
    
    entity_keywords = {
        'customer': ['customer', 'client', 'buyer'],
        'product': ['product', 'item', 'article', 'good'],
        'order': ['order', 'purchase', 'transaction'],
        'employee': ['employee', 'staff', 'worker', 'personnel'],
        'supplier': ['supplier', 'vendor', 'provider'],
        'location': ['location', 'store', 'branch', 'office'],
        'category': ['category', 'type', 'class', 'group']
    }
    
    entity_ownership = {}
    for entity_type, keywords in entity_keywords.items():
        for table in tables:
            table_lower = table.lower()
            for keyword in keywords:
                if keyword in table_lower:
                    if entity_type not in entity_ownership:
                        entity_ownership[entity_type] = []
                    entity_ownership[entity_type].append(table)
                    break
    
    for entity_type, tables_list in entity_ownership.items():
        print(f"\n{entity_type.upper()} data likely in:")
        for table in tables_list:
            cursor.execute(f"SELECT COUNT(*) FROM [{table}]")
            count = cursor.fetchone()[0]
            print(f"  → {table} ({count} rows)")
    
    # Check for aggregated vs detail tables
    print("\n\nAGGREGATION LEVEL ANALYSIS:")
    print("-" * 40)
    
    potential_summary_tables = []
    potential_detail_tables = []
    
    for table in tables:
        table_lower = table.lower()
        # Check for summary indicators
        if any(word in table_lower for word in ['summary', 'total', 'aggregate', 'stats', 'report']):
            potential_summary_tables.append(table)
        # Check for detail indicators
        elif any(word in table_lower for word in ['detail', 'line', 'item', 'transaction']):
            potential_detail_tables.append(table)
        # Check column names for aggregation functions
        else:
            for col_name, _ in table_columns[table]:
                col_lower = col_name.lower()
                if any(word in col_lower for word in ['sum', 'total', 'count', 'avg', 'average', 'max', 'min']):
                    potential_summary_tables.append(table)
                    break
    
    if potential_summary_tables:
        print("\nSummary/Aggregated tables:")
        for table in set(potential_summary_tables):
            print(f"  → {table} (contains aggregated data)")
    
    if potential_detail_tables:
        print("\nDetail/Transaction tables:")
        for table in set(potential_detail_tables):
            print(f"  → {table} (contains detail records)")
    
    # Generate attribution rules
    print("\n\nRECOMMENDED ATTRIBUTION RULES:")
    print("-" * 40)
    
    rules = []
    
    # Rule 1: Check for 'players' vs 'players_teams' pattern
    if 'players' in tables and any('players' in t and t != 'players' for t in tables):
        rules.append("Player biographical data → players table (NOT players_teams or similar)")
        rules.append("Player statistics → players_[context] tables (season/team specific)")
    
    # Rule 2: Check for regular vs post season
    regular_tables = [t for t in tables if not any(word in t.lower() for word in ['post', 'playoff'])]
    post_tables = [t for t in tables if any(word in t.lower() for word in ['post', 'playoff'])]
    if regular_tables and post_tables:
        rules.append("Regular season data → tables WITHOUT 'post'/'playoff' suffix")
        rules.append("Playoff/postseason data → tables WITH 'post'/'playoff' suffix")
    
    # Rule 3: Check for current vs historical
    if any('history' in t.lower() or 'archive' in t.lower() for t in tables):
        rules.append("Current data → tables WITHOUT 'history'/'archive'")
        rules.append("Historical data → tables WITH 'history'/'archive'")
    
    for rule in rules:
        print(f"  • {rule}")
    
    # Save output
    os.makedirs('./tool_output', exist_ok=True)
    with open('./tool_output/attribution_map.txt', 'w') as f:
        f.write("ATTRIBUTION MAP\n")
        f.write("=" * 40 + "\n\n")
        
        if conflicts_found:
            f.write("CONFLICTS REQUIRING CLARIFICATION:\n")
            for attr_type, locations in attribute_ownership.items():
                if len(locations) > 1:
                    f.write(f"\n{attr_type}:\n")
                    for table, col in locations:
                        f.write(f"  - {table}.{col}\n")
        
        f.write("\n\nATTRIBUTION RULES:\n")
        for rule in rules:
            f.write(f"  • {rule}\n")
    
    conn.close()
    print("\n\nAttribution map saved to ./tool_output/attribution_map.txt")

if __name__ == "__main__":
    map_attribution()