#!/usr/bin/env python3
"""
Enhanced Column Selector Tool
Maps question patterns to specific columns with focus on human-readable identifiers.
"""

import json
import os
import sqlite3
import re

def generate_column_selection_rules(db_path="database.sqlite"):
    """Generate column selection rules based on database schema and patterns."""

    os.makedirs("tool_output", exist_ok=True)

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    selection_rules = {
        "person_patterns": {},
        "entity_patterns": {},
        "multi_column_patterns": [],
        "id_vs_name_mappings": {},
        "special_cases": [],
        "general_rules": []
    }

    try:
        # Get all tables and columns
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = [row[0] for row in cursor.fetchall() if not row[0].startswith("sqlite_")]

        table_columns = {}
        for table in tables:
            cursor.execute(f"PRAGMA table_info(`{table}`)")
            columns = cursor.fetchall()
            table_columns[table] = {col[1]: col[2] for col in columns}

        # Analyze person-related columns
        for table, columns in table_columns.items():
            # Check for person name patterns
            has_firstname = any('firstname' in col.lower() for col in columns)
            has_lastname = any('lastname' in col.lower() for col in columns)
            has_fullname = any(col.lower() in ['name', 'fullname', 'full_name'] for col in columns)

            if has_firstname and has_lastname:
                firstname_col = next((col for col in columns if 'firstname' in col.lower()), None)
                lastname_col = next((col for col in columns if 'lastname' in col.lower()), None)
                selection_rules["person_patterns"][table] = {
                    "pattern": "Who/Which person",
                    "columns": [firstname_col, lastname_col],
                    "sql": f"SELECT {firstname_col}, {lastname_col}",
                    "note": "Return both FirstName and LastName for person identification"
                }

                # Multi-column pattern for full name
                selection_rules["multi_column_patterns"].append({
                    "question_pattern": f"name of {table.lower()}",
                    "columns": [firstname_col, lastname_col],
                    "order": "FirstName, LastName",
                    "reason": "Full name requires both components"
                })

            elif has_fullname:
                name_col = next((col for col in columns if col.lower() in ['name', 'fullname', 'full_name']), None)
                selection_rules["person_patterns"][table] = {
                    "pattern": "Who/Which person",
                    "columns": [name_col],
                    "sql": f"SELECT {name_col}",
                    "note": "Single name column for person identification"
                }

        # Analyze entity patterns (non-person)
        for table, columns in table_columns.items():
            # Find ID columns and corresponding name columns
            id_cols = [col for col in columns if col.lower().endswith('id') or col.lower() == 'id']
            name_cols = [col for col in columns if 'name' in col.lower() or col.lower() in ['title', 'label']]

            if id_cols and name_cols:
                # Map ID to Name
                for id_col in id_cols:
                    base = id_col.replace('ID', '').replace('Id', '').replace('_id', '')
                    matching_name = None

                    # Try to find matching name column
                    for name_col in name_cols:
                        if base.lower() in name_col.lower() or name_col.lower() == 'name':
                            matching_name = name_col
                            break

                    if matching_name:
                        selection_rules["id_vs_name_mappings"][f"{table}.{id_col}"] = {
                            "readable_column": matching_name,
                            "rule": f"When asked for {base}, return {matching_name} NOT {id_col}",
                            "example": f"'What {base.lower()}' → SELECT {matching_name}"
                        }

            # Entity selection patterns
            entity_name = table.replace('_', ' ').title()
            if name_cols:
                primary_name = name_cols[0]  # Use first name column as primary
                selection_rules["entity_patterns"][table] = {
                    "entity": entity_name,
                    "identifier_column": primary_name,
                    "rule": f"'What/Which {entity_name.lower()}' → SELECT {primary_name}",
                    "not_id": f"Never return {id_cols[0] if id_cols else 'ID'} unless explicitly asked"
                }

        # Special cases
        # Capital city pattern
        if 'Country' in table_columns and 'City' in table_columns:
            if 'Capital' in table_columns['Country'] and 'Name' in table_columns['City']:
                selection_rules["special_cases"].append({
                    "pattern": "capital city",
                    "explanation": "Capital is an ID, need JOIN to get city name",
                    "correct_sql": "SELECT City.Name FROM Country JOIN City ON Country.Capital = City.ID",
                    "wrong_sql": "SELECT Capital FROM Country",
                    "columns_to_return": ["City.Name"]
                })

        # Multi-column request patterns
        selection_rules["multi_column_patterns"].extend([
            {
                "question_pattern": "X and Y",
                "rule": "Return both X and Y in that order",
                "example": "'name and age' → SELECT name, age"
            },
            {
                "question_pattern": "X, Y and Z",
                "rule": "Return all three in listed order",
                "example": "'name, capital and language' → SELECT name, capital, language"
            },
            {
                "question_pattern": "List the X and Y",
                "rule": "Return both columns",
                "example": "'List the cities and countries' → SELECT city, country"
            }
        ])

        # General selection rules
        selection_rules["general_rules"] = [
            {
                "rule": "Human-Readable Priority",
                "description": "Always return human-readable columns (names) over IDs",
                "examples": [
                    "ProductID → ProductName",
                    "CustomerID → CustomerName or FirstName, LastName",
                    "CountryCode → CountryName"
                ]
            },
            {
                "rule": "Multi-Column Evidence",
                "description": "When evidence says 'refers to X, Y', return BOTH columns",
                "example": "'name refers to FirstName, LastName' → SELECT FirstName, LastName"
            },
            {
                "rule": "Question Order Preservation",
                "description": "Return columns in the order they appear in the question",
                "example": "'age and name' → SELECT age, name (NOT name, age)"
            },
            {
                "rule": "Complete Information",
                "description": "Return ALL requested information, not just primary keys",
                "example": "'customer details' might need name, address, phone"
            }
        ]

        # Analyze actual data to find more patterns
        for table in tables[:5]:  # Sample first 5 tables for efficiency
            # Look for columns that might need special handling
            for col_name, col_type in table_columns[table].items():
                # Check if this is a reference to another table
                if col_name.lower().endswith('id') and col_name.lower() != 'id':
                    potential_ref_table = col_name.replace('ID', '').replace('Id', '').replace('_id', '')
                    if any(t.lower() == potential_ref_table.lower() for t in tables):
                        selection_rules["special_cases"].append({
                            "column": f"{table}.{col_name}",
                            "type": "foreign_key_id",
                            "instruction": f"JOIN to {potential_ref_table} table to get readable value",
                            "avoid": f"Don't return raw {col_name} unless specifically asked for ID"
                        })

    except Exception as e:
        selection_rules["error"] = str(e)

    finally:
        conn.close()

    # Write output
    with open("tool_output/column_selection.json", "w") as f:
        json.dump(selection_rules, f, indent=2)

    print(f"Column selection analysis complete")
    print(f"Found {len(selection_rules['person_patterns'])} person patterns")
    print(f"Found {len(selection_rules['entity_patterns'])} entity patterns")
    print(f"Found {len(selection_rules['id_vs_name_mappings'])} ID-to-name mappings")
    print("Results saved to tool_output/column_selection.json")

if __name__ == "__main__":
    generate_column_selection_rules()