#!/usr/bin/env python3
"""
Enhanced Value Extractor Tool
Samples values from each column to identify patterns, case sensitivity, and column types.
Enhanced to detect human-readable identifier columns.
"""

import sqlite3
import json
import os
import re
from collections import Counter

def extract_values(db_path="database.sqlite", sample_size=100):
    """Extract sample values and identify human-readable columns."""

    os.makedirs("tool_output", exist_ok=True)

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    value_info = {
        "tables": {},
        "case_sensitive_columns": [],
        "date_columns": [],
        "numeric_columns": [],
        "identifier_columns": {},  # Maps tables to their human-readable identifier columns
        "id_to_name_mappings": {}  # Maps ID columns to corresponding name columns
    }

    try:
        # Get all tables
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = cursor.fetchall()

        for table_name, in tables:
            if table_name.startswith("sqlite_"):
                continue

            value_info["tables"][table_name] = {
                "columns": {},
                "row_count": 0,
                "likely_identifier": None  # The human-readable identifier for this table
            }

            # Get row count
            try:
                cursor.execute(f"SELECT COUNT(*) FROM `{table_name}`")
                row_count = cursor.fetchone()[0]
                value_info["tables"][table_name]["row_count"] = row_count
            except:
                continue

            if row_count == 0:
                continue

            # Get column info
            cursor.execute(f"PRAGMA table_info(`{table_name}`)")
            columns = cursor.fetchall()

            # Track potential identifier columns
            name_columns = []
            id_columns = []

            for col in columns:
                col_name = col[1]
                col_type = col[2].upper()

                # Sample values
                try:
                    cursor.execute(f"""
                        SELECT DISTINCT `{col_name}`
                        FROM `{table_name}`
                        WHERE `{col_name}` IS NOT NULL
                        LIMIT {sample_size}
                    """)
                    values = [row[0] for row in cursor.fetchall()]

                    # Analyze column characteristics
                    col_info = {
                        "type": col_type,
                        "sample_values": values[:10],
                        "distinct_count": len(values),
                        "has_nulls": False,
                        "case_variations": False,
                        "is_numeric": False,
                        "is_date": False,
                        "is_identifier_candidate": False
                    }

                    # Check for nulls
                    cursor.execute(f"SELECT COUNT(*) FROM `{table_name}` WHERE `{col_name}` IS NULL")
                    null_count = cursor.fetchone()[0]
                    col_info["has_nulls"] = null_count > 0
                    col_info["null_count"] = null_count

                    # Analyze values
                    if values:
                        # Check if numeric
                        if all(isinstance(v, (int, float)) for v in values):
                            col_info["is_numeric"] = True
                            value_info["numeric_columns"].append(f"{table_name}.{col_name}")

                        # Check for dates
                        elif any(isinstance(v, str) and re.match(r'\d{4}-\d{2}-\d{2}', str(v)) for v in values):
                            col_info["is_date"] = True
                            value_info["date_columns"].append(f"{table_name}.{col_name}")

                        # Check for case sensitivity
                        elif all(isinstance(v, str) for v in values):
                            # Check if values have mixed case
                            lower_values = [v.lower() for v in values if v]
                            if len(set(lower_values)) < len(set(values)):
                                col_info["case_variations"] = True
                                value_info["case_sensitive_columns"].append(f"{table_name}.{col_name}")

                    # Identify potential identifier columns
                    col_name_lower = col_name.lower()
                    if 'name' in col_name_lower or col_name_lower.endswith('name'):
                        name_columns.append(col_name)
                        col_info["is_identifier_candidate"] = True
                    elif col_name_lower.endswith('id') or col_name_lower == 'id':
                        id_columns.append(col_name)
                    elif 'title' in col_name_lower or 'label' in col_name_lower:
                        name_columns.append(col_name)
                        col_info["is_identifier_candidate"] = True

                    value_info["tables"][table_name]["columns"][col_name] = col_info

                except Exception as e:
                    value_info["tables"][table_name]["columns"][col_name] = {
                        "error": str(e),
                        "type": col_type
                    }

            # Determine the likely human-readable identifier
            if name_columns:
                # Prefer columns with 'name' in them
                if len(name_columns) == 1:
                    value_info["tables"][table_name]["likely_identifier"] = name_columns[0]
                else:
                    # Prefer FirstName+LastName pattern or most generic name
                    if 'FirstName' in name_columns and 'LastName' in name_columns:
                        value_info["tables"][table_name]["likely_identifier"] = ['FirstName', 'LastName']
                    else:
                        # Pick the most generic name column
                        for candidate in ['Name', 'name', 'Title', 'Label']:
                            if candidate in name_columns:
                                value_info["tables"][table_name]["likely_identifier"] = candidate
                                break
                        if not value_info["tables"][table_name]["likely_identifier"]:
                            value_info["tables"][table_name]["likely_identifier"] = name_columns[0]

            value_info["identifier_columns"][table_name] = value_info["tables"][table_name]["likely_identifier"]

            # Map ID columns to name columns if both exist
            if id_columns and name_columns:
                for id_col in id_columns:
                    # Try to match ID to corresponding name (e.g., ProductID -> ProductName)
                    base_name = id_col.replace('ID', '').replace('Id', '').replace('_id', '')
                    for name_col in name_columns:
                        if base_name.lower() in name_col.lower():
                            value_info["id_to_name_mappings"][f"{table_name}.{id_col}"] = f"{table_name}.{name_col}"
                            break

    except Exception as e:
        value_info["error"] = str(e)

    finally:
        conn.close()

    # Write output
    with open("tool_output/value_samples.json", "w") as f:
        json.dump(value_info, f, indent=2)

    print(f"Value extraction complete: {len(value_info['tables'])} tables analyzed")
    print(f"Found {len(value_info['case_sensitive_columns'])} case-sensitive columns")
    print(f"Identified human-readable columns for {len(value_info['identifier_columns'])} tables")
    print("Results saved to tool_output/value_samples.json")

if __name__ == "__main__":
    extract_values()