#!/usr/bin/env python3
"""
Enhanced Value Extractor Tool
Samples values with increased precision and pattern detection.
"""

import sqlite3
import json
import os
import re
from collections import Counter

def extract_values(db_path="database.sqlite", sample_size=500):
    """Extract comprehensive value samples with pattern analysis."""

    os.makedirs("tool_output", exist_ok=True)

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    value_info = {
        "tables": {},
        "case_sensitive_columns": [],
        "date_columns": [],
        "numeric_columns": [],
        "percentage_columns": [],
        "boolean_columns": []
    }

    try:
        # Get all tables
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = cursor.fetchall()

        for table_name, in tables:
            if table_name.startswith("sqlite_"):
                continue

            value_info["tables"][table_name] = {
                "columns": {},
                "row_count": 0
            }

            # Get row count
            try:
                cursor.execute(f"SELECT COUNT(*) FROM `{table_name}`")
                row_count = cursor.fetchone()[0]
                value_info["tables"][table_name]["row_count"] = row_count
            except:
                continue

            if row_count == 0:
                continue

            # Get column names
            cursor.execute(f"PRAGMA table_info(`{table_name}`)")
            columns = cursor.fetchall()

            for col in columns:
                col_name = col[1]
                col_type = col[2].upper()

                column_data = {
                    "type": col_type,
                    "unique_values": 0,
                    "null_count": 0,
                    "sample_values": [],
                    "most_common": [],
                    "case_variations": {},
                    "value_patterns": [],
                    "numeric_stats": {},
                    "is_percentage": False,
                    "is_boolean": False
                }

                try:
                    # Get unique value count
                    cursor.execute(f"SELECT COUNT(DISTINCT `{col_name}`) FROM `{table_name}`")
                    column_data["unique_values"] = cursor.fetchone()[0]

                    # Get null count
                    cursor.execute(f"SELECT COUNT(*) FROM `{table_name}` WHERE `{col_name}` IS NULL")
                    column_data["null_count"] = cursor.fetchone()[0]

                    # Get comprehensive samples
                    cursor.execute(f"""
                        SELECT DISTINCT `{col_name}`
                        FROM `{table_name}`
                        WHERE `{col_name}` IS NOT NULL
                        ORDER BY RANDOM()
                        LIMIT ?
                    """, (sample_size,))
                    samples = [row[0] for row in cursor.fetchall()]
                    column_data["sample_values"] = samples[:50]  # Store first 50 for reference

                    # Get most common values with percentages
                    cursor.execute(f"""
                        SELECT `{col_name}`, COUNT(*) as cnt,
                               CAST(COUNT(*) AS REAL) * 100.0 / ? as percentage
                        FROM `{table_name}`
                        WHERE `{col_name}` IS NOT NULL
                        GROUP BY `{col_name}`
                        ORDER BY cnt DESC
                        LIMIT 20
                    """, (row_count,))
                    column_data["most_common"] = [
                        {"value": row[0], "count": row[1], "percentage": round(row[2], 2)}
                        for row in cursor.fetchall()
                    ]

                    # Analyze text columns for case patterns
                    if col_type in ['TEXT', 'VARCHAR', 'CHAR'] and samples:
                        case_patterns = {
                            "all_uppercase": 0,
                            "all_lowercase": 0,
                            "mixed_case": 0,
                            "title_case": 0
                        }

                        case_variations = {}
                        for val in samples:
                            if isinstance(val, str):
                                if val.isupper():
                                    case_patterns["all_uppercase"] += 1
                                elif val.islower():
                                    case_patterns["all_lowercase"] += 1
                                elif val.istitle():
                                    case_patterns["title_case"] += 1
                                else:
                                    case_patterns["mixed_case"] += 1

                                # Track case variations
                                lower_val = val.lower()
                                if lower_val not in case_variations:
                                    case_variations[lower_val] = set()
                                case_variations[lower_val].add(val)

                        # Find actual case variations
                        for lower_val, variations in case_variations.items():
                            if len(variations) > 1:
                                column_data["case_variations"][lower_val] = list(variations)
                                value_info["case_sensitive_columns"].append(f"{table_name}.{col_name}")

                        # Detect patterns
                        if case_patterns["all_uppercase"] > len(samples) * 0.8:
                            column_data["value_patterns"].append("UPPERCASE")
                        elif case_patterns["all_lowercase"] > len(samples) * 0.8:
                            column_data["value_patterns"].append("lowercase")
                        elif case_patterns["title_case"] > len(samples) * 0.8:
                            column_data["value_patterns"].append("Title Case")

                        # Date pattern detection
                        date_patterns = [
                            (r'^\d{4}-\d{2}-\d{2}$', "YYYY-MM-DD"),
                            (r'^\d{2}/\d{2}/\d{4}$', "MM/DD/YYYY"),
                            (r'^\d{4}$', "YYYY"),
                            (r'^\d{2}-\d{2}-\d{4}$', "DD-MM-YYYY")
                        ]

                        for pattern, format_name in date_patterns:
                            matches = sum(1 for val in samples[:100] if isinstance(val, str) and re.match(pattern, val))
                            if matches > len(samples[:100]) * 0.5:
                                column_data["value_patterns"].append(f"DATE:{format_name}")
                                value_info["date_columns"].append(f"{table_name}.{col_name}")
                                break

                        # Boolean pattern detection
                        unique_vals = set(str(v).lower() for v in samples if v is not None)
                        boolean_sets = [
                            {'true', 'false'},
                            {'yes', 'no'},
                            {'y', 'n'},
                            {'1', '0'},
                            {'t', 'f'},
                            {'pos', 'neg'},
                            {'active', 'inactive'}
                        ]

                        for bool_set in boolean_sets:
                            if unique_vals.issubset(bool_set):
                                column_data["is_boolean"] = True
                                column_data["value_patterns"].append(f"BOOLEAN:{'/'.join(sorted(unique_vals))}")
                                value_info["boolean_columns"].append(f"{table_name}.{col_name}")
                                break

                    # Numeric analysis
                    elif col_type in ['INTEGER', 'REAL', 'NUMERIC', 'FLOAT', 'DOUBLE']:
                        numeric_vals = [v for v in samples if v is not None]
                        if numeric_vals:
                            column_data["numeric_stats"] = {
                                "min": min(numeric_vals),
                                "max": max(numeric_vals),
                                "avg": sum(numeric_vals) / len(numeric_vals)
                            }

                            # Check if percentage (0-1 or 0-100)
                            if all(0 <= v <= 1 for v in numeric_vals):
                                column_data["is_percentage"] = True
                                column_data["value_patterns"].append("PERCENTAGE:0-1")
                                value_info["percentage_columns"].append(f"{table_name}.{col_name}")
                            elif all(0 <= v <= 100 for v in numeric_vals):
                                column_data["is_percentage"] = True
                                column_data["value_patterns"].append("PERCENTAGE:0-100")
                                value_info["percentage_columns"].append(f"{table_name}.{col_name}")

                            value_info["numeric_columns"].append(f"{table_name}.{col_name}")

                except Exception as e:
                    column_data["error"] = str(e)

                value_info["tables"][table_name]["columns"][col_name] = column_data

        # Save to file
        with open("tool_output/value_samples.json", "w") as f:
            json.dump(value_info, f, indent=2)

        print(f"✓ Value extraction complete: Analyzed {len(value_info['tables'])} tables")
        print(f"✓ Found {len(value_info['case_sensitive_columns'])} case-sensitive columns")
        print(f"✓ Identified {len(value_info['percentage_columns'])} percentage columns")

    except Exception as e:
        print(f"✗ Value extraction failed: {str(e)}")
        raise
    finally:
        conn.close()

if __name__ == "__main__":
    extract_values()