#!/usr/bin/env python3
"""
Value Extractor Tool
Samples values from each column to identify patterns and case sensitivity.
Inspired by OpenSearch-SQL's value retrieval system.
"""

import sqlite3
import json
import os
from collections import Counter

def extract_values(db_path="database.sqlite", sample_size=100):
    """Extract sample values from all columns to understand data patterns."""

    os.makedirs("tool_output", exist_ok=True)

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    value_info = {
        "tables": {},
        "case_sensitive_columns": [],
        "date_columns": [],
        "numeric_columns": []
    }

    try:
        # Get all tables
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = cursor.fetchall()

        for table_name, in tables:
            if table_name.startswith("sqlite_"):
                continue

            value_info["tables"][table_name] = {
                "columns": {},
                "row_count": 0
            }

            # Get row count
            try:
                cursor.execute(f"SELECT COUNT(*) FROM `{table_name}`")
                row_count = cursor.fetchone()[0]
                value_info["tables"][table_name]["row_count"] = row_count
            except:
                continue

            if row_count == 0:
                continue

            # Get column names
            cursor.execute(f"PRAGMA table_info(`{table_name}`)")
            columns = cursor.fetchall()

            for col in columns:
                col_name = col[1]
                col_type = col[2].upper()

                column_data = {
                    "type": col_type,
                    "unique_values": 0,
                    "null_count": 0,
                    "sample_values": [],
                    "most_common": [],
                    "case_variations": {},
                    "min_length": None,
                    "max_length": None,
                    "patterns": []
                }

                try:
                    # Get unique value count
                    cursor.execute(f"SELECT COUNT(DISTINCT `{col_name}`) FROM `{table_name}`")
                    column_data["unique_values"] = cursor.fetchone()[0]

                    # Get null count
                    cursor.execute(f"SELECT COUNT(*) FROM `{table_name}` WHERE `{col_name}` IS NULL")
                    column_data["null_count"] = cursor.fetchone()[0]

                    # Get sample values
                    cursor.execute(f"""
                        SELECT DISTINCT `{col_name}`
                        FROM `{table_name}`
                        WHERE `{col_name}` IS NOT NULL
                        ORDER BY RANDOM()
                        LIMIT ?
                    """, (sample_size,))
                    samples = [row[0] for row in cursor.fetchall()]
                    column_data["sample_values"] = samples[:20]  # Limit for readability

                    # Get most common values
                    cursor.execute(f"""
                        SELECT `{col_name}`, COUNT(*) as cnt
                        FROM `{table_name}`
                        WHERE `{col_name}` IS NOT NULL
                        GROUP BY `{col_name}`
                        ORDER BY cnt DESC
                        LIMIT 10
                    """)
                    column_data["most_common"] = [
                        {"value": row[0], "count": row[1]}
                        for row in cursor.fetchall()
                    ]

                    # Analyze text columns for case sensitivity
                    if col_type in ['TEXT', 'VARCHAR', 'CHAR'] and samples:
                        # Check for case variations
                        case_groups = {}
                        for val in samples:
                            if isinstance(val, str):
                                lower_val = val.lower()
                                if lower_val not in case_groups:
                                    case_groups[lower_val] = set()
                                case_groups[lower_val].add(val)

                        # Find case variations
                        for lower_val, variations in case_groups.items():
                            if len(variations) > 1:
                                column_data["case_variations"][lower_val] = list(variations)
                                value_info["case_sensitive_columns"].append(f"{table_name}.{col_name}")

                        # String length analysis
                        str_samples = [s for s in samples if isinstance(s, str)]
                        if str_samples:
                            column_data["min_length"] = min(len(s) for s in str_samples)
                            column_data["max_length"] = max(len(s) for s in str_samples)

                    # Detect date patterns
                    if samples and col_type in ['TEXT', 'VARCHAR']:
                        date_patterns = [
                            lambda x: isinstance(x, str) and len(x) == 10 and x[4] == '-' and x[7] == '-',  # YYYY-MM-DD
                            lambda x: isinstance(x, str) and len(x) == 10 and x[2] == '/' and x[5] == '/',    # MM/DD/YYYY
                            lambda x: isinstance(x, str) and 'T' in x and ':' in x,                           # ISO datetime
                        ]

                        for pattern in date_patterns:
                            if any(pattern(s) for s in samples[:10] if s):
                                value_info["date_columns"].append(f"{table_name}.{col_name}")
                                column_data["patterns"].append("date/datetime")
                                break

                    # Identify numeric columns
                    if col_type in ['INTEGER', 'REAL', 'NUMERIC', 'FLOAT', 'DOUBLE']:
                        value_info["numeric_columns"].append(f"{table_name}.{col_name}")
                        if samples:
                            numeric_samples = [s for s in samples if s is not None]
                            if numeric_samples:
                                column_data["min_value"] = min(numeric_samples)
                                column_data["max_value"] = max(numeric_samples)

                except Exception as e:
                    column_data["error"] = str(e)

                value_info["tables"][table_name]["columns"][col_name] = column_data

    except Exception as e:
        value_info["error"] = str(e)
    finally:
        conn.close()

    # Write to output file
    output_path = "tool_output/value_samples.json"
    with open(output_path, 'w') as f:
        json.dump(value_info, f, indent=2)

    print(f"Value extraction complete - results in {output_path}")
    if value_info["case_sensitive_columns"]:
        print(f"Found {len(value_info['case_sensitive_columns'])} columns with case-sensitive values")
    if value_info["date_columns"]:
        print(f"Detected {len(value_info['date_columns'])} date/datetime columns")

if __name__ == "__main__":
    extract_values()