#!/usr/bin/env python3
"""
Value Extractor Tool - Enhanced with comprehensive sampling
Samples values from database columns to understand formats and case sensitivity.
"""

import sqlite3
import json
import os

def extract_values(db_path="database.sqlite"):
    """Extract sample values and patterns from database."""

    os.makedirs("tool_output", exist_ok=True)

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    value_info = {
        "tables": {},
        "case_patterns": {},
        "date_formats": [],
        "time_formats": [],
        "critical_exact_values": {}
    }

    try:
        # Get all tables
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = cursor.fetchall()

        for table_name, in tables:
            if table_name.startswith("sqlite_"):
                continue

            value_info["tables"][table_name] = {
                "sample_values": {},
                "unique_counts": {},
                "null_counts": {},
                "case_info": {},
                "common_values": {}
            }

            # Get columns
            cursor.execute(f"PRAGMA table_info(`{table_name}`)")
            columns = cursor.fetchall()

            # Get row count
            cursor.execute(f"SELECT COUNT(*) FROM `{table_name}`")
            row_count = cursor.fetchone()[0]

            if row_count == 0:
                continue

            for col in columns:
                col_name = col[1]
                col_type = col[2]

                # Get more sample values (50 distinct)
                try:
                    cursor.execute(f"""
                        SELECT DISTINCT `{col_name}`
                        FROM `{table_name}`
                        WHERE `{col_name}` IS NOT NULL
                        LIMIT 50
                    """)
                    samples = [row[0] for row in cursor.fetchall()]
                    value_info["tables"][table_name]["sample_values"][col_name] = samples

                    # Get unique count
                    cursor.execute(f"SELECT COUNT(DISTINCT `{col_name}`) FROM `{table_name}`")
                    unique_count = cursor.fetchone()[0]
                    value_info["tables"][table_name]["unique_counts"][col_name] = unique_count

                    # Get NULL count
                    cursor.execute(f"SELECT COUNT(*) FROM `{table_name}` WHERE `{col_name}` IS NULL")
                    null_count = cursor.fetchone()[0]
                    value_info["tables"][table_name]["null_counts"][col_name] = null_count

                    # Get most common values for categorical columns
                    if unique_count <= 100 and unique_count > 0:
                        cursor.execute(f"""
                            SELECT `{col_name}`, COUNT(*) as cnt
                            FROM `{table_name}`
                            WHERE `{col_name}` IS NOT NULL
                            GROUP BY `{col_name}`
                            ORDER BY cnt DESC
                            LIMIT 10
                        """)
                        common = [(row[0], row[1]) for row in cursor.fetchall()]
                        value_info["tables"][table_name]["common_values"][col_name] = common

                    # Analyze case patterns for text columns
                    if col_type in ('TEXT', 'VARCHAR', 'CHAR') and samples:
                        case_patterns = set()
                        for sample in samples[:20]:
                            if isinstance(sample, str):
                                if sample.isupper():
                                    case_patterns.add("UPPERCASE")
                                elif sample.islower():
                                    case_patterns.add("lowercase")
                                elif sample[0].isupper() if sample else False:
                                    case_patterns.add("TitleCase")
                                else:
                                    case_patterns.add("mixed")

                        if case_patterns:
                            value_info["tables"][table_name]["case_info"][col_name] = list(case_patterns)

                    # Detect date/time formats
                    if 'date' in col_name.lower() or 'time' in col_name.lower():
                        if samples:
                            sample_str = str(samples[0])
                            # Date format detection
                            if '-' in sample_str:
                                if sample_str.count('-') == 2:
                                    if len(sample_str.split('-')[0]) == 4:
                                        format_desc = "YYYY-MM-DD format"
                                    else:
                                        format_desc = "Date with - separator"
                                    if format_desc not in value_info["date_formats"]:
                                        value_info["date_formats"].append(format_desc)

                            # Time format detection
                            if ':' in sample_str:
                                if sample_str.count(':') >= 2:
                                    format_desc = "HH:MM:SS format"
                                    if format_desc not in value_info["time_formats"]:
                                        value_info["time_formats"].append(format_desc)

                    # Store critical values for exact matching
                    col_lower = col_name.lower()
                    if any(word in col_lower for word in ['code', 'type', 'status', 'category', 'name', 'description']):
                        if unique_count <= 1000 and samples:
                            key = f"{table_name}.{col_name}"
                            value_info["critical_exact_values"][key] = samples[:30]

                except Exception as e:
                    value_info["tables"][table_name]["sample_values"][col_name] = f"Error: {str(e)}"

    except Exception as e:
        value_info["error"] = str(e)
    finally:
        conn.close()

    # Write to output file
    output_path = "tool_output/value_samples.json"
    with open(output_path, 'w') as f:
        json.dump(value_info, f, indent=2)

    print(f"Value extraction complete - results in {output_path}")

    if value_info.get("critical_exact_values"):
        print(f"Found {len(value_info['critical_exact_values'])} columns with critical exact values")

    if value_info.get("date_formats"):
        print(f"Date formats detected: {', '.join(value_info['date_formats'])}")

if __name__ == "__main__":
    extract_values()