#!/usr/bin/env python3
"""
Value Validator - Samples actual database values with exact case and format.
Critical for avoiding case sensitivity issues and format mismatches.
"""

import sqlite3
import json
import os

def validate_values(db_path="database.sqlite"):
    """Sample values from all tables with exact case preservation."""

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    value_samples = {}

    try:
        # Get all tables
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
        tables = [t[0] for t in cursor.fetchall()]

        for table in tables:
            # Get column info
            cursor.execute(f"PRAGMA table_info([{table}])")
            columns = [col[1] for col in cursor.fetchall()]

            # Get sample rows
            cursor.execute(f"SELECT * FROM [{table}] LIMIT 5")
            sample_rows = cursor.fetchall()

            if sample_rows:
                # Convert to dict format
                sample_dicts = []
                for row in sample_rows:
                    row_dict = {}
                    for i, col in enumerate(columns):
                        value = row[i]
                        # Preserve exact value including None, empty strings, etc.
                        row_dict[col] = value
                    sample_dicts.append(row_dict)

                value_samples[table] = {
                    "columns": columns,
                    "sample_rows": sample_dicts,
                    "row_count": len(sample_rows)
                }

                # Also collect distinct values for categorical columns
                categorical_values = {}
                for col in columns:
                    # Check if column might be categorical (text with limited distinct values)
                    cursor.execute(f"""
                        SELECT COUNT(DISTINCT [{col}]) as distinct_count,
                               COUNT([{col}]) as total_count
                        FROM [{table}]
                        WHERE [{col}] IS NOT NULL
                    """)
                    result = cursor.fetchone()
                    if result and result[0] and result[1]:
                        distinct_count, total_count = result

                        # If ratio suggests categorical (less than 20 distinct values or < 10% unique)
                        if distinct_count <= 20 or (distinct_count / total_count < 0.1 and distinct_count <= 100):
                            cursor.execute(f"""
                                SELECT DISTINCT [{col}]
                                FROM [{table}]
                                WHERE [{col}] IS NOT NULL
                                ORDER BY [{col}]
                                LIMIT 20
                            """)
                            distinct_vals = [row[0] for row in cursor.fetchall()]
                            if distinct_vals:
                                categorical_values[col] = distinct_vals

                if categorical_values:
                    value_samples[table]["categorical_values"] = categorical_values

                # Check for NULL vs empty string patterns
                null_empty_info = {}
                for col in columns:
                    cursor.execute(f"""
                        SELECT
                            SUM(CASE WHEN [{col}] IS NULL THEN 1 ELSE 0 END) as null_count,
                            SUM(CASE WHEN [{col}] = '' THEN 1 ELSE 0 END) as empty_count,
                            COUNT(*) as total_count
                        FROM [{table}]
                    """)
                    null_count, empty_count, total_count = cursor.fetchone()

                    if null_count > 0 or empty_count > 0:
                        null_empty_info[col] = {
                            "has_nulls": null_count > 0,
                            "has_empty_strings": empty_count > 0,
                            "null_count": null_count,
                            "empty_count": empty_count
                        }

                if null_empty_info:
                    value_samples[table]["null_empty_patterns"] = null_empty_info

    finally:
        conn.close()

    # Save value samples
    os.makedirs('tool_output', exist_ok=True)
    with open('tool_output/value_samples.json', 'w', encoding='utf-8') as f:
        json.dump(value_samples, f, indent=2, ensure_ascii=False)

    # Create a quick reference for common values
    with open('tool_output/value_reference.txt', 'w', encoding='utf-8') as f:
        f.write("VALUE SAMPLES - EXACT CASE REFERENCE\n")
        f.write("=" * 60 + "\n\n")

        for table, data in value_samples.items():
            f.write(f"TABLE: {table}\n")
            f.write("-" * 40 + "\n")

            # Show sample values
            if "sample_rows" in data and data["sample_rows"]:
                f.write("Sample Values (USE EXACT CASE):\n")
                for row in data["sample_rows"][:3]:
                    for col, val in row.items():
                        if val is not None and val != "":
                            f.write(f"  {col}: {repr(val)}\n")
                f.write("\n")

            # Show categorical values
            if "categorical_values" in data:
                f.write("Common Values (EXACT CASE):\n")
                for col, vals in data["categorical_values"].items():
                    if vals:
                        sample_vals = vals[:5]
                        f.write(f"  {col}: {', '.join(repr(v) for v in sample_vals)}\n")
                f.write("\n")

            # Show NULL/empty patterns
            if "null_empty_patterns" in data:
                f.write("NULL vs Empty String:\n")
                for col, pattern in data["null_empty_patterns"].items():
                    if pattern["has_nulls"] and pattern["has_empty_strings"]:
                        f.write(f"  {col}: Has both NULL and empty strings\n")
                    elif pattern["has_nulls"]:
                        f.write(f"  {col}: Uses NULL (not empty string)\n")
                    elif pattern["has_empty_strings"]:
                        f.write(f"  {col}: Uses empty string (not NULL)\n")
                f.write("\n")

            f.write("\n")

    print(f"Value validation complete: {len(value_samples)} tables sampled")
    return value_samples

if __name__ == "__main__":
    validate_values()