#!/usr/bin/env python3
"""
Value Sampler - Provides exact sample values with correct case to prevent mismatches.
Shows the eval model exactly how values should be formatted.
"""

import sqlite3
import json
import os
from collections import Counter

def sample_values():
    """Sample actual values from the database to show exact case and format."""

    conn = sqlite3.connect("database.sqlite")
    cursor = conn.cursor()

    os.makedirs("tool_output", exist_ok=True)

    samples = {}
    output_lines = []

    try:
        # Get all tables
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
        tables = [row[0] for row in cursor.fetchall()]

        output_lines.append("# VALUE SAMPLES WITH EXACT CASE")
        output_lines.append("# Use these exact values in your queries to avoid case mismatches")
        output_lines.append("")

        for table in tables:
            cursor.execute(f"PRAGMA table_info({table})")
            columns = cursor.fetchall()

            # Get row count
            cursor.execute(f"SELECT COUNT(*) FROM {table}")
            row_count = cursor.fetchone()[0]

            if row_count == 0:
                continue

            samples[table] = {
                "row_count": row_count,
                "columns": {}
            }

            output_lines.append(f"## Table: {table} ({row_count} rows)")
            output_lines.append("")

            # Sample each column
            for col in columns:
                col_name = col[1]
                col_type = col[2]

                # Get distinct values count
                cursor.execute(f"SELECT COUNT(DISTINCT {col_name}) FROM {table}")
                distinct_count = cursor.fetchone()[0]

                # Sample values based on type and cardinality
                if distinct_count <= 20:
                    # Show all values for low cardinality columns
                    cursor.execute(f"""
                        SELECT {col_name}, COUNT(*) as freq
                        FROM {table}
                        WHERE {col_name} IS NOT NULL
                        GROUP BY {col_name}
                        ORDER BY freq DESC
                    """)
                    values = cursor.fetchall()
                else:
                    # Sample common values for high cardinality
                    cursor.execute(f"""
                        SELECT {col_name}, COUNT(*) as freq
                        FROM {table}
                        WHERE {col_name} IS NOT NULL
                        GROUP BY {col_name}
                        ORDER BY freq DESC
                        LIMIT 10
                    """)
                    values = cursor.fetchall()

                if not values:
                    continue

                samples[table]["columns"][col_name] = {
                    "type": col_type,
                    "distinct_count": distinct_count,
                    "samples": []
                }

                output_lines.append(f"### {col_name} ({col_type}) - {distinct_count} distinct values")

                # Format based on type
                if any(t in col_type.upper() for t in ['CHAR', 'TEXT', 'BLOB']):
                    # String values - show with quotes and exact case
                    output_lines.append("```")
                    for val, freq in values[:10]:
                        if val is not None:
                            samples[table]["columns"][col_name]["samples"].append(val)
                            if distinct_count <= 20:
                                output_lines.append(f"'{val}' (appears {freq} times)")
                            else:
                                output_lines.append(f"'{val}'")
                    output_lines.append("```")

                elif any(t in col_type.upper() for t in ['INT', 'REAL', 'NUMERIC']):
                    # Numeric values
                    output_lines.append("```")
                    for val, freq in values[:10]:
                        if val is not None:
                            samples[table]["columns"][col_name]["samples"].append(val)
                            if distinct_count <= 20:
                                output_lines.append(f"{val} (appears {freq} times)")
                            else:
                                output_lines.append(f"{val}")
                    output_lines.append("```")

                    # Show range for numeric columns
                    cursor.execute(f"SELECT MIN({col_name}), MAX({col_name}) FROM {table} WHERE {col_name} IS NOT NULL")
                    min_val, max_val = cursor.fetchone()
                    if min_val is not None and max_val is not None:
                        output_lines.append(f"Range: {min_val} to {max_val}")

                else:
                    # Other types
                    output_lines.append("```")
                    for val, freq in values[:10]:
                        if val is not None:
                            samples[table]["columns"][col_name]["samples"].append(val)
                            output_lines.append(f"{val}")
                    output_lines.append("```")

                # Check for NULLs
                cursor.execute(f"SELECT COUNT(*) FROM {table} WHERE {col_name} IS NULL")
                null_count = cursor.fetchone()[0]
                if null_count > 0:
                    output_lines.append(f"**Note: {null_count} NULL values**")

                output_lines.append("")

            # Show sample rows
            output_lines.append("### Sample Rows (first 3)")
            cursor.execute(f"SELECT * FROM {table} LIMIT 3")
            sample_rows = cursor.fetchall()
            col_names = [col[1] for col in columns]

            for i, row in enumerate(sample_rows, 1):
                output_lines.append(f"Row {i}:")
                for col_name, value in zip(col_names, row):
                    if value is None:
                        output_lines.append(f"  {col_name}: NULL")
                    elif isinstance(value, str):
                        output_lines.append(f"  {col_name}: '{value}'")
                    else:
                        output_lines.append(f"  {col_name}: {value}")
                output_lines.append("")

        # Add case sensitivity warnings
        output_lines.append("## IMPORTANT: Case Sensitivity Notes")
        output_lines.append("")
        output_lines.append("SQLite string comparisons are case-sensitive by default:")
        output_lines.append("- `WHERE column = 'Value'` will NOT match 'value'")
        output_lines.append("- Use exact case from samples above")
        output_lines.append("- For case-insensitive: `WHERE LOWER(column) = LOWER('value')`")
        output_lines.append("- LIKE is case-insensitive: `WHERE column LIKE '%value%'`")

        # Write outputs
        with open("tool_output/value_samples.txt", "w") as f:
            f.write("\n".join(output_lines))

        with open("tool_output/value_samples.json", "w") as f:
            json.dump(samples, f, indent=2)

        print("Value samples generated - see tool_output/value_samples.txt")

    finally:
        conn.close()

if __name__ == "__main__":
    sample_values()