#!/usr/bin/env python3
"""
Value Extractor Tool
Samples values from each column to identify patterns and case sensitivity.
Enhanced to catch value mismatches.
"""

import sqlite3
import json
import os
from collections import Counter

def extract_values(db_path="database.sqlite", sample_size=100):
    """Extract sample values from all columns to understand data patterns."""

    os.makedirs("tool_output", exist_ok=True)

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    value_info = {
        "tables": {},
        "case_sensitive_columns": [],
        "date_columns": [],
        "numeric_columns": [],
        "value_mismatches": []  # Track potential value confusions
    }

    try:
        # Get all tables
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = cursor.fetchall()

        for table_name, in tables:
            if table_name.startswith("sqlite_"):
                continue

            value_info["tables"][table_name] = {
                "columns": {},
                "row_count": 0
            }

            # Get row count
            try:
                cursor.execute(f"SELECT COUNT(*) FROM `{table_name}`")
                row_count = cursor.fetchone()[0]
                value_info["tables"][table_name]["row_count"] = row_count
            except:
                continue

            if row_count == 0:
                continue

            # Get column names
            cursor.execute(f"PRAGMA table_info(`{table_name}`)")
            columns = cursor.fetchall()

            for col in columns:
                col_name = col[1]
                col_type = col[2].upper()

                column_data = {
                    "type": col_type,
                    "unique_values": 0,
                    "null_count": 0,
                    "sample_values": [],
                    "most_common": [],
                    "case_variations": {},
                    "min_length": None,
                    "max_length": None,
                    "patterns": [],
                    "similar_values": []  # Track similar values that might be confused
                }

                try:
                    # Get unique value count
                    cursor.execute(f"SELECT COUNT(DISTINCT `{col_name}`) FROM `{table_name}`")
                    column_data["unique_values"] = cursor.fetchone()[0]

                    # Get null count
                    cursor.execute(f"SELECT COUNT(*) FROM `{table_name}` WHERE `{col_name}` IS NULL")
                    column_data["null_count"] = cursor.fetchone()[0]

                    # Get sample values
                    cursor.execute(f"""
                        SELECT DISTINCT `{col_name}`
                        FROM `{table_name}`
                        WHERE `{col_name}` IS NOT NULL
                        ORDER BY RANDOM()
                        LIMIT ?
                    """, (sample_size,))
                    samples = [row[0] for row in cursor.fetchall()]
                    column_data["sample_values"] = samples[:20]

                    # Get most common values
                    cursor.execute(f"""
                        SELECT `{col_name}`, COUNT(*) as cnt
                        FROM `{table_name}`
                        WHERE `{col_name}` IS NOT NULL
                        GROUP BY `{col_name}`
                        ORDER BY cnt DESC
                        LIMIT 10
                    """)
                    column_data["most_common"] = [
                        {"value": row[0], "count": row[1]}
                        for row in cursor.fetchall()
                    ]

                    # Analyze text columns for case sensitivity and similar values
                    if col_type in ['TEXT', 'VARCHAR', 'CHAR'] and samples:
                        # Check for case variations
                        case_groups = {}
                        for val in samples:
                            if isinstance(val, str):
                                lower_val = val.lower()
                                if lower_val not in case_groups:
                                    case_groups[lower_val] = set()
                                case_groups[lower_val].add(val)

                        # Find case variations
                        for lower_val, variations in case_groups.items():
                            if len(variations) > 1:
                                column_data["case_variations"][lower_val] = list(variations)
                                value_info["case_sensitive_columns"].append(f"{table_name}.{col_name}")

                        # Check for similar values (singular/plural, spelling variations)
                        str_samples = [s for s in samples if isinstance(s, str)]
                        for i, val1 in enumerate(str_samples):
                            for val2 in str_samples[i+1:]:
                                # Check for singular/plural
                                if (val1.lower() + 's' == val2.lower() or
                                    val1.lower() == val2.lower() + 's'):
                                    column_data["similar_values"].append({
                                        "values": [val1, val2],
                                        "type": "singular/plural"
                                    })
                                    value_info["value_mismatches"].append({
                                        "table": table_name,
                                        "column": col_name,
                                        "values": [val1, val2],
                                        "type": "singular/plural"
                                    })
                                # Check for spelling variations (Gray/Grey)
                                elif (val1.lower().replace('gray', 'grey') == val2.lower() or
                                      val1.lower().replace('grey', 'gray') == val2.lower()):
                                    column_data["similar_values"].append({
                                        "values": [val1, val2],
                                        "type": "spelling_variation"
                                    })
                                    value_info["value_mismatches"].append({
                                        "table": table_name,
                                        "column": col_name,
                                        "values": [val1, val2],
                                        "type": "spelling_variation"
                                    })

                        # String length analysis
                        if str_samples:
                            column_data["min_length"] = min(len(s) for s in str_samples)
                            column_data["max_length"] = max(len(s) for s in str_samples)

                    # Detect date patterns
                    if samples and col_type in ['TEXT', 'VARCHAR', 'DATE', 'DATETIME']:
                        import re
                        date_patterns = [
                            (r'^\d{4}-\d{2}-\d{2}$', 'YYYY-MM-DD'),
                            (r'^\d{2}/\d{2}/\d{4}$', 'MM/DD/YYYY'),
                            (r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$', 'YYYY-MM-DD HH:MM:SS'),
                            (r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}', 'ISO 8601')
                        ]

                        for pattern, format_name in date_patterns:
                            if any(re.match(pattern, str(s)) for s in samples[:10] if s):
                                value_info["date_columns"].append(f"{table_name}.{col_name}")
                                column_data["patterns"].append(f"date/{format_name}")
                                break

                    # Identify numeric columns
                    if col_type in ['INTEGER', 'REAL', 'NUMERIC', 'FLOAT', 'DOUBLE']:
                        value_info["numeric_columns"].append(f"{table_name}.{col_name}")
                        if samples:
                            numeric_samples = [s for s in samples if s is not None]
                            if numeric_samples:
                                column_data["min_value"] = min(numeric_samples)
                                column_data["max_value"] = max(numeric_samples)

                except Exception as e:
                    column_data["error"] = str(e)

                value_info["tables"][table_name]["columns"][col_name] = column_data

    except Exception as e:
        value_info["error"] = str(e)
    finally:
        conn.close()

    # Write to output file
    output_path = "tool_output/value_samples.json"
    with open(output_path, 'w') as f:
        json.dump(value_info, f, indent=2)

    print(f"Value extraction complete - results in {output_path}")
    if value_info["case_sensitive_columns"]:
        print(f"Found {len(value_info['case_sensitive_columns'])} columns with case-sensitive values")
    if value_info["value_mismatches"]:
        print(f"⚠️  Found {len(value_info['value_mismatches'])} potential value confusions")

if __name__ == "__main__":
    extract_values()