#!/usr/bin/env python3
"""
Extracts representative data samples showing actual values with exact case and format.
Critical for understanding what data actually looks like.
"""

import sqlite3
import json
import os
from collections import Counter

def sample_data(db_path="database.sqlite"):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    samples = {
        "table_samples": {},
        "value_formats": {},
        "common_values": {},
        "edge_cases": {}
    }

    try:
        # Get all tables
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
        tables = [row[0] for row in cursor.fetchall()]

        for table in tables:
            # Get row count
            cursor.execute(f"SELECT COUNT(*) FROM {table}")
            row_count = cursor.fetchone()[0]

            if row_count == 0:
                samples["table_samples"][table] = {
                    "sample_rows": [],
                    "row_count": 0
                }
                continue

            # Get column names
            cursor.execute(f"PRAGMA table_info({table})")
            columns = [col[1] for col in cursor.fetchall()]

            # Get sample rows
            sample_size = min(5, row_count)
            cursor.execute(f"SELECT * FROM {table} LIMIT {sample_size}")
            sample_rows = cursor.fetchall()

            # Format samples with column names
            formatted_samples = []
            for row in sample_rows:
                formatted_row = {}
                for i, col in enumerate(columns):
                    formatted_row[col] = row[i]
                formatted_samples.append(formatted_row)

            samples["table_samples"][table] = {
                "columns": columns,
                "sample_rows": formatted_samples,
                "row_count": row_count
            }

            # Analyze value formats for text columns
            samples["value_formats"][table] = {}
            samples["common_values"][table] = {}

            for col in columns:
                # Get column type
                cursor.execute(f"PRAGMA table_info({table})")
                col_info = cursor.fetchall()
                col_type = None
                for c in col_info:
                    if c[1] == col:
                        col_type = c[2]
                        break

                if col_type and ('TEXT' in col_type.upper() or 'CHAR' in col_type.upper()):
                    # Get distinct values for categorical analysis
                    cursor.execute(f"""
                        SELECT DISTINCT {col}, COUNT(*) as freq
                        FROM {table}
                        WHERE {col} IS NOT NULL
                        GROUP BY {col}
                        ORDER BY freq DESC
                        LIMIT 20
                    """)
                    distinct_values = cursor.fetchall()

                    if distinct_values:
                        # Store common values with exact case
                        samples["common_values"][table][col] = [
                            {"value": v[0], "frequency": v[1]}
                            for v in distinct_values[:10]
                        ]

                        # Analyze format patterns
                        value_list = [v[0] for v in distinct_values if v[0] is not None]
                        format_info = analyze_value_format(value_list)
                        if format_info:
                            samples["value_formats"][table][col] = format_info

                # Get NULL count
                cursor.execute(f"SELECT COUNT(*) FROM {table} WHERE {col} IS NULL")
                null_count = cursor.fetchone()[0]

                if null_count > 0:
                    if table not in samples["edge_cases"]:
                        samples["edge_cases"][table] = {}
                    samples["edge_cases"][table][col] = {
                        "has_nulls": True,
                        "null_count": null_count,
                        "null_percentage": round((null_count / row_count) * 100, 2)
                    }

        # Save samples
        os.makedirs("tool_output", exist_ok=True)

        with open("tool_output/data_samples.json", "w") as f:
            json.dump(samples, f, indent=2, default=str)

        # Generate human-readable sample report
        generate_sample_report(samples)

        print("Data sampling complete - results in tool_output/")

    except Exception as e:
        print(f"Error sampling data: {e}")
        samples["error"] = str(e)
        with open("tool_output/data_samples.json", "w") as f:
            json.dump(samples, f, indent=2, default=str)

    finally:
        conn.close()

def analyze_value_format(values):
    """Analyze the format of values to detect patterns."""

    if not values:
        return None

    format_info = {}

    # Check case patterns
    str_values = [str(v) for v in values if v is not None]

    if str_values:
        # Check if all uppercase
        if all(v.isupper() for v in str_values if v.replace('-', '').replace('_', '').isalpha()):
            format_info["case"] = "UPPERCASE"
        # Check if all lowercase
        elif all(v.islower() for v in str_values if v.replace('-', '').replace('_', '').isalpha()):
            format_info["case"] = "lowercase"
        # Check if title case
        elif all(v.istitle() for v in str_values if ' ' in v):
            format_info["case"] = "Title Case"
        # Mixed case
        else:
            # Check for specific patterns
            has_upper = any(any(c.isupper() for c in v) for v in str_values)
            has_lower = any(any(c.islower() for c in v) for v in str_values)
            if has_upper and has_lower:
                format_info["case"] = "Mixed Case"

        # Check for common patterns
        if all(len(v) == len(str_values[0]) for v in str_values):
            format_info["fixed_length"] = len(str_values[0])

        # Check for prefixes
        if len(str_values) > 3:
            common_prefix = os.path.commonprefix(str_values)
            if len(common_prefix) >= 2:
                format_info["common_prefix"] = common_prefix

        # Check for special characters
        special_chars = set()
        for v in str_values:
            for char in v:
                if not char.isalnum() and char not in ' ':
                    special_chars.add(char)

        if special_chars:
            format_info["special_characters"] = list(special_chars)

        # Check for patterns like XX-## or similar
        if len(str_values) > 2:
            # Simple pattern detection
            if all('-' in v for v in str_values[:5]):
                format_info["contains_dash"] = True
            if all('_' in v for v in str_values[:5]):
                format_info["contains_underscore"] = True

    return format_info if format_info else None

def generate_sample_report(samples):
    """Generate a human-readable sample report."""

    report = []
    report.append("# DATA SAMPLE REPORT\n\n")

    # Table samples
    for table, data in samples["table_samples"].items():
        report.append(f"## {table} (Total rows: {data['row_count']:,})\n\n")

        if data["sample_rows"]:
            report.append("### Sample Data (First 5 rows):\n\n")

            # Format as table
            if data["columns"]:
                # Header
                report.append("| " + " | ".join(data["columns"]) + " |\n")
                report.append("| " + " | ".join(["---"] * len(data["columns"])) + " |\n")

                # Rows
                for row in data["sample_rows"]:
                    row_values = []
                    for col in data["columns"]:
                        val = row.get(col)
                        if val is None:
                            row_values.append("NULL")
                        elif isinstance(val, str):
                            # Escape pipes and limit length
                            val_str = val.replace("|", "\\|")
                            if len(val_str) > 50:
                                val_str = val_str[:47] + "..."
                            row_values.append(val_str)
                        else:
                            row_values.append(str(val))
                    report.append("| " + " | ".join(row_values) + " |\n")

            report.append("\n")

        # Common values for important columns
        if table in samples["common_values"] and samples["common_values"][table]:
            report.append("### Common Values:\n\n")
            for col, values in samples["common_values"][table].items():
                if values:
                    report.append(f"**{col}**:\n")
                    for v in values[:5]:
                        if v["value"] is not None:
                            report.append(f"  - `{v['value']}` (appears {v['frequency']} times)\n")
                    report.append("\n")

        # Value formats
        if table in samples["value_formats"] and samples["value_formats"][table]:
            report.append("### Value Formats:\n\n")
            for col, format_info in samples["value_formats"][table].items():
                report.append(f"**{col}**:\n")
                for key, value in format_info.items():
                    if key == "case":
                        report.append(f"  - Case: {value}\n")
                    elif key == "fixed_length":
                        report.append(f"  - Fixed length: {value} characters\n")
                    elif key == "common_prefix":
                        report.append(f"  - Common prefix: `{value}`\n")
                    elif key == "special_characters":
                        report.append(f"  - Contains: {', '.join(value)}\n")
                report.append("\n")

        # Edge cases
        if table in samples["edge_cases"]:
            report.append("### NULL Values:\n\n")
            for col, info in samples["edge_cases"][table].items():
                if info.get("has_nulls"):
                    report.append(f"- {col}: {info['null_percentage']}% NULL ({info['null_count']} rows)\n")
            report.append("\n")

        report.append("---\n\n")

    with open("tool_output/data_sample_report.txt", "w") as f:
        f.writelines(report)

if __name__ == "__main__":
    sample_data()