#!/usr/bin/env python3
"""
Value Sampler - Enhanced from iter20
Comprehensive value sampling with exact case and format detection
"""

import sqlite3
import os

def sample_values():
    """Sample values comprehensively from all columns."""
    
    try:
        conn = sqlite3.connect("database.sqlite")
        cursor = conn.cursor()
        
        output = []
        output.append("# VALUE SAMPLES")
        output.append("(Exact values for precise matching)")
        output.append("")
        
        # Get all tables
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = [row[0] for row in cursor.fetchall()]
        
        critical_columns = {}  # Track columns that often cause errors
        
        for table in tables:
            output.append(f"## {table}")
            output.append("")
            
            table_escaped = f"`{table}`" if any(c in table for c in [' ', '-', '.']) else table
            
            try:
                # Get columns
                cursor.execute(f"PRAGMA table_info({table_escaped})")
                columns = cursor.fetchall()
                
                for col in columns:
                    col_name = col[1]
                    col_type = col[2]
                    col_escaped = f"`{col_name}`" if any(c in col_name for c in [' ', '-', '.']) else col_name
                    
                    try:
                        # Get cardinality
                        cursor.execute(f"""
                            SELECT 
                                COUNT(DISTINCT {col_escaped}) as distinct_count,
                                COUNT(*) as total_count,
                                COUNT({col_escaped}) as non_null_count
                            FROM {table_escaped}
                        """)
                        distinct_count, total_count, non_null_count = cursor.fetchone()
                        
                        if distinct_count == 0:
                            output.append(f"**{col_name}**: All NULL")
                            output.append("")
                            continue
                        
                        # For low cardinality (≤20), show all values
                        if distinct_count <= 20:
                            cursor.execute(f"""
                                SELECT DISTINCT {col_escaped}, COUNT(*) as cnt
                                FROM {table_escaped}
                                WHERE {col_escaped} IS NOT NULL
                                GROUP BY {col_escaped}
                                ORDER BY cnt DESC, {col_escaped}
                            """)
                            value_counts = cursor.fetchall()
                            
                            output.append(f"**{col_name}** ({distinct_count} distinct values):")
                            
                            # Show all values with counts
                            values = [row[0] for row in value_counts]
                            counts = [row[1] for row in value_counts]
                            
                            if len(values) <= 10:
                                for val, cnt in value_counts:
                                    output.append(f"  '{val}' ({cnt} rows)")
                            else:
                                output.append(f"  All values: {values}")
                            
                            # Check for case sensitivity
                            if any(isinstance(v, str) for v in values):
                                str_values = [v for v in values if isinstance(v, str)]
                                lower_set = set(v.lower() for v in str_values)
                                if len(lower_set) < len(str_values):
                                    output.append(f"  ⚠ CASE SENSITIVE! Use exact case")
                                    critical_columns[f"{table}.{col_name}"] = "case_sensitive"
                            
                            # Check for empty strings
                            if '' in values:
                                output.append(f"  ⚠ Contains empty string ''")
                                critical_columns[f"{table}.{col_name}"] = "has_empty_string"
                        
                        # For high cardinality, show samples and patterns
                        else:
                            output.append(f"**{col_name}** ({distinct_count} distinct values):")
                            
                            # Sample values
                            cursor.execute(f"""
                                SELECT DISTINCT {col_escaped}
                                FROM {table_escaped}
                                WHERE {col_escaped} IS NOT NULL
                                ORDER BY RANDOM()
                                LIMIT 10
                            """)
                            samples = [row[0] for row in cursor.fetchall()]
                            
                            if samples:
                                output.append(f"  Samples: {samples[:5]}")
                            
                            # For numeric columns, show range
                            if col_type and any(t in col_type.upper() for t in ['INT', 'REAL', 'NUMERIC', 'DECIMAL']):
                                cursor.execute(f"""
                                    SELECT MIN({col_escaped}), MAX({col_escaped}), AVG({col_escaped})
                                    FROM {table_escaped}
                                    WHERE {col_escaped} IS NOT NULL
                                """)
                                min_val, max_val, avg_val = cursor.fetchone()
                                if min_val is not None:
                                    output.append(f"  Range: {min_val} to {max_val} (avg: {avg_val:.2f if avg_val else 0})")
                            
                            # For date columns, show format
                            elif col_type and 'DATE' in col_type.upper():
                                cursor.execute(f"""
                                    SELECT DISTINCT {col_escaped}
                                    FROM {table_escaped}
                                    WHERE {col_escaped} IS NOT NULL
                                    LIMIT 3
                                """)
                                date_samples = [row[0] for row in cursor.fetchall()]
                                if date_samples:
                                    output.append(f"  Date format: {date_samples[0]}")
                                    output.append(f"  Examples: {date_samples}")
                            
                            # For string columns, check patterns
                            elif any(isinstance(s, str) for s in samples):
                                # Check for common patterns
                                has_spaces = any(' ' in str(s) for s in samples if s)
                                has_special = any(any(c in str(s) for c in "',\"") for s in samples if s)
                                
                                if has_spaces:
                                    output.append(f"  ⚠ Values contain spaces")
                                if has_special:
                                    output.append(f"  ⚠ Values contain special characters")
                        
                        # Check NULL ratio
                        null_count = total_count - non_null_count
                        if null_count > 0:
                            null_ratio = null_count / total_count
                            if null_ratio > 0.5:
                                output.append(f"  ⚠ {null_count}/{total_count} rows are NULL ({null_ratio:.1%})")
                            else:
                                output.append(f"  Has {null_count} NULL values")
                        
                        output.append("")
                        
                    except Exception as e:
                        output.append(f"**{col_name}**: Error sampling - {str(e)[:50]}")
                        output.append("")
                
            except Exception as e:
                output.append(f"Error reading table: {e}")
                output.append("")
        
        # Add critical columns summary
        if critical_columns:
            output.append("## CRITICAL COLUMNS")
            output.append("(Pay special attention to these)")
            output.append("")
            
            for col, issue in critical_columns.items():
                if issue == "case_sensitive":
                    output.append(f"- {col}: CASE SENSITIVE")
                elif issue == "has_empty_string":
                    output.append(f"- {col}: Has empty string ''")
            output.append("")
        
        conn.close()
        
        # Write output
        os.makedirs("tool_output", exist_ok=True)
        with open("tool_output/value_sampler_output.txt", "w", encoding='utf-8') as f:
            f.write("\n".join(output))
        
        print(f"Value sampling complete - {len(tables)} tables, {len(critical_columns)} critical columns found")
        
    except Exception as e:
        # Fallback output
        error_output = [
            "# VALUE SAMPLES",
            "",
            "ERROR: Could not sample values",
            f"Reason: {e}",
            "",
            "Using basic patterns as fallback."
        ]
        
        os.makedirs("tool_output", exist_ok=True)
        with open("tool_output/value_sampler_output.txt", "w") as f:
            f.write("\n".join(error_output))
        
        print(f"Value sampling failed: {e}")
        raise

if __name__ == "__main__":
    sample_values()