#!/usr/bin/env python3
"""
Evidence Translator - Maps common evidence patterns to SQL constructs.
Helps the eval model correctly interpret evidence hints.
"""

import os
import json

def generate_evidence_rules():
    """Generate comprehensive evidence mapping rules."""

    rules = []

    # Column mapping patterns
    rules.append({
        "category": "Column Mappings",
        "patterns": [
            {
                "evidence_pattern": "X refers to Y",
                "sql_translation": "Use Y as the column name for X",
                "example": "'total sales refers to amount' → Use 'amount' column"
            },
            {
                "evidence_pattern": "X is the Y",
                "sql_translation": "Y is the column name for X",
                "example": "'customer name is the full_name' → Use 'full_name' column"
            },
            {
                "evidence_pattern": "X = value",
                "sql_translation": "Add WHERE X = value condition",
                "example": "'status = \"Active\"' → WHERE status = 'Active'"
            },
            {
                "evidence_pattern": "use X for Y",
                "sql_translation": "Use column X when question asks for Y",
                "example": "'use created_at for registration date' → SELECT created_at"
            }
        ]
    })

    # Calculation patterns
    rules.append({
        "category": "Calculations",
        "patterns": [
            {
                "evidence_pattern": "DIVIDE(X, Y)",
                "sql_translation": "CAST(X AS REAL) / NULLIF(Y, 0)",
                "example": "'DIVIDE(sales, customers)' → CAST(sales AS REAL) / NULLIF(customers, 0)"
            },
            {
                "evidence_pattern": "percentage",
                "sql_translation": "(part / whole) * 100 or just (part / whole) - check evidence",
                "example": "'percentage of active' → Check if evidence shows * 100"
            },
            {
                "evidence_pattern": "X - Y",
                "sql_translation": "X - Y (direct subtraction)",
                "example": "'revenue - cost' → revenue - cost"
            },
            {
                "evidence_pattern": "SUM of X",
                "sql_translation": "SUM(X)",
                "example": "'SUM of quantities' → SUM(quantities)"
            },
            {
                "evidence_pattern": "average X",
                "sql_translation": "AVG(X)",
                "example": "'average salary' → AVG(salary)"
            }
        ]
    })

    # Comparison patterns
    rules.append({
        "category": "Comparisons",
        "patterns": [
            {
                "evidence_pattern": "the most",
                "sql_translation": "ORDER BY column DESC LIMIT 1 or MAX(column)",
                "example": "'the most expensive' → ORDER BY price DESC LIMIT 1"
            },
            {
                "evidence_pattern": "the least",
                "sql_translation": "ORDER BY column ASC LIMIT 1 or MIN(column)",
                "example": "'the least populated' → ORDER BY population ASC LIMIT 1"
            },
            {
                "evidence_pattern": "the oldest",
                "sql_translation": "MIN(date/year) or ORDER BY date ASC LIMIT 1",
                "example": "'the oldest record' → ORDER BY created_date ASC LIMIT 1"
            },
            {
                "evidence_pattern": "the newest/latest",
                "sql_translation": "MAX(date/year) or ORDER BY date DESC LIMIT 1",
                "example": "'the latest update' → ORDER BY updated_at DESC LIMIT 1"
            },
            {
                "evidence_pattern": "top N",
                "sql_translation": "ORDER BY metric DESC LIMIT N",
                "example": "'top 5 sellers' → ORDER BY sales DESC LIMIT 5"
            },
            {
                "evidence_pattern": "bottom N",
                "sql_translation": "ORDER BY metric ASC LIMIT N",
                "example": "'bottom 3 performers' → ORDER BY score ASC LIMIT 3"
            }
        ]
    })

    # NULL and empty handling
    rules.append({
        "category": "NULL/Empty Handling",
        "patterns": [
            {
                "evidence_pattern": "without X",
                "sql_translation": "Check evidence - could be IS NULL or = ''",
                "example": "'without email' → email IS NULL or email = ''"
            },
            {
                "evidence_pattern": "no X",
                "sql_translation": "X IS NULL or X = '' or X = 0 - check evidence",
                "example": "'no involvement' → involvement = '' (per evidence)"
            },
            {
                "evidence_pattern": "has X",
                "sql_translation": "X IS NOT NULL AND X != ''",
                "example": "'has phone number' → phone IS NOT NULL AND phone != ''"
            },
            {
                "evidence_pattern": "with X",
                "sql_translation": "X IS NOT NULL AND X != ''",
                "example": "'with description' → description IS NOT NULL AND description != ''"
            }
        ]
    })

    # Date/Time patterns
    rules.append({
        "category": "Date/Time",
        "patterns": [
            {
                "evidence_pattern": "in year YYYY",
                "sql_translation": "strftime('%Y', date_column) = 'YYYY' or YEAR(date_column) = YYYY",
                "example": "'in year 2023' → strftime('%Y', created_date) = '2023'"
            },
            {
                "evidence_pattern": "before date",
                "sql_translation": "date_column < 'YYYY-MM-DD'",
                "example": "'before 2023-01-01' → date_column < '2023-01-01'"
            },
            {
                "evidence_pattern": "after date",
                "sql_translation": "date_column > 'YYYY-MM-DD'",
                "example": "'after 2023-01-01' → date_column > '2023-01-01'"
            },
            {
                "evidence_pattern": "between dates",
                "sql_translation": "date_column BETWEEN 'date1' AND 'date2'",
                "example": "'between 2023-01-01 and 2023-12-31' → BETWEEN dates"
            },
            {
                "evidence_pattern": "scheduled vs actual",
                "sql_translation": "scheduled columns often have prefix (CRS_), actual without",
                "example": "'earliest scheduled' → Use CRS_ prefixed columns"
            }
        ]
    })

    # Aggregation indicators
    rules.append({
        "category": "Aggregation Indicators",
        "patterns": [
            {
                "evidence_pattern": "how many",
                "sql_translation": "COUNT(*) or COUNT(DISTINCT column)",
                "example": "'how many customers' → COUNT(DISTINCT customer_id)"
            },
            {
                "evidence_pattern": "total",
                "sql_translation": "SUM(column)",
                "example": "'total revenue' → SUM(revenue)"
            },
            {
                "evidence_pattern": "average/mean",
                "sql_translation": "AVG(column)",
                "example": "'average price' → AVG(price)"
            },
            {
                "evidence_pattern": "maximum",
                "sql_translation": "MAX(column)",
                "example": "'maximum salary' → MAX(salary)"
            },
            {
                "evidence_pattern": "minimum",
                "sql_translation": "MIN(column)",
                "example": "'minimum age' → MIN(age)"
            }
        ]
    })

    # Special SQL constructs
    rules.append({
        "category": "Special SQL Constructs",
        "patterns": [
            {
                "evidence_pattern": "distinct/unique",
                "sql_translation": "Use DISTINCT in SELECT or COUNT",
                "example": "'unique products' → SELECT DISTINCT product_id"
            },
            {
                "evidence_pattern": "group by X",
                "sql_translation": "GROUP BY X with appropriate aggregation",
                "example": "'sales by region' → GROUP BY region"
            },
            {
                "evidence_pattern": "for each X",
                "sql_translation": "GROUP BY X",
                "example": "'count for each category' → GROUP BY category"
            },
            {
                "evidence_pattern": "per X",
                "sql_translation": "GROUP BY X or use X as denominator",
                "example": "'sales per customer' → sales / customer_count"
            }
        ]
    })

    # Save evidence rules
    os.makedirs('tool_output', exist_ok=True)

    # Save as text for readability
    with open('tool_output/evidence_rules.txt', 'w') as f:
        f.write("EVIDENCE TRANSLATION RULES\n")
        f.write("=" * 60 + "\n\n")
        f.write("Use these rules to translate evidence hints into SQL constructs.\n")
        f.write("Evidence takes priority over everything else!\n\n")

        for rule_set in rules:
            f.write(f"## {rule_set['category']}\n")
            f.write("-" * 40 + "\n")
            for pattern in rule_set['patterns']:
                f.write(f"Evidence: {pattern['evidence_pattern']}\n")
                f.write(f"SQL:      {pattern['sql_translation']}\n")
                f.write(f"Example:  {pattern['example']}\n")
                f.write("\n")
            f.write("\n")

        # Add quick reference section
        f.write("=" * 60 + "\n")
        f.write("QUICK REFERENCE - COMMON EVIDENCE KEYWORDS\n")
        f.write("=" * 60 + "\n\n")

        keywords = {
            "refers to": "Column name mapping",
            "is the": "Column name mapping",
            "DIVIDE": "Division with NULL safety",
            "percentage": "Check if needs * 100",
            "the most": "MAX or ORDER BY DESC LIMIT 1",
            "the least": "MIN or ORDER BY ASC LIMIT 1",
            "without": "IS NULL or = '' (check evidence)",
            "how many": "COUNT(*) or COUNT(DISTINCT)",
            "total": "SUM(column)",
            "average": "AVG(column)",
            "distinct": "Use DISTINCT",
            "for each": "GROUP BY"
        }

        for keyword, translation in keywords.items():
            f.write(f"'{keyword}' → {translation}\n")

    # Also save as JSON for potential programmatic use
    with open('tool_output/evidence_rules.json', 'w') as f:
        json.dump({"rules": rules}, f, indent=2)

    print(f"Evidence translation rules generated: {len(rules)} categories")
    return rules

if __name__ == "__main__":
    generate_evidence_rules()