#!/usr/bin/env python3
"""
Enhanced Evidence Analyzer Tool
Extracts and parses evidence patterns with improved multi-column detection.
"""

import json
import os
import re

def analyze_evidence_patterns():
    """Generate evidence parsing rules with multi-column support."""

    os.makedirs("tool_output", exist_ok=True)

    evidence_patterns = {
        "parsing_rules": [],
        "operator_mappings": [],
        "formula_templates": [],
        "column_mappings": [],
        "multi_column_patterns": [],
        "value_formats": [],
        "critical_notes": []
    }

    # Enhanced parsing rules for evidence
    evidence_patterns["parsing_rules"] = [
        {
            "pattern": "X refers to Y",
            "meaning": "Use column/table Y when question mentions X",
            "example": "department refers to organ → Use 'organ' column for department",
            "priority": "HIGHEST - Always follow this mapping"
        },
        {
            "pattern": "X refers to Y, Z",
            "meaning": "Return MULTIPLE columns Y and Z when X is mentioned",
            "example": "name refers to FirstName, LastName → SELECT FirstName, LastName",
            "priority": "HIGHEST - Return ALL listed columns in that order",
            "critical": "This is a multi-column mapping - must return both!"
        },
        {
            "pattern": "X refers to Y and Z",
            "meaning": "Return both Y and Z columns",
            "example": "address refers to Street and City → SELECT Street, City",
            "priority": "HIGHEST - Return both in order"
        },
        {
            "pattern": "X = formula",
            "meaning": "Use exact formula provided",
            "example": "percentage = COUNT(X) * 100 / COUNT(Y)",
            "priority": "HIGHEST - Copy formula exactly"
        },
        {
            "pattern": "X > value",
            "meaning": "Use > operator, NOT >=",
            "example": "income > 3000 means > not >=",
            "priority": "HIGH - Exact operator matters"
        },
        {
            "pattern": "X < value",
            "meaning": "Use < operator, NOT <=",
            "example": "age < 18 means < not <=",
            "priority": "HIGH - Exact operator matters"
        },
        {
            "pattern": "between X and Y",
            "meaning": "Use BETWEEN or >= AND <=",
            "example": "age between 20 and 30 → age >= 20 AND age <= 30",
            "priority": "HIGH - Inclusive bounds unless specified"
        },
        {
            "pattern": "over X refers to Y > X",
            "meaning": "Translates 'over' to strict greater than",
            "example": "over 3000 refers to INCOME_K > 3000",
            "priority": "HIGH - Use > not >="
        },
        {
            "pattern": "at least X",
            "meaning": "Use >= operator",
            "example": "at least 5 → >= 5",
            "priority": "HIGH"
        },
        {
            "pattern": "at most X",
            "meaning": "Use <= operator",
            "example": "at most 10 → <= 10",
            "priority": "HIGH"
        }
    ]

    # Multi-column specific patterns
    evidence_patterns["multi_column_patterns"] = [
        {
            "pattern": "refers to Col1, Col2, Col3",
            "interpretation": "Return ALL columns in the exact order listed",
            "sql_impact": "SELECT Col1, Col2, Col3",
            "warning": "Missing any column is an error"
        },
        {
            "pattern": "includes X and Y",
            "interpretation": "Must return both X and Y",
            "sql_impact": "Both columns required in SELECT"
        },
        {
            "pattern": "consists of A, B",
            "interpretation": "Composite result from multiple columns",
            "sql_impact": "SELECT A, B in that order"
        },
        {
            "pattern": "(Column1, Column2)",
            "interpretation": "Parenthetical list means all columns",
            "sql_impact": "Return all columns within parentheses"
        }
    ]

    # Operator mappings
    evidence_patterns["operator_mappings"] = [
        {
            "evidence_term": ">",
            "sql_operator": ">",
            "warning": "NOT >= unless explicitly stated"
        },
        {
            "evidence_term": "<",
            "sql_operator": "<",
            "warning": "NOT <= unless explicitly stated"
        },
        {
            "evidence_term": "!=",
            "sql_operator": "!=",
            "alternative": "<>"
        },
        {
            "evidence_term": "not equal",
            "sql_operator": "!=",
            "alternative": "<>"
        },
        {
            "evidence_term": "is null",
            "sql_operator": "IS NULL",
            "warning": "Use IS NULL, not = NULL"
        },
        {
            "evidence_term": "is not null",
            "sql_operator": "IS NOT NULL",
            "warning": "Use IS NOT NULL, not != NULL"
        }
    ]

    # Formula templates
    evidence_patterns["formula_templates"] = [
        {
            "type": "percentage",
            "common_patterns": [
                "COUNT(condition) * 100 / COUNT(*)",
                "CAST(COUNT(condition) AS REAL) * 100 / COUNT(*)",
                "SUM(CASE WHEN condition THEN 1 ELSE 0 END) * 100.0 / COUNT(*)"
            ],
            "rule": "Use exact formula from evidence"
        },
        {
            "type": "ratio",
            "common_patterns": [
                "COUNT(X) / COUNT(Y)",
                "CAST(COUNT(X) AS REAL) / COUNT(Y)"
            ],
            "rule": "Preserve CAST for decimal results"
        },
        {
            "type": "calculation",
            "examples": [
                "price * quantity",
                "end_date - start_date"
            ],
            "rule": "Copy calculation exactly as shown"
        }
    ]

    # Column mapping patterns
    evidence_patterns["column_mappings"] = [
        {
            "type": "rename",
            "pattern": "X refers to Y",
            "action": "Replace X with Y in query"
        },
        {
            "type": "multi_column",
            "pattern": "X refers to Y, Z",
            "action": "Replace X with both Y and Z columns",
            "critical": "Must return multiple columns"
        },
        {
            "type": "table_specific",
            "pattern": "in table T, X refers to Y",
            "action": "Use Y for X only in table T"
        },
        {
            "type": "combined",
            "pattern": "X refers to CONCAT(Y, Z)",
            "action": "Use concatenation when X is requested"
        }
    ]

    # Value format patterns
    evidence_patterns["value_formats"] = [
        {
            "type": "case_sensitive",
            "indicator": "exact match",
            "rule": "Preserve exact case from evidence"
        },
        {
            "type": "date_format",
            "patterns": ["YYYY-MM-DD", "MM/DD/YYYY"],
            "rule": "Use format shown in evidence"
        },
        {
            "type": "boolean",
            "patterns": ["'T'/'F'", "'Y'/'N'", "1/0", "'yes'/'no'"],
            "rule": "Use exact format from evidence"
        }
    ]

    # Critical notes for evidence parsing
    evidence_patterns["critical_notes"] = [
        "Evidence ALWAYS overrides general patterns",
        "Multi-column mappings (X refers to Y, Z) must return ALL columns",
        "Column order in evidence determines SELECT column order",
        "Operators must match exactly (> vs >= matters)",
        "Formulas should be copied verbatim",
        "When evidence conflicts with question, follow evidence",
        "Check for comma-separated column lists in 'refers to' patterns",
        "Validate evidence mappings against actual schema"
    ]

    # Write output
    with open("tool_output/evidence_patterns.json", "w") as f:
        json.dump(evidence_patterns, f, indent=2)

    print("Evidence analysis complete")
    print(f"Generated {len(evidence_patterns['parsing_rules'])} parsing rules")
    print(f"Identified {len(evidence_patterns['multi_column_patterns'])} multi-column patterns")
    print("Results saved to tool_output/evidence_patterns.json")

if __name__ == "__main__":
    analyze_evidence_patterns()