#!/usr/bin/env python3
"""
Evidence Mapper - Translates common evidence patterns to exact database columns.
Helps the eval model map evidence hints to the correct table.column locations.
"""

import sqlite3
import json
import os

def map_evidence_patterns():
    """Generate mappings from common evidence patterns to exact columns."""

    conn = sqlite3.connect("database.sqlite")
    cursor = conn.cursor()

    os.makedirs("tool_output", exist_ok=True)

    mappings = {
        "concept_to_column": {},
        "common_aliases": {},
        "aggregation_hints": {},
        "join_hints": []
    }
    output_lines = []

    try:
        # Load schema validation if available
        schema_info = {}
        if os.path.exists("tool_output/schema_validation.json"):
            with open("tool_output/schema_validation.json") as f:
                schema_info = json.load(f)

        output_lines.append("# EVIDENCE TO COLUMN MAPPING GUIDE")
        output_lines.append("# Translate evidence hints to exact database columns")
        output_lines.append("")

        # Get all tables and columns
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
        tables = [row[0] for row in cursor.fetchall()]

        all_columns = {}
        for table in tables:
            cursor.execute(f"PRAGMA table_info({table})")
            columns = cursor.fetchall()
            all_columns[table] = [(col[1], col[2]) for col in columns]

        # Section 1: Common concept mappings
        output_lines.append("## CONCEPT TO COLUMN MAPPINGS")
        output_lines.append("")

        # Detect common patterns
        concept_patterns = {
            "full_name": [],
            "name": [],
            "identifier": [],
            "id": [],
            "date": [],
            "time": [],
            "year": [],
            "amount": [],
            "count": [],
            "total": [],
            "average": [],
            "percentage": [],
            "rate": [],
            "status": [],
            "type": [],
            "category": [],
            "location": [],
            "address": [],
            "description": []
        }

        for table, columns in all_columns.items():
            for col_name, col_type in columns:
                col_lower = col_name.lower()

                # Check each concept
                for concept, matches in concept_patterns.items():
                    if concept in col_lower or col_lower in concept:
                        matches.append(f"{table}.{col_name}")

                # Special patterns
                if 'first' in col_lower and 'name' in col_lower:
                    concept_patterns.setdefault("first_name", []).append(f"{table}.{col_name}")
                if 'last' in col_lower and 'name' in col_lower:
                    concept_patterns.setdefault("last_name", []).append(f"{table}.{col_name}")
                if 'middle' in col_lower and 'name' in col_lower:
                    concept_patterns.setdefault("middle_name", []).append(f"{table}.{col_name}")

        # Output concept mappings
        for concept, matches in sorted(concept_patterns.items()):
            if matches:
                mappings["concept_to_column"][concept] = matches
                output_lines.append(f"### '{concept}' maps to:")
                for match in matches:
                    output_lines.append(f"  - {match}")
                output_lines.append("")

        # Section 2: Name construction patterns
        output_lines.append("## NAME CONSTRUCTION PATTERNS")
        output_lines.append("")

        # Check for name columns across tables
        name_columns = {
            "first": [],
            "middle": [],
            "last": [],
            "full": []
        }

        for table, columns in all_columns.items():
            for col_name, _ in columns:
                col_lower = col_name.lower()
                if 'first' in col_lower and 'name' in col_lower:
                    name_columns["first"].append((table, col_name))
                elif 'middle' in col_lower and 'name' in col_lower:
                    name_columns["middle"].append((table, col_name))
                elif 'last' in col_lower and 'name' in col_lower:
                    name_columns["last"].append((table, col_name))
                elif 'full' in col_lower and 'name' in col_lower:
                    name_columns["full"].append((table, col_name))
                elif col_lower == 'name':
                    name_columns["full"].append((table, col_name))

        # Generate name patterns
        if name_columns["first"] and name_columns["last"]:
            output_lines.append("### Full name construction:")
            for first_table, first_col in name_columns["first"]:
                for last_table, last_col in name_columns["last"]:
                    if first_table == last_table:
                        if name_columns["middle"]:
                            for middle_table, middle_col in name_columns["middle"]:
                                if middle_table == first_table:
                                    output_lines.append(f"- {first_table}: `{first_col} || ' ' || {middle_col} || ' ' || {last_col}`")
                                    break
                        output_lines.append(f"- {first_table}: `{first_col} || ' ' || {last_col}`")
            output_lines.append("")

        # Section 3: Common aliases
        output_lines.append("## COMMON COLUMN ALIASES")
        output_lines.append("Evidence terms that map to specific columns:")
        output_lines.append("")

        # Manual common mappings based on typical patterns
        alias_patterns = {
            "graduated": ["grad", "graduate", "graduation"],
            "enrolled": ["enroll", "enrollment", "enrolled"],
            "percent": ["percentage", "pct", "percent", "_percent"],
            "average": ["avg", "mean", "average"],
            "total": ["sum", "total", "count"],
            "maximum": ["max", "maximum", "highest"],
            "minimum": ["min", "minimum", "lowest"]
        }

        for concept, patterns in alias_patterns.items():
            matches = []
            for table, columns in all_columns.items():
                for col_name, _ in columns:
                    col_lower = col_name.lower()
                    if any(p in col_lower for p in patterns):
                        matches.append(f"{table}.{col_name}")

            if matches:
                mappings["common_aliases"][concept] = matches
                output_lines.append(f"### '{concept}' evidence hints to:")
                for match in matches:
                    output_lines.append(f"  - {match}")
                output_lines.append("")

        # Section 4: Aggregation hints
        output_lines.append("## AGGREGATION PATTERNS FROM EVIDENCE")
        output_lines.append("")

        agg_hints = {
            "how many": "COUNT(*) or COUNT(DISTINCT column)",
            "total": "SUM(column)",
            "average": "AVG(column)",
            "maximum": "MAX(column)",
            "minimum": "MIN(column)",
            "highest": "MAX(column) or ORDER BY column DESC LIMIT 1",
            "lowest": "MIN(column) or ORDER BY column ASC LIMIT 1",
            "most": "GROUP BY ... ORDER BY COUNT(*) DESC LIMIT 1",
            "least": "GROUP BY ... ORDER BY COUNT(*) ASC LIMIT 1"
        }

        for hint, sql in agg_hints.items():
            mappings["aggregation_hints"][hint] = sql
            output_lines.append(f"- **'{hint}'** → `{sql}`")

        output_lines.append("")

        # Section 5: Join hints
        if schema_info and "foreign_keys" in schema_info:
            output_lines.append("## JOIN PATTERNS FROM RELATIONSHIPS")
            output_lines.append("")

            for table, fks in schema_info["foreign_keys"].items():
                for fk in fks:
                    join_hint = {
                        "from_table": table,
                        "from_column": fk["from_column"],
                        "to_table": fk["to_table"],
                        "to_column": fk["to_column"],
                        "pattern": f"JOIN {fk['to_table']} ON {table}.{fk['from_column']} = {fk['to_table']}.{fk['to_column']}"
                    }
                    mappings["join_hints"].append(join_hint)
                    output_lines.append(f"- {table} → {fk['to_table']}: `{join_hint['pattern']}`")

            output_lines.append("")

        # Section 6: Common mistakes to avoid
        output_lines.append("## COMMON EVIDENCE INTERPRETATION MISTAKES")
        output_lines.append("")
        output_lines.append("1. **Case sensitivity**: Evidence may use different case than actual values")
        output_lines.append("   - Check value_samples.txt for exact case")
        output_lines.append("2. **Column location**: Evidence may not specify which table")
        output_lines.append("   - Check schema_validation.txt for exact table.column")
        output_lines.append("3. **Aggregation context**: 'most' can mean MAX() or COUNT(*) depending on context")
        output_lines.append("4. **Percentage format**: Could be 0.5 or 50 depending on the column")

        # Write outputs
        with open("tool_output/evidence_mapping.txt", "w") as f:
            f.write("\n".join(output_lines))

        with open("tool_output/evidence_mapping.json", "w") as f:
            json.dump(mappings, f, indent=2)

        print("Evidence mappings generated - see tool_output/evidence_mapping.txt")

    finally:
        conn.close()

if __name__ == "__main__":
    map_evidence_patterns()