#!/usr/bin/env python3
"""
Column Order Analyzer Tool
Determines the expected column order based on question patterns.
This is CRITICAL for accuracy as column order matters significantly.
"""

import json
import os
import sqlite3
import re

def analyze_column_order(db_path="database.sqlite"):
    """Analyze and generate column ordering rules."""

    os.makedirs("tool_output", exist_ok=True)

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    column_order_rules = {
        "critical_importance": "Column order in SELECT must match the order in the question!",
        "ordering_patterns": [],
        "multi_column_rules": [],
        "evidence_order_rules": [],
        "database_specific_patterns": [],
        "common_order_mistakes": [],
        "validation_checklist": []
    }

    try:
        # Get schema information
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = [row[0] for row in cursor.fetchall() if not row[0].startswith("sqlite_")]

        table_columns = {}
        for table in tables:
            cursor.execute(f"PRAGMA table_info(`{table}`)")
            columns = cursor.fetchall()
            table_columns[table] = [col[1] for col in columns]

        # Core ordering patterns
        column_order_rules["ordering_patterns"] = [
            {
                "pattern": "Question word order is SELECT order",
                "rule": "Columns must appear in SELECT in the same order they appear in the question",
                "examples": [
                    {
                        "question": "What is the name, age and salary",
                        "correct": "SELECT name, age, salary",
                        "wrong": "SELECT age, name, salary"
                    },
                    {
                        "question": "Show the price and quantity",
                        "correct": "SELECT price, quantity",
                        "wrong": "SELECT quantity, price"
                    },
                    {
                        "question": "List cities and their countries",
                        "correct": "SELECT city, country",
                        "wrong": "SELECT country, city"
                    }
                ],
                "priority": "HIGHEST"
            },
            {
                "pattern": "Comma-separated lists maintain order",
                "rule": "Items separated by commas must be returned in that exact order",
                "examples": [
                    {
                        "question": "name, address, and phone number",
                        "correct": "SELECT name, address, phone",
                        "note": "All three in exact order"
                    },
                    {
                        "question": "ID, description and status",
                        "correct": "SELECT id, description, status"
                    }
                ],
                "priority": "HIGHEST"
            },
            {
                "pattern": "'and' conjunction preserves order",
                "rule": "X and Y means X first, then Y",
                "examples": [
                    {
                        "question": "username and password",
                        "correct": "SELECT username, password",
                        "wrong": "SELECT password, username"
                    },
                    {
                        "question": "first and last name",
                        "correct": "SELECT firstname, lastname"
                    }
                ],
                "priority": "HIGH"
            },
            {
                "pattern": "Question focus determines primary column",
                "rule": "The main subject comes first, properties follow",
                "examples": [
                    {
                        "question": "Which product has the highest price",
                        "correct": "SELECT product_name",
                        "note": "Return product, not price (unless both asked)"
                    },
                    {
                        "question": "Employee with their department and salary",
                        "correct": "SELECT employee_name, department, salary",
                        "note": "Employee first, then properties"
                    }
                ]
            }
        ]

        # Multi-column specific rules
        column_order_rules["multi_column_rules"] = [
            {
                "scenario": "Multiple entities requested",
                "rule": "Return ALL entities in the order mentioned",
                "examples": [
                    {
                        "question": "List the cities and countries",
                        "correct": "SELECT city_name, country_name",
                        "wrong": "SELECT city_name",
                        "error": "Missing country - must return both!"
                    },
                    {
                        "question": "What are the order ID, date and total",
                        "correct": "SELECT order_id, order_date, total_amount",
                        "wrong": "SELECT order_id, total_amount",
                        "error": "Missing date - all three required!"
                    }
                ],
                "critical": "Omitting any requested column is an error"
            },
            {
                "scenario": "Properties of an entity",
                "rule": "Entity identifier first, then properties in order",
                "example": {
                    "question": "Customer name with their email and phone",
                    "correct": "SELECT name, email, phone",
                    "pattern": "Entity (name) → Property1 (email) → Property2 (phone)"
                }
            },
            {
                "scenario": "Related entities",
                "rule": "Primary entity first, related entities follow",
                "example": {
                    "question": "Products and their categories",
                    "correct": "SELECT product_name, category_name",
                    "pattern": "Main entity → Related entity"
                }
            }
        ]

        # Evidence-based ordering
        column_order_rules["evidence_order_rules"] = [
            {
                "pattern": "Evidence 'refers to' with commas",
                "rule": "When evidence says 'X refers to Y, Z', return Y and Z in that order",
                "examples": [
                    {
                        "evidence": "name refers to FirstName, LastName",
                        "correct": "SELECT FirstName, LastName",
                        "wrong": "SELECT LastName, FirstName",
                        "critical": "Evidence order overrides everything"
                    },
                    {
                        "evidence": "address refers to Street, City, Zip",
                        "correct": "SELECT Street, City, Zip",
                        "note": "All three columns in exact order"
                    }
                ],
                "priority": "HIGHEST - Evidence always wins"
            },
            {
                "pattern": "Evidence formulas preserve order",
                "rule": "Column order in formulas must be maintained",
                "example": {
                    "evidence": "full_address = CONCAT(street, ', ', city, ', ', state)",
                    "impact": "Maintain street → city → state order"
                }
            }
        ]

        # Database-specific patterns
        for table in tables[:3]:  # Check first 3 tables for patterns
            if 'FirstName' in table_columns.get(table, []) and 'LastName' in table_columns.get(table, []):
                column_order_rules["database_specific_patterns"].append({
                    "table": table,
                    "pattern": "Person name columns",
                    "standard_order": "FirstName, LastName",
                    "note": "Unless evidence specifies otherwise"
                })

            # Check for address patterns
            addr_cols = [col for col in table_columns.get(table, [])
                         if any(x in col.lower() for x in ['street', 'city', 'state', 'zip', 'country'])]
            if len(addr_cols) > 1:
                column_order_rules["database_specific_patterns"].append({
                    "table": table,
                    "pattern": "Address components",
                    "standard_order": "Street, City, State, Zip, Country",
                    "note": "Geographic hierarchy from specific to general"
                })

            # Check for date/time patterns
            date_cols = [col for col in table_columns.get(table, [])
                         if any(x in col.lower() for x in ['date', 'time', 'year', 'month', 'day'])]
            if len(date_cols) > 1:
                column_order_rules["database_specific_patterns"].append({
                    "table": table,
                    "pattern": "Temporal columns",
                    "standard_order": "Date before Time, Start before End",
                    "example": "start_date, end_date, duration"
                })

        # Common ordering mistakes
        column_order_rules["common_order_mistakes"] = [
            {
                "mistake": "Reversing column order",
                "wrong_pattern": "Returning columns in reverse of question order",
                "example": {
                    "question": "Show ID and Name",
                    "wrong": "SELECT name, id",
                    "correct": "SELECT id, name"
                },
                "impact": "Results marked as incorrect even if data is right"
            },
            {
                "mistake": "Alphabetical ordering",
                "wrong_pattern": "Sorting columns alphabetically instead of by question order",
                "example": {
                    "question": "List salary, name, and department",
                    "wrong": "SELECT department, name, salary",
                    "correct": "SELECT salary, name, department"
                }
            },
            {
                "mistake": "Missing columns in multi-column requests",
                "wrong_pattern": "Returning only some of the requested columns",
                "example": {
                    "question": "What is the product, price and quantity",
                    "wrong": "SELECT product, price",
                    "correct": "SELECT product, price, quantity"
                },
                "impact": "Incomplete results fail validation"
            },
            {
                "mistake": "ID instead of name in compound requests",
                "wrong_pattern": "Returning ID when name is part of multi-column request",
                "example": {
                    "question": "Show the capital city and population",
                    "wrong": "SELECT capital_id, population",
                    "correct": "SELECT city.name, country.population",
                    "note": "Need JOIN to get city name from capital ID"
                }
            }
        ]

        # Validation checklist
        column_order_rules["validation_checklist"] = [
            "✓ Column order matches question word order?",
            "✓ All requested columns included?",
            "✓ Evidence-specified order followed?",
            "✓ Multi-column 'refers to' patterns handled?",
            "✓ Human-readable columns used instead of IDs?",
            "✓ Compound requests return all components?",
            "✓ Properties listed after entities?",
            "✓ No extra columns added?"
        ]

        # Critical summary
        column_order_rules["critical_summary"] = {
            "rule_1": "Question order = SELECT order",
            "rule_2": "Evidence order overrides question order",
            "rule_3": "ALL requested columns must be returned",
            "rule_4": "Order matters for accuracy - wrong order = wrong answer",
            "rule_5": "Multi-column evidence (X refers to Y, Z) returns multiple columns"
        }

    except Exception as e:
        column_order_rules["error"] = str(e)

    finally:
        conn.close()

    # Write output
    with open("tool_output/column_order_analysis.json", "w") as f:
        json.dump(column_order_rules, f, indent=2)

    print("Column order analysis complete")
    print(f"Generated {len(column_order_rules['ordering_patterns'])} ordering patterns")
    print(f"Identified {len(column_order_rules['common_order_mistakes'])} common mistakes")
    print("CRITICAL: Column order determines accuracy!")
    print("Results saved to tool_output/column_order_analysis.json")

if __name__ == "__main__":
    analyze_column_order()