#!/usr/bin/env python3
"""
Output Type Detector
Determines exact output requirements from question patterns.
"""

import json
import os
import re

def detect_output_patterns():
    """Detect output type from question keywords."""
    output_patterns = {
        'detection_rules': [],
        'return_types': {},
        'column_format_rules': [],
        'common_mistakes': [],
        'validation_checks': []
    }

    # Primary detection patterns (in priority order)
    primary_patterns = [
        {
            'pattern': r'what\s+is\s+the\s+percentage',
            'type': 'PERCENTAGE',
            'sql_template': 'SELECT CAST(COUNT(CASE WHEN condition THEN 1 END) AS REAL) * 100 / COUNT(*)',
            'output': 'Single numeric value (0-100)',
            'priority': 1
        },
        {
            'pattern': r'percentage\s+of',
            'type': 'PERCENTAGE',
            'sql_template': 'SELECT CAST(part AS REAL) * 100 / whole',
            'output': 'Single percentage value',
            'priority': 1
        },
        {
            'pattern': r'how\s+many',
            'type': 'COUNT',
            'sql_template': 'SELECT COUNT(*) or COUNT(DISTINCT ...)',
            'output': 'Single integer count',
            'priority': 1
        },
        {
            'pattern': r'(count|number)\s+of',
            'type': 'COUNT',
            'sql_template': 'SELECT COUNT(...)',
            'output': 'Single count value',
            'priority': 1
        },
        {
            'pattern': r'^list\s+',
            'type': 'LIST',
            'sql_template': 'SELECT column(s) -- actual values',
            'output': 'Multiple rows of values',
            'priority': 1
        },
        {
            'pattern': r'what\s+are\s+(the|all)',
            'type': 'LIST',
            'sql_template': 'SELECT actual_columns',
            'output': 'List of values',
            'priority': 2
        },
        {
            'pattern': r'(describe|give|provide|show)\s+.*\s+(and|with)',
            'type': 'MULTI_COLUMN',
            'sql_template': 'SELECT col1, col2, ... -- multiple columns',
            'output': 'Multiple columns per row',
            'priority': 2
        },
        {
            'pattern': r'which\s+(is|are)',
            'type': 'IDENTIFIER',
            'sql_template': 'SELECT identifying_column(s)',
            'output': 'Identifying information',
            'priority': 2
        },
        {
            'pattern': r'(the\s+)?(highest|most|maximum)',
            'type': 'SINGLE_MAX',
            'sql_template': 'SELECT ... ORDER BY metric DESC LIMIT 1',
            'output': 'Single row with maximum',
            'priority': 2
        },
        {
            'pattern': r'(the\s+)?(lowest|least|minimum)',
            'type': 'SINGLE_MIN',
            'sql_template': 'SELECT ... ORDER BY metric ASC LIMIT 1',
            'output': 'Single row with minimum',
            'priority': 2
        },
        {
            'pattern': r'difference\s+(between|of)',
            'type': 'DIFFERENCE',
            'sql_template': 'SELECT value1 - value2',
            'output': 'Single numeric difference',
            'priority': 2
        },
        {
            'pattern': r'(average|mean)\s+',
            'type': 'AVERAGE',
            'sql_template': 'SELECT AVG(column)',
            'output': 'Single average value',
            'priority': 2
        },
        {
            'pattern': r'(sum|total)\s+',
            'type': 'SUM',
            'sql_template': 'SELECT SUM(column)',
            'output': 'Single sum value',
            'priority': 2
        }
    ]

    # Return type specifications
    output_patterns['return_types'] = {
        'PERCENTAGE': {
            'description': 'Percentage calculation required',
            'select_pattern': 'CAST(numerator AS REAL) * 100 / denominator',
            'expected_output': 'Single numeric value between 0 and 100',
            'common_errors': [
                'Returning count instead of percentage',
                'Forgetting to multiply by 100',
                'Wrong denominator in calculation'
            ]
        },
        'COUNT': {
            'description': 'Count of items required',
            'select_pattern': 'COUNT(*) or COUNT(DISTINCT ...)',
            'expected_output': 'Single integer value',
            'common_errors': [
                'Returning list instead of count',
                'Missing DISTINCT when uniqueness implied',
                'Wrong aggregation level'
            ]
        },
        'LIST': {
            'description': 'List of values required',
            'select_pattern': 'SELECT actual_column_values',
            'expected_output': 'Multiple rows of data',
            'common_errors': [
                'Returning COUNT instead of values',
                'Adding unnecessary columns',
                'Missing DISTINCT when needed'
            ]
        },
        'MULTI_COLUMN': {
            'description': 'Multiple columns per row required',
            'select_pattern': 'SELECT col1, col2, col3',
            'expected_output': 'Rows with multiple columns',
            'common_errors': [
                'Concatenating columns that should be separate',
                'Missing requested columns',
                'Adding unrequested columns'
            ]
        },
        'SINGLE_MAX': {
            'description': 'Single maximum value required',
            'select_pattern': 'ORDER BY metric DESC LIMIT 1',
            'expected_output': 'Single row with highest value',
            'common_errors': [
                'Returning multiple rows',
                'Wrong ordering column',
                'Missing LIMIT 1'
            ]
        },
        'SINGLE_MIN': {
            'description': 'Single minimum value required',
            'select_pattern': 'ORDER BY metric ASC LIMIT 1',
            'expected_output': 'Single row with lowest value',
            'common_errors': [
                'Returning multiple rows',
                'Wrong ordering column',
                'Missing LIMIT 1'
            ]
        }
    }

    # Column format rules
    output_patterns['column_format_rules'] = [
        {
            'scenario': 'Question asks for "full name"',
            'with_evidence': 'If evidence says "f_name, l_name"',
            'action': 'Return TWO separate columns: f_name, l_name',
            'not': 'Do NOT concatenate'
        },
        {
            'scenario': 'Question asks for "names"',
            'with_evidence': 'Check if firstName, lastName exist',
            'action': 'Return appropriate name columns',
            'not': 'Do NOT add IDs or other data'
        },
        {
            'scenario': 'Question asks to "list X"',
            'action': 'Return ONLY X',
            'not': 'Do NOT add helpful context columns'
        },
        {
            'scenario': 'Question asks for "X and Y"',
            'action': 'Return EXACTLY X and Y',
            'not': 'Do NOT add Z even if related'
        },
        {
            'scenario': 'Question asks for "all"',
            'action': 'Return complete results',
            'not': 'Do NOT use LIMIT (unless with ORDER BY)'
        }
    ]

    # Common mistakes to avoid
    output_patterns['common_mistakes'] = [
        {
            'mistake': 'Returning COUNT when LIST is requested',
            'example': '"List all users" → Should return user data, not COUNT(*)',
            'fix': 'Check for "list" keyword → return values'
        },
        {
            'mistake': 'Returning LIST when COUNT is requested',
            'example': '"How many users" → Should return COUNT(*), not user list',
            'fix': 'Check for "how many" → return count'
        },
        {
            'mistake': 'Concatenating separate columns',
            'example': 'Evidence: "f_name, l_name" → Keep as two columns',
            'fix': 'Respect evidence column format'
        },
        {
            'mistake': 'Adding unrequested columns',
            'example': '"List URLs" → Return only URLs, not titles too',
            'fix': 'Return ONLY what is explicitly requested'
        },
        {
            'mistake': 'Wrong percentage calculation',
            'example': 'Forgetting * 100 or using wrong denominator',
            'fix': 'Use exact formula from evidence or standard pattern'
        }
    ]

    # Validation checks
    output_patterns['validation_checks'] = [
        {
            'check': 'Output type matches question intent',
            'method': 'Match primary pattern keywords',
            'priority': 'CRITICAL'
        },
        {
            'check': 'Column count matches request',
            'method': 'Count requested vs returned columns',
            'priority': 'CRITICAL'
        },
        {
            'check': 'Aggregation level is correct',
            'method': 'Verify GROUP BY if aggregating',
            'priority': 'HIGH'
        },
        {
            'check': 'LIMIT used appropriately',
            'method': 'LIMIT 1 for "the most/least"',
            'priority': 'HIGH'
        },
        {
            'check': 'DISTINCT used when needed',
            'method': 'Check for uniqueness requirements',
            'priority': 'MEDIUM'
        }
    ]

    # Detection algorithm
    output_patterns['detection_algorithm'] = {
        'step1': 'Check primary patterns in priority order',
        'step2': 'If multiple matches, use highest priority',
        'step3': 'Check evidence for output format hints',
        'step4': 'Apply column format rules',
        'step5': 'Validate against common mistakes'
    }

    # Special cases
    output_patterns['special_cases'] = [
        {
            'case': 'Percentage with specific formula in evidence',
            'action': 'Use EXACT evidence formula',
            'example': 'Evidence formula overrides standard pattern'
        },
        {
            'case': 'Count with GROUP BY in evidence',
            'action': 'Apply grouping as specified',
            'example': 'GROUP BY before COUNT if evidence shows it'
        },
        {
            'case': 'List with specific column order in evidence',
            'action': 'Use exact column order from evidence',
            'example': 'Respect evidence column sequence'
        }
    ]

    # Keywords mapping
    output_patterns['keyword_triggers'] = {
        'percentage_triggers': ['percentage', 'percent', '%', 'rate'],
        'count_triggers': ['how many', 'number of', 'count', 'total number'],
        'list_triggers': ['list', 'show all', 'what are', 'display'],
        'single_triggers': ['the most', 'the least', 'the highest', 'the lowest'],
        'multi_column_triggers': ['describe', 'provide', 'give', 'with their']
    }

    output_patterns['detection_rules'] = primary_patterns

    return output_patterns

def main():
    output_dir = "./tool_output"
    os.makedirs(output_dir, exist_ok=True)

    # Detect output patterns
    patterns = detect_output_patterns()

    # Save results
    output_path = os.path.join(output_dir, "output_patterns.json")
    with open(output_path, 'w') as f:
        json.dump(patterns, f, indent=2)

    print("Output type detection complete")
    print(f"Generated {len(patterns['detection_rules'])} detection rules")
    print(f"Defined {len(patterns['return_types'])} return types")
    print(f"Identified {len(patterns['common_mistakes'])} common mistakes to avoid")
    print(f"Results saved to {output_path}")
    print("\nREMEMBER: Return EXACTLY what is requested, nothing more")

if __name__ == "__main__":
    main()