"""
Module for generating HTML visualizations of labeled instruction data.

This module creates interactive HTML pages that display labeled text data with
instruction highlighting. It processes JSONL files containing labeled data and
generates user-friendly HTML visualizations with collapsible sections and
visual highlighting of instruction tags.
"""

import json
import re
import os
import html
import argparse
from datetime import datetime
from typing import List, Dict, Any

def read_labeled_data(file_path: str) -> List[Dict[str, Any]]:
    """
    Read the labeled JSONL file and return a list of data objects.
    
    Args:
        file_path (str): Path to the JSONL file containing labeled data
        
    Returns:
        List[Dict[str, Any]]: List of data objects parsed from the JSONL file
        
    Note:
        Empty lines in the file are skipped. Each line should contain valid JSON.
    """
    data: List[Dict[str, Any]] = []
    with open(file_path, 'r') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line.strip()))
    return data

def highlight_instructions(text: str, color_class: str = "instruction") -> str:
    """
    Convert instruction tags to HTML with highlighting while escaping all other HTML.
    
    Args:
        text (str): Text containing instruction tags and possibly other content
        color_class (str): CSS class name for styling instruction spans
        
    Returns:
        str: HTML-safe text with instruction spans highlighted using CSS classes
        
    Note:
        First removes wrapping <text></text> tags if present, then escapes all HTML,
        then converts instruction tags to HTML span elements for styling.
    """
    # First remove <text></text> tags if they wrap the entire text
    text = text.strip()
    if text.startswith('<text>') and text.endswith('</text>'):
        text = text[6:-7]  # Remove <text> (6 chars) from start and </text> (7 chars) from end
    
    # Then escape all HTML characters
    escaped_text: str = html.escape(text)
    
    # Then replace the escaped instruction tags with HTML span tags for styling
    highlighted: str = re.sub(
        r'&lt;instruction&gt;(.*?)&lt;/instruction&gt;',
        rf'<span class="{color_class}">\1</span>',
        escaped_text,
        flags=re.DOTALL
    )
    return highlighted

def generate_html_page(data: List[Dict[str, Any]], output_path: str, input_filename: str, 
                      text_field: str = 'label_text', color_class: str = "instruction", 
                      page_title_suffix: str = "") -> None:
    """
    Generate an interactive HTML page for visualizing the labeled data.
    
    Creates a complete HTML document with embedded CSS and JavaScript for
    displaying labeled instruction data in an interactive format with
    collapsible sections and highlighted instructions.
    
    Args:
        data (List[Dict[str, Any]]): List of labeled data objects to visualize
        output_path (str): Path where the HTML file will be saved
        input_filename (str): Name of the input file for display in the HTML header
        text_field (str): Field name to use for text content ('label_text' or 'predict_text')
        color_class (str): CSS class name for instruction highlighting
        page_title_suffix (str): Additional text to append to page title
        
            Note:
        Only includes data objects where sanity_check is True. The HTML includes
        embedded CSS for styling and JavaScript for interactive expand/collapse.
        Instructions are highlighted based on the color_class parameter.
        If a 'sample_truth' field is present, header bars are color-coded:
        red for True values and green for False values.
    """
    
    # Define color styles based on the color_class
    if color_class == "instruction":
        # Red styling (original)
        instruction_bg = "#ffebee"
        instruction_color = "#c62828"
        instruction_border = "#ffcdd2"
    else:
        # Purple styling for predictions
        instruction_bg = "#f3e5f5"
        instruction_color = "#7b1fa2"
        instruction_border = "#ce93d8"
    
    html_content: str = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Instruction Classifier - Labeled Data Visualization{page_title_suffix}</title>
    <style>
        body {{
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            margin: 0;
            padding: 20px;
            background-color: #f5f5f5;
            line-height: 1.6;
        }}
        
        .container {{
            max-width: 1200px;
            margin: 0 auto;
            background-color: white;
            border-radius: 10px;
            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
            padding: 30px;
        }}
        
        h1 {{
            color: #333;
            text-align: center;
            margin-bottom: 30px;
            border-bottom: 3px solid #007bff;
            padding-bottom: 10px;
        }}
        
        .stats {{
            background-color: #e9ecef;
            padding: 15px;
            border-radius: 5px;
            margin-bottom: 20px;
            text-align: center;
        }}
        
        .sample-row {{
            border: 1px solid #ddd;
            border-radius: 8px;
            margin-bottom: 15px;
            background-color: #fafafa;
            transition: all 0.3s ease;
        }}
        
        .sample-row:hover {{
            box-shadow: 0 4px 8px rgba(0,0,0,0.1);
            transform: translateY(-2px);
        }}
        
        .sample-header {{
            padding: 15px 20px;
            background-color: #007bff;
            color: white;
            cursor: pointer;
            border-radius: 8px 8px 0 0;
            display: flex;
            justify-content: space-between;
            align-items: center;
        }}
        
        .sample-header:hover {{
            background-color: #0056b3;
        }}
        
        .sample-id {{
            font-weight: bold;
            font-size: 1.1em;
        }}
        
        .expand-icon {{
            font-size: 1.2em;
            transition: transform 0.3s ease;
        }}
        
        .sample-content {{
            padding: 20px;
            display: none;
            background-color: white;
            border-radius: 0 0 8px 8px;
        }}
        
        .sample-content.expanded {{
            display: block;
        }}
        
        .labeled-text {{
            font-size: 1.1em;
            line-height: 1.8;
            white-space: pre-wrap;
            word-wrap: break-word;
        }}
        
        .{color_class} {{
            background-color: {instruction_bg};
            color: {instruction_color};
            font-weight: bold;
            padding: 2px 4px;
            border-radius: 3px;
            border: 1px solid {instruction_border};
        }}
        
        .sample-header-true {{
            background-color: #dc3545 !important;
        }}
        
        .sample-header-true:hover {{
            background-color: #c82333 !important;
        }}
        
        .sample-header-false {{
            background-color: #28a745 !important;
        }}
        
        .sample-header-false:hover {{
            background-color: #218838 !important;
        }}
    </style>
</head>
<body>
    <div class="container">
        <h1>📝 Instruction Classifier - Labeled Data Visualization{page_title_suffix}</h1>
        
        <div class="stats">
            <strong>File:</strong> {input_filename} | 
            <strong>Field:</strong> {text_field} |
            <strong>Total Samples:</strong> {sum(1 for d in data if d.get('sanity_check', False) and d.get(text_field))} | 
            <strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
        </div>
        
        <div id="samplesContainer">
"""

    # Add each sample to the HTML
    for i, sample in enumerate(data):
        if not sample.get('sanity_check', False) or not sample.get(text_field): # So empty label_text/complete text will not be shown
            continue
        sample_id: str = sample.get('id', f'Sample {i+1}')
        text_content: str = sample.get(text_field, '')
        highlighted_text: str = highlight_instructions(text_content, color_class)
        
        # Determine sample_truth class
        sample_truth_class = ""
        if 'sample_truth' in sample:
            sample_truth_value = sample['sample_truth']
            if sample_truth_value is True or sample_truth_value == 'true':
                sample_truth_class = "sample-header-true"
            elif sample_truth_value is False or sample_truth_value == 'false':
                sample_truth_class = "sample-header-false"
        
        html_content += f"""
            <div class="sample-row">
                <div class="sample-header {sample_truth_class}" onclick="toggleSample({i})">
                    <div class="sample-id">{sample_id}</div>
                    <div class="expand-icon" id="icon-{i}">▼</div>
                </div>
                <div class="sample-content" id="content-{i}">
                    <div class="labeled-text">{highlighted_text}</div>
                </div>
            </div>
        """

    # Add JavaScript for interactivity
    html_content += """
        </div>
    </div>

    <script>
        function toggleSample(index) {
            const content = document.getElementById(`content-${index}`);
            const icon = document.getElementById(`icon-${index}`);
            
            if (content.classList.contains('expanded')) {
                content.classList.remove('expanded');
                icon.textContent = '▼';
            } else {
                content.classList.add('expanded');
                icon.textContent = '▲';
            }
        }
    </script>
</body>
</html>
"""

    # Write the HTML file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(html_content)

def main() -> None:
    """
    Main function to generate HTML visualization for labeled instruction data.
    
    Parses command line arguments for input file and optional output file paths.
    If no output path is provided, generates one based on the input filename.
    Reads labeled data and creates an interactive HTML visualization.
    
    If the data contains 'predict_text' field, generates an additional HTML file
    for predictions with purple highlighting.
    
    Command line arguments:
        --input/-i: Path to input labeled JSONL file (required)
        --output/-o: Path to output HTML file (optional, auto-generated if not provided)
        
    Prints:
        Processing status including file paths and number of samples processed.
        Instructions for viewing the generated HTML file(s).
        
    Returns:
        None. Exits early if input file is not found or if there are errors
        reading data or generating HTML.
    """
    # Set up command line argument parsing
    parser = argparse.ArgumentParser(description='Generate HTML visualization for labeled instruction data')
    parser.add_argument('--input', '-i', required=True, help='Path to input labeled JSONL file')
    parser.add_argument('--output', '-o', help='Path to output HTML file (optional - will auto-generate if not provided)')
    
    args = parser.parse_args()
    
    input_file: str = args.input
    
    print(f"Reading labeled data from: {input_file}")
    
    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"Error: Input file {input_file} not found!")
        return
    
    # Read the labeled data
    try:
        data: List[Dict[str, Any]] = read_labeled_data(input_file)
        print(f"Successfully read {len(data)} samples")
    except Exception as e:
        print(f"Error reading data: {e}")
        return
    
    # Check if any sample has predict_text field
    has_predict_text = any(sample.get('predict_text') for sample in data)
    
    # Generate output file names
    input_filename: str = os.path.basename(input_file)
    input_filename_no_ext: str = input_filename.replace('.jsonl', '')
    output_dir: str = os.path.dirname(input_file)
    
    if args.output:
        # If specific output provided, use it for label_text and generate predict_text name
        output_file_labels: str = args.output
        if has_predict_text:
            output_base = args.output.replace('.html', '')
            output_file_predictions: str = f"{output_base}_predictions.html"
    else:
        # Auto-generate output file names
        output_file_labels = os.path.join(output_dir, f"{input_filename_no_ext}_labels.html")
        if has_predict_text:
            output_file_predictions = os.path.join(output_dir, f"{input_filename_no_ext}_predictions.html")
    
    print(f"Output HTML file for labels: {output_file_labels}")
    if has_predict_text:
        print(f"Output HTML file for predictions: {output_file_predictions}")
    
    # Generate HTML visualization for label_text
    try:
        input_filename_for_display: str = os.path.basename(input_file)
        generate_html_page(data, output_file_labels, input_filename_for_display, 
                          text_field='label_text', color_class="instruction", 
                          page_title_suffix=" - Labels")
        print(f"HTML visualization for labels generated: {output_file_labels}")
    except Exception as e:
        print(f"Error generating HTML for labels: {e}")
        return
    
    # Generate HTML visualization for predict_text if it exists
    if has_predict_text:
        try:
            generate_html_page(data, output_file_predictions, input_filename_for_display, 
                              text_field='predict_text', color_class="prediction", 
                              page_title_suffix=" - Predictions")
            print(f"HTML visualization for predictions generated: {output_file_predictions}")
        except Exception as e:
            print(f"Error generating HTML for predictions: {e}")
            return
    
    print("Open the file(s) in your browser to view the visualization(s)")

if __name__ == "__main__":
    main()
