import re
import json
import uuid
from datetime import datetime
from typing import Dict, Any, List
from pathlib import Path


def clean_response_text(text: str) -> str:
    """
    Clean the response text, remove markdown marks etc.

    Args:
        text: Original response text

    Returns:
        Cleaned text
    """
    json_block_pattern = r'<json>\s*([\s\S]*?)\s*</json>'
    json_block_match = re.search(json_block_pattern, text, re.DOTALL)

    if json_block_match:
        # Extract the content surrounded by <json></json>
        extracted_content = json_block_match.group(1).strip()
        return extracted_content

    # Try to extract the content surrounded by ```json ```
    json_code_block_pattern = r'```json\s*([\s\S]*?)\s*```'
    json_code_block_match = re.search(json_code_block_pattern, text, re.DOTALL)

    if json_code_block_match:
        # Extract the content surrounded by ```json ```
        extracted_content = json_code_block_match.group(1).strip()
        return extracted_content

    # Try to extract the content surrounded by ``` (as a backup)
    code_block_pattern = r'```\s*([\s\S]*?)\s*```'
    code_block_match = re.search(code_block_pattern, text, re.DOTALL)

    if code_block_match:
        extracted_content = code_block_match.group(1).strip()
        # Check if it looks like JSON
        if extracted_content.startswith('{') and extracted_content.endswith('}'):
            return extracted_content

    # If no code blocks are found, try to find JSON objects directly
    json_object_pattern = r'\{[\s\S]*\}'
    json_object_match = re.search(json_object_pattern, text, re.DOTALL)

    if json_object_match:
        return json_object_match.group().strip()

    # If nothing is found, return an empty string
    return ""


def fix_truncated_json(json_text: str) -> str:
    """
    Fix possible truncated JSON

    Args:
        json_text: Possible truncated JSON text

    Returns:
        Fixed JSON text
    """
    # If JSON ends with ..., try to fix it
    if json_text.rstrip().endswith('...'):
        # Remove ...
        json_text = json_text.rstrip()[:-3]

        # Try to complete common truncated cases
        # If the string is truncated in the middle, complete the quotes
        if json_text.count('"') % 2 != 0:
            json_text += '"'

        # Complete possible missing }
        open_braces = json_text.count('{')
        close_braces = json_text.count('}')
        if open_braces > close_braces:
            json_text += '}' * (open_braces - close_braces)

        # Complete possible missing ]
        open_brackets = json_text.count('[')
        close_brackets = json_text.count(']')
        if open_brackets > close_brackets:
            json_text += ']' * (open_brackets - close_brackets)

    return json_text


def parse_openai_response(response_text: str, question_data: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Parse OpenAI response (for template generation)

    Args:
        response_text: OpenAI API response text
        question_data: Question data

    Returns:
        Parsed template list
    """
    templates = []

    try:
        # Clean the response text
        cleaned_text = clean_response_text(response_text)

        # Try to parse JSON directly
        response_json = json.loads(cleaned_text)

        if "templates" in response_json:
            for template_data in response_json["templates"]:
                template = {
                    'id': str(uuid.uuid4()),
                    'source_question_id': question_data['id'],
                    'template': template_data.get('template', ''),
                    'placeholders': template_data.get('placeholders', []),
                    'description': template_data.get('description', ''),
                    'question_type': question_data['question_type'],
                    'topic': question_data['topic'],
                    'created_time': datetime.now().isoformat(),
                    'generation_method': 'openai'
                }
                templates.append(template)

    except json.JSONDecodeError as e:
        print(f"❌ JSON parsing failed: {e}")

        # Try multiple parsing methods
        templates = try_alternative_parsing(response_text, question_data)

        if not templates:
            # Create a basic template as a backup
            template = {
                'id': str(uuid.uuid4()),
                'source_question_id': question_data['id'],
                'template': question_data['refined_question'],
                'placeholders': [],
                'description': f"Based on the {question_data['question_type']} template (fallback)",
                'question_type': question_data['question_type'],
                'topic': question_data['topic'],
                'created_time': datetime.now().isoformat(),
                'generation_method': 'fallback'
            }
            templates.append(template)

    return templates


def try_alternative_parsing(response_text: str, question_data: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Try multiple ways to parse the response

    Args:
        response_text: Response text
        question_data: Question data

    Returns:
        Parsed template list
    """
    templates = []

    # Method 1: Find JSON blocks
    try:
        # Find {} surrounded JSON content
        json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
        if json_match:
            json_text = json_match.group()
            # Clean possible truncated issues
            json_text = fix_truncated_json(json_text)
            response_json = json.loads(json_text)

            if "templates" in response_json:
                for template_data in response_json["templates"]:
                    template = {
                        'id': str(uuid.uuid4()),
                        'source_question_id': question_data['id'],
                        'template': template_data.get('template', ''),
                        'placeholders': template_data.get('placeholders', []),
                        'description': template_data.get('description', ''),
                        'question_type': question_data['question_type'],
                        'topic': question_data['topic'],
                        'created_time': datetime.now().isoformat(),
                        'generation_method': 'openai_alt1'
                    }
                    templates.append(template)

            print("Alternative parsing method 1 succeeded")
            return templates

    except Exception as e:
        print(f"Alternative parsing method 1 failed: {e}")

    # Method 2: Process a single QA pair (not templates format)
    try:
        cleaned_text = clean_response_text(response_text)
        # Fix possible truncated issues
        cleaned_text = fix_truncated_json(cleaned_text)

        response_json = json.loads(cleaned_text)

        # If it's directly a QA pair format
        if "question" in response_json and "answer" in response_json:
            template = {
                'id': str(uuid.uuid4()),
                'source_question_id': question_data['id'],
                'template': response_json.get('question', ''),
                'placeholders': response_json.get('placeholders', []),
                'description': response_json.get('description', 'Template generated based on single QA pair'),
                'question_type': question_data['question_type'],
                'topic': question_data['topic'],
                'created_time': datetime.now().isoformat(),
                'generation_method': 'openai_alt2'
            }
            templates.append(template)

            print("✅ Alternative parsing method 2 succeeded")
            return templates

    except Exception as e:
        print(f"Alternative parsing method 2 failed: {e}")
    try:
        question_pattern = r'"question":\s*"([^"]+)"'
        question_match = re.search(question_pattern, response_text)

        if question_match:
            extracted_question = question_match.group(1)
            template = {
                'id': str(uuid.uuid4()),
                'source_question_id': question_data['id'],
                'template': extracted_question,
                'placeholders': [],
                'description': "Extracted question from response text",
                'question_type': question_data['question_type'],
                'topic': question_data['topic'],
                'created_time': datetime.now().isoformat(),
                'generation_method': 'openai_regex'
            }
            templates.append(template)

            print("✅ Regular expression extraction method succeeded")
            return templates

    except Exception as e:
        print(f"Regular expression extraction method failed: {e}")

    print("❌ All parsing methods failed")
    return templates


def parse_qa_response_alternative(response_text: str, original_question: str,
                                  question_type: str, retrieved_docs: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Alternative parsing method for QA pairs

    Args:
        response_text: Response text
        original_question: Original question
        question_type: Question type
        retrieved_docs: List of retrieved documents

    Returns:
        Parsed QA pair data
    """

    # Method 1: Clean and try JSON parsing again
    try:
        cleaned_text = clean_response_text(response_text)
        cleaned_text = fix_truncated_json(cleaned_text)

        qa_data = json.loads(cleaned_text)

        # Verify necessary fields exist
        if qa_data.get('question') and qa_data.get('answer'):
            qa_data.update({
                'id': str(uuid.uuid4()),
                'original_question': original_question,
                'question_type': question_type,
                'retrieved_docs': retrieved_docs,
                'generated_time': datetime.now().isoformat(),
                'generation_method': 'retrieval_augmented_alt1'
            })
            print("Alternative parsing method 1 succeeded")
            return qa_data

    except Exception as e:
        print(f"Alternative parsing method 1 failed: {e}")

    # Method 2: Regular expression extraction
    try:
        # Extract question
        question_pattern = r'"question":\s*"([^"]*(?:\\.[^"]*)*)"'
        question_match = re.search(question_pattern, response_text, re.DOTALL)

        # Extract answer (possibly across multiple lines)
        answer_pattern = r'"answer":\s*"([^"]*(?:\\.[^"]*)*(?:\.\.\.)?)(?:"|$)'
        answer_match = re.search(answer_pattern, response_text, re.DOTALL)

        # Extract references (optional)
        references_pattern = r'"references":\s*\[(.*?)\]'
        references_match = re.search(references_pattern, response_text, re.DOTALL)

        if question_match and answer_match:
            question = question_match.group(1)
            answer = answer_match.group(1)

            # Handle possible escaped characters
            question = question.replace('\\"', '"').replace('\\n', '\n')
            answer = answer.replace('\\"', '"').replace('\\n', '\n')

            # If answer ends with ..., it means it's truncated, try to complete it
            if answer.endswith('...'):
                answer = answer[:-3] + " (answer may be truncated)"

            # Handle references
            references = []
            if references_match:
                refs_text = references_match.group(1)
                # Simple reference extraction
                ref_items = re.findall(r'"([^"]+)"', refs_text)
                references = ref_items

            qa_data = {
                'id': str(uuid.uuid4()),
                'question': question,
                'answer': answer,
                'references': references,
                'original_question': original_question,
                'question_type': question_type,
                'retrieved_docs': retrieved_docs,
                'generated_time': datetime.now().isoformat(),
                'generation_method': 'retrieval_augmented_regex'
            }

            print("Alternative parsing method 2 (regular expression extraction) succeeded")
            return qa_data

    except Exception as e:
        print(f"Alternative parsing method 2 failed: {e}")

    # Method 3: Basic text extraction (last backup option)
    try:
        # If all methods fail, at least extract some basic information
        # Look for text that looks like questions
        lines = response_text.split('\n')
        potential_question = ""
        potential_answer = ""

        for line in lines:
            line = line.strip()
            if line.endswith('?') or line.endswith('?'):
                potential_question = line
            elif len(line) > 50 and ('yes' in line or 'no' in line or 'can' in line or 'according to' in line):
                potential_answer = line[:200]  # Limit length
                break

        if potential_question:
            qa_data = {
                'id': str(uuid.uuid4()),
                'question': potential_question,
                'answer': potential_answer or "Unable to parse complete answer",
                'references': [],
                'original_question': original_question,
                'question_type': question_type,
                'retrieved_docs': retrieved_docs,
                'generated_time': datetime.now().isoformat(),
                'generation_method': 'retrieval_augmented_text_extract'
            }

            print("Alternative parsing method 3 (text extraction) succeeded")
            return qa_data

    except Exception as e:
        print(f"Alternative parsing method 3 failed: {e}")

    print("❌ All alternative parsing methods failed")
    return {}


def validate_qa_pair(qa_data: Dict[str, Any]) -> bool:
    """
    Validate the completeness of QA pair data

    Args:
        qa_data: QA pair data

    Returns:
        Whether valid
    """
    required_fields = ['question', 'answer']
    for field in required_fields:
        if not qa_data.get(field):
            return False

    # Check the length of question and answer
    if len(qa_data['question'].strip()) < 5:
        return False

    if len(qa_data['answer'].strip()) < 10:
        return False

    return True


def format_progress_message(current: int, total: int, prefix: str = "Progress") -> str:
    """
    Format progress message

    Args:
        current: Current progress
        total: Total
        prefix: Prefix text

    Returns:
        Formatted progress message
    """
    percentage = (current / total * 100) if total > 0 else 0
    return f"{prefix}: {current}/{total} ({percentage:.1f}%)"


def load_question_types_mapping(output_dir: Path) -> Dict[str, Any]:
    """Load question type mapping"""
    mapping_file = output_dir / "question_types.json"

    try:
        with open(mapping_file, 'r', encoding='utf-8') as f:
            mapping = json.load(f)
        print(f"✅ Successfully loaded question type mapping: {len(mapping)} main categories")
        return mapping
    except Exception as e:
        print(f"⚠️  Loading question type mapping failed: {e}")
        print("Using original question types")
        return {}


def get_question_type_description(question_type: str, question_types_map: Dict[str, Any]) -> str:
    """Get the detailed description of the question type"""
    if not question_types_map:
        return question_type

    # Find matching types in the mapping
    for main_category, main_info in question_types_map.items():
        if main_category == question_type:
            return main_info.get('description', question_type)

        # Check subcategories
        subcategories = main_info.get('subcategories', {})
        for sub_category, sub_info in subcategories.items():
            if sub_category == question_type:
                return sub_info.get('description', question_type)

            # Check code matching
            if sub_info.get('code') == question_type:
                return sub_info.get('description', question_type)

    # If no match is found, return the original type
    return question_type
