import argparse
import logging
import os
import sys
import glob
import json


def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def preprocess_llm_output(llm_output):
    """
    Preprocess LLM output

    Args:
        llm_output: Raw LLM output string

    Returns:
        str: Preprocessed LLM output
    """
    if '</think>' in llm_output:
        end = llm_output.rfind('</think>') + len('</think>')
        llm_output = llm_output[end:].strip()
    return llm_output


def extract_answer_reverse(llm_output):
    """
    Extract the answer from the llm_output string.
    The answer is expected to be in the format: \\boxed{an int value}
    Search from the end of the string for the last occurrence of \\boxed{...}
    If it is not an int value, continue searching backwards until an int value is found or no more matches exist
    """
    end = len(llm_output)
    while True:
        box_end = llm_output.rfind("\\boxed{", 0, end)
        if box_end == -1:
            return None

        box_content_start = box_end + len("\\boxed{")
        box_start = llm_output.find("}", box_content_start)
        if box_start == -1:
            return None

        answer = llm_output[box_content_start:box_start]

        try:
            return int(answer)
        except ValueError:
            end = box_end - 1


def extract_final_answer_from_json(formatted_response):
    """
    Extract final_answer from JSON formatted response

    Args:
        formatted_response: Possibly JSON formatted string

    Returns:
        int or None: Extracted answer, returns None if parsing fails
    """
    try:
        json_data = json.loads(formatted_response)
        if isinstance(json_data, dict) and 'final_answer' in json_data:
            final_answer = json_data['final_answer']
            try:
                return int(final_answer)
            except (ValueError, TypeError):
                return None
    except json.JSONDecodeError:
        # If not valid JSON, return None
        pass
    return None


def evaluate_dataset(dataset, use_process_evaluation=False):
    """
    Evaluate dataset

    Args:
        dataset: Dataset list
        use_process_evaluation: Whether to use process evaluation

    Returns:
        Dict: Dictionary containing answer evaluation and process evaluation results
    """
    answer_scores = []
    process_scores = []
    formatted_response_scores = []
    formatted_response_process_scores = []

    for data in dataset:
        answer = data['answer']
        raw_llm_output = data['llm_output']

        # Preprocess LLM output
        llm_output = preprocess_llm_output(raw_llm_output)

        # Create processed data copy for process evaluation
        processed_data = data.copy()
        processed_data['llm_output'] = llm_output

        # 1. Evaluation of original llm_output
        extracted_answer = extract_answer_reverse(llm_output)
        if extracted_answer is not None and extracted_answer == answer:
            llm_output_score = 1
        else:
            llm_output_score = 0
        answer_scores.append(llm_output_score)

        # 2. Evaluation of formatted_response (if exists)
        formatted_response_score = 0
        if 'formatted_response' in data and data['formatted_response']:
            formatted_response = data['formatted_response']

            # First try to extract final_answer from JSON
            final_answer = extract_final_answer_from_json(formatted_response)
            if final_answer is not None and final_answer == answer:
                formatted_response_score = 1
            else:
                # If JSON parsing fails, try using original method to extract answer from formatted_response
                formatted_response_score = llm_output_score

        formatted_response_scores.append(formatted_response_score)

        # 3. Process evaluation (if enabled and data supports it)
        if use_process_evaluation:
            project_root = os.path.dirname(
                os.path.dirname(os.path.abspath(__file__)))
            sys.path.insert(0, project_root)
            from process_score.parser import ReasoningAnalyzer

            analyzer = ReasoningAnalyzer()
            # Disable log output
            analyzer.logger.setLevel(logging.CRITICAL)
            analyzer.fact_parser.logger.setLevel(logging.CRITICAL)
            analyzer.graph_builder.logger.setLevel(logging.CRITICAL)
            analyzer.llm_parser.logger.setLevel(logging.CRITICAL)

            # 3.1 Process evaluation of original llm_output
            try:
                # Use preprocessed data for process evaluation
                process_score = analyzer.process_reasoning_data(processed_data)
                process_scores.append(process_score)
            except Exception as e:
                print(
                    f"Warning: Process evaluation failed for data item. Error: {e}")
                process_scores.append(0)

            # 3.2 Process evaluation of formatted_response (if exists)
            if 'formatted_response' in data and data['formatted_response']:
                try:
                    # Create new data copy, replace llm_output with formatted_response
                    if data['final_answer'] != "":
                        formatted_data = data.copy()
                        formatted_data['llm_output'] = data['formatted_response']
                        # Process evaluation of formatted_response
                        formatted_process_score = analyzer.process_reasoning_data(
                            formatted_data)
                        formatted_response_process_scores.append(
                            formatted_process_score)
                    else:
                        formatted_response_process_scores.append(process_score)
                except Exception as e:
                    print(
                        f"Warning: Formatted response process evaluation failed for data item. Error: {e}")
                    formatted_response_process_scores.append(process_score)
            else:
                formatted_response_process_scores.append(process_score)

    answer_avg = sum(answer_scores) / \
        len(answer_scores) if answer_scores else 0
    formatted_response_avg = sum(formatted_response_scores) / len(
        formatted_response_scores) if formatted_response_scores else 0
    process_avg = sum(process_scores) / \
        len(process_scores) if process_scores else None
    formatted_response_process_avg = sum(formatted_response_process_scores) / len(
        formatted_response_process_scores) if formatted_response_process_scores else None

    return {
        'answer_scores': answer_scores,
        'answer_average': answer_avg,
        'formatted_response_scores': formatted_response_scores,
        'formatted_response_average': formatted_response_avg,
        'process_scores': process_scores,
        'process_average': process_avg,
        'process_valid_count': len([score for score in process_scores if score > 0]),
        'formatted_response_process_scores': formatted_response_process_scores,
        'formatted_response_process_average': formatted_response_process_avg,
        'formatted_response_process_valid_count': len([score for score in formatted_response_process_scores if score > 0])
    }


def evaluate_json(file_path, use_process_evaluation=False):
    """
    Evaluate JSON file

    Args:
        file_path: JSON file path
        use_process_evaluation: Whether to use process evaluation

    Returns:
        Dict: Evaluation results
    """
    data = read_json(file_path)
    results = {}
    for dataset_name, dataset in data.items():
        results[dataset_name] = evaluate_dataset(
            dataset, use_process_evaluation)

    return results


def sort_file_names(file_names):
    """
    Sort file names by type (base/chat), then by model family, then by model version, then by parameter size
    """
    import re

    def extract_model_info(file_name):
        # Remove extension and path, convert to lowercase
        name = file_name.split('/')[-1].replace('.json', '').lower()

        # Determine type: 0 for base, 1 for chat/instruct/it
        if any(keyword in name for keyword in ['chat', 'instruct', '-it']):
            model_type = 1
        else:
            model_type = 0

        # Extract model family and version
        # Handle different model patterns
        if name.startswith('glm'):
            model_family = 'glm'
            version_match = re.search(r'glm-(\d+)', name)
            model_version = version_match.group(1) if version_match else '0'
        elif name.startswith('llama'):
            model_family = 'llama'
            version_match = re.search(r'llama-(\d+\.\d+)', name)
            model_version = version_match.group(1) if version_match else '0'
        elif name.startswith('qwen'):
            model_family = 'qwen'
            version_match = re.search(r'qwen(\d+(?:\.\d+)?)', name)
            model_version = version_match.group(1) if version_match else '0'
        else:
            model_family = 'unknown'
            model_version = '0'

        # Extract parameter size
        size_match = re.search(r'(\d+(?:\.\d+)?)[b]', name)
        if size_match:
            size_value = float(size_match.group(1))
        else:
            size_value = 0

        # Convert version to float for proper sorting
        try:
            version_float = float(model_version)
        except ValueError:
            version_float = 0

        return (model_type, model_family, version_float, size_value, file_name)

    # Sort by type, then model family, then version, then size
    sorted_names = sorted(file_names, key=extract_model_info)
    return sorted_names


def evaluate_json_files(file_paths, output_file, use_process_evaluation=False):
    """
    Evaluate multiple JSON files and save results to Excel

    Args:
        file_paths: List of JSON file paths
        output_file: Output Excel file path
        use_process_evaluation: Whether to use process evaluation
    """
    import pandas as pd
    import os

    answer_results = {}
    formatted_response_results = {}
    process_results = {}
    formatted_response_process_results = {}

    # Sort file paths according to the specified order
    sorted_file_paths = sort_file_names(file_paths)

    # Ensure detail-results directory exists
    os.makedirs("detail-results", exist_ok=True)

    for file_path in sorted_file_paths:
        print(f"Evaluating: {file_path}")
        file_results = evaluate_json(file_path, use_process_evaluation)
        # file_name without extension and path
        file_name = os.path.splitext(os.path.basename(file_path))[0]

        # Save detailed results to detail-results/samename.details.json
        details_save_path = os.path.join("detail-results", f"{file_name}.details.json")
        with open(details_save_path, "w", encoding="utf-8") as f:
            json.dump(file_results, f, ensure_ascii=False, indent=2)
        print(f"  - Details saved to: {details_save_path}")

        for dataset_name, v in file_results.items():
            # Answer evaluation results
            if file_name not in answer_results:
                answer_results[file_name] = {}
            answer_results[file_name][dataset_name] = v['answer_average']

            # formatted_response evaluation results
            if file_name not in formatted_response_results:
                formatted_response_results[file_name] = {}
            formatted_response_results[file_name][dataset_name] = v['formatted_response_average']

            # Process evaluation results
            if v['process_average'] is not None:
                if file_name not in process_results:
                    process_results[file_name] = {}
                process_results[file_name][dataset_name] = v['process_average']

            # formatted_response process evaluation results
            if v['formatted_response_process_average'] is not None:
                if file_name not in formatted_response_process_results:
                    formatted_response_process_results[file_name] = {}
                formatted_response_process_results[file_name][dataset_name] = v['formatted_response_process_average']
        print(f"  - Answer average: {answer_results[file_name]}")
        print(
            f"  - Formatted response average: {formatted_response_results[file_name]}")
        if use_process_evaluation:
            print(f"  - Process average: {process_results.get(file_name)}")
            print(
                f"  - Formatted response process average: {formatted_response_process_results.get(file_name)}")

    # Generate answer evaluation DataFrame
    sorted_file_names = [os.path.splitext(os.path.basename(fp))[
        0] for fp in sorted_file_paths]
    answer_df = pd.DataFrame.from_dict(answer_results, orient='index')
    answer_df = answer_df.reindex(sorted_file_names)

    # Generate formatted_response evaluation DataFrame
    formatted_response_df = pd.DataFrame.from_dict(
        formatted_response_results, orient='index')
    formatted_response_df = formatted_response_df.reindex(sorted_file_names)

    # Generate process evaluation DataFrame (if data exists)
    process_df = None
    if process_results:
        process_df = pd.DataFrame.from_dict(process_results, orient='index')
        process_df = process_df.reindex(sorted_file_names)

    # Generate formatted_response process evaluation DataFrame (if data exists)
    formatted_response_process_df = None
    if formatted_response_process_results:
        formatted_response_process_df = pd.DataFrame.from_dict(
            formatted_response_process_results, orient='index')
        formatted_response_process_df = formatted_response_process_df.reindex(
            sorted_file_names)

    # Save to Excel file with multiple worksheets
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        # Answer evaluation results
        answer_df.to_excel(writer, sheet_name='ans')

        # formatted_response evaluation results
        formatted_response_df.to_excel(writer, sheet_name='format-ans')

        # Process evaluation results (if available)
        if process_df is not None and not process_df.empty:
            process_df.to_excel(writer, sheet_name='process')

        # formatted_response process evaluation results (if available)
        if formatted_response_process_df is not None and not formatted_response_process_df.empty:
            formatted_response_process_df.to_excel(
                writer, sheet_name='format-process')

    print(f"Results saved to: {output_file}")
    print(f"  - Answer evaluation results: ans sheet")
    print(f"  - Formatted response evaluation results: format-ans sheet")
    if process_df is not None and not process_df.empty:
        print(f"  - Process evaluation results: process sheet")
    if formatted_response_process_df is not None and not formatted_response_process_df.empty:
        print(f"  - Formatted response process evaluation results: format-process sheet")


if __name__ == "__main__":
    argparser = argparse.ArgumentParser(
        description="Evaluate JSON files and save results to Excel.")
    argparser.add_argument('--input-dir', type=str, default='results',
                           help='Directory containing JSON files to evaluate.')
    argparser.add_argument('--output-file', type=str,
                           default='evaluation_results.xlsx', help='Output Excel file path.')
    argparser.add_argument('--use-process-evaluation',
                           action='store_true', help='Whether to use process evaluation.')
    args = argparser.parse_args()

    file_paths = glob.glob(f"{args.input_dir}/*.json")
    output_file = args.output_file
    evaluate_json_files(file_paths, output_file, args.use_process_evaluation)
    print(f"Evaluation results saved to {output_file}")
