import os
import re
import pandas as pd
import json
import warnings
import time
import threading
import datetime
import numpy as np
from pathlib import Path
from typing import Dict, List, Optional, Union, Any

# Import VLMEvalKit related tools

# Load API configuration from .env file
from dotenv import load_dotenv
load_dotenv('../.env')

# Import local verifier module
import sys
sys.path.append('../utils')
from verifier import grade, extract_boxed_answer, get_answer_str, answer_tag_reward_fn_for_r1

from vlmeval.smp import load, dump, gpt_key_set
from vlmeval.dataset.utils import build_judge
from vlmeval.utils import track_progress_rich

# Thread lock for synchronized output
output_lock = threading.Lock()

def safe_print(*args, **kwargs):
    """Thread-safe print function"""
    with output_lock:
        print(*args, **kwargs)

class LogBuffer:
    """Log buffer class for collecting all logs from a single task"""
    def __init__(self, task_id):
        self.task_id = task_id
        self.logs = []
        self.start_time = datetime.datetime.now()
    
    def log(self, message):
        """Add log message"""
        timestamp = datetime.datetime.now().strftime("%H:%M:%S.%f")[:-3]
        self.logs.append(f"[{timestamp}] [{self.task_id}] {message}")
    
    def flush(self):
        """Output all cached logs at once"""
        with output_lock:
            for log in self.logs:
                print(log)
            print()  # Add blank line to separate different task outputs

# Removed add_json_dataset function, now using static configuration

class UniversalPhysicsEvaluator:
    """
    HiPhO Physics Competition Evaluation System
    
    Supports automatic evaluation of HiPhO datasets:
    - PanPhO2024/2025
    - IPhO2024/2025
    - EuPhO2024/2025
    - APhO2025
    - FMA2024/2025
    - NBPhO2024/2025
    - PanMechanics2024/2025
    
    Supports fine-grained and coarse-grained evaluation, supports statistical analysis of multiple run results
    """
    
    # HiPhO dataset configuration mapping (consistent with data file names)
    DATASET_CONFIGS = {
        'APhO_2025': {
            'display_name': 'APhO 2025',
            'dataset_file': '../data/APhO_2025.json',
            'result_pattern': '*_APhO_2025'
        },
        'EuPhO_2024': {
            'display_name': 'EuPhO 2024',
            'dataset_file': '../data/EuPhO_2024.json',
            'result_pattern': '*_EuPhO_2024'
        },
        'EuPhO_2025': {
            'display_name': 'EuPhO 2025',
            'dataset_file': '../data/EuPhO_2025.json',
            'result_pattern': '*_EuPhO_2025'
        },
        'F=MA_2024': {
            'display_name': 'F=MA 2024',
            'dataset_file': '../data/F=MA_2024.json',
            'result_pattern': '*_F=MA_2024'
        },
        'F=MA_2025': {
            'display_name': 'F=MA 2025',
            'dataset_file': '../data/F=MA_2025.json',
            'result_pattern': '*_F=MA_2025'
        },
        'IPhO_2024': {
            'display_name': 'IPhO 2024',
            'dataset_file': '../data/IPhO_2024.json',
            'result_pattern': '*_IPhO_2024'
        },
        'IPhO_2025': {
            'display_name': 'IPhO 2025',
            'dataset_file': '../data/IPhO_2025.json',
            'result_pattern': '*_IPhO_2025'
        },
        'NBPhO_2024': {
            'display_name': 'NBPhO 2024',
            'dataset_file': '../data/NBPhO_2024.json',
            'result_pattern': '*_NBPhO_2024'
        },
        'NBPhO_2025': {
            'display_name': 'NBPhO 2025',
            'dataset_file': '../data/NBPhO_2025.json',
            'result_pattern': '*_NBPhO_2025'
        },
        'PanMechanics_2024': {
            'display_name': 'PanMechanics 2024',
            'dataset_file': '../data/PanMechanics_2024.json',
            'result_pattern': '*_PanMechanics_2024'
        },
        'PanMechanics_2025': {
            'display_name': 'PanMechanics 2025',
            'dataset_file': '../data/PanMechanics_2025.json',
            'result_pattern': '*_PanMechanics_2025'
        },
        'PanPhO_2024': {
            'display_name': 'PanPhO 2024',
            'dataset_file': '../data/PanPhO_2024.json',
            'result_pattern': '*_PanPhO_2024'
        },
        'PanPhO_2025': {
            'display_name': 'PanPhO 2025',
            'dataset_file': '../data/PanPhO_2025.json',
            'result_pattern': '*_PanPhO_2025'
        }
    }

    def __init__(self, results_dir: str = "../infer_results", nproc: int = 4, model_name: str = None, output_dir: str = "../eval_results"):
        """
        Initialize evaluator
        
        Args:
            results_dir: Inference results directory
            nproc: Number of parallel processes
            model_name: Model name, used for filtering result files and naming output files
            output_dir: Evaluation results output directory
        """
        self.results_dir = Path(results_dir)
        self.nproc = nproc
        self.model_name = model_name
        self.output_dir = output_dir
        
        if not self.results_dir.exists():
            raise FileNotFoundError(f"Results directory not found: {results_dir}")
    
    def detect_available_datasets(self) -> List[str]:
        """Detect available HiPhO datasets"""
        available_datasets = []
        
        for dataset_key, config in self.DATASET_CONFIGS.items():
            # Check if dataset file exists
            dataset_file = Path(config['dataset_file'])
            if not dataset_file.exists():
                safe_print(f"⚠️  Skip {dataset_key}: dataset file does not exist ({dataset_file})")
                continue
            
            # Find matching inference result directories
            result_pattern = config['result_pattern']
            result_dirs = list(self.results_dir.glob(result_pattern))
            if not result_dirs:
                safe_print(f"⚠️  Skip {dataset_key}: result directory does not exist ({result_pattern})")
                continue
            
            # Check if there are inference result files
            found_valid_results = False
            for result_dir in result_dirs:
                # Find run_* subdirectories
                run_dirs = list(result_dir.glob("run_*"))
                if run_dirs:
                    # Check if at least one run directory contains result files
                    for run_dir in run_dirs:
                        result_files = list(run_dir.glob("*_results.json"))
                        if result_files:
                            found_valid_results = True
                            break
                    if found_valid_results:
                        break
            
            if found_valid_results:
                available_datasets.append(dataset_key)
                safe_print(f"✅ Found dataset: {config['display_name']} ({dataset_key})")
            else:
                safe_print(f"⚠️  Skip {dataset_key}: no valid inference result files found")
        
        return available_datasets

    def detect_multiple_runs(self, dataset_key: str) -> List[str]:
        """Detect multiple run results of dataset"""
        config = self.DATASET_CONFIGS[dataset_key]
        result_pattern = config['result_pattern']
        result_dirs = list(self.results_dir.glob(result_pattern))
        
        if not result_dirs:
            return []
        
        result_dir = result_dirs[0]
        run_dirs = []
        
        # Check if there are run_XX format subdirectories
        for item in result_dir.iterdir():
            if item.is_dir() and item.name.startswith('run_'):
                # Verify if it contains inference result files
                result_files = list(item.glob("*_results.json"))
                if result_files:
                    run_dirs.append(item.name)
        
        # Sort by run number
        run_dirs.sort(key=lambda x: int(x.split('_')[1]) if x.split('_')[1].isdigit() else 0)
        return run_dirs

    def has_multiple_runs(self, dataset_key: str) -> bool:
        """Check if dataset has multiple run results"""
        return len(self.detect_multiple_runs(dataset_key)) > 1

    def load_inference_results(self, dataset_key: str, run_id: Optional[str] = None) -> List[Dict]:
        """Load inference results"""
        config = self.DATASET_CONFIGS[dataset_key]
        result_pattern = config['result_pattern']
        result_dirs = list(self.results_dir.glob(result_pattern))
        if not result_dirs:
            raise FileNotFoundError(f"No inference results found for pattern: {result_pattern}")
        
        result_dir = result_dirs[0]
        safe_print(f"📁 Loading inference results: {result_dir}")
        
        if run_id:
            # Load specified run results
            run_dir = result_dir / run_id
            if not run_dir.exists():
                raise FileNotFoundError(f"Run directory not found: {run_dir}")
            
            result_files = list(run_dir.glob("*_results.json"))
            if not result_files:
                raise FileNotFoundError(f"No result files found in {run_dir}")
            
            inference_file = result_files[0]  # Use the first found result file
            safe_print(f"   Using specified run: {inference_file}")
        else:
            # Find result files in run_* directories
            run_dirs = list(result_dir.glob("run_*"))
            if not run_dirs:
                raise FileNotFoundError(f"No run directories found in {result_dir}")
            
            # Select the latest run
            latest_run = max(run_dirs, key=lambda x: x.name)
            result_files = list(latest_run.glob("*_results.json"))
            if not result_files:
                raise FileNotFoundError(f"No result files found in {latest_run}")
            
            inference_file = result_files[0]
            safe_print(f"   Using latest run: {inference_file}")
        
        with open(inference_file, 'r', encoding='utf-8') as f:
            results = json.load(f)
        
        # Filter out invalid data (such as entries containing only "information" field)
        valid_results = []
        for item in results:
            if isinstance(item, dict) and 'id' in item and 'prediction' in item:
                valid_results.append(item)
        
        safe_print(f"   Inference results count: {len(valid_results)} (raw data: {len(results)} entries)")
        return valid_results

    def load_multiple_runs_results(self, dataset_key: str) -> Dict[str, List[Dict]]:
        """Load all inference results from multiple runs"""
        run_dirs = self.detect_multiple_runs(dataset_key)
        if not run_dirs:
            raise ValueError(f"No multiple runs found for dataset: {dataset_key}")
        
        all_runs_results = {}
        for run_dir in run_dirs:
            try:
                results = self.load_inference_results(dataset_key, run_dir)
                all_runs_results[run_dir] = results
                safe_print(f"✅ Loaded {run_dir}: {len(results)} results")
            except Exception as e:
                safe_print(f"⚠️  Skip {run_dir}: {e}")
        
        return all_runs_results

    def prepare_evaluation_data(self, dataset_key: str) -> pd.DataFrame:
        """Prepare evaluation data"""
        # Load inference results JSON directly, which already contains all required fields
        inference_results = self.load_inference_results(dataset_key)
        
        # Convert to DataFrame
        eval_data = pd.DataFrame(inference_results)
        
        safe_print(f"✅ Loaded evaluation data, total {len(eval_data)} records")
        safe_print(f"📊 Data columns: {list(eval_data.columns)}")
        
        # Check required fields
        required_fields = ['prediction', 'answer']
        for field in required_fields:
            if field not in eval_data.columns:
                raise ValueError(f"Missing required field: {field}")
        
        return eval_data

    def _safe_parse_json_field(self, field_value):
        """Safely parse JSON field"""
        # If already a list, return directly
        if isinstance(field_value, list):
            return field_value
        
        # Check None and NaN
        if field_value is None:
            return []
        
        try:
            if pd.isna(field_value):
                return []
        except (TypeError, ValueError):
            # Handle cases that cannot be checked with pd.isna
            pass
        
        if field_value == '':
            return []
        
        field_str = str(field_value).strip()
        if field_str.startswith('[') and field_str.endswith(']'):
            try:
                return json.loads(field_str)
            except json.JSONDecodeError:
                return [field_str]
        else:
            return [field_str] if field_str != 'nan' else []
    
    def _safe_parse_points_field(self, points_value):
        """Safely parse points field"""
        # If already a list, convert directly
        if isinstance(points_value, list):
            return [float(p) for p in points_value if p is not None]
        
        # Check None
        if points_value is None:
            return [0.0]
            
        # Check NaN
        try:
            if pd.isna(points_value):
                return [0.0]
        except (TypeError, ValueError):
            # Handle cases that cannot be checked with pd.isna
            pass
        
        if isinstance(points_value, (int, float)):
            return [float(points_value)]
        
        points_str = str(points_value).strip()
        if points_str.startswith('[') and points_str.endswith(']'):
            try:
                parsed = json.loads(points_str)
                return [float(p) for p in parsed if p is not None]
            except (json.JSONDecodeError, ValueError):
                pass
        
        try:
            return [float(points_str)]
        except ValueError:
            return [0.0]

    def _has_valid_marking(self, marking):
        """Check if marking contains valid scoring criteria"""
        if not marking:
            return False
        
        if not isinstance(marking, list):
            return False
        
        if len(marking) == 0:
            return False
        
        for item in marking:
            if item is None:
                continue
            
            if isinstance(item, list):
                if len(item) > 0:
                    return True
            elif isinstance(item, str):
                stripped = item.strip()
                if stripped and stripped.lower() not in ['', 'nan', 'none', 'null']:
                    return True
            else:
                return True
        
        return False

    def evaluate_dataset(self, dataset_key: str, judge_kwargs: Optional[Dict] = None) -> Dict:
        """Evaluate single dataset"""
        if judge_kwargs is None:
            judge_kwargs = {}
        
        config = self.DATASET_CONFIGS[dataset_key]
        safe_print(f"🚀 Starting dataset evaluation: {config['display_name']}")
        
        # Prepare evaluation data
        eval_data = self.prepare_evaluation_data(dataset_key)
        
        # Initialize judge model
        judge_model = self._init_judge_model(judge_kwargs)
        
        safe_print(f"📊 Starting parallel evaluation, total {len(eval_data)} questions...")
        
        # Build task list
        tasks = []
        indices = []
        for i in range(len(eval_data)):
            row = eval_data.iloc[i]
            task_kwargs = judge_kwargs.copy()
            task = (judge_model, row, i, task_kwargs)
            tasks.append(task)
            indices.append(i)
        
        safe_print(f"🔄 Starting parallel evaluation, task count: {len(tasks)}")
        
        # Set intermediate result save file
        if self.model_name:
            output_dir = Path(f"{self.output_dir}/{dataset_key}_{self.model_name}")
            tmp_file_name = f"parallel_tmp_{self.model_name}.pkl"
        else:
            output_dir = Path(f"{self.output_dir}/{dataset_key}")
            tmp_file_name = "parallel_tmp.pkl"
        
        output_dir.mkdir(parents=True, exist_ok=True)
        tmp_file = output_dir / tmp_file_name
        
        # Parallel evaluation of all questions
        parallel_results = track_progress_rich(
            self._evaluate_single_problem,
            tasks,
            nproc=self.nproc,
            chunksize=max(1, self.nproc//2),
            keys=indices,
            save=str(tmp_file)
        )
        
        safe_print(f"✅ Parallel evaluation completed, starting result aggregation...")
        
        # Aggregate results
        results = self._aggregate_results(parallel_results, eval_data, dataset_key)
        
        # Save results
        self._save_evaluation_results(results, dataset_key, eval_data, parallel_results)
        
        # Clean up temporary files
        try:
            if tmp_file.exists():
                tmp_file.unlink()
                safe_print(f"🗑️  Cleaning temporary file: {tmp_file}")
        except Exception as e:
            safe_print(f"⚠️  Failed to clean temporary file: {e}")
        
        return results

    def _init_judge_model(self, judge_kwargs):
        """Initialize judge model"""
        safe_print(f"🔧 Starting Judge model initialization")
        
        judge_model_name = judge_kwargs.get('model', None)
        safe_print(f"   🤖 Specified model name: {judge_model_name}")
        
        if judge_model_name and judge_model_name != 'exact_matching':
            safe_print(f"   🔑 Checking API key...")
            if gpt_key_set():
                safe_print(f"   ✅ API key is set")
                try:
                    model_kwargs = {
                        'model': judge_model_name,
                        'timeout': int(os.getenv('JUDGE_TIMEOUT', '1800')),
                        'retry': int(os.getenv('JUDGE_RETRY', '10')),
                        'max_tokens': int(os.getenv('JUDGE_MAX_TOKENS', '16384')),
                        'verbose': False,
                        **{k: v for k, v in judge_kwargs.items() if k not in ['model', 'nproc']}
                    }
                    safe_print(f"   🔄 Testing Judge model connection...")
                    test_model = build_judge(**model_kwargs)
                    safe_print(f"   ✅ Judge model built successfully, testing connectivity...")
                    if test_model.working():
                        safe_print(f"🤖 Using Judge model: {judge_model_name}")
                        return test_model
                    else:
                        safe_print(f"   ❌ Judge API not working")
                        warnings.warn('Judge API not working, skipping process evaluation')
                except Exception as e:
                    safe_print(f"   ❌ Model initialization failed: {e}")
                    warnings.warn(f'Model initialization failed: {e}, skipping process evaluation')
            else:
                safe_print(f"   ❌ API_KEY not set or invalid")
                warnings.warn('API_KEY invalid, skipping process evaluation')
        else:
            safe_print("⚠️  Judge model not specified, only performing final answer evaluation")
        return None

    def _evaluate_single_problem(self, judge_model, row, index, judge_kwargs):
        """Function for evaluating a single problem (used for parallel calls)"""
        task_id = f"Question{index + 1}"
        log_buffer = LogBuffer(task_id)
        
        try:
            log_buffer.log(f"📖 Starting evaluation - ID: {row.get('id', 'N/A')}")
            
            # Extract fields
            prediction = str(row['prediction']).strip()
            ground_truth = self._safe_parse_json_field(row.get('answer', ''))
            answer_type = self._safe_parse_json_field(row.get('answer_type', 'Open-End'))
            unit = self._safe_parse_json_field(row.get('unit', ''))
            # Try to read points field, if not exists try point field (backward compatibility)
            points_value = row.get('points', row.get('point', 0))
            points = self._safe_parse_points_field(points_value)
            # Record the field name used (for debugging)
            points_field_used = 'points' if 'points' in row else ('point' if 'point' in row else 'default')
            marking = self._safe_parse_json_field(row.get('marking', ''))
            
            log_buffer.log(f"📝 Question information:")
            log_buffer.log(f"   - Prediction length: {len(prediction)} characters")
            log_buffer.log(f"   - Ground truth: {ground_truth}")
            log_buffer.log(f"   - Points: {points} (field: {points_field_used})")
            log_buffer.log(f"   - Number of marking criteria: {len(marking) if marking else 0}")
            
            item_total_points = sum(points) if points else 0.0
            log_buffer.log(f"   - Total points for this question: {item_total_points}")
            
            # Always perform fine-grained and coarse-grained evaluation
            # Fine-grained evaluation
            log_buffer.log(f"🔍 Starting fine-grained evaluation...")
            fine_grained_score, marking_detailed_scores = self._evaluate_fine_grained_with_buffer(
                prediction, marking, points, judge_model, row.get('question', ''), log_buffer
            )
            log_buffer.log(f"✅ Fine-grained score: {fine_grained_score}")
            
            # Coarse-grained evaluation (pass fine_grained_score, fully aligned with EuPhO2024 logic)
            log_buffer.log(f"🎯 Starting coarse-grained evaluation...")
            coarse_grained_score, extracted_pred = self._evaluate_coarse_grained_with_buffer(
                prediction, ground_truth, answer_type, unit, points, 
                fine_grained_score, row.get('question', ''), log_buffer
            )
            log_buffer.log(f"✅ Coarse-grained score: {coarse_grained_score}")
            log_buffer.log(f"📤 Extracted predicted answer: {extracted_pred}")
            
            # Calculate final score (take maximum of both)
            final_score = max(fine_grained_score, coarse_grained_score)
            log_buffer.log(f"📊 Final score: {final_score} (fine-grained: {fine_grained_score}, coarse-grained: {coarse_grained_score})")
            
            result = {
                'index': index,
                'fine_grained_score': fine_grained_score,
                'coarse_grained_score': coarse_grained_score,
                'extracted_pred': extracted_pred,
                'marking_detailed_scores': marking_detailed_scores,
                'item_total_points': item_total_points,
                'ground_truth': ground_truth,
                'answer_type': answer_type,
                'unit': unit,
                'points': points,
                'marking': marking,
                'prediction': prediction,
                'earned_points': final_score  # Add earned_points field, equal to maximum score
            }
            
            log_buffer.log(f"✅ Evaluation completed, final score: {final_score}")
            log_buffer.flush()
            return result
            
        except Exception as e:
            log_buffer.log(f"❌ Evaluation failed: {e}")
            import traceback
            log_buffer.log(f"📄 Error details: {traceback.format_exc()}")
            log_buffer.flush()
            return None



    def _evaluate_fine_grained_with_buffer(self, prediction, marking, points, judge_model, question, log_buffer):
        """Fine-grained evaluation - with retry mechanism (with log buffer version)"""
        log_buffer.log(f"   🔍 Fine-grained evaluation started")
        log_buffer.log(f"      - Number of marking: {len(marking) if marking else 0}")
        log_buffer.log(f"      - judge_model: {'Available' if judge_model else 'None'}")
        
        if not marking or not judge_model:
            log_buffer.log(f"   ⚠️  Skip fine-grained evaluation: {'No marking criteria' if not marking else 'No judge model'}")
            return 0.0, []
        
        # Check if there are multiple marking criteria sets
        has_multiple_marking_sets = self._has_multiple_marking_sets(marking)
        if has_multiple_marking_sets:
            log_buffer.log(f"   📋 Detected multiple marking criteria sets, total {len(marking)} sets")
            return self._evaluate_multiple_marking_sets_with_buffer(prediction, marking, points, judge_model, question, log_buffer)
        else:
            log_buffer.log(f"   📋 Single marking criteria set")
            return self._evaluate_single_marking_set_with_buffer(prediction, marking, points, judge_model, question, log_buffer)
    
    def _has_multiple_marking_sets(self, marking):
        """Check if there are multiple marking criteria sets"""
        if not marking or len(marking) == 0:
            return False
        
        # If the first element is a list, consider it as multiple criteria sets
        return isinstance(marking[0], list)
    
    def _evaluate_multiple_marking_sets_with_buffer(self, prediction, marking_sets, points, judge_model, question, log_buffer):
        """Evaluate multiple marking criteria sets, take the highest score"""
        best_score = 0.0
        best_detailed_scores = []
        all_marking_results = []
        
        max_possible_score = sum(points) if points else 0.0
        
        for set_idx, marking_set in enumerate(marking_sets):
            log_buffer.log(f"   📊 Evaluating marking criteria set {set_idx + 1}")
            
            score, detailed_scores = self._evaluate_single_marking_set_with_buffer(
                prediction, marking_set, points, judge_model, question, log_buffer
            )
            
            # Record results for each criteria set
            marking_result = {
                'marking_set_index': set_idx + 1,
                'score': score,
                'detailed_scores': detailed_scores,
                'max_possible_score': max_possible_score
            }
            all_marking_results.append(marking_result)
            
            log_buffer.log(f"      ✅ Criteria set {set_idx + 1} score: {score:.2f}")
            
            # Update best score
            if score > best_score:
                best_score = score
                best_detailed_scores = detailed_scores
                # Add marker to best detailed scores
                for detailed_score in best_detailed_scores:
                    detailed_score['best_marking_set'] = set_idx + 1
        
        log_buffer.log(f"   🏆 Final score from multiple criteria: {best_score:.2f} (from criteria set {[r['marking_set_index'] for r in all_marking_results if r['score'] == best_score][0]})")
        
        return round(best_score, 2), best_detailed_scores
    
    def _evaluate_single_marking_set_with_buffer(self, prediction, marking, points, judge_model, question, log_buffer):
        """Evaluate single marking criteria set - with retry mechanism (with log buffer version)"""        
        scoring_criteria = self._parse_marking_criteria(marking)
        max_possible_score = sum(points) if points else 0.0
        max_retries = 3
        
        log_buffer.log(f"      📊 Evaluation configuration:")
        log_buffer.log(f"         - Number of scoring criteria: {len(scoring_criteria)}")
        log_buffer.log(f"         - Maximum total score: {max_possible_score}")
        log_buffer.log(f"         - Maximum retry attempts: {max_retries}")
        
        for attempt in range(max_retries + 1):
            log_buffer.log(f"      🔄 Starting attempt {attempt + 1} evaluation")
            scores = []
            detailed_scores = []
            
            # Score each marking criterion
            for i, criterion in enumerate(scoring_criteria):
                log_buffer.log(f"         📏 Evaluating criterion {i+1}/{len(scoring_criteria)}: {criterion['description'][:50]}{'...' if len(criterion['description']) > 50 else ''}")
                score, response = self._evaluate_single_criterion_with_buffer(
                    prediction, criterion, judge_model, question, 
                    max_total_score=max_possible_score, 
                    current_attempt=attempt,
                    log_buffer=log_buffer
                )
                scores.append(score)
                log_buffer.log(f"            ➡️ Score: {score}")
                
                # Record detailed scores
                detailed_scores.append({
                    'marking_criterion': criterion['description'],
                    'score': round(score, 2),
                    'index': criterion['index'],
                    'attempt': attempt + 1,
                    'judge_response': response
                })
            
            total_score = sum(scores)
            log_buffer.log(f"      📊 Attempt {attempt + 1} total score: {total_score} (individual scores: {scores})")
            
            # Check if exceeds maximum score
            if total_score <= max_possible_score or max_possible_score == 0:
                # Score is reasonable, add success marker
                for detailed_score in detailed_scores:
                    detailed_score['retry_info'] = f"Attempt {attempt + 1} successful" if attempt > 0 else "First attempt successful"
                    detailed_score['total_attempts'] = attempt + 1
                    detailed_score['final_success'] = True
                
                if attempt > 0:
                    log_buffer.log(f"      ✅ Attempt {attempt + 1} successful, total score {total_score:.2f} <= {max_possible_score:.2f}")
                else:
                    log_buffer.log(f"      ✅ First attempt successful, total score {total_score:.2f} <= {max_possible_score:.2f}")
                
                return round(total_score, 2), detailed_scores
            else:
                # Score exceeds limit, prepare retry
                if attempt < max_retries:
                    log_buffer.log(f"      ⚠️  Attempt {attempt + 1} exceeded score: {total_score:.2f} > {max_possible_score:.2f}, performing attempt {attempt + 2} retry...")
                else:
                    # Reached maximum retry attempts, force adjustment
                    log_buffer.log(f"      ❌ Reached maximum retry attempts({max_retries + 1}), total score still exceeds limit: {total_score:.2f} > {max_possible_score:.2f}")
                    log_buffer.log(f"      📊 Force proportional score adjustment...")
                    
                    scale_factor = max_possible_score / total_score
                    adjusted_scores = []
                    
                    log_buffer.log(f"         📐 Adjustment factor: {scale_factor:.3f}")
                    for i, score in enumerate(scores):
                        adjusted_score = score * scale_factor
                        adjusted_scores.append(adjusted_score)
                        log_buffer.log(f"            Criterion{i+1}: {score:.2f} -> {adjusted_score:.2f}")
                        detailed_scores[i]['original_score'] = detailed_scores[i]['score']
                        detailed_scores[i]['score'] = round(adjusted_score, 2)
                        detailed_scores[i]['retry_info'] = f"Force adjustment after {max_retries + 1} retries"
                        detailed_scores[i]['total_attempts'] = max_retries + 1
                        detailed_scores[i]['forced_adjustment'] = True
                        detailed_scores[i]['scale_factor'] = round(scale_factor, 3)
                        detailed_scores[i]['final_success'] = False
                    
                    return round(sum(adjusted_scores), 2), detailed_scores
        
        return 0.0, []

    def _evaluate_coarse_grained_with_buffer(self, prediction, ground_truth, answer_type, unit, points, fine_grained_score, question, log_buffer):
        """Coarse-grained evaluation"""
        log_buffer.log(f"   🎯 Coarse-grained evaluation started")
        log_buffer.log(f"      - Ground truth: {ground_truth}")
        log_buffer.log(f"      - Fine-grained score: {fine_grained_score}")
        
        extracted_pred = ""
        
        if ground_truth:
            log_buffer.log(f"      ✅ Has ground truth, starting answer matching evaluation")
            try:
                # Extract predicted answer for display
                num_expected_answers = len(ground_truth)
                log_buffer.log(f"      📤 Extracting predicted answer (expecting {num_expected_answers} answers)")
                extracted_pred = self._extract_prediction_for_display(prediction, num_expected_answers)
                log_buffer.log(f"      📝 Extraction result: {extracted_pred}")
                
                # Multiple answer evaluation
                log_buffer.log(f"      🔍 Starting multiple answer matching evaluation")
                answer_score = self._evaluate_multiple_answers_with_buffer(prediction, ground_truth, points, question, log_buffer)
                log_buffer.log(f"      📊 Answer matching score: {answer_score}")
                
                if answer_score > 0:
                    # Answer correct, use answer score
                    log_buffer.log(f"      ✅ Answer correct, using answer score: {answer_score}")
                    return round(answer_score, 2), extracted_pred
                else:
                    # Answer incorrect, coarse-grained evaluation gives 0 points
                    log_buffer.log(f"      ❌ Answer incorrect, coarse-grained score: 0")
                    return 0.0, extracted_pred
            except Exception as e:
                # Evaluation failed, coarse-grained evaluation gives 0 points
                log_buffer.log(f"      ⚠️  Answer evaluation failed: {e}, coarse-grained score: 0")
                return 0.0, extracted_pred
        
        # If no ground truth, try to extract predicted answer for display
        log_buffer.log(f"      ⚠️  No ground truth, trying to extract predicted answer for display")
        if not extracted_pred:
            try:
                extracted_pred = self._extract_prediction_for_display(prediction, 10)
                log_buffer.log(f"      📝 Extracted predicted answer: {extracted_pred}")
            except Exception as e:
                log_buffer.log(f"      ❌ Failed to extract predicted answer: {e}")
                extracted_pred = ""
        
        # When no ground truth, use fine-grained score
        log_buffer.log(f"      📊 Finally using fine-grained score: {fine_grained_score}")
        return round(fine_grained_score, 2), extracted_pred



    def _evaluate_multiple_answers_with_buffer(self, prediction, ground_truth_list, points_list, question="", log_buffer=None):
        """Multiple answer evaluation (with log buffer version)"""
        if not ground_truth_list:
            return 0.0
            
        # Ensure data length consistency
        actual_length = min(len(ground_truth_list), len(points_list))
        ground_truth_list = ground_truth_list[:actual_length]
        points_list = points_list[:actual_length]
        
        try:
            # Use physics_r1 multiple answer evaluation function
            total_score, total_point, extracted_preds, extracted_gts, scored_by_list = answer_tag_reward_fn_for_r1(
                prediction, ground_truth_list, problem=question, points=points_list, 
                use_xverify=True, debug=True, log_callback=log_buffer.log if log_buffer else None
            )
            if log_buffer:
                log_buffer.log(f"         📊 Scoring details: scored_by={scored_by_list}, total_point={total_point}")
            return total_point
        except Exception as e:
            if log_buffer:
                log_buffer.log(f"[DEBUG] Exception in answer evaluation: {str(e)}")
            # Fall back to individual evaluation
            return self._fallback_individual_evaluation_with_buffer(prediction, ground_truth_list, points_list, question, log_buffer)

    def _fallback_individual_evaluation_with_buffer(self, prediction, ground_truth_list, points_list, question="", log_buffer=None):
        """Fallback individual evaluation method (with log buffer version)"""
        try:
            num_answers = len(ground_truth_list)
            extracted_answers = get_answer_str(prediction, return_origin=False, num_answers=num_answers)
            
            total_earned_score = 0.0
            for extracted_ans, gt_answer, points in zip(extracted_answers, ground_truth_list, points_list):
                if extracted_ans and extracted_ans.strip():
                    try:
                        is_correct, _, _, _ = grade(extracted_ans, gt_answer, False, problem=question, 
                                                 use_xverify=True, debug=True, 
                                                 log_callback=log_buffer.log if log_buffer else None)
                        if is_correct:
                            total_earned_score += points
                    except Exception as e:
                        if log_buffer:
                            log_buffer.log(f"[DEBUG] Exception in grade call: {str(e)}")
                        pass
            
            return total_earned_score
        except Exception as e:
            if log_buffer:
                log_buffer.log(f"[DEBUG] Exception in fallback evaluation: {str(e)}")
            return 0.0

    def _parse_marking_criteria(self, marking_list):
        """Parse marking scoring criteria"""
        criteria = []
        if not marking_list:
            return criteria
        
        for i, marking_criterion in enumerate(marking_list):
            if marking_criterion and str(marking_criterion).strip():
                criteria.append({
                    'description': str(marking_criterion).strip(),
                    'index': i
                })
        
        return criteria

    def _evaluate_single_criterion_with_buffer(self, prediction, criterion, judge_model, question, max_total_score=None, current_attempt=0, log_buffer=None):
        """Use judge model to evaluate single criterion - with retry mechanism (with log buffer version)"""
        log_buffer.log(f"         🤖 Calling Judge model to evaluate criterion")
        
        # Build total score limit prompt
        total_score_warning = ""
        if max_total_score is not None and max_total_score > 0:
            total_score_warning = f"""
⚠️  IMPORTANT TOTAL SCORE CONSTRAINT:
- This question has a maximum total score of {max_total_score} points
- ALL marking criteria scores combined MUST NOT exceed {max_total_score} points
- You are evaluating ONE criterion among multiple criteria for this question
- Be conservative in your scoring to ensure the total doesn't exceed the limit
- This is attempt #{current_attempt + 1} of evaluation"""

        prompt = f"""You are an expert physics competition grader. Evaluate the student's solution against the specific grading criterion.

PHYSICS PROBLEM:
{question}

STUDENT'S SOLUTION:
{prediction}

GRADING CRITERION:
{criterion['description']}{total_score_warning}

INSTRUCTIONS:
1. Carefully analyze the student's solution for physics concepts, mathematical derivations, and calculations.
2. Compare the solution against the specific grading criterion provided.
3. Award points strictly according to the criterion, including partial credit when specified.
4. BE CONSERVATIVE - remember this is one of multiple criteria being evaluated simultaneously.

SCORING FORMAT:
- Read the grading criterion carefully to understand the maximum points and conditions for partial credit
- Evaluate whether the student's solution meets the full criteria, partial criteria, or no criteria
- Output your score using the exact format: \\boxed{{score}}
- The score should be a number (e.g., 0.4, 0.2, 0.1, 0.0)

CRITICAL REQUIREMENTS:
- You MUST output your final score in the format: \\boxed{{score}}
- The score must be a single number only (no text inside the boxed)
- BE CONSERVATIVE to avoid exceeding the total score limit

⚠️ CRITICAL INSTRUCTION: 
- Output ONLY: \\boxed{{score}}
- NO explanations, NO analysis, NO reasoning
- Just the number in the exact format \\boxed{{score}}

RESPOND WITH ONLY THE BOXED SCORE:"""
        
        try:
            log_buffer.log(f"         ⏳ Calling Judge model...")
            start_time = time.time()
            
            response = judge_model.generate(prompt).strip()
            
            elapsed_time = time.time() - start_time
            log_buffer.log(f"         ⏱️  Response time: {elapsed_time:.2f}s")
            
            # Extract score
            score = self._extract_score_from_response(response)
            log_buffer.log(f"         🔍 Extracted score: {score}")
            
            return score, response
            
        except Exception as e:
            elapsed_time = time.time() - start_time
            log_buffer.log(f"         ❌ Judge model call failed (time {elapsed_time:.2f}s): {e}")
            return 0.0, f"Judge model call failed: {str(e)}"

    def _extract_score_from_response(self, response):
        """Helper function to extract score from model response"""
        if not response:
            return 0.0
            
        response = response.strip()
        
        # Prioritize using boxed format to extract score
        boxed_patterns = [
            r'\\boxed\{([^}]+)\}',
            r'boxed\{([^}]+)\}',
        ]
        
        for pattern in boxed_patterns:
            matches = re.findall(pattern, response, re.IGNORECASE)
            for match in reversed(matches):
                match = match.strip()
                if match:
                    try:
                        score = float(match)
                        return round(score, 2)
                    except ValueError:
                        nums = re.findall(r'\d+\.?\d*', match)
                        if nums:
                            try:
                                score = float(nums[-1])
                                return round(score, 2)
                            except ValueError:
                                continue
        
        # Find scores in specific format
        score_patterns = [
            r'(?:Score|Final Score|Total|Points?):\s*([0-9]*\.?[0-9]+)',
            r'([0-9]*\.?[0-9]+)\s*(?:points?|pts?)',
        ]
        
        for pattern in score_patterns:
            matches = re.findall(pattern, response, re.IGNORECASE)
            if matches:
                try:
                    score = float(matches[-1])
                    return round(score, 2)
                except ValueError:
                    continue
        
        # Extract all numbers, take the last one
        all_numbers = re.findall(r'[0-9]*\.?[0-9]+', response)
        if all_numbers:
            try:
                score = float(all_numbers[-1])
                return round(score, 2)
            except ValueError:
                pass
        
        return 0.0
    
    def _extract_prediction_for_display(self, prediction, num_answers=10):
        """Extract predicted answer for display"""
        try:
            extracted_answers = get_answer_str(prediction, return_origin=False, num_answers=num_answers)
            valid_answers = []
            
            for ans in extracted_answers:
                if ans and ans.strip():
                    cleaned_ans = ' '.join(ans.strip().replace('\n', ' ').replace('\r', ' ').split())
                    if cleaned_ans:
                        valid_answers.append(cleaned_ans)
            
            return ", ".join(valid_answers) if valid_answers else ""
        except Exception:
            # Fall back to extract_boxed_answer
            try:
                extracted = extract_boxed_answer(prediction)
                if extracted and extracted.strip():
                    cleaned = ' '.join(extracted.strip().replace('\n', ' ').replace('\r', ' ').split())
                    return cleaned if cleaned else ""
            except Exception:
                pass
            return ""

    def _aggregate_results(self, parallel_results, eval_data, dataset_key):
        """Aggregate parallel evaluation results"""
        fine_grained_total_score = 0.0
        coarse_grained_total_score = 0.0
        total_score = 0.0  # Use earned_points (maximum value) as total score
        max_possible_score = 0.0
        
        for i, result in enumerate(parallel_results):
            if result is None:
                safe_print(f"⚠️  Question {i+1} evaluation failed, skipping")
                continue
                
            fine_score = result['fine_grained_score']
            coarse_score = result['coarse_grained_score']
            earned_points = result.get('earned_points', max(fine_score, coarse_score))
            item_points = result['item_total_points']
            
            # Accumulate various scores
            fine_grained_total_score = round(fine_grained_total_score + fine_score, 2)
            coarse_grained_total_score = round(coarse_grained_total_score + coarse_score, 2)
            total_score = round(total_score + earned_points, 2)  # Total score uses earned_points
            
            max_possible_score += item_points
        
        # Calculate final results
        max_possible_score = round(max_possible_score, 2)
        fine_rate = round((fine_grained_total_score / max_possible_score * 100), 2) if max_possible_score > 0 else 0.0
        coarse_rate = round((coarse_grained_total_score / max_possible_score * 100), 2) if max_possible_score > 0 else 0.0
        total_rate = round((total_score / max_possible_score * 100), 2) if max_possible_score > 0 else 0.0
        
        return {
            'dataset': dataset_key,
            'fine_grained_total_score': fine_grained_total_score,
            'fine_grained_score_rate': fine_rate,
            'coarse_grained_total_score': coarse_grained_total_score,
            'coarse_grained_score_rate': coarse_rate,
            'total_score': total_score,  # This is the sum of earned_points
            'score_rate': total_rate,
            'max_possible_score': max_possible_score,
            'total_count': len(parallel_results),
        }

    def _save_evaluation_results(self, results, dataset_key, eval_data, parallel_results):
        """Save evaluation results"""
        # Build output directory and filename containing model name
        if self.model_name:
            output_dir = Path(f"{self.output_dir}/{dataset_key}_{self.model_name}")
            file_prefix = f"{dataset_key}_{self.model_name}"
        else:
            output_dir = Path(f"{self.output_dir}/{dataset_key}")
            file_prefix = dataset_key
        
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save summary results
        score_file = output_dir / f"{file_prefix}_score.json"
        dump(results, str(score_file))
        
        # Build detailed results
        detailed_results = []
        for i, result in enumerate(parallel_results):
            if result is None:
                continue
            
            row = eval_data.iloc[i]
            # Use earned_points field, if not available calculate maximum value (aligned with EuPhO2024 logic)
            earned_points = result.get('earned_points', max(result['fine_grained_score'], result['coarse_grained_score']))
            
            detailed_item = {
                "id": str(row.get('id', f"{dataset_key}_{i+1}")),
                "context": str(row.get('context', '')).strip(),
                "question": str(row.get('question', '')).strip(),
                "solution": str(row.get('solution', '')).strip(),
                "marking": result['marking'] if result['marking'] else [],
                "marking_detailed_scores": result['marking_detailed_scores'] if result['marking_detailed_scores'] else [],
                "answer": [f"\\boxed{{{ans}}}" for ans in result['ground_truth']] if result['ground_truth'] else [''],
                "answer_type": result['answer_type'] if result['answer_type'] else ['Open-End'],
                "unit": result['unit'] if result['unit'] else [''],
                "points": result['points'] if result['points'] else [0.0],
                "modality": str(row.get('modality', 'text')).strip(),
                "field": str(row.get('field', '')).strip(),
                "subfield": str(row.get('subfield', '')).strip(),
                "source": dataset_key,
                "test_result": str(result['prediction']),
                "test_answer": [f"\\boxed{{{ans.strip()}}}" for ans in result['extracted_pred'].split(", ") if ans.strip()] if result['extracted_pred'] else [''],
                "fine_grained_score": result['fine_grained_score'],
                "coarse_grained_score": result['coarse_grained_score'],
                "earned_points": earned_points
            }
            detailed_results.append(detailed_item)
        
        # Save detailed results
        detailed_file = output_dir / f"{file_prefix}_detailed_results.json"
        dump(detailed_results, str(detailed_file))
        
        # Save Excel format (with evaluation results)
        try:
            eval_data_with_results = eval_data.copy()
            eval_data_with_results['fine_grained_score'] = [r['fine_grained_score'] for r in detailed_results]
            eval_data_with_results['coarse_grained_score'] = [r['coarse_grained_score'] for r in detailed_results]
            eval_data_with_results['earned_points'] = [r['earned_points'] for r in detailed_results]
            eval_data_with_results['extracted_prediction'] = [", ".join(r['test_answer']).replace("\\boxed{", "").replace("}", "") for r in detailed_results]
            # Convert marking detailed scores to readable string format for Excel
            eval_data_with_results['marking_detailed_scores'] = [
                json.dumps(r['marking_detailed_scores'], ensure_ascii=False) if r['marking_detailed_scores'] else '[]' 
                for r in detailed_results
            ]
            
            detailed_xlsx_file = output_dir / f"{file_prefix}_detailed.xlsx"
            dump(eval_data_with_results, str(detailed_xlsx_file))
        except Exception as e:
            safe_print(f"⚠️  Failed to save detailed Excel file: {e}")
        
        safe_print(f"💾 Evaluation results saved to: {output_dir}")

    def evaluate_all_datasets(self, judge_kwargs: Optional[Dict] = None) -> Dict[str, Dict]:
        """Evaluate all available datasets"""
        available_datasets = self.detect_available_datasets()
        
        if not available_datasets:
            safe_print("❌ No available datasets found")
            return {}
        
        safe_print(f"🎯 Starting evaluation of {len(available_datasets)} datasets...")
        
        all_results = {}
        for dataset_key in available_datasets:
            safe_print(f"\n{'='*60}")
            safe_print(f"📊 Evaluating: {self.DATASET_CONFIGS[dataset_key]['display_name']}")
            safe_print(f"{'='*60}")
            
            try:
                results = self.evaluate_dataset(dataset_key, judge_kwargs)
                all_results[dataset_key] = results
                
                # Print summary of single dataset
                safe_print(f"\n✅ {self.DATASET_CONFIGS[dataset_key]['display_name']} evaluation completed!")
                safe_print(f"🏆 Overall score: {results['total_score']:.2f} / {results['max_possible_score']:.2f} ({results['score_rate']:.2f}%)")
                safe_print(f"📊 Fine-grained total score: {results['fine_grained_total_score']:.2f} ({results['fine_grained_score_rate']:.2f}%)")
                safe_print(f"🎯 Coarse-grained total score: {results['coarse_grained_total_score']:.2f} ({results['coarse_grained_score_rate']:.2f}%)")
                safe_print(f"📈 Number of evaluated questions: {results['total_count']}")
                
            except Exception as e:
                safe_print(f"❌ evaluation {dataset_key} failed: {e}")
                import traceback
                safe_print(f"Error details: {traceback.format_exc()}")
                all_results[dataset_key] = None
        
        # Save summary results
        self._save_summary_results(all_results)
        
        return all_results

    def evaluate_multiple_runs(self, dataset_key: str, judge_kwargs: Optional[Dict] = None) -> Dict:
        """Evaluate multiple run results and calculate statistics"""
        if judge_kwargs is None:
            judge_kwargs = {}
        
        config = self.DATASET_CONFIGS[dataset_key]
        safe_print(f"🚀 Starting multiple runs evaluation: {config['display_name']}")
        
        # Check if there are multiple runs
        run_dirs = self.detect_multiple_runs(dataset_key)
        if len(run_dirs) <= 1:
            safe_print(f"⚠️  Dataset {dataset_key} only has {len(run_dirs)} runs, cannot perform multiple runs evaluation")
            return None
        
        safe_print(f"📊 Found {len(run_dirs)} runs: {', '.join(run_dirs)}")
        
        # Load inference results from all runs
        all_runs_results = self.load_multiple_runs_results(dataset_key)
        
        # Evaluate each run separately
        run_evaluation_results = {}
        for run_dir, inference_results in all_runs_results.items():
            safe_print(f"\n📈 Evaluating {run_dir}...")
            
            # Convert to DataFrame for evaluation
            eval_data = pd.DataFrame(inference_results)
            
            # Initialize judge model
            judge_model = self._init_judge_model(judge_kwargs)
            
            # Build task list
            tasks = []
            indices = []
            for i in range(len(eval_data)):
                row = eval_data.iloc[i]
                task_kwargs = judge_kwargs.copy()
                task = (judge_model, row, i, task_kwargs)
                tasks.append(task)
                indices.append(i)
            
            # Set intermediate result save file
            if self.model_name:
                output_dir = Path(f"{self.output_dir}/{dataset_key}_multiple_runs_{self.model_name}")
                tmp_file_name = f"parallel_tmp_{run_dir}_{self.model_name}.pkl"
            else:
                output_dir = Path(f"{self.output_dir}/{dataset_key}_multiple_runs")
                tmp_file_name = f"parallel_tmp_{run_dir}.pkl"
            
            output_dir.mkdir(parents=True, exist_ok=True)
            tmp_file = output_dir / tmp_file_name
            
            # Parallel evaluation of all questions
            parallel_results = track_progress_rich(
                self._evaluate_single_problem,
                tasks,
                nproc=self.nproc,
                chunksize=max(1, self.nproc//2),
                keys=indices,
                save=str(tmp_file)
            )
            
            # Aggregate single run results
            run_results = self._aggregate_results(parallel_results, eval_data, f"{dataset_key}_{run_dir}")
            run_evaluation_results[run_dir] = {
                'results': run_results,
                'detailed_results': parallel_results,
                'eval_data': eval_data
            }
            
            # Clean up temporary files
            try:
                if tmp_file.exists():
                    tmp_file.unlink()
            except Exception:
                pass
            
            safe_print(f"   ✅ {run_dir} evaluation completed: {run_results['total_score']:.2f}/{run_results['max_possible_score']:.2f} ({run_results['score_rate']:.2f}%)")
        
        # Calculate statistics for multiple runs
        multi_run_stats = self._calculate_multi_run_statistics(run_evaluation_results, dataset_key)
        
        # Save multiple runs evaluation results
        self._save_multi_run_results(multi_run_stats, dataset_key, run_evaluation_results)
        
        return multi_run_stats

    def _calculate_multi_run_statistics(self, run_evaluation_results: Dict, dataset_key: str) -> Dict:
        """Calculate statistics for multiple runs"""
        safe_print(f"\n📊 Calculating multiple runs statistics...")
        
        run_dirs = list(run_evaluation_results.keys())
        num_runs = len(run_dirs)
        
        # Get question information from first run as baseline
        first_run = list(run_evaluation_results.values())[0]
        first_eval_data = first_run['eval_data']
        num_questions = len(first_eval_data)
        
        # Initialize statistical data structure
        question_stats = {}
        
        # Initialize statistical information for each question
        for i in range(num_questions):
            row = first_eval_data.iloc[i]
            question_id = str(row.get('id', f"{dataset_key}_{i+1}"))
            question_stats[question_id] = {
                'question_id': question_id,
                'question': str(row.get('question', '')).strip(),
                'context': str(row.get('context', '')).strip(),
                'answer': row.get('answer', []),
                'points': row.get('points', row.get('point', [0.0])),
                'max_points': sum(self._safe_parse_points_field(row.get('points', row.get('point', [0.0])))),
                'runs': {},
                'statistics': {}
            }
        
        # Collect results from each run
        for run_dir, run_data in run_evaluation_results.items():
            detailed_results = run_data['detailed_results']
            eval_data = run_data['eval_data']
            
            for i, result in enumerate(detailed_results):
                if result is None:
                    continue
                
                row = eval_data.iloc[i]
                question_id = str(row.get('id', f"{dataset_key}_{i+1}"))
                
                if question_id in question_stats:
                    # Determine score to use (take maximum value, consistent with single evaluation logic)
                    earned_score = max(result['fine_grained_score'], result['coarse_grained_score'])
                    
                    question_stats[question_id]['runs'][run_dir] = {
                        'fine_grained_score': result['fine_grained_score'],
                        'coarse_grained_score': result['coarse_grained_score'],
                        'earned_score': earned_score,
                        'prediction': result['prediction'],
                        'extracted_prediction': result['extracted_pred']
                    }
        
        # Calculate statistical information for each question
        for question_id, question_data in question_stats.items():
            runs_data = question_data['runs']
            if not runs_data:
                continue
            
            scores = [run_data['earned_score'] for run_data in runs_data.values()]
            fine_scores = [run_data['fine_grained_score'] for run_data in runs_data.values()]
            coarse_scores = [run_data['coarse_grained_score'] for run_data in runs_data.values()]
            
            question_data['statistics'] = {
                'num_runs': len(scores),
                'mean_score': round(np.mean(scores), 2),
                'std_score': round(np.std(scores), 2),
                'min_score': round(np.min(scores), 2),
                'max_score': round(np.max(scores), 2),
                'mean_fine_score': round(np.mean(fine_scores), 2),
                'mean_coarse_score': round(np.mean(coarse_scores), 2),
                'score_rate': round(np.mean(scores) / question_data['max_points'] * 100, 2) if question_data['max_points'] > 0 else 0.0
            }
        
        # Calculate overall statistical information
        overall_stats = self._calculate_overall_multi_run_stats(run_evaluation_results, question_stats)
        
        config = self.DATASET_CONFIGS[dataset_key]
        return {
            'dataset_key': dataset_key,
            'dataset_name': config['display_name'],
            'num_runs': num_runs,
            'run_dirs': run_dirs,
            'question_statistics': question_stats,
            'overall_statistics': overall_stats,
            'run_results': {run_dir: data['results'] for run_dir, data in run_evaluation_results.items()}
        }

    def _calculate_overall_multi_run_stats(self, run_evaluation_results: Dict, question_stats: Dict) -> Dict:
        """Calculate overall multiple run statistics"""
        run_dirs = list(run_evaluation_results.keys())
        num_runs = len(run_dirs)
        
        # Collect overall scores from each run
        run_total_scores = []
        run_max_scores = []
        run_score_rates = []
        
        for run_dir, run_data in run_evaluation_results.items():
            results = run_data['results']
            run_total_scores.append(results['total_score'])
            run_max_scores.append(results['max_possible_score'])
            run_score_rates.append(results['score_rate'])
        
        # Calculate average score for each question
        question_mean_scores = []
        question_max_points = []
        
        for question_data in question_stats.values():
            if question_data['runs']:
                question_mean_scores.append(question_data['statistics']['mean_score'])
                question_max_points.append(question_data['max_points'])
        
        total_mean_score = sum(question_mean_scores)
        total_max_score = sum(question_max_points)
        
        return {
            'num_runs': num_runs,
            'num_questions': len(question_stats),
            'mean_total_score': round(np.mean(run_total_scores), 2),
            'std_total_score': round(np.std(run_total_scores), 2),
            'min_total_score': round(np.min(run_total_scores), 2),
            'max_total_score': round(np.max(run_total_scores), 2),
            'mean_score_rate': round(np.mean(run_score_rates), 2),
            'std_score_rate': round(np.std(run_score_rates), 2),
            'question_based_mean_score': round(total_mean_score, 2),
            'question_based_max_score': round(total_max_score, 2),
            'question_based_score_rate': round(total_mean_score / total_max_score * 100, 2) if total_max_score > 0 else 0.0
        }

    def _save_summary_results(self, all_results):
        """Save summary results of all datasets"""
        output_dir = Path(self.output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save complete results  
        summary_file = output_dir / "all_datasets_summary.json"
        dump(all_results, str(summary_file))
        
        # Create simplified summary table
        summary_table = []
        for dataset_key, results in all_results.items():
            if results is None:
                continue
                
            config = self.DATASET_CONFIGS[dataset_key]
            summary_table.append({
                'dataset': config['display_name'],
                'dataset_key': dataset_key,
                'total_questions': results['total_count'],
                'total_score': results['total_score'],
                'max_possible_score': results['max_possible_score'],
                'score_rate': results['score_rate'],
                'total_count': results['total_count'],
                'fine_grained_score': results['fine_grained_total_score'],
                'fine_grained_rate': results['fine_grained_score_rate'],
                'coarse_grained_score': results['coarse_grained_total_score'],
                'coarse_grained_rate': results['coarse_grained_score_rate']
            })
        
        # Save summary table
        summary_table_file = output_dir / "summary_table.json"
        dump(summary_table, str(summary_table_file))
        
        # Print final summary
        safe_print(f"\n{'='*80}")
        safe_print(f"🏆 All datasets evaluation completed! Summary results:")
        safe_print(f"{'='*80}")
        
        total_score_all = sum(r['total_score'] for r in all_results.values() if r)
        total_max_all = sum(r['max_possible_score'] for r in all_results.values() if r)
        overall_rate = round((total_score_all / total_max_all * 100), 2) if total_max_all > 0 else 0.0
        
        safe_print(f"📊 Overall performance: {total_score_all:.2f} / {total_max_all:.2f} ({overall_rate:.2f}%)")
        safe_print(f"📁 Detailed results saved to: {output_dir}/")
        safe_print(f"💾 Summary file: {summary_file}")
        safe_print(f"📋 Summary table: {summary_table_file}")
        
        for item in summary_table:
            safe_print(f"   {item['dataset']}: {item['score_rate']:.2f}% ({item['total_score']:.1f}/{item['max_possible_score']:.1f})")

    def _save_multi_run_results(self, multi_run_stats: Dict, dataset_key: str, run_evaluation_results: Dict):
        """Save multiple runs evaluation results"""
        if self.model_name:
            output_dir = Path(f"{self.output_dir}/{dataset_key}_multiple_runs_{self.model_name}")
            file_prefix = f"{dataset_key}_{self.model_name}"
        else:
            output_dir = Path(f"{self.output_dir}/{dataset_key}_multiple_runs")
            file_prefix = dataset_key
        
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save complete multiple runs statistical results
        stats_file = output_dir / f"{file_prefix}_multi_run_statistics.json"
        dump(multi_run_stats, str(stats_file))
        
        # Save detailed statistics for each question (Excel format)
        question_stats_list = []
        for question_id, question_data in multi_run_stats['question_statistics'].items():
            stats = question_data['statistics']
            if not stats:
                continue
            
            # Collect scores from each run
            run_scores = {}
            for run_dir in multi_run_stats['run_dirs']:
                if run_dir in question_data['runs']:
                    run_scores[f"{run_dir}_score"] = question_data['runs'][run_dir]['earned_score']
                else:
                    run_scores[f"{run_dir}_score"] = 0.0
            
            question_stats_list.append({
                'question_id': question_id,
                'question': question_data['question'][:100] + '...' if len(question_data['question']) > 100 else question_data['question'],
                'max_points': question_data['max_points'],
                'mean_score': stats['mean_score'],
                'std_score': stats['std_score'],
                'min_score': stats['min_score'],
                'max_score': stats['max_score'],
                'score_rate': stats['score_rate'],
                'num_runs': stats['num_runs'],
                **run_scores
            })
        
        # Save question statistics Excel
        if question_stats_list:
            question_stats_df = pd.DataFrame(question_stats_list)
            question_stats_file = output_dir / f"{file_prefix}_question_statistics.xlsx"
            question_stats_df.to_excel(question_stats_file, index=False)
            safe_print(f"📊 Question statistics saved: {question_stats_file}")
        
        # Save run summary statistics
        run_summary_list = []
        for run_dir in multi_run_stats['run_dirs']:
            if run_dir in run_evaluation_results:
                results = run_evaluation_results[run_dir]['results']
                run_summary_list.append({
                    'run_id': run_dir,
                    'total_score': results['total_score'],
                    'max_possible_score': results['max_possible_score'],
                    'score_rate': results['score_rate'],
                    'total_count': results['total_count'],
                    'fine_grained_score': results['fine_grained_total_score'],
                    'fine_grained_rate': results['fine_grained_score_rate'],
                    'coarse_grained_score': results['coarse_grained_total_score'],
                    'coarse_grained_rate': results['coarse_grained_score_rate']
                })
        
        if run_summary_list:
            run_summary_df = pd.DataFrame(run_summary_list)
            run_summary_file = output_dir / f"{file_prefix}_run_summary.xlsx"
            run_summary_df.to_excel(run_summary_file, index=False)
            safe_print(f"📈 Run summary saved: {run_summary_file}")
        
        # Print multiple runs summary information
        overall = multi_run_stats['overall_statistics']
        safe_print(f"\n🏆 Multiple runs evaluation completed!")
        safe_print(f"📊 Dataset: {multi_run_stats['dataset_name']}")
        safe_print(f"🔄 Number of runs: {overall['num_runs']}")
        safe_print(f"📝 Number of questions: {overall['num_questions']}")
        safe_print(f"📈 Average total score: {overall['mean_total_score']:.2f} ± {overall['std_total_score']:.2f}")
        safe_print(f"🎯 Average score rate: {overall['mean_score_rate']:.2f}% ± {overall['std_score_rate']:.2f}%")
        safe_print(f"📋 Question-based average: {overall['question_based_mean_score']:.2f}/{overall['question_based_max_score']:.2f} ({overall['question_based_score_rate']:.2f}%)")
        safe_print(f"💾 Detailed results saved to: {output_dir}")
        
        return multi_run_stats

    def _save_all_multi_run_summary(self, all_multi_run_results: Dict):
        """Save summary of multiple runs results for all datasets"""
        output_dir = Path(self.output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save complete results
        summary_file = output_dir / "all_datasets_multi_run_summary.json"
        dump(all_multi_run_results, str(summary_file))
        
        # Create summary table
        summary_table = []
        for dataset_key, multi_run_results in all_multi_run_results.items():
            if multi_run_results is None:
                continue
            
            config = self.DATASET_CONFIGS[dataset_key]
            overall = multi_run_results['overall_statistics']
            
            summary_table.append({
                'dataset': config['display_name'],
                'dataset_key': dataset_key,
                'num_runs': overall['num_runs'],
                'num_questions': overall['num_questions'],
                'mean_total_score': overall['mean_total_score'],
                'std_total_score': overall['std_total_score'],
                'mean_score_rate': overall['mean_score_rate'],
                'std_score_rate': overall['std_score_rate'],
                'question_based_mean_score': overall['question_based_mean_score'],
                'question_based_max_score': overall['question_based_max_score'],
                'question_based_score_rate': overall['question_based_score_rate']
            })
        
        # Save summary table
        if summary_table:
            summary_table_df = pd.DataFrame(summary_table)
            summary_table_file = output_dir / "multi_run_summary_table.xlsx"
            summary_table_df.to_excel(summary_table_file, index=False)
            safe_print(f"📋 Multiple runs summary table saved: {summary_table_file}")
        
        # Print final summary
        safe_print(f"\n{'='*80}")
        safe_print(f"🏆 All datasets multiple runs evaluation completed! Summary results:")
        safe_print(f"{'='*80}")
        
        for item in summary_table:
            safe_print(f"   {item['dataset']}: {item['mean_score_rate']:.2f}% ± {item['std_score_rate']:.2f}% ({item['num_runs']} runs)")
        
        safe_print(f"📁 Detailed results saved to: {output_dir}/")
        safe_print(f"💾 Summary file: {summary_file}")
        safe_print(f"📋 Summary table: {summary_table_file}")
