import os
import re
import pandas as pd
import json
import glob
import warnings
import time
import threading
import datetime
import numpy as np
from pathlib import Path
from typing import Dict, List, Optional, Union, Any

# Load API configuration from the .env file
from dotenv import load_dotenv
load_dotenv('.env')

from vlmeval.smp import load, dump, gpt_key_set
from vlmeval.dataset.physics_r1 import grade, extract_boxed_answer, get_answer_str, answer_tag_reward_fn_for_r1
from vlmeval.dataset.utils import build_judge
from vlmeval.utils import track_progress_rich

# Thread lock for synchronized output
output_lock = threading.Lock()

def safe_print(*args, **kwargs):
    """Thread-safe print function"""
    with output_lock:
        print(*args, **kwargs)

class LogBuffer:
    """Log buffer for collecting all logs of a single task"""
    def __init__(self, task_id):
        self.task_id = task_id
        self.logs = []
        self.start_time = datetime.datetime.now()
    
    def log(self, message):
        """Append a log message"""
        timestamp = datetime.datetime.now().strftime("%H:%M:%S.%f")[:-3]
        self.logs.append(f"[{timestamp}] [{self.task_id}] {message}")
    
    def flush(self):
        """Flush all cached logs at once"""
        with output_lock:
            for log in self.logs:
                print(log)
            print()  # Add a blank line to separate outputs of different tasks

    def save_log_to_file(self, log_file_path):
        """Save logs to file (append mode)"""
        with open(log_file_path, 'a', encoding='utf-8') as f:
            for log in self.logs:
                f.write(log + '\n')

class UniversalPhysicsEvaluator:
    """
    Universal Physics Competition Evaluation System
    
    Supports automated evaluation for multiple physics competition datasets:
    - PanPhO2024/2025: Pan Pearl River Delta Physics Olympiad
    - IPhO2024/2025: International Physics Olympiad
    - EuPhO2024/2025: European Physics Olympiad
    - APhO2025: Asian Physics Olympiad
    - FMA2024/2025: US F=MA competition
    - PanPhO_Mechanics2024/2025: PanMechanics competition
    
    Supports both fine-grained and coarse-grained evaluation.
    """
    
    # Dataset configuration mapping
    DATASET_CONFIGS = {
        # Datasets in JSON file format
        'ipho_2025': {
            'display_name': 'IPhO 2025 (JSON Format)',
            'json_file': 'ipho_2025_with_predictions_boxed.json'
        },
        'apho_2025': {
            'display_name': 'APhO 2025 (JSON Format)',
            'json_file': 'apho_2025_with_predictions_boxed.json'
        },
        'eupho_2025': {
            'display_name': 'EuPhO 2025 (JSON Format)',
            'json_file': 'eupho_2025_with_predictions_boxed.json'
        },
        'nbpho_2025': {
            'display_name': 'NBPhO 2025 (JSON Format)',
            'json_file': 'nbpho_2025_with_predictions_boxed.json'
        },
        'panpho_2025': {
            'display_name': 'PanPhO 2025 (JSON Format)',
            'json_file': 'panpho_2025_with_predictions_boxed.json'
        },
        'panmechanics_2025': {
            'display_name': 'PanMechanics 2025 (JSON Format)',
            'json_file': 'panmechanics_2025_with_predictions_boxed.json'
        },
        'fma_2025': {
            'display_name': 'FMA 2025 (JSON Format)',
            'json_file': 'fma_2025_with_predictions_boxed.json'
        }
    }

    def __init__(self, results_dir: str = "results", output_dir: str = "results", dataset_name: str = "", nproc: int = 4):
        """
        Initialize the evaluator
        
        Args:
            results_dir: directory containing inference results
            nproc: number of parallel processes
        """
        self.results_dir = Path(results_dir)
        self.output_dir = Path(output_dir)
        self.dataset_name = dataset_name
        self.nproc = nproc
        
        if not self.results_dir.exists():
            raise FileNotFoundError(f"Results directory not found: {results_dir}")
    
    def detect_available_datasets(self) -> List[str]:
        """Detect available datasets"""
        available_datasets = []
        
        for dataset_key, config in self.DATASET_CONFIGS.items():
            # Check if it is a direct JSON file
            if 'json_file' in config:
                json_file_path = os.path.join(self.results_dir, config['json_file'])
                if json_file_path.exists():
                    available_datasets.append(dataset_key)
                    safe_print(f"✅ Found JSON dataset: {config['display_name']} ({dataset_key})")
                else:
                    safe_print(f"⚠️  Skipped {dataset_key}: JSON file does not exist ({json_file_path})")
                continue
            
            # Check if the corresponding result directory exists
            result_pattern = config['result_pattern']
            result_dirs = list(self.results_dir.glob(result_pattern))
            if not result_dirs:
                safe_print(f"⚠️  Skipped {dataset_key}: result directory does not exist ({result_pattern})")
                continue
            
            # Check for inference result files
            result_dir = result_dirs[0]
            inference_files = list(result_dir.glob("run_*/inference_results.json"))
            if not inference_files:
                safe_print(f"⚠️  Skipped {dataset_key}: no inference result files found ({result_dir})")
                continue
            
            available_datasets.append(dataset_key)
            safe_print(f"✅ Found dataset: {config['display_name']} ({dataset_key})")
        
        return available_datasets

    def detect_multiple_runs(self, dataset_key: str) -> List[str]:
        """Detect multiple runs for a dataset"""
        config = self.DATASET_CONFIGS[dataset_key]
        
        # JSON format does not support multiple runs
        if 'json_file' in config:
            return []
        
        result_pattern = config['result_pattern']
        result_dirs = list(self.results_dir.glob(result_pattern))
        
        if not result_dirs:
            return []
        
        result_dir = result_dirs[0]
        run_dirs = []
        
        # Check for subdirectories with format run_XX
        for item in result_dir.iterdir():
            if item.is_dir() and item.name.startswith('run_'):
                # Verify the presence of inference result file
                inference_file = item / "inference_results.json"
                if inference_file.exists():
                    run_dirs.append(item.name)
        
        # Sort by run index
        run_dirs.sort(key=lambda x: int(x.split('_')[1] if '_' in x else 0) if x.split('_')[1].isdigit() else 0)
        return run_dirs

    def has_multiple_runs(self, dataset_key: str) -> bool:
        """Check whether the dataset has multiple runs"""
        return len(self.detect_multiple_runs(dataset_key)) > 1

    def load_inference_results(self, dataset_key: str, run_id: Optional[str] = None) -> List[Dict]:
        """Load inference results"""
        config = self.DATASET_CONFIGS[dataset_key]
        
        # Check if this is a direct JSON file path
        if 'json_file' in config:
            json_file_path = Path(os.path.join(self.results_dir, config['json_file']))
            print(f"\n{json_file_path}\n")
            if not json_file_path.exists():
                raise FileNotFoundError(f"JSON file not found: {json_file_path}")
            
            safe_print(f"📁 Loading JSON inference results: {json_file_path}")
            with open(json_file_path, 'r', encoding='utf-8') as f:
                results = json.load(f)

            # Filter out invalid entries (e.g., those only containing the "information" field)
            valid_results = []
            for item in results:
                if isinstance(item, dict) and 'id' in item and 'prediction' in item:
                    valid_results.append(item)

            safe_print(f"   Number of inference results: {len(valid_results)} (raw entries: {len(results)})")
            return valid_results
        
        # Original directory structure handling
        result_pattern = config['result_pattern']
        
        # Find all matching result directories
        result_dirs = list(self.results_dir.glob(result_pattern))
        if not result_dirs:
            raise FileNotFoundError(f"No inference results found for pattern: {result_pattern}")
        
        result_dir = result_dirs[0]  # Use the first directory found
        safe_print(f"📁 Loading inference results: {result_dir}")
        
        if run_id:
            # Load the specified run
            inference_file = result_dir / run_id / "inference_results.json"
            if not inference_file.exists():
                raise FileNotFoundError(f"No inference_results.json found in {result_dir}/{run_id}")
            safe_print(f"   Using specified run: {inference_file}")
        else:
            # Search for inference_results.json under run_* directories
            inference_files = list(result_dir.glob("run_*/inference_results.json"))
            if not inference_files:
                raise FileNotFoundError(f"No inference_results.json files found in {result_dir}")
            
            # Load the latest inference result
            inference_file = max(inference_files, key=lambda x: x.parent.name)
            safe_print(f"   Using latest file: {inference_file}")
        
        with open(inference_file, 'r', encoding='utf-8') as f:
            results = json.load(f)
        
        safe_print(f"   Number of inference results: {len(results)}")
        return results

    def load_multiple_runs_results(self, dataset_key: str) -> Dict[str, List[Dict]]:
        """Load all inference results for multiple runs"""
        run_dirs = self.detect_multiple_runs(dataset_key)
        if not run_dirs:
            raise ValueError(f"No multiple runs found for dataset: {dataset_key}")
        
        all_runs_results = {}
        for run_dir in run_dirs:
            try:
                results = self.load_inference_results(dataset_key, run_dir)
                all_runs_results[run_dir] = results
                safe_print(f"✅ Loaded {run_dir}: {len(results)} entries")
            except Exception as e:
                safe_print(f"⚠️  Skipped {run_dir}: {e}")
        
        return all_runs_results

    def prepare_evaluation_data(self, dataset_key: str) -> pd.DataFrame:
        """Prepare evaluation data directly from JSON"""
        # Directly load the inference result JSON that already contains all required fields
        inference_results = self.load_inference_results(dataset_key)
        
        # Convert to DataFrame
        eval_data = pd.DataFrame(inference_results)
        
        safe_print(f"✅ Directly loaded evaluation data, total {len(eval_data)} records")
        safe_print(f"📊 Data columns: {list(eval_data.columns)}")
        
        # Check required fields
        required_fields = ['prediction', 'answer']
        for field in required_fields:
            if field not in eval_data.columns:
                raise ValueError(f"Missing required field: {field}")
        
        return eval_data

    def _safe_parse_json_field(self, field_value):
        """Safely parse a JSON-like field"""
        # Return directly if already a list
        if isinstance(field_value, list):
            return field_value
        
        # Check None and NaN
        if field_value is None:
            return []
        
        try:
            if pd.isna(field_value):
                return []
        except (TypeError, ValueError):
            # Handle values that cannot be checked with pd.isna
            pass
        
        if field_value == '':
            return []
        
        field_str = str(field_value).strip()
        if field_str.startswith('[') and field_str.endswith(']'):
            try:
                return json.loads(field_str)
            except json.JSONDecodeError:
                return [field_str]
        else:
            return [field_str] if field_str != 'nan' else []
    
    def _safe_parse_points_field(self, points_value):
        """Safely parse the points field"""
        # If it's already a list, convert directly
        if isinstance(points_value, list):
            return [float(p) for p in points_value if p is not None]
        
        # Check None
        if points_value is None:
            return [0.0]
            
        # Check NaN
        try:
            if pd.isna(points_value):
                return [0.0]
        except (TypeError, ValueError):
            # Handle values that cannot be checked with pd.isna
            pass
        
        if isinstance(points_value, (int, float)):
            return [float(points_value)]
        
        points_str = str(points_value).strip()
        if points_str.startswith('[') and points_str.endswith(']'):
            try:
                parsed = json.loads(points_str)
                return [float(p) for p in parsed if p is not None]
            except (json.JSONDecodeError, ValueError):
                pass
        
        try:
            return [float(points_str)]
        except ValueError:
            return [0.0]

    def _has_valid_marking(self, marking):
        """Check whether the marking contains valid grading criteria"""
        if not marking:
            return False
        
        if not isinstance(marking, list):
            return False
        
        if len(marking) == 0:
            return False
        
        for item in marking:
            if item is None:
                continue
            
            if isinstance(item, list):
                if len(item) > 0:
                    return True
            elif isinstance(item, str):
                stripped = item.strip()
                if stripped and stripped.lower() not in ['', 'nan', 'none', 'null']:
                    return True
            else:
                return True
        
        return False

    def evaluate_dataset(self, dataset_key: str, judge_kwargs: Optional[Dict] = None) -> Dict:
        """Evaluate a single dataset"""
        if judge_kwargs is None:
            judge_kwargs = {}
        
        config = self.DATASET_CONFIGS[dataset_key]
        safe_print(f"🚀 Start evaluating dataset: {config['display_name']}")
        
        # Prepare evaluation data
        eval_data = self.prepare_evaluation_data(dataset_key)
        
        # Initialize judge model
        judge_model = self._init_judge_model(judge_kwargs)
        
        safe_print(f"📊 Start parallel evaluation, total {len(eval_data)} problems...")
        
        # Build task list
        tasks = []
        indices = []
        for i in range(len(eval_data)):
            row = eval_data.iloc[i]
            task_kwargs = judge_kwargs.copy()
            task = (judge_model, row, i, task_kwargs)
            tasks.append(task)
            indices.append(i)
        
        safe_print(f"🔄 Launching parallel evaluation, number of tasks: {len(tasks)}")
        
        # Intermediate result file
        output_dir = Path(f"{self.output_dir}/intermediate_results")
        output_dir.mkdir(parents=True, exist_ok=True)
        tmp_file = output_dir / f"{dataset_key}_parallel_tmp.pkl"
        
        # Parallel evaluation
        parallel_results = track_progress_rich(
            self._evaluate_single_problem,
            tasks,
            nproc=self.nproc,
            chunksize=max(1, self.nproc//2),
            keys=indices,
            save=str(tmp_file)
        )
        
        safe_print(f"✅ Parallel evaluation complete, aggregating results...")
        
        # Aggregate
        results = self._aggregate_results(parallel_results, eval_data, dataset_key)
        
        # Save
        self._save_evaluation_results(results, dataset_key, eval_data, parallel_results)
        
        # Cleanup temp file
        try:
            if tmp_file.exists():
                tmp_file.unlink()
                safe_print(f"🗑️  Cleaned up temp file: {tmp_file}")
        except Exception as e:
            safe_print(f"⚠️  Failed to clean temp file: {e}")
        
        return results

    def _init_judge_model(self, judge_kwargs):
        """Initialize the judge model"""
        safe_print(f"🔧 Initializing judge model")
        
        judge_model_name = judge_kwargs.get('model', None)
        safe_print(f"   🤖 Specified model name: {judge_model_name}")
        
        if judge_model_name and judge_model_name != 'exact_matching':
            safe_print(f"   🔑 Checking API key...")
            if gpt_key_set():
                safe_print(f"   ✅ API key detected")
                try:
                    model_kwargs = {
                        'model': judge_model_name,
                        'timeout': 100,
                        'retry': 10,
                        'max_tokens': 16384,
                        'verbose': False,
                        **{k: v for k, v in judge_kwargs.items() if k not in ['model', 'nproc']}
                    }
                    test_model = build_judge(**model_kwargs)
                    if test_model.working():
                        safe_print(f"🤖 Using judge model: {judge_model_name}")
                        return test_model
                    else:
                        safe_print(f"   ❌ Judge API not working")
                        warnings.warn('Judge API not working, skipping process evaluation')
                except Exception as e:
                    safe_print(f"   ❌ Failed to initialize model: {e}")
                    warnings.warn(f'Model initialization failed: {e}, skipping process evaluation')
            else:
                safe_print(f"   ❌ API_KEY not set or invalid")
                warnings.warn('Invalid API_KEY, skipping process evaluation')
        else:
            safe_print("⚠️  No judge model specified, only final-answer evaluation will be performed")
        return None

    def _evaluate_single_problem(self, judge_model, row, index, judge_kwargs):
        """Evaluate a single problem (for parallel call)"""
        task_id = f"Problem {index + 1}"
        log_buffer = LogBuffer(task_id)
        
        try:
            log_buffer.log(f"📖 Start evaluation - ID: {row.get('id', 'N/A')}")
            
            # Extract fields
            prediction = str(row['prediction']).strip()
            ground_truth = self._safe_parse_json_field(row.get('answer', ''))
            answer_type = self._safe_parse_json_field(row.get('answer_type', 'Open-End'))
            unit = self._safe_parse_json_field(row.get('unit', ''))
            # Try reading the points field; fallback to point (backward compatibility)
            points_value = row.get('points', row.get('point', 0))
            points = self._safe_parse_points_field(points_value)
            # Record which field was used (for debugging)
            points_field_used = 'points' if 'points' in row else ('point' if 'point' in row else 'default')
            marking = self._safe_parse_json_field(row.get('marking', ''))
            
            log_buffer.log(f"📝 Problem info:")
            log_buffer.log(f"   - Prediction length: {len(prediction)} characters")
            log_buffer.log(f"   - Ground truth: {ground_truth}")
            log_buffer.log(f"   - Points: {points} (field: {points_field_used})")
            log_buffer.log(f"   - Number of marking criteria: {len(marking) if marking else 0}")
            
            item_total_points = sum(points) if points else 0.0
            log_buffer.log(f"   - Total points for this item: {item_total_points}")
            
            # Always perform both fine-grained and coarse-grained evaluation (aligned with EuPhO2024 logic)
            # Fine-grained evaluation
            log_buffer.log(f"🔍 Starting fine-grained evaluation...")
            fine_grained_score, marking_detailed_scores = self._evaluate_fine_grained_with_buffer(
                prediction, marking, points, judge_model, row.get('question', ''), log_buffer
            )
            log_buffer.log(f"✅ Fine-grained score: {fine_grained_score}")
            
            # Coarse-grained evaluation (passing fine_grained_score; aligned with EuPhO2024 logic)
            log_buffer.log(f"🎯 Starting coarse-grained evaluation...")
            coarse_grained_score, extracted_pred = self._evaluate_coarse_grained_with_buffer(
                prediction, ground_truth, answer_type, unit, points, 
                fine_grained_score, row.get('question', ''), log_buffer
            )
            log_buffer.log(f"✅ Coarse-grained score: {coarse_grained_score}")
            log_buffer.log(f"📤 Extracted prediction: {extracted_pred}")
            
            # Final score is the maximum of the two
            final_score = max(fine_grained_score, coarse_grained_score)
            log_buffer.log(f"📊 Final score: {final_score} (fine: {fine_grained_score}, coarse: {coarse_grained_score})")
            
            result = {
                'index': index,
                'fine_grained_score': fine_grained_score,
                'coarse_grained_score': coarse_grained_score,
                'extracted_pred': extracted_pred,
                'marking_detailed_scores': marking_detailed_scores,
                'item_total_points': item_total_points,
                'ground_truth': ground_truth,
                'answer_type': answer_type,
                'unit': unit,
                'points': points,
                'marking': marking,
                'prediction': prediction,
                'earned_points': final_score  # Add earned_points equal to the max score
            }
            
            log_buffer.log(f"✅ Evaluation complete, final score: {final_score}")
            log_buffer.flush()
            log_buffer.save_log_to_file(self.output_dir / f"{self.dataset_name}_detailed_logs.txt")
            return result
            
        except Exception as e:
            log_buffer.log(f"❌ Evaluation failed: {e}")
            import traceback
            log_buffer.log(f"📄 Error details: {traceback.format_exc()}")
            log_buffer.flush()
            log_buffer.save_log_to_file(self.output_dir / f"{self.dataset_name}_detailed_logs.txt")
            return None



    def _evaluate_fine_grained_with_buffer(self, prediction, marking, points, judge_model, question, log_buffer):
        """Fine-grained evaluation with retry mechanism (with log buffer)"""
        log_buffer.log(f"   🔍 Fine-grained evaluation begins")
        log_buffer.log(f"      - Number of marking criteria: {len(marking) if marking else 0}")
        log_buffer.log(f"      - judge_model: {'present' if judge_model else 'absent'}")
        
        if not marking or not judge_model:
            log_buffer.log(f"   ⚠️  Skipping fine-grained evaluation: {'no marking criteria' if not marking else 'no judge model'}")
            return 0.0, []
        
        # Check whether there are multiple marking sets (aligned with EuPhO2024 logic)
        has_multiple_marking_sets = self._has_multiple_marking_sets(marking)
        if has_multiple_marking_sets:
            log_buffer.log(f"   📋 Detected multiple marking sets, total {len(marking)} sets")
            return self._evaluate_multiple_marking_sets_with_buffer(prediction, marking, points, judge_model, question, log_buffer)
        else:
            log_buffer.log(f"   📋 Single marking set")
            return self._evaluate_single_marking_set_with_buffer(prediction, marking, points, judge_model, question, log_buffer)
    
    def _has_multiple_marking_sets(self, marking):
        """Check whether there are multiple marking sets"""
        if not marking or len(marking) == 0:
            return False
        
        # If the first element is a list, consider it multiple sets
        return isinstance(marking[0], list)
    
    def _evaluate_multiple_marking_sets_with_buffer(self, prediction, marking_sets, points, judge_model, question, log_buffer):
        """Evaluate multiple marking sets and take the highest score"""
        best_score = 0.0
        best_detailed_scores = []
        all_marking_results = []
        
        max_possible_score = sum(points) if points else 0.0
        
        for set_idx, marking_set in enumerate(marking_sets):
            log_buffer.log(f"   📊 Evaluating marking set {set_idx + 1}")
            
            score, detailed_scores = self._evaluate_single_marking_set_with_buffer(
                prediction, marking_set, points, judge_model, question, log_buffer
            )
            
            # Record result of each set
            marking_result = {
                'marking_set_index': set_idx + 1,
                'score': score,
                'detailed_scores': detailed_scores,
                'max_possible_score': max_possible_score
            }
            all_marking_results.append(marking_result)
            
            log_buffer.log(f"      ✅ Score of set {set_idx + 1}: {score:.2f}")
            
            # Update the best score
            if score > best_score:
                best_score = score
                best_detailed_scores = detailed_scores
                # Add a tag to the best detailed scores
                for detailed_score in best_detailed_scores:
                    detailed_score['best_marking_set'] = set_idx + 1
        
        log_buffer.log(f"   🏆 Final score with multiple sets: {best_score:.2f} (from set {[r['marking_set_index'] for r in all_marking_results if r['score'] == best_score][0]})")
        
        return round(best_score, 2), best_detailed_scores
    
    def _evaluate_single_marking_set_with_buffer(self, prediction, marking, points, judge_model, question, log_buffer):
        """Evaluate a single marking set with retry mechanism (with log buffer)"""        
        scoring_criteria = self._parse_marking_criteria(marking)
        max_possible_score = sum(points) if points else 0.0
        max_retries = 3
        
        log_buffer.log(f"      📊 Evaluation config:")
        log_buffer.log(f"         - Number of criteria: {len(scoring_criteria)}")
        log_buffer.log(f"         - Maximum total score: {max_possible_score}")
        log_buffer.log(f"         - Max retries: {max_retries}")
        
        for attempt in range(max_retries + 1):
            log_buffer.log(f"      🔄 Start attempt {attempt + 1}")
            scores = []
            detailed_scores = []
            
            # Evaluate each marking criterion
            for i, criterion in enumerate(scoring_criteria):
                log_buffer.log(f"         📏 Criterion {i+1}/{len(scoring_criteria)}: {criterion['description'][:50]}{'...' if len(criterion['description']) > 50 else ''}")
                score, response = self._evaluate_single_criterion_with_buffer(
                    prediction, criterion, judge_model, question, 
                    max_total_score=max_possible_score, 
                    current_attempt=attempt,
                    log_buffer=log_buffer
                )
                scores.append(score)
                log_buffer.log(f"            ➡️ Score: {score}")
                
                # Record detailed score
                detailed_scores.append({
                    'marking_criterion': criterion['description'],
                    'score': round(score, 2),
                    'index': criterion['index'],
                    'attempt': attempt + 1,
                    'judge_response': response
                })
            
            total_score = sum(scores)
            log_buffer.log(f"      📊 Total score of attempt {attempt + 1}: {total_score} (per-criterion: {scores})")
            
            # Check whether the total score exceeds the maximum
            if total_score <= max_possible_score or max_possible_score == 0:
                # Valid score; mark success
                for detailed_score in detailed_scores:
                    detailed_score['retry_info'] = f"Attempt {attempt + 1} succeeded" if attempt > 0 else "Succeeded on first attempt"
                    detailed_score['total_attempts'] = attempt + 1
                    detailed_score['final_success'] = True
                
                if attempt > 0:
                    log_buffer.log(f"      ✅ Attempt {attempt + 1} succeeded, total {total_score:.2f} <= {max_possible_score:.2f}")
                else:
                    log_buffer.log(f"      ✅ First attempt succeeded, total {total_score:.2f} <= {max_possible_score:.2f}")
                
                return round(total_score, 2), detailed_scores
            else:
                # Score exceeds limit; prepare for retry
                if attempt < max_retries:
                    log_buffer.log(f"      ⚠️  Attempt {attempt + 1} exceeded max: {total_score:.2f} > {max_possible_score:.2f}, retrying attempt {attempt + 2}...")
                else:
                    # Reached max retries; enforce scaling
                    log_buffer.log(f"      ❌ Reached maximum retries ({max_retries + 1}); total still exceeds limit: {total_score:.2f} > {max_possible_score:.2f}")
                    log_buffer.log(f"      📊 Enforcing proportional scaling...")
                    
                    scale_factor = max_possible_score / total_score
                    adjusted_scores = []
                    
                    log_buffer.log(f"         📐 Scale factor: {scale_factor:.3f}")
                    for i, score in enumerate(scores):
                        adjusted_score = score * scale_factor
                        adjusted_scores.append(adjusted_score)
                        log_buffer.log(f"            Criterion {i+1}: {score:.2f} -> {adjusted_score:.2f}")
                        detailed_scores[i]['original_score'] = detailed_scores[i]['score']
                        detailed_scores[i]['score'] = round(adjusted_score, 2)
                        detailed_scores[i]['retry_info'] = f"Forced adjustment after {max_retries + 1} attempts"
                        detailed_scores[i]['total_attempts'] = max_retries + 1
                        detailed_scores[i]['forced_adjustment'] = True
                        detailed_scores[i]['scale_factor'] = round(scale_factor, 3)
                        detailed_scores[i]['final_success'] = False
                    
                    return round(sum(adjusted_scores), 2), detailed_scores
        
        return 0.0, []

    def _evaluate_coarse_grained_with_buffer(self, prediction, ground_truth, answer_type, unit, points, fine_grained_score, question, log_buffer):
        """Coarse-grained evaluation (fully aligned with EuPhO2024 logic)"""
        log_buffer.log(f"   🎯 Coarse-grained evaluation begins")
        log_buffer.log(f"      - Ground truth: {ground_truth}")
        log_buffer.log(f"      - Fine-grained score: {fine_grained_score}")
        
        extracted_pred = ""
        
        if ground_truth:
            log_buffer.log(f"      ✅ Ground truth present, starting answer matching evaluation")
            try:
                # Extract predicted answers for display
                num_expected_answers = len(ground_truth)
                log_buffer.log(f"      📤 Extracting predicted answers (expecting {num_expected_answers} answers)")
                extracted_pred = self._extract_prediction_for_display(prediction, num_expected_answers)
                log_buffer.log(f"      📝 Extraction result: {extracted_pred}")
                
                # Multi-answer evaluation
                log_buffer.log(f"      🔍 Starting multi-answer matching evaluation")
                answer_score = self._evaluate_multiple_answers_with_buffer(prediction, ground_truth, points, question, log_buffer)
                log_buffer.log(f"      📊 Answer matching score: {answer_score}")
                
                if answer_score > 0:
                    # Answers correct; use answer score
                    log_buffer.log(f"      ✅ Correct answer(s), using answer score: {answer_score}")
                    return round(answer_score, 2), extracted_pred
                else:
                    # Incorrect answer; zero score
                    log_buffer.log(f"      ❌ Incorrect answer(s), score is zero")
                    return 0.0, extracted_pred
            except Exception as e:
                # Evaluation failed; use the already computed fine-grained score
                log_buffer.log(f"      ⚠️  Answer evaluation failed: {e}, score is zero")
                return 0.0, extracted_pred
        
        # If no ground truth, try to extract predicted answers for display
        log_buffer.log(f"      ⚠️  No ground truth, attempting to extract predicted answers for display")
        if not extracted_pred:
            try:
                extracted_pred = self._extract_prediction_for_display(prediction, 10)
                log_buffer.log(f"      📝 Extracted predicted answers: {extracted_pred}")
            except Exception as e:
                log_buffer.log(f"      ❌ Failed to extract predicted answers: {e}")
                extracted_pred = ""
        
        # Without ground truth, use fine-grained score
        log_buffer.log(f"      📊 Using fine-grained score as final: {fine_grained_score}")
        return round(fine_grained_score, 2), extracted_pred

    def _evaluate_coarse_grained_simple_with_buffer(self, prediction, ground_truth, answer_type, unit, points, question, log_buffer):
        """Simplified coarse-grained evaluation - only answer matching (with log buffer)"""
        log_buffer.log(f"   🎯 Coarse-grained evaluation begins")
        log_buffer.log(f"      - Ground truth: {ground_truth}")
        
        extracted_pred = ""
        
        if ground_truth:
            log_buffer.log(f"      ✅ Ground truth present, starting answer matching evaluation")
            try:
                # Extract predicted answers for display
                num_expected_answers = len(ground_truth)
                log_buffer.log(f"      📤 Extracting predicted answers (expecting {num_expected_answers} answers)")
                extracted_pred = self._extract_prediction_for_display(prediction, num_expected_answers)
                log_buffer.log(f"      📝 Extraction result: {extracted_pred}")
                
                # Multi-answer evaluation
                log_buffer.log(f"      🔍 Starting multi-answer matching evaluation")
                answer_score = self._evaluate_multiple_answers_with_buffer(prediction, ground_truth, points, question, log_buffer)
                log_buffer.log(f"      📊 Answer matching score: {answer_score}")
                
                return round(answer_score, 2), extracted_pred
                
            except Exception as e:
                log_buffer.log(f"      ⚠️  Answer evaluation failed: {e}, returning 0")
                return 0.0, extracted_pred
        
        # If no ground truth, try to extract predicted answers for display
        log_buffer.log(f"      ⚠️  No ground truth, attempting to extract predicted answers for display")
        if not extracted_pred:
            try:
                extracted_pred = self._extract_prediction_for_display(prediction, 10)
                log_buffer.log(f"      📝 Extracted predicted answers: {extracted_pred}")
            except Exception as e:
                log_buffer.log(f"      ❌ Failed to extract predicted answers: {e}")
                extracted_pred = ""
        
        log_buffer.log(f"      📊 No ground truth, returning 0")
        return 0.0, extracted_pred

    def _evaluate_multiple_answers_with_buffer(self, prediction, ground_truth_list, points_list, question="", log_buffer=None):
        """Multi-answer evaluation (with log buffer)"""
        if not ground_truth_list:
            return 0.0
            
        # Ensure consistent lengths
        actual_length = min(len(ground_truth_list), len(points_list))
        ground_truth_list = ground_truth_list[:actual_length]
        points_list = points_list[:actual_length]
        
        try:
            # Use the multi-answer evaluation function from physics_r1
            total_score, total_point, extracted_preds, extracted_gts, scored_by_list = answer_tag_reward_fn_for_r1(
                prediction, ground_truth_list, problem=question, points=points_list, 
                use_xverify=True, debug=True, log_callback=log_buffer.log if log_buffer else None
            )
            if log_buffer:
                log_buffer.log(f"         📊 Scoring details: scored_by={scored_by_list}, total_point={total_point}")
            return total_point
        except Exception as e:
            if log_buffer:
                log_buffer.log(f"[DEBUG] Exception in answer evaluation: {str(e)}")
            # Fallback to individual evaluation
            return self._fallback_individual_evaluation_with_buffer(prediction, ground_truth_list, points_list, question, log_buffer)

    def _fallback_individual_evaluation_with_buffer(self, prediction, ground_truth_list, points_list, question="", log_buffer=None):
        """Fallback individual evaluation (with log buffer)"""
        try:
            num_answers = len(ground_truth_list)
            extracted_answers = get_answer_str(prediction, return_origin=False, num_answers=num_answers)
            
            total_earned_score = 0.0
            for extracted_ans, gt_answer, points in zip(extracted_answers, ground_truth_list, points_list):
                if extracted_ans and extracted_ans.strip():
                    try:
                        is_correct, _, _, _ = grade(extracted_ans, gt_answer, False, problem=question, 
                                                 use_xverify=True, debug=True, 
                                                 log_callback=log_buffer.log if log_buffer else None)
                        if is_correct:
                            total_earned_score += points
                    except Exception as e:
                        if log_buffer:
                            log_buffer.log(f"[DEBUG] Exception in grade call: {str(e)}")
                        pass
            
            return total_earned_score
        except Exception as e:
            if log_buffer:
                log_buffer.log(f"[DEBUG] Exception in fallback evaluation: {str(e)}")
            return 0.0

    def _parse_marking_criteria(self, marking_list):
        """Parse marking criteria"""
        criteria = []
        if not marking_list:
            return criteria
        
        for i, marking_criterion in enumerate(marking_list):
            if marking_criterion and str(marking_criterion).strip():
                criteria.append({
                    'description': str(marking_criterion).strip(),
                    'index': i
                })
        
        return criteria

    def _evaluate_single_criterion_with_buffer(self, prediction, criterion, judge_model, question, max_total_score=None, current_attempt=0, log_buffer=None):
        """Evaluate a single criterion with the judge model and retry mechanism (with log buffer)"""
        log_buffer.log(f"         🤖 Calling judge model for criterion evaluation")
        
        # Build total score constraint hint
        total_score_warning = ""
        if max_total_score is not None and max_total_score > 0:
            total_score_warning = f"""
⚠️  IMPORTANT TOTAL SCORE CONSTRAINT:
- This question has a maximum total score of {max_total_score} points
- ALL marking criteria scores combined MUST NOT exceed {max_total_score} points
- You are evaluating ONE criterion among multiple criteria for this question
- Be conservative in your scoring to ensure the total doesn't exceed the limit
- This is attempt #{current_attempt + 1} of evaluation"""
        
        prompt = f"""You are an expert physics competition grader. Evaluate the student's solution against the specific grading criterion.

PHYSICS PROBLEM:
{question}

STUDENT'S SOLUTION:
{prediction}

GRADING CRITERION:
{criterion['description']}{total_score_warning}

INSTRUCTIONS:
1. Carefully analyze the student's solution for physics concepts, mathematical derivations, and calculations.
2. Compare the solution against the specific grading criterion provided.
3. Award points strictly according to the criterion, including partial credit when specified.
4. BE CONSERVATIVE - remember this is one of multiple criteria being evaluated simultaneously.

SCORING FORMAT:
- Read the grading criterion carefully to understand the maximum points and conditions for partial credit
- Evaluate whether the student's solution meets the full criteria, partial criteria, or no criteria
- Output your score using the exact format: \\boxed{{score}}
- The score should be a number (e.g., 0.4, 0.2, 0.1, 0.0)

CRITICAL REQUIREMENTS:
- You MUST output your final score in the format: \\boxed{{score}}
- The score must be a single number only (no text inside the boxed)
- BE CONSERVATIVE to avoid exceeding the total score limit

⚠️ CRITICAL INSTRUCTION: 
- Output ONLY: \\boxed{{score}}
- NO explanations, NO analysis, NO reasoning
- Just the number in the exact format \\boxed{{score}}

RESPOND WITH ONLY THE BOXED SCORE:"""
        
        try:
            log_buffer.log(f"         ⏳ Calling judge model...")
            start_time = time.time()
            
            response = judge_model.generate(prompt).strip()
            
            elapsed_time = time.time() - start_time
            log_buffer.log(f"         ⏱️  Response time: {elapsed_time:.2f}s")
            
            # Extract score
            score = self._extract_score_from_response(response)
            log_buffer.log(f"         🔍 Extracted score: {score}")
            
            return score, response
            
        except Exception as e:
            elapsed_time = time.time() - start_time
            log_buffer.log(f"         ❌ Judge model call failed (elapsed {elapsed_time:.2f}s): {e}")
            return 0.0, f"Judge model call failed: {str(e)}"

    def _extract_score_from_response(self, response):
        """Helper function to extract a score from the model response"""
        if not response:
            return 0.0
            
        response = response.strip()
        
        # Prefer boxed format to extract the score
        boxed_patterns = [
            r'\\boxed\{([^}]+)\}',
            r'boxed\{([^}]+)\}',
        ]
        
        for pattern in boxed_patterns:
            matches = re.findall(pattern, response, re.IGNORECASE)
            for match in reversed(matches):
                match = match.strip()
                if match:
                    try:
                        score = float(match)
                        return round(score, 2)
                    except ValueError:
                        nums = re.findall(r'\d+\.?\d*', match)
                        if nums:
                            try:
                                score = float(nums[-1])
                                return round(score, 2)
                            except ValueError:
                                continue
        
        # Search for specific score formats
        score_patterns = [
            r'(?:Score|Final Score|Total|Points?):\s*([0-9]*\.?[0-9]+)',
            r'([0-9]*\.?[0-9]+)\s*(?:points?|pts?)',
        ]
        
        for pattern in score_patterns:
            matches = re.findall(pattern, response, re.IGNORECASE)
            if matches:
                try:
                    score = float(matches[-1])
                    return round(score, 2)
                except ValueError:
                    continue
        
        # Extract all numbers and take the last one
        all_numbers = re.findall(r'[0-9]*\.?[0-9]+', response)
        if all_numbers:
            try:
                score = float(all_numbers[-1])
                return round(score, 2)
            except ValueError:
                pass
        
        return 0.0
    
    def _extract_prediction_for_display(self, prediction, num_answers=10):
        """Extract predicted answers for display"""
        try:
            extracted_answers = get_answer_str(prediction, return_origin=False, num_answers=num_answers)
            valid_answers = []
            
            for ans in extracted_answers:
                if ans and ans.strip():
                    cleaned_ans = ' '.join(ans.strip().replace('\n', ' ').replace('\r', ' ').split())
                    if cleaned_ans:
                        valid_answers.append(cleaned_ans)
            
            return ", ".join(valid_answers) if valid_answers else ""
        except Exception:
            # Fallback to extract_boxed_answer
            try:
                extracted = extract_boxed_answer(prediction)
                if extracted and extracted.strip():
                    cleaned = ' '.join(extracted.strip().replace('\n', ' ').replace('\r', ' ').split())
                    return cleaned if cleaned else ""
            except Exception:
                pass
            return ""

    def _aggregate_results(self, parallel_results, eval_data, dataset_key):
        """Aggregate parallel evaluation results (aligned with EuPhO2024 logic)"""
        fine_grained_total_score = 0.0
        coarse_grained_total_score = 0.0
        total_score = 0.0  # Use earned_points (max of the two) as total
        max_possible_score = 0.0
        
        for i, result in enumerate(parallel_results):
            if result is None:
                safe_print(f"⚠️  Problem {i+1} failed in evaluation, skipping")
                continue
                
            fine_score = result['fine_grained_score']
            coarse_score = result['coarse_grained_score']
            earned_points = result.get('earned_points', max(fine_score, coarse_score))
            item_points = result['item_total_points']
            
            # Accumulate scores (aligned with EuPhO2024 logic)
            fine_grained_total_score = round(fine_grained_total_score + fine_score, 2)
            coarse_grained_total_score = round(coarse_grained_total_score + coarse_score, 2)
            total_score = round(total_score + earned_points, 2)  # total uses earned_points
            
            max_possible_score += item_points
        
        # Final results
        max_possible_score = round(max_possible_score, 2)
        fine_rate = round((fine_grained_total_score / max_possible_score * 100), 2) if max_possible_score > 0 else 0.0
        coarse_rate = round((coarse_grained_total_score / max_possible_score * 100), 2) if max_possible_score > 0 else 0.0
        total_rate = round((total_score / max_possible_score * 100), 2) if max_possible_score > 0 else 0.0
        
        return {
            'dataset': dataset_key,
            'fine_grained_total_score': fine_grained_total_score,
            'fine_grained_score_rate': fine_rate,
            'coarse_grained_total_score': coarse_grained_total_score,
            'coarse_grained_score_rate': coarse_rate,
            'total_score': total_score,  # sum of earned_points
            'score_rate': total_rate,
            'max_possible_score': max_possible_score,
            'total_count': len(parallel_results),
        }

    def _save_evaluation_results(self, results, dataset_key, eval_data, parallel_results):
        """Save evaluation results"""
        output_dir = Path(self.output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save summary
        score_file = output_dir / f"{dataset_key}_score.json"
        dump(results, str(score_file))
        
        # Build detailed results
        detailed_results = []
        for i, result in enumerate(parallel_results):
            if result is None:
                continue
            
            row = eval_data.iloc[i]
            # Use earned_points if present; otherwise compute max of the two (aligned with EuPhO2024 logic)
            earned_points = result.get('earned_points', max(result['fine_grained_score'], result['coarse_grained_score']))
            
            detailed_item = {
                "id": str(row.get('id', f"{dataset_key}_{i+1}")),
                "context": str(row.get('context', '')).strip(),
                "question": str(row.get('question', '')).strip(),
                "solution": str(row.get('solution', '')).strip(),
                "marking": result['marking'] if result['marking'] else [],
                "marking_detailed_scores": result['marking_detailed_scores'] if result['marking_detailed_scores'] else [],
                "answer": [f"\\boxed{{{ans}}}" for ans in result['ground_truth']] if result['ground_truth'] else [''],
                "answer_type": result['answer_type'] if result['answer_type'] else ['Open-End'],
                "unit": result['unit'] if result['unit'] else [''],
                "points": result['points'] if result['points'] else [0.0],
                "modality": str(row.get('modality', 'text')).strip(),
                "field": str(row.get('field', '')).strip(),
                "subfield": str(row.get('subfield', '')).strip(),
                "source": dataset_key,
                "test_result": str(result['prediction']),
                "test_answer": [f"\\boxed{{{ans.strip()}}}" for ans in result['extracted_pred'].split(", ") if ans.strip()] if result['extracted_pred'] else [''],
                "fine_grained_score": result['fine_grained_score'],
                "coarse_grained_score": result['coarse_grained_score'],
                "earned_points": earned_points
            }
            detailed_results.append(detailed_item)
        
        # Save detailed results
        detailed_file = output_dir / f"{dataset_key}_detailed_results.json"
        dump(detailed_results, str(detailed_file))
        
        # Save Excel (with evaluation results)
        try:
            eval_data_with_results = eval_data.copy()
            eval_data_with_results['fine_grained_score'] = [r['fine_grained_score'] for r in detailed_results]
            eval_data_with_results['coarse_grained_score'] = [r['coarse_grained_score'] for r in detailed_results]
            eval_data_with_results['earned_points'] = [r['earned_points'] for r in detailed_results]
            eval_data_with_results['extracted_prediction'] = [", ".join(r['test_answer']).replace("\\boxed{", "").replace("}", "") for r in detailed_results]
            # Save marking detailed scores as a JSON string for readability in Excel
            eval_data_with_results['marking_detailed_scores'] = [
                json.dumps(r['marking_detailed_scores'], ensure_ascii=False) if r['marking_detailed_scores'] else '[]' 
                for r in detailed_results
            ]
            
            detailed_xlsx_file = output_dir / f"{dataset_key}_detailed.xlsx"
            dump(eval_data_with_results, str(detailed_xlsx_file))
        except Exception as e:
            safe_print(f"⚠️  Failed to save detailed Excel file: {e}")
        
        safe_print(f"💾 Evaluation results saved to: {output_dir}")

    def evaluate_all_datasets(self, judge_kwargs: Optional[Dict] = None) -> Dict[str, Dict]:
        """Evaluate all available datasets"""
        available_datasets = self.detect_available_datasets()
        
        if not available_datasets:
            safe_print("❌ No available datasets found")
            return {}
        
        safe_print(f"🎯 Start evaluating {len(available_datasets)} datasets...")
        
        all_results = {}
        for dataset_key in available_datasets:
            safe_print(f"\n{'='*60}")
            safe_print(f"📊 Evaluating: {self.DATASET_CONFIGS[dataset_key]['display_name']}")
            safe_print(f"{'='*60}")
            
            try:
                results = self.evaluate_dataset(dataset_key, judge_kwargs)
                all_results[dataset_key] = results
                
                # Print per-dataset summary
                safe_print(f"\n✅ {self.DATASET_CONFIGS[dataset_key]['display_name']} evaluation complete!")
                safe_print(f"🏆 Overall score: {results['total_score']:.2f} / {results['max_possible_score']:.2f} ({results['score_rate']:.2f}%)")
                safe_print(f"📊 Fine-grained total: {results['fine_grained_total_score']:.2f} ({results['fine_grained_score_rate']:.2f}%)")
                safe_print(f"🎯 Coarse-grained total: {results['coarse_grained_total_score']:.2f} ({results['coarse_grained_score_rate']:.2f}%)")
                safe_print(f"📈 Number of problems: {results['total_count']}")
                
            except Exception as e:
                safe_print(f"❌ Evaluation failed for {dataset_key}: {e}")
                import traceback
                safe_print(f"Error details: {traceback.format_exc()}")
                all_results[dataset_key] = None
        
        # Save overall summary
        self._save_summary_results(all_results)
        
        return all_results

    def evaluate_multiple_runs(self, dataset_key: str, judge_kwargs: Optional[Dict] = None) -> Dict:
        """Evaluate multiple runs and compute statistics"""
        if judge_kwargs is None:
            judge_kwargs = {}
        
        config = self.DATASET_CONFIGS[dataset_key]
        safe_print(f"🚀 Start evaluating multiple runs: {config['display_name']}")
        
        # Check multiple runs
        run_dirs = self.detect_multiple_runs(dataset_key)
        if len(run_dirs) <= 1:
            safe_print(f"⚠️  Dataset {dataset_key} has only {len(run_dirs)} run(s); cannot perform multi-run evaluation")
            return None
        
        safe_print(f"📊 Found {len(run_dirs)} runs: {', '.join(run_dirs)}")
        
        # Load all runs' inference results
        all_runs_results = self.load_multiple_runs_results(dataset_key)
        
        # Evaluate each run separately
        run_evaluation_results = {}
        for run_dir, inference_results in all_runs_results.items():
            safe_print(f"\n📈 Evaluating {run_dir}...")
            
            # Convert to DataFrame for evaluation
            eval_data = pd.DataFrame(inference_results)
            
            # Initialize judge model
            judge_model = self._init_judge_model(judge_kwargs)
            
            # Build task list
            tasks = []
            indices = []
            for i in range(len(eval_data)):
                row = eval_data.iloc[i]
                task_kwargs = judge_kwargs.copy()
                task = (judge_model, row, i, task_kwargs)
                tasks.append(task)
                indices.append(i)
            
            # Intermediate result file
            output_dir = Path(f"{self.output_dir}/multiple_runs")
            output_dir.mkdir(parents=True, exist_ok=True)
            tmp_file = output_dir / f"{dataset_key}_parallel_tmp_{run_dir}.pkl"
            
            # Parallel evaluation
            parallel_results = track_progress_rich(
                self._evaluate_single_problem,
                tasks,
                nproc=self.nproc,
                chunksize=max(1, self.nproc//2),
                keys=indices,
                save=str(tmp_file)
            )
            
            # Aggregate single-run results
            run_results = self._aggregate_results(parallel_results, eval_data, f"{dataset_key}_{run_dir}")
            run_evaluation_results[run_dir] = {
                'results': run_results,
                'detailed_results': parallel_results,
                'eval_data': eval_data
            }
            
            # Cleanup temp file
            try:
                if tmp_file.exists():
                    tmp_file.unlink()
            except Exception:
                pass
            
            safe_print(f"   ✅ {run_dir} finished: {run_results['total_score']:.2f}/{run_results['max_possible_score']:.2f} ({run_results['score_rate']:.2f}%)")
        
        # Compute multi-run statistics
        multi_run_stats = self._calculate_multi_run_statistics(run_evaluation_results, dataset_key)
        
        # Save multi-run evaluation results
        self._save_multi_run_results(multi_run_stats, dataset_key, run_evaluation_results)
        
        return multi_run_stats

    def _calculate_multi_run_statistics(self, run_evaluation_results: Dict, dataset_key: str) -> Dict:
        """Compute statistics across multiple runs"""
        safe_print(f"\n📊 Computing multi-run statistics...")
        
        run_dirs = list(run_evaluation_results.keys())
        num_runs = len(run_dirs)
        
        # Use the first run's items as the base
        first_run = list(run_evaluation_results.values())[0]
        first_eval_data = first_run['eval_data']
        num_questions = len(first_eval_data)
        
        # Initialize stats structure
        question_stats = {}
        
        # Initialize per-question stats
        for i in range(num_questions):
            row = first_eval_data.iloc[i]
            question_id = str(row.get('id', f"{dataset_key}_{i+1}"))
            question_stats[question_id] = {
                'question_id': question_id,
                'question': str(row.get('question', '')).strip(),
                'context': str(row.get('context', '')).strip(),
                'answer': row.get('answer', []),
                'points': row.get('points', row.get('point', [0.0])),
                'max_points': sum(self._safe_parse_points_field(row.get('points', row.get('point', [0.0])))),
                'runs': {},
                'statistics': {}
            }
        
        # Collect each run's results
        for run_dir, run_data in run_evaluation_results.items():
            detailed_results = run_data['detailed_results']
            eval_data = run_data['eval_data']
            
            for i, result in enumerate(detailed_results):
                if result is None:
                    continue
                
                row = eval_data.iloc[i]
                question_id = str(row.get('id', f"{dataset_key}_{i+1}"))
                
                if question_id in question_stats:
                    # Decide which score to use (fine-grained preferred)
                    has_marking = result['marking'] and len(result['marking']) > 0 and self._has_valid_marking(result['marking'])
                    earned_score = result['fine_grained_score'] if has_marking else result['coarse_grained_score']
                    
                    question_stats[question_id]['runs'][run_dir] = {
                        'fine_grained_score': result['fine_grained_score'],
                        'coarse_grained_score': result['coarse_grained_score'],
                        'earned_score': earned_score,
                        'prediction': result['prediction'],
                        'extracted_prediction': result['extracted_pred'],
                        'evaluation_method': 'fine_grained' if has_marking else 'coarse_grained'
                    }
        
        # Compute per-question statistics
        for question_id, question_data in question_stats.items():
            runs_data = question_data['runs']
            if not runs_data:
                continue
            
            scores = [run_data['earned_score'] for run_data in runs_data.values()]
            fine_scores = [run_data['fine_grained_score'] for run_data in runs_data.values()]
            coarse_scores = [run_data['coarse_grained_score'] for run_data in runs_data.values()]
            
            question_data['statistics'] = {
                'num_runs': len(scores),
                'mean_score': round(np.mean(scores), 2),
                'std_score': round(np.std(scores), 2),
                'min_score': round(np.min(scores), 2),
                'max_score': round(np.max(scores), 2),
                'mean_fine_score': round(np.mean(fine_scores), 2),
                'mean_coarse_score': round(np.mean(coarse_scores), 2),
                'score_rate': round(np.mean(scores) / question_data['max_points'] * 100, 2) if question_data['max_points'] > 0 else 0.0
            }
        
        # Compute overall statistics
        overall_stats = self._calculate_overall_multi_run_stats(run_evaluation_results, question_stats)
        
        config = self.DATASET_CONFIGS[dataset_key]
        return {
            'dataset_key': dataset_key,
            'dataset_name': config['display_name'],
            'num_runs': num_runs,
            'run_dirs': run_dirs,
            'question_statistics': question_stats,
            'overall_statistics': overall_stats,
            'run_results': {run_dir: data['results'] for run_dir, data in run_evaluation_results.items()}
        }

    def _calculate_overall_multi_run_stats(self, run_evaluation_results: Dict, question_stats: Dict) -> Dict:
        """Compute overall statistics for multiple runs"""
        run_dirs = list(run_evaluation_results.keys())
        num_runs = len(run_dirs)
        
        # Collect total scores of each run
        run_total_scores = []
        run_max_scores = []
        run_score_rates = []
        
        for run_dir, run_data in run_evaluation_results.items():
            results = run_data['results']
            run_total_scores.append(results['total_score'])
            run_max_scores.append(results['max_possible_score'])
            run_score_rates.append(results['score_rate'])
        
        # Compute the mean per-question score
        question_mean_scores = []
        question_max_points = []
        
        for question_data in question_stats.values():
            if question_data['runs']:
                question_mean_scores.append(question_data['statistics']['mean_score'])
                question_max_points.append(question_data['max_points'])
        
        total_mean_score = sum(question_mean_scores)
        total_max_score = sum(question_max_points)
        
        return {
            'num_runs': num_runs,
            'num_questions': len(question_stats),
            'mean_total_score': round(np.mean(run_total_scores), 2),
            'std_total_score': round(np.std(run_total_scores), 2),
            'min_total_score': round(np.min(run_total_scores), 2),
            'max_total_score': round(np.max(run_total_scores), 2),
            'mean_score_rate': round(np.mean(run_score_rates), 2),
            'std_score_rate': round(np.std(run_score_rates), 2),
            'question_based_mean_score': round(total_mean_score, 2),
            'question_based_max_score': round(total_max_score, 2),
            'question_based_score_rate': round(total_mean_score / total_max_score * 100, 2) if total_max_score > 0 else 0.0
        }

    def _save_summary_results(self, all_results):
        """Save the summary across all datasets"""
        output_dir = Path("results")
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save full summary
        summary_file = output_dir / "all_datasets_summary.json"
        dump(all_results, str(summary_file))
        
        # Build simplified summary table
        summary_table = []
        for dataset_key, results in all_results.items():
            if results is None:
                continue
                
            config = self.DATASET_CONFIGS[dataset_key]
            summary_table.append({
                'dataset': config['display_name'],
                'dataset_key': dataset_key,
                'total_questions': results['total_count'],
                'total_score': results['total_score'],
                'max_possible_score': results['max_possible_score'],
                'score_rate': results['score_rate'],
                'total_count': results['total_count'],
                'fine_grained_score': results['fine_grained_total_score'],
                'fine_grained_rate': results['fine_grained_score_rate'],
                'coarse_grained_score': results['coarse_grained_total_score'],
                'coarse_grained_rate': results['coarse_grained_score_rate']
            })
        
        # Save summary table
        summary_table_file = output_dir / "summary_table.json"
        dump(summary_table, str(summary_table_file))
        
        # Print final summary
        safe_print(f"\n{'='*80}")
        safe_print(f"🏆 All dataset evaluations complete! Summary:")
        safe_print(f"{'='*80}")
        
        total_score_all = sum(r['total_score'] for r in all_results.values() if r)
        total_max_all = sum(r['max_possible_score'] for r in all_results.values() if r)
        overall_rate = round((total_score_all / total_max_all * 100), 2) if total_max_all > 0 else 0.0
        
        safe_print(f"📊 Overall performance: {total_score_all:.2f} / {total_max_all:.2f} ({overall_rate:.2f}%)")
        safe_print(f"📁 Detailed results saved to: {output_dir}/")
        safe_print(f"💾 Summary file: {summary_file}")
        safe_print(f"📋 Summary table: {summary_table_file}")
        
        for item in summary_table:
            safe_print(f"   {item['dataset']}: {item['score_rate']:.2f}% ({item['total_score']:.1f}/{item['max_possible_score']:.1f})")

    def _save_multi_run_results(self, multi_run_stats: Dict, dataset_key: str, run_evaluation_results: Dict):
        """Save multi-run evaluation results"""
        output_dir = Path(f"{self.output_dir}/multiple_runs")
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save full multi-run statistics
        stats_file = output_dir / f"{dataset_key}_multi_run_statistics.json"
        dump(multi_run_stats, str(stats_file))
        
        # Save per-question statistics (Excel)
        question_stats_list = []
        for question_id, question_data in multi_run_stats['question_statistics'].items():
            stats = question_data['statistics']
            if not stats:
                continue
            
            # Collect per-run scores
            run_scores = {}
            for run_dir in multi_run_stats['run_dirs']:
                if run_dir in question_data['runs']:
                    run_scores[f"{run_dir}_score"] = question_data['runs'][run_dir]['earned_score']
                else:
                    run_scores[f"{run_dir}_score"] = 0.0
            
            question_stats_list.append({
                'question_id': question_id,
                'question': question_data['question'][:100] + '...' if len(question_data['question']) > 100 else question_data['question'],
                'max_points': question_data['max_points'],
                'mean_score': stats['mean_score'],
                'std_score': stats['std_score'],
                'min_score': stats['min_score'],
                'max_score': stats['max_score'],
                'score_rate': stats['score_rate'],
                'num_runs': stats['num_runs'],
                **run_scores
            })
        
        # Save question-level statistics
        if question_stats_list:
            question_stats_df = pd.DataFrame(question_stats_list)
            question_stats_file = output_dir / f"{dataset_key}_question_statistics.xlsx"
            question_stats_df.to_excel(question_stats_file, index=False)
            safe_print(f"📊 Question statistics saved: {question_stats_file}")
        
        # Save run summary
        run_summary_list = []
        for run_dir in multi_run_stats['run_dirs']:
            if run_dir in run_evaluation_results:
                results = run_evaluation_results[run_dir]['results']
                run_summary_list.append({
                    'run_id': run_dir,
                    'total_score': results['total_score'],
                    'max_possible_score': results['max_possible_score'],
                    'score_rate': results['score_rate'],
                    'total_count': results['total_count'],
                    'fine_grained_score': results['fine_grained_total_score'],
                    'fine_grained_rate': results['fine_grained_score_rate'],
                    'coarse_grained_score': results['coarse_grained_total_score'],
                    'coarse_grained_rate': results['coarse_grained_score_rate']
                })
        
        if run_summary_list:
            run_summary_df = pd.DataFrame(run_summary_list)
            run_summary_file = output_dir / f"{dataset_key}_run_summary.xlsx"
            run_summary_df.to_excel(run_summary_file, index=False)
            safe_print(f"📈 Run summary saved: {run_summary_file}")
        
        # Print multi-run summary
        overall = multi_run_stats['overall_statistics']
        safe_print(f"\n🏆 Multi-run evaluation complete!")
        safe_print(f"📊 Dataset: {multi_run_stats['dataset_name']}")
        safe_print(f"🔄 Number of runs: {overall['num_runs']}")
        safe_print(f"📝 Number of problems: {overall['num_questions']}")
        safe_print(f"📈 Mean total score: {overall['mean_total_score']:.2f} ± {overall['std_total_score']:.2f}")
        safe_print(f"🎯 Mean score rate: {overall['mean_score_rate']:.2f}% ± {overall['std_score_rate']:.2f}%")
        safe_print(f"📋 Question-based mean: {overall['question_based_mean_score']:.2f}/{overall['question_based_max_score']:.2f} ({overall['question_based_score_rate']:.2f}%)")
        safe_print(f"💾 Detailed results saved to: {output_dir}")
        
        return multi_run_stats

    def _save_all_multi_run_summary(self, all_multi_run_results: Dict):
        """Save the overall summary of multi-run results across all datasets"""
        output_dir = Path("results")
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Save full results
        summary_file = output_dir / "all_datasets_multi_run_summary.json"
        dump(all_multi_run_results, str(summary_file))
        
        # Build summary table
        summary_table = []
        for dataset_key, multi_run_results in all_multi_run_results.items():
            if multi_run_results is None:
                continue
            
            config = self.DATASET_CONFIGS[dataset_key]
            overall = multi_run_results['overall_statistics']
            
            summary_table.append({
                'dataset': config['display_name'],
                'dataset_key': dataset_key,
                'num_runs': overall['num_runs'],
                'num_questions': overall['num_questions'],
                'mean_total_score': overall['mean_total_score'],
                'std_total_score': overall['std_total_score'],
                'mean_score_rate': overall['mean_score_rate'],
                'std_score_rate': overall['std_score_rate'],
                'question_based_mean_score': overall['question_based_mean_score'],
                'question_based_max_score': overall['question_based_max_score'],
                'question_based_score_rate': overall['question_based_score_rate']
            })
        
        # Save summary table
        if summary_table:
            summary_table_df = pd.DataFrame(summary_table)
            summary_table_file = output_dir / "multi_run_summary_table.xlsx"
            summary_table_df.to_excel(summary_table_file, index=False)
            safe_print(f"📋 Multi-run summary table saved: {summary_table_file}")
        
        # Print final summary
        safe_print(f"\n{'='*80}")
        safe_print(f"🏆 Multi-run evaluations across all datasets complete! Summary:")
        safe_print(f"{'='*80}")
        
        for item in summary_table:
            safe_print(f"   {item['dataset']}: {item['mean_score_rate']:.2f}% ± {item['std_score_rate']:.2f}% ({item['num_runs']} runs)")
        
        safe_print(f"📁 Detailed results saved to: {output_dir}/")
        safe_print(f"💾 Summary file: {summary_file}")
        safe_print(f"📋 Summary table: {summary_table_file}")
