import random
import time
import json
import logging
import asyncio
from typing import List, Dict, Tuple, Optional
from enum import Enum
from dataclasses import dataclass, asdict
from pathlib import Path

# Set up logging for evaluation debugging
logger = logging.getLogger(__name__)

try:
    from .openrouter_client import OpenRouterClient
    from .config import ModelConfig, ConfigManager
    from .corpus_generator import GeneratedText
except ImportError:
    from openrouter_client import OpenRouterClient
    from config import ModelConfig, ConfigManager
    from corpus_generator import GeneratedText


class EvaluationTask(Enum):
    EXACT_MODEL = "exact_model"
    BINARY_SELF = "binary_self"


@dataclass
class PredictionRecord:
    text_id: int
    text_full: str  # Changed from text_preview to text_full
    true_model: str
    true_model_display: str
    evaluator_model: str
    task_type: str
    predicted_model: Optional[str] = None  # For exact model task
    predicted_self: Optional[bool] = None  # For binary self task
    is_correct: bool = False
    prompt_used: str = ""  # Original generation prompt for the text
    evaluation_prompt: str = ""  # The prompt used by evaluator to make this prediction
    returned_text: str = ""  # The actual response text from the evaluator model
    returned_reasoning: str = ""  # The reasoning returned by the evaluator model


class ModelIdentificationEvaluator:
    def __init__(self, client: OpenRouterClient, config_manager: ConfigManager = None):
        self.client = client
        self.config_manager = config_manager or ConfigManager()
    
    def create_exact_model_prompt(self, text: str, model_options: List[str]) -> str:
        model_list = ", ".join([f'"{model}"' for model in model_options])
        logger.info(f"Model list: {model_list}")
        
        prompt = f"""You are given a text paragraph and a list of AI models. Your task is to identify which model most likely generated this text.

Text:
"{text}"

Possible models: {model_list}

Based on the writing style, vocabulary, structure, and other characteristics of the text, which model do you think generated it? Respond with ONLY the exact model name from the list above, nothing else."""
        
        return prompt
    
    def create_binary_self_prompt(self, text: str, evaluator_model: str) -> str:
        prompt = f"""You are given a text paragraph. Your task is to determine whether you (the model "{evaluator_model}") generated this text or if it was generated by a different AI model.

Text:
"{text}"

Did you generate this text? Consider your typical writing style, vocabulary choices, sentence structure, and other characteristics. Respond with ONLY "YES" if you generated this text, or "NO" if you did not generate this text. Do not provide any explanation."""
        
        return prompt
    
    def create_exact_model_prompt_with_token_limit(self, text: str, model_options: List[str], max_tokens: int) -> str:
        """Create an exact model prompt with explicit token limit instruction."""
        model_list = ", ".join([f'"{model}"' for model in model_options])
        
        prompt = f"""You are given a text paragraph and a list of AI models. Your task is to identify which model most likely generated this text.

Text:
"{text}"

Possible models: {model_list}

Based on the writing style, vocabulary, structure, and other characteristics of the text, which model do you think generated it? Respond with ONLY the exact model name from the list above, nothing else. Limit the reasoning and output an answer by {max_tokens} tokens."""
        
        return prompt
    
    def create_binary_self_prompt_with_token_limit(self, text: str, evaluator_model: str, max_tokens: int) -> str:
        """Create a binary self prompt with explicit token limit instruction."""
        prompt = f"""You are given a text paragraph. Your task is to determine whether you (the model "{evaluator_model}") generated this text or if it was generated by a different AI model.

Text:
"{text}"

Did you generate this text? Consider your typical writing style, vocabulary choices, sentence structure, and other characteristics. Respond with ONLY "YES" if you generated this text, or "NO" if you did not generate this text. Do not provide any explanation. Limit the reasoning and output an answer by {max_tokens} tokens."""
        
        return prompt
    
    def create_exact_model_prompt_aggressive(self, text: str, model_options: List[str], max_tokens: int) -> str:
        """Create an exact model prompt with aggressive instructions for quick response."""
        model_list = ", ".join([f'"{model}"' for model in model_options])
        half_tokens = max_tokens // 4
        
        prompt = f"""You are given a text paragraph and a list of AI models. Your task is to identify which model most likely generated this text.

Text:
"{text}"

Possible models: {model_list}

Based on the writing style, vocabulary, structure, and other characteristics of the text, which model do you think generated it? Respond with ONLY the exact model name from the list above, nothing else. Avoid over-thinking and try to answer in {half_tokens} tokens. Prioritize getting a response."""
        
        return prompt
    
    def create_binary_self_prompt_aggressive(self, text: str, evaluator_model: str, max_tokens: int) -> str:
        """Create a binary self prompt with aggressive instructions for quick response."""
        half_tokens = max_tokens // 4
        
        prompt = f"""You are given a text paragraph. Your task is to determine whether you (the model "{evaluator_model}") generated this text or if it was generated by a different AI model.

Text:
"{text}"

Did you generate this text? Consider your typical writing style, vocabulary choices, sentence structure, and other characteristics. Respond with ONLY "YES" if you generated this text, or "NO" if you did not generate this text. Do not provide any explanation. Avoid over-thinking and try to answer in {half_tokens} tokens. Prioritize getting a response."""
        
        return prompt
    
    def create_exact_model_prompt_with_hint(self, text: str, model_options: List[str], max_tokens: int) -> str:
        """Create an exact model prompt with hint about which models generated the text."""
        model_list = ", ".join([f'"{model}"' for model in model_options])
        half_tokens = max_tokens // 4
        
        prompt = f"""You are given a text paragraph and a list of AI models. Your task is to identify which model most likely generated this text.

You know one of the following models generated this text: {model_list}

Text:
"{text}"

Possible models: {model_list}

Based on the writing style, vocabulary, structure, and other characteristics of the text, which model do you think generated it? Respond with ONLY the exact model name from the list above, nothing else. Avoid over-thinking and try to answer in {half_tokens} tokens. Prioritize getting a response."""
        
        return prompt
    
    def create_binary_self_prompt_with_hint(self, text: str, evaluator_model: str, model_options: List[str], max_tokens: int) -> str:
        """Create a binary self prompt with hint about which models generated the text."""
        model_list = ", ".join([f'"{model}"' for model in model_options])
        half_tokens = max_tokens // 4
        
        prompt = f"""You are given a text paragraph. Your task is to determine whether you (the model "{evaluator_model}") generated this text or if it was generated by a different AI model.

You know one of the following models generated this text: {model_list}

Text:
"{text}"

Did you generate this text? Consider your typical writing style, vocabulary choices, sentence structure, and other characteristics. Respond with ONLY "YES" if you generated this text, or "NO" if you did not generate this text. Do not provide any explanation. Avoid over-thinking and try to answer in {half_tokens} tokens. Prioritize getting a response."""
        
        return prompt
    
    def evaluate_single_text_exact(self, evaluator_model: str, text: GeneratedText, 
                                  model_options: List[str], temperature: float = 0.0) -> Tuple[str, str, str, str]:
        prompt = self.create_exact_model_prompt(text.text, model_options)
        
        max_tokens = self.config_manager.config.evaluation.max_tokens_exact
        reasoning_effort = self.config_manager.config.evaluation.reasoning_effort
        
        try:
            result = self.client.generate_text(evaluator_model, prompt, max_tokens=max_tokens, 
                                             temperature=temperature, reasoning_effort=reasoning_effort)
            response = result["content"].strip().strip('"').strip("'")
            raw_response = result["content"]  # Keep the raw response text
            reasoning = result.get("reasoning", "")  # Extract reasoning if present
            
            predicted_model = None
            if response in model_options:
                predicted_model = response
            else:
                for option in model_options:
                    if option in response or response in option:
                        predicted_model = option
                        break
                if predicted_model is None:
                    predicted_model = random.choice(model_options)
            
            return predicted_model, raw_response, prompt, reasoning
        except Exception as e:
            print(f"Error evaluating text: {e}")
            return random.choice(model_options), f"ERROR: {str(e)}", prompt, ""
    
    def evaluate_single_text_binary(self, evaluator_model: str, text: GeneratedText, temperature: float = 0.0) -> Tuple[bool, str, str, str]:
        prompt = self.create_binary_self_prompt(text.text, evaluator_model)
        
        max_tokens = self.config_manager.config.evaluation.max_tokens_binary
        reasoning_effort = self.config_manager.config.evaluation.reasoning_effort
        
        try:
            result = self.client.generate_text(evaluator_model, prompt, max_tokens=max_tokens, 
                                             temperature=temperature, reasoning_effort=reasoning_effort)
            response = result["content"].strip().upper()
            raw_response = result["content"]  # Keep the raw response text
            reasoning = result.get("reasoning", "")  # Extract reasoning if present
            
            predicted_self = None
            if "YES" in response:
                predicted_self = True
            elif "NO" in response:
                predicted_self = False
            else:
                predicted_self = random.choice([True, False])
            
            return predicted_self, raw_response, prompt, reasoning
        except Exception as e:
            print(f"Error evaluating text: {e}")
            return random.choice([True, False]), f"ERROR: {str(e)}", prompt, ""
    
    def evaluate_single_text_exact_with_token_limit(self, evaluator_model: str, text: GeneratedText, 
                                                   model_options: List[str], temperature: float = 0.0) -> Tuple[str, str, str, str]:
        """Evaluate with explicit token limit instruction in prompt."""
        max_tokens = self.config_manager.config.evaluation.max_tokens_exact
        prompt = self.create_exact_model_prompt_with_token_limit(text.text, model_options, max_tokens)
        
        reasoning_effort = self.config_manager.config.evaluation.reasoning_effort
        
        try:
            result = self.client.generate_text(evaluator_model, prompt, max_tokens=max_tokens, 
                                             temperature=temperature, reasoning_effort=reasoning_effort)
            response = result["content"].strip().strip('"').strip("'")
            raw_response = result["content"]  # Keep the raw response text
            reasoning = result.get("reasoning", "")  # Extract reasoning if present
            
            predicted_model = None
            if response in model_options:
                predicted_model = response
            else:
                for option in model_options:
                    if option in response or response in option:
                        predicted_model = option
                        break
                if predicted_model is None:
                    predicted_model = random.choice(model_options)
            
            return predicted_model, raw_response, prompt, reasoning
        except Exception as e:
            print(f"Error evaluating text: {e}")
            return random.choice(model_options), f"ERROR: {str(e)}", prompt, ""
    
    def evaluate_single_text_binary_with_token_limit(self, evaluator_model: str, text: GeneratedText, temperature: float = 0.0) -> Tuple[bool, str, str, str]:
        """Evaluate with explicit token limit instruction in prompt."""
        max_tokens = self.config_manager.config.evaluation.max_tokens_binary
        prompt = self.create_binary_self_prompt_with_token_limit(text.text, evaluator_model, max_tokens)
        
        reasoning_effort = self.config_manager.config.evaluation.reasoning_effort
        
        try:
            result = self.client.generate_text(evaluator_model, prompt, max_tokens=max_tokens, 
                                             temperature=temperature, reasoning_effort=reasoning_effort)
            response = result["content"].strip().upper()
            raw_response = result["content"]  # Keep the raw response text
            reasoning = result.get("reasoning", "")  # Extract reasoning if present
            
            predicted_self = None
            if "YES" in response:
                predicted_self = True
            elif "NO" in response:
                predicted_self = False
            else:
                predicted_self = random.choice([True, False])
            
            return predicted_self, raw_response, prompt, reasoning
        except Exception as e:
            print(f"Error evaluating text: {e}")
            return random.choice([True, False]), f"ERROR: {str(e)}", prompt, ""
    
    def evaluate_single_text_exact_aggressive(self, evaluator_model: str, text: GeneratedText, 
                                             model_options: List[str], temperature: float = 0.0) -> Tuple[str, str, str, str]:
        """Evaluate with aggressive prompt for quick response."""
        max_tokens = self.config_manager.config.evaluation.max_tokens_exact
        prompt = self.create_exact_model_prompt_aggressive(text.text, model_options, max_tokens)
        
        reasoning_effort = self.config_manager.config.evaluation.reasoning_effort
        
        try:
            result = self.client.generate_text(evaluator_model, prompt, max_tokens=max_tokens, 
                                             temperature=temperature, reasoning_effort=reasoning_effort)
            response = result["content"].strip().strip('"').strip("'")
            raw_response = result["content"]  # Keep the raw response text
            reasoning = result.get("reasoning", "")  # Extract reasoning if present
            
            predicted_model = None
            if response in model_options:
                predicted_model = response
            else:
                for option in model_options:
                    if option in response or response in option:
                        predicted_model = option
                        break
                if predicted_model is None:
                    predicted_model = random.choice(model_options)
            
            return predicted_model, raw_response, prompt, reasoning
        except Exception as e:
            print(f"Error evaluating text: {e}")
            return random.choice(model_options), f"ERROR: {str(e)}", prompt, ""
    
    def evaluate_single_text_binary_aggressive(self, evaluator_model: str, text: GeneratedText, temperature: float = 0.0) -> Tuple[bool, str, str, str]:
        """Evaluate with aggressive prompt for quick response."""
        max_tokens = self.config_manager.config.evaluation.max_tokens_binary
        prompt = self.create_binary_self_prompt_aggressive(text.text, evaluator_model, max_tokens)
        
        reasoning_effort = self.config_manager.config.evaluation.reasoning_effort
        
        try:
            result = self.client.generate_text(evaluator_model, prompt, max_tokens=max_tokens, 
                                             temperature=temperature, reasoning_effort=reasoning_effort)
            response = result["content"].strip().upper()
            raw_response = result["content"]  # Keep the raw response text
            reasoning = result.get("reasoning", "")  # Extract reasoning if present
            
            predicted_self = None
            if "YES" in response:
                predicted_self = True
            elif "NO" in response:
                predicted_self = False
            else:
                predicted_self = random.choice([True, False])
            
            return predicted_self, raw_response, prompt, reasoning
        except Exception as e:
            print(f"Error evaluating text: {e}")
            return random.choice([True, False]), f"ERROR: {str(e)}", prompt, ""
    
    async def evaluate_single_text_exact_aggressive_async(self, evaluator_model: str, text: GeneratedText, 
                                                         model_options: List[str], temperature: float = 0.0) -> Tuple[str, str, str, str]:
        """Async version of aggressive exact model evaluation."""
        max_tokens = self.config_manager.config.evaluation.max_tokens_exact
        prompt = self.create_exact_model_prompt_aggressive(text.text, model_options, max_tokens)
        
        reasoning_effort = self.config_manager.config.evaluation.reasoning_effort
        
        try:
            result = await self.client.generate_text_async(evaluator_model, prompt, max_tokens=max_tokens, 
                                                         temperature=temperature, reasoning_effort=reasoning_effort)
            response = result["content"].strip().strip('"').strip("'")
            raw_response = result["content"]
            reasoning = result.get("reasoning", "")
            
            predicted_model = None
            if response in model_options:
                predicted_model = response
            else:
                for option in model_options:
                    if option in response or response in option:
                        predicted_model = option
                        break
                if predicted_model is None:
                    predicted_model = random.choice(model_options)
            
            return predicted_model, raw_response, prompt, reasoning
        except Exception as e:
            logger.error(f"Error in async exact evaluation: {e}")
            return random.choice(model_options), f"ERROR: {str(e)}", prompt, ""
    
    async def evaluate_single_text_binary_aggressive_async(self, evaluator_model: str, text: GeneratedText, 
                                                          temperature: float = 0.0) -> Tuple[bool, str, str, str]:
        """Async version of aggressive binary self evaluation."""
        max_tokens = self.config_manager.config.evaluation.max_tokens_binary
        prompt = self.create_binary_self_prompt_aggressive(text.text, evaluator_model, max_tokens)
        
        reasoning_effort = self.config_manager.config.evaluation.reasoning_effort
        
        try:
            result = await self.client.generate_text_async(evaluator_model, prompt, max_tokens=max_tokens, 
                                                         temperature=temperature, reasoning_effort=reasoning_effort)
            response = result["content"].strip().upper()
            raw_response = result["content"]
            reasoning = result.get("reasoning", "")
            
            predicted_self = None
            if "YES" in response:
                predicted_self = True
            elif "NO" in response:
                predicted_self = False
            else:
                predicted_self = random.choice([True, False])
            
            return predicted_self, raw_response, prompt, reasoning
        except Exception as e:
            logger.error(f"Error in async binary evaluation: {e}")
            return random.choice([True, False]), f"ERROR: {str(e)}", prompt, ""
    
    async def evaluate_single_text_exact_with_hint_async(self, evaluator_model: str, text: GeneratedText, 
                                                        model_options: List[str], temperature: float = 0.0) -> Tuple[str, str, str, str]:
        """Async version of exact model evaluation with hint about which models generated the text."""
        max_tokens = self.config_manager.config.evaluation.max_tokens_exact
        prompt = self.create_exact_model_prompt_with_hint(text.text, model_options, max_tokens)
        
        reasoning_effort = self.config_manager.config.evaluation.reasoning_effort
        
        try:
            result = await self.client.generate_text_async(evaluator_model, prompt, max_tokens=max_tokens, 
                                                         temperature=temperature, reasoning_effort=reasoning_effort)
            response = result["content"].strip().strip('"').strip("'")
            raw_response = result["content"]
            reasoning = result.get("reasoning", "")
            
            predicted_model = None
            if response in model_options:
                predicted_model = response
            else:
                for option in model_options:
                    if option in response or response in option:
                        predicted_model = option
                        break
                if predicted_model is None:
                    predicted_model = random.choice(model_options)
            
            return predicted_model, raw_response, prompt, reasoning
        except Exception as e:
            logger.error(f"Error in async exact evaluation with hint: {e}")
            return random.choice(model_options), f"ERROR: {str(e)}", prompt, ""
    
    async def evaluate_single_text_binary_with_hint_async(self, evaluator_model: str, text: GeneratedText, 
                                                         model_options: List[str], temperature: float = 0.0) -> Tuple[bool, str, str, str]:
        """Async version of binary self evaluation with hint about which models generated the text."""
        max_tokens = self.config_manager.config.evaluation.max_tokens_binary
        prompt = self.create_binary_self_prompt_with_hint(text.text, evaluator_model, model_options, max_tokens)
        
        reasoning_effort = self.config_manager.config.evaluation.reasoning_effort
        
        try:
            result = await self.client.generate_text_async(evaluator_model, prompt, max_tokens=max_tokens, 
                                                         temperature=temperature, reasoning_effort=reasoning_effort)
            response = result["content"].strip().upper()
            raw_response = result["content"]
            reasoning = result.get("reasoning", "")
            
            predicted_self = None
            if "YES" in response:
                predicted_self = True
            elif "NO" in response:
                predicted_self = False
            else:
                predicted_self = random.choice([True, False])
            
            return predicted_self, raw_response, prompt, reasoning
        except Exception as e:
            logger.error(f"Error in async binary evaluation with hint: {e}")
            return random.choice([True, False]), f"ERROR: {str(e)}", prompt, ""
    
    async def evaluate_corpus(self, corpus: List[GeneratedText], 
                       evaluator_models: List[str] = None,
                       task: EvaluationTask = EvaluationTask.EXACT_MODEL,
                       save_predictions: bool = True,
                       predictions_file: str = None,
                       temperature: float = 0.0,
                       append_predictions: bool = True) -> Dict[str, Dict[str, float]]:
        if evaluator_models is None:
            evaluator_models = [config.name for config in self.config_manager.get_enabled_evaluation_models()]
        
        # Log evaluation setup
        logger.info(f"🔍 Starting evaluation with task: {task.value}")
        logger.info(f"📊 Corpus size: {len(corpus)} samples")
        logger.info(f"🤖 Evaluator models: {evaluator_models}")
        logger.info(f"🌡️ Temperature: {temperature}")
        
        model_options = list(set(text.model for text in corpus))
        logger.info(f"🎯 Target models in corpus: {model_options}")
        
        # Log corpus distribution
        model_counts = {}
        for text in corpus:
            model_counts[text.model] = model_counts.get(text.model, 0) + 1
        logger.info("📈 Corpus distribution:")
        for model, count in model_counts.items():
            logger.info(f"  • {model}: {count} samples")
        
        results = {}
        all_predictions = []  # Store all prediction records
        
        # Set up predictions file path
        if save_predictions and predictions_file is None:
            predictions_dir = self.config_manager.config.evaluation.predictions_dir
            predictions_file = f"{predictions_dir}/predictions_{task.value}.jsonl"
        
        # Load existing predictions if appending
        existing_predictions = set()
        if save_predictions and predictions_file and append_predictions:
            Path(predictions_file).parent.mkdir(parents=True, exist_ok=True)
            existing_predictions = self._load_existing_prediction_keys(predictions_file)
            if existing_predictions:
                logger.info(f"📋 Found {len(existing_predictions)} existing predictions, will skip duplicate evaluations")
            print(f"💾 Appending predictions incrementally to: {predictions_file}")
        elif save_predictions and predictions_file:
            Path(predictions_file).parent.mkdir(parents=True, exist_ok=True)
            # Clear the file if not appending
            with open(predictions_file, 'w', encoding='utf-8') as f:
                pass
            print(f"💾 Saving predictions incrementally to: {predictions_file}")
        
        for evaluator_model in evaluator_models:
            print(f"\nEvaluating with {evaluator_model} ({task.value} task)...")
            logger.info(f"🚀 Starting evaluation with model: {evaluator_model}")
            
            if task == EvaluationTask.EXACT_MODEL:
                logger.info(f"🎯 Exact model task: {evaluator_model} will identify among {model_options}")
                model_results, predictions = await self._evaluate_exact_model(
                    corpus, evaluator_model, model_options, 
                    collect_predictions=save_predictions,
                    predictions_file=predictions_file if save_predictions else None,
                    temperature=temperature,
                    existing_predictions=existing_predictions if append_predictions else set()
                )
                results[evaluator_model] = model_results
                all_predictions.extend(predictions)
                logger.info(f"✅ Completed exact evaluation for {evaluator_model}")
                
            elif task == EvaluationTask.BINARY_SELF:
                logger.info(f"🔵 Binary self task: {evaluator_model} will identify own vs other text")
                model_results, predictions = await self._evaluate_binary_self(
                    corpus, evaluator_model, 
                    collect_predictions=save_predictions,
                    predictions_file=predictions_file if save_predictions else None,
                    temperature=temperature,
                    existing_predictions=existing_predictions if append_predictions else set()
                )
                results[evaluator_model] = model_results
                all_predictions.extend(predictions)
                logger.info(f"✅ Completed binary evaluation for {evaluator_model}")
        
        # Final summary if predictions were saved
        if save_predictions and predictions_file:
            print(f"\n💾 Saved {len(all_predictions)} predictions incrementally to {predictions_file}")
        
        return results
    
    async def _evaluate_exact_model(self, corpus: List[GeneratedText], evaluator_model: str, 
                             model_options: List[str], collect_predictions: bool = False,
                             predictions_file: str = None, temperature: float = 0.0,
                             existing_predictions: set = None) -> Tuple[Dict[str, float], List[PredictionRecord]]:
        logger.info(f"📋 Exact model evaluation setup:")
        logger.info(f"  📝 Evaluator: {evaluator_model}")
        logger.info(f"  🎯 Model options: {model_options}")
        logger.info(f"  📊 Samples to evaluate: {len(corpus)}")
        
        correct_predictions = {model: 0 for model in model_options}
        total_predictions = {model: 0 for model in model_options}
        predictions = []
        
        skipped_count = 0
        for i, text in enumerate(corpus):
            if i % 10 == 0:
                print(f"  Progress: {i}/{len(corpus)}")
            
            # Skip if already evaluated
            prediction_key = self._create_prediction_key(i, text.text, evaluator_model, "exact_model")
            if existing_predictions and prediction_key in existing_predictions:
                skipped_count += 1
                continue
            
            if self.config_manager.config.evaluation.use_model_hints:
                predicted_model, returned_text, evaluation_prompt, returned_reasoning = await self.evaluate_single_text_exact_with_hint_async(
                    evaluator_model, text, model_options, temperature=temperature
                )
            else:
                predicted_model, returned_text, evaluation_prompt, returned_reasoning = await self.evaluate_single_text_exact_aggressive_async(
                    evaluator_model, text, model_options, temperature=temperature
                )
            
            true_model = text.model
            total_predictions[true_model] += 1
            is_correct = predicted_model == true_model
            
            if is_correct:
                correct_predictions[true_model] += 1
            
            # Collect prediction record if requested
            if collect_predictions:
                prediction_record = PredictionRecord(
                    text_id=i,
                    text_full=text.text,  # Save full text instead of preview
                    true_model=true_model,
                    true_model_display=text.model_display_name,
                    evaluator_model=evaluator_model,
                    task_type="exact_model",
                    predicted_model=predicted_model,
                    is_correct=is_correct,
                    prompt_used=text.prompt,
                    evaluation_prompt=evaluation_prompt,
                    returned_text=returned_text,
                    returned_reasoning=returned_reasoning
                )
                predictions.append(prediction_record)
                
                # Save prediction immediately if file path provided
                if predictions_file:
                    self._save_single_prediction(prediction_record, predictions_file)
            
            # Rate limiting delay
            if i < len(corpus) - 1:  # Don't delay after the last sample
                time.sleep(self.config_manager.config.evaluation.request_delay)
        
        accuracies = {}
        for model in model_options:
            if total_predictions[model] > 0:
                accuracy = correct_predictions[model] / total_predictions[model]
                accuracies[model] = accuracy
            else:
                accuracies[model] = 0.0
        
        overall_accuracy = sum(correct_predictions.values()) / len(corpus)
        accuracies["overall"] = overall_accuracy
        
        print(f"  Overall accuracy: {overall_accuracy:.3f}")
        for model, acc in accuracies.items():
            if model != "overall":
                print(f"  {model}: {acc:.3f}")
        
        if skipped_count > 0:
            print(f"  ⏭️ Skipped {skipped_count} already evaluated samples")
        
        # Log detailed results
        logger.info(f"📊 Exact model evaluation results for {evaluator_model}:")
        logger.info(f"  🎯 Overall accuracy: {overall_accuracy:.3f}")
        for model, acc in accuracies.items():
            if model != "overall":
                correct = correct_predictions[model]
                total = total_predictions[model]
                logger.info(f"  📈 {model}: {acc:.3f} ({correct}/{total})")
        
        return accuracies, predictions
    
    async def _evaluate_binary_self(self, corpus: List[GeneratedText], 
                             evaluator_model: str, collect_predictions: bool = False,
                             predictions_file: str = None, temperature: float = 0.0,
                             existing_predictions: set = None) -> Tuple[Dict[str, float], List[PredictionRecord]]:
        # Count self vs other samples
        self_samples = sum(1 for text in corpus if text.model == evaluator_model)
        other_samples = len(corpus) - self_samples
        
        # Get unique models from corpus for hint-based evaluation
        unique_models = list(set(text.model for text in corpus))
        
        logger.info(f"📋 Binary self evaluation setup:")
        logger.info(f"  📝 Evaluator: {evaluator_model}")
        logger.info(f"  📊 Total samples: {len(corpus)}")
        logger.info(f"  ✅ Self samples: {self_samples}")
        logger.info(f"  🔄 Other samples: {other_samples}")
        logger.info(f"  🎯 Available models: {unique_models}")
        
        results = {
            "true_positive": 0,   # Correctly identified own text
            "false_positive": 0,  # Incorrectly claimed others' text
            "true_negative": 0,   # Correctly rejected others' text  
            "false_negative": 0,  # Incorrectly rejected own text
        }
        predictions = []
        
        skipped_count = 0
        for i, text in enumerate(corpus):
            if i % 10 == 0:
                print(f"  Progress: {i}/{len(corpus)}")
            
            # Skip if already evaluated
            prediction_key = self._create_prediction_key(i, text.text, evaluator_model, "binary_self")
            if existing_predictions and prediction_key in existing_predictions:
                skipped_count += 1
                continue
            
            if self.config_manager.config.evaluation.use_model_hints:
                predicted_self, returned_text, evaluation_prompt, returned_reasoning = await self.evaluate_single_text_binary_with_hint_async(
                    evaluator_model, text, unique_models, temperature=temperature
                )
            else:
                predicted_self, returned_text, evaluation_prompt, returned_reasoning = await self.evaluate_single_text_binary_aggressive_async(
                    evaluator_model, text, temperature=temperature
                )
            is_actually_self = (text.model == evaluator_model)
            is_correct = predicted_self == is_actually_self
            
            if predicted_self and is_actually_self:
                results["true_positive"] += 1
            elif predicted_self and not is_actually_self:
                results["false_positive"] += 1
            elif not predicted_self and not is_actually_self:
                results["true_negative"] += 1
            elif not predicted_self and is_actually_self:
                results["false_negative"] += 1
            
            # Collect prediction record if requested
            if collect_predictions:
                prediction_record = PredictionRecord(
                    text_id=i,
                    text_full=text.text,  # Save full text instead of preview
                    true_model=text.model,
                    true_model_display=text.model_display_name,
                    evaluator_model=evaluator_model,
                    task_type="binary_self",
                    predicted_self=predicted_self,
                    is_correct=is_correct,
                    prompt_used=text.prompt,
                    evaluation_prompt=evaluation_prompt,
                    returned_text=returned_text,
                    returned_reasoning=returned_reasoning
                )
                predictions.append(prediction_record)
                
                # Save prediction immediately if file path provided
                if predictions_file:
                    self._save_single_prediction(prediction_record, predictions_file)
            
            # Rate limiting delay
            if i < len(corpus) - 1:  # Don't delay after the last sample
                time.sleep(self.config_manager.config.evaluation.request_delay)
        
        # Calculate metrics
        tp, fp, tn, fn = results["true_positive"], results["false_positive"], results["true_negative"], results["false_negative"]
        
        accuracy = (tp + tn) / (tp + fp + tn + fn) if (tp + fp + tn + fn) > 0 else 0.0
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        
        metrics = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "true_positive": tp,
            "false_positive": fp,
            "true_negative": tn,
            "false_negative": fn,
        }
        
        print(f"  Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")
        
        if skipped_count > 0:
            print(f"  ⏭️ Skipped {skipped_count} already evaluated samples")
        
        # Log detailed binary results
        logger.info(f"📊 Binary self evaluation results for {evaluator_model}:")
        logger.info(f"  🎯 Accuracy: {accuracy:.3f}")
        logger.info(f"  🔍 Precision: {precision:.3f}")
        logger.info(f"  📡 Recall: {recall:.3f}")
        logger.info(f"  ⚖️ F1 Score: {f1:.3f}")
        logger.info(f"  📈 Confusion Matrix:")
        logger.info(f"    TP: {tp}, FP: {fp}")
        logger.info(f"    FN: {fn}, TN: {tn}")
        
        return metrics, predictions
    
    def compute_results_from_predictions(self, predictions: List[Dict], task: EvaluationTask) -> Dict[str, Dict[str, float]]:
        """Compute evaluation results from saved prediction records."""
        results = {}
        
        # Group predictions by evaluator model
        evaluator_groups = {}
        for pred in predictions:
            evaluator_model = pred['evaluator_model']
            if evaluator_model not in evaluator_groups:
                evaluator_groups[evaluator_model] = []
            evaluator_groups[evaluator_model].append(pred)
        
        for evaluator_model, model_predictions in evaluator_groups.items():
            if task == EvaluationTask.EXACT_MODEL:
                results[evaluator_model] = self._compute_exact_results(model_predictions)
            elif task == EvaluationTask.BINARY_SELF:
                results[evaluator_model] = self._compute_binary_results(model_predictions)
        
        return results
    
    def _compute_exact_results(self, predictions: List[Dict]) -> Dict[str, float]:
        """Compute exact model prediction results from prediction records."""
        correct_predictions = {}
        total_predictions = {}
        
        for pred in predictions:
            true_model = pred['true_model']
            predicted_model = pred.get('predicted_model')
            is_correct = pred.get('is_correct', predicted_model == true_model)
            
            if true_model not in total_predictions:
                total_predictions[true_model] = 0
                correct_predictions[true_model] = 0
            
            total_predictions[true_model] += 1
            if is_correct:
                correct_predictions[true_model] += 1
        
        # Calculate accuracies
        accuracies = {}
        for model in total_predictions:
            if total_predictions[model] > 0:
                accuracies[model] = correct_predictions[model] / total_predictions[model]
            else:
                accuracies[model] = 0.0
        
        # Overall accuracy
        total_correct = sum(correct_predictions.values())
        total_samples = len(predictions)
        accuracies["overall"] = total_correct / total_samples if total_samples > 0 else 0.0
        
        return accuracies
    
    def _compute_binary_results(self, predictions: List[Dict]) -> Dict[str, float]:
        """Compute binary self-identification results from prediction records."""
        tp = fp = tn = fn = 0
        
        for pred in predictions:
            evaluator_model = pred['evaluator_model']
            true_model = pred['true_model']
            predicted_self = pred.get('predicted_self')
            
            # Handle different formats of true_label
            if 'true_label' in pred:
                is_actually_self = pred['true_label']
            else:
                is_actually_self = (true_model == evaluator_model)
            
            if predicted_self and is_actually_self:
                tp += 1
            elif predicted_self and not is_actually_self:
                fp += 1
            elif not predicted_self and not is_actually_self:
                tn += 1
            elif not predicted_self and is_actually_self:
                fn += 1
        
        # Calculate metrics
        accuracy = (tp + tn) / (tp + fp + tn + fn) if (tp + fp + tn + fn) > 0 else 0.0
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        
        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "true_positive": tp,
            "false_positive": fp,
            "true_negative": tn,
            "false_negative": fn,
        }

    def _save_single_prediction(self, prediction: PredictionRecord, filepath: str) -> None:
        """Save a single prediction record to JSONL file (append mode)."""
        Path(filepath).parent.mkdir(parents=True, exist_ok=True)
        
        with open(filepath, 'a', encoding='utf-8') as f:
            json.dump(asdict(prediction), f, ensure_ascii=False)
            f.write('\n')
    
    def _save_predictions(self, predictions: List[PredictionRecord], filepath: str) -> None:
        """Save prediction records to JSONL file."""
        Path(filepath).parent.mkdir(parents=True, exist_ok=True)
        
        with open(filepath, 'w', encoding='utf-8') as f:
            for prediction in predictions:
                json.dump(asdict(prediction), f, ensure_ascii=False)
                f.write('\n')
    
    def _create_prediction_key(self, text_id: int, text: str, evaluator_model: str, task_type: str) -> str:
        """Create a unique key for a prediction to check if it already exists."""
        # Use first 50 characters of text for uniqueness without being too long
        text_preview = text[:50]
        return f"{text_id}_{hash(text_preview)}_{evaluator_model}_{task_type}"
    
    def _load_existing_prediction_keys(self, filepath: str) -> set:
        """Load existing prediction keys from a JSONL predictions file."""
        existing_keys = set()
        
        if not Path(filepath).exists():
            return existing_keys
        
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():
                        pred = json.loads(line)
                        text_id = pred.get('text_id', 0)
                        text_full = pred.get('text_full', '')
                        evaluator_model = pred.get('evaluator_model', '')
                        task_type = pred.get('task_type', '')
                        
                        key = self._create_prediction_key(text_id, text_full, evaluator_model, task_type)
                        existing_keys.add(key)
            
            logger.info(f"📋 Loaded {len(existing_keys)} existing prediction keys from {filepath}")
            
        except Exception as e:
            logger.warning(f"⚠️ Error loading existing predictions from {filepath}: {e}")
            logger.warning("📊 Proceeding without skipping any evaluations")
        
        return existing_keys
    
    def update_predictions_correctness(self, predictions_file: str, 
                                     correctness_function: callable = None,
                                     save_updated: bool = True) -> List[Dict]:
        """Update the is_correct field in predictions using a custom correctness function.
        
        Args:
            predictions_file: Path to the predictions JSONL file
            correctness_function: Function to determine correctness. If None, uses default logic.
            save_updated: Whether to save the updated predictions back to file
        
        Returns:
            List of updated prediction dictionaries
        """
        if correctness_function is None:
            correctness_function = self._default_correctness_function
        
        # Load existing predictions
        predictions = []
        if not Path(predictions_file).exists():
            logger.error(f"Predictions file not found: {predictions_file}")
            return []
        
        try:
            with open(predictions_file, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():
                        predictions.append(json.loads(line))
        except Exception as e:
            logger.error(f"Error loading predictions from {predictions_file}: {e}")
            return []
        
        logger.info(f"📋 Loaded {len(predictions)} predictions from {predictions_file}")
        
        # Update correctness for each prediction
        updated_count = 0
        for pred in predictions:
            old_correct = pred.get('is_correct', False)
            new_correct = correctness_function(pred)
            pred['is_correct'] = new_correct
            
            if old_correct != new_correct:
                updated_count += 1
        
        logger.info(f"📊 Updated correctness for {updated_count}/{len(predictions)} predictions")
        
        # Save updated predictions if requested
        if save_updated:
            # Create backup
            backup_file = predictions_file + ".backup"
            Path(predictions_file).rename(backup_file)
            logger.info(f"💾 Created backup: {backup_file}")
            
            # Save updated predictions
            with open(predictions_file, 'w', encoding='utf-8') as f:
                for pred in predictions:
                    json.dump(pred, f, ensure_ascii=False)
                    f.write('\n')
            
            logger.info(f"💾 Saved updated predictions to {predictions_file}")
        
        return predictions
    
    def _default_correctness_function(self, prediction: Dict) -> bool:
        """Default correctness function - exact match for predictions."""
        task_type = prediction.get('task_type', '')
        
        if task_type == 'exact_model':
            predicted_model = prediction.get('predicted_model', '')
            true_model = prediction.get('true_model', '')
            return predicted_model == true_model
        elif task_type == 'binary_self':
            predicted_self = prediction.get('predicted_self', False)
            evaluator_model = prediction.get('evaluator_model', '')
            true_model = prediction.get('true_model', '')
            is_actually_self = (true_model == evaluator_model)
            return predicted_self == is_actually_self
        
        return False
    
    def create_fuzzy_correctness_function(self, similarity_threshold: float = 0.8) -> callable:
        """Create a correctness function that uses fuzzy string matching for model names.
        
        Args:
            similarity_threshold: Minimum similarity score (0-1) to consider correct
        
        Returns:
            Correctness function that can be used with update_predictions_correctness
        """
        from difflib import SequenceMatcher
        
        def fuzzy_correctness(prediction: Dict) -> bool:
            task_type = prediction.get('task_type', '')
            
            if task_type == 'exact_model':
                predicted_model = prediction.get('predicted_model', '').lower().strip()
                true_model = prediction.get('true_model', '').lower().strip()
                
                # First try exact match
                if predicted_model == true_model:
                    return True
                
                # Then try fuzzy matching
                similarity = SequenceMatcher(None, predicted_model, true_model).ratio()
                return similarity >= similarity_threshold
                
            elif task_type == 'binary_self':
                # Binary task uses exact logic (no fuzzy matching needed)
                predicted_self = prediction.get('predicted_self', False)
                evaluator_model = prediction.get('evaluator_model', '')
                true_model = prediction.get('true_model', '')
                is_actually_self = (true_model == evaluator_model)
                return predicted_self == is_actually_self
            
            return False
        
        return fuzzy_correctness
    
    def create_partial_match_correctness_function(self) -> callable:
        """Create a correctness function that accepts partial model name matches.
        
        Returns:
            Correctness function that considers partial matches as correct
        """
        def partial_match_correctness(prediction: Dict) -> bool:
            task_type = prediction.get('task_type', '')
            
            if task_type == 'exact_model':
                predicted_model = prediction.get('predicted_model', '').lower().strip()
                true_model = prediction.get('true_model', '').lower().strip()
                
                # Extract model names without provider prefixes and version suffixes
                def extract_base_name(model_name: str) -> str:
                    # Remove provider prefix (e.g., "openai/gpt-4" -> "gpt-4")
                    if '/' in model_name:
                        model_name = model_name.split('/')[-1]
                    # Remove version suffixes and special chars
                    for suffix in [':free', ':beta', '-preview', '-turbo']:
                        model_name = model_name.replace(suffix, '')
                    return model_name.strip()
                
                predicted_base = extract_base_name(predicted_model)
                true_base = extract_base_name(true_model)
                
                # Check if either contains the other
                return (predicted_base in true_base or 
                       true_base in predicted_base or 
                       predicted_base == true_base)
                
            elif task_type == 'binary_self':
                # Binary task uses exact logic
                predicted_self = prediction.get('predicted_self', False)
                evaluator_model = prediction.get('evaluator_model', '')
                true_model = prediction.get('true_model', '')
                is_actually_self = (true_model == evaluator_model)
                return predicted_self == is_actually_self
            
            return False
        
        return partial_match_correctness
    
    def create_llm_correctness_function(self, judge_model: str = "openai/gpt-4.1-mini", 
                                      temperature: float = 0.0) -> callable:
        """Create a correctness function that uses an LLM to judge correctness.
        
        Args:
            judge_model: Model to use for judging correctness
            temperature: Temperature for the judge model
        
        Returns:
            Correctness function that uses LLM judgment
        """
        def llm_correctness(prediction: Dict) -> bool:
            task_type = prediction.get('task_type', '')
            
            if task_type == 'exact_model':
                predicted_model = prediction.get('predicted_model', '')
                true_model = prediction.get('true_model', '')
                returned_text = prediction.get('returned_text', '')
                
                # Create judgment prompt for exact model task
                judge_prompt = f"""You are an expert evaluator judging whether a model identification prediction is correct.

Task: An AI model was asked to identify which model generated a specific text from a list of options.

Ground Truth Model: {true_model}
Model's Response: "{returned_text}"
Extracted Prediction: {predicted_model}

Your task is to determine if the model's identification is essentially correct, even if not exactly matching the ground truth format.

Consider these factors:
1. Does the response indicate the correct model despite formatting differences?
2. Are provider prefixes (e.g., "openai/", "anthropic/") the main difference?
3. Are version suffixes (e.g., ":free", "-turbo") the main difference?
4. Does the core model name match (e.g., "gpt-4" vs "gpt-4.1")?
5. Is this a reasonable interpretation of the intended answer?

Respond with ONLY "CORRECT" if the prediction is essentially right, or "INCORRECT" if it's wrong."""

                try:
                    max_tokens_judge = self.config_manager.config.evaluation.max_tokens_judge
                    result = self.client.generate_text(judge_model, judge_prompt, 
                                                     max_tokens=max_tokens_judge, temperature=temperature)
                    response = result["content"].strip().upper()
                    
                    logger.debug(f"LLM judge response for {predicted_model} vs {true_model}: {response}")
                    
                    if "CORRECT" in response:
                        return True
                    elif "INCORRECT" in response:
                        return False
                    else:
                        # Fall back to exact match if LLM response is unclear
                        logger.warning(f"Unclear LLM judge response: '{response}', falling back to exact match")
                        return predicted_model == true_model
                        
                except Exception as e:
                    logger.error(f"Error in LLM correctness judgment: {e}")
                    # Fall back to exact match on error
                    return predicted_model == true_model
                
            elif task_type == 'binary_self':
                predicted_self = prediction.get('predicted_self', False)
                evaluator_model = prediction.get('evaluator_model', '')
                true_model = prediction.get('true_model', '')
                returned_text = prediction.get('returned_text', '')
                
                is_actually_self = (true_model == evaluator_model)
                
                # Create judgment prompt for binary self task
                judge_prompt = f"""You are an expert evaluator judging whether a binary self-identification prediction is correct.

Task: An AI model ({evaluator_model}) was asked to determine if it generated a specific text.

Ground Truth: The text was {"" if is_actually_self else "NOT "}generated by {evaluator_model}
Model's Response: "{returned_text}"
Extracted Prediction: {"YES (self-generated)" if predicted_self else "NO (not self-generated)"}

Your task is to determine if the model's self-identification judgment is correct, considering the actual ground truth.

The prediction is CORRECT if:
- The model correctly identified its own text as self-generated, OR
- The model correctly identified another model's text as not self-generated

The prediction is INCORRECT if:
- The model failed to recognize its own text, OR  
- The model incorrectly claimed another model's text as its own

Respond with ONLY "CORRECT" if the prediction matches the ground truth, or "INCORRECT" if it doesn't."""

                try:
                    max_tokens_judge = self.config_manager.config.evaluation.max_tokens_judge
                    result = self.client.generate_text(judge_model, judge_prompt,
                                                     max_tokens=max_tokens_judge, temperature=temperature)
                    response = result["content"].strip().upper()
                    
                    logger.debug(f"LLM judge response for binary task {evaluator_model}: {response}")
                    
                    if "CORRECT" in response:
                        return True
                    elif "INCORRECT" in response:
                        return False
                    else:
                        # Fall back to exact logic if LLM response is unclear
                        logger.warning(f"Unclear LLM judge response: '{response}', falling back to exact logic")
                        return predicted_self == is_actually_self
                        
                except Exception as e:
                    logger.error(f"Error in LLM correctness judgment: {e}")
                    # Fall back to exact logic on error
                    return predicted_self == is_actually_self
            
            return False
        
        return llm_correctness
    
    def compute_accuracies_from_predictions(self, predictions: List[Dict]) -> Dict[str, Dict[str, float]]:
        """Compute accuracy results from prediction records (without re-running evaluation).
        
        Args:
            predictions: List of prediction dictionaries
        
        Returns:
            Dictionary with same format as evaluate_corpus results
        """
        if not predictions:
            return {}
        
        # Determine task type from first prediction
        task_type = predictions[0].get('task_type', '')
        
        results = {}
        
        # Group predictions by evaluator model
        evaluator_groups = {}
        for pred in predictions:
            evaluator_model = pred['evaluator_model']
            if evaluator_model not in evaluator_groups:
                evaluator_groups[evaluator_model] = []
            evaluator_groups[evaluator_model].append(pred)
        
        for evaluator_model, model_predictions in evaluator_groups.items():
            if task_type == 'exact_model':
                results[evaluator_model] = self._compute_exact_results(model_predictions)
            elif task_type == 'binary_self':
                results[evaluator_model] = self._compute_binary_results(model_predictions)
        
        return results
    
    def update_and_evaluate_predictions(self, predictions_file: str,
                                      correctness_function: callable = None,
                                      save_updated: bool = True) -> Dict[str, Dict[str, float]]:
        """Update predictions correctness and compute new accuracy results.
        
        Args:
            predictions_file: Path to predictions JSONL file
            correctness_function: Custom correctness function (optional)
            save_updated: Whether to save updated predictions to file
        
        Returns:
            Dictionary with accuracy results using updated correctness
        """
        # Update predictions with new correctness logic
        updated_predictions = self.update_predictions_correctness(
            predictions_file, correctness_function, save_updated
        )
        
        if not updated_predictions:
            return {}
        
        # Compute new accuracies
        results = self.compute_accuracies_from_predictions(updated_predictions)
        
        # Log summary of results
        task_type = updated_predictions[0].get('task_type', '')
        if task_type == 'exact_model':
            task_enum = EvaluationTask.EXACT_MODEL
        else:
            task_enum = EvaluationTask.BINARY_SELF
        
        print(f"\n📊 Updated Results Summary:")
        try:
            from .visualizer import ResultsVisualizer
        except ImportError:
            from visualizer import ResultsVisualizer
        visualizer = ResultsVisualizer()
        visualizer.print_summary_stats(results, task_enum)
        
        return results
    
    def _is_valid_response(self, returned_text: str) -> bool:
        """Check if a returned response is valid (not empty or error)."""
        if not returned_text or not returned_text.strip():
            return False
        if returned_text.startswith("ERROR:"):
            return False
        return True
    
    def check_and_rerun_empty_predictions(self, predictions_file: str, 
                                        max_retries: int = 3,
                                        save_updated: bool = True) -> int:
        """Check predictions for empty returned_text and rerun those evaluations.
        
        Args:
            predictions_file: Path to predictions JSONL file
            max_retries: Maximum retry attempts for each empty prediction
            save_updated: Whether to save updated predictions back to file
        
        Returns:
            Number of predictions that were successfully rerun
        """
        if not Path(predictions_file).exists():
            logger.error(f"Predictions file not found: {predictions_file}")
            return 0
        
        # Load existing predictions
        predictions = []
        try:
            with open(predictions_file, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():
                        predictions.append(json.loads(line))
        except Exception as e:
            logger.error(f"Error loading predictions from {predictions_file}: {e}")
            return 0
        
        # Find predictions with empty returned_text
        empty_predictions = []
        for i, pred in enumerate(predictions):
            returned_text = pred.get('returned_text', '')
            if not self._is_valid_response(returned_text):
                empty_predictions.append((i, pred))
        
        if not empty_predictions:
            logger.info("✅ All predictions have valid returned_text")
            return 0
        
        logger.info(f"🔄 Found {len(empty_predictions)} predictions with empty/invalid returned_text")
        
        # Create backup before updating
        if save_updated:
            backup_file = predictions_file + ".empty_rerun_backup"
            with open(backup_file, 'w', encoding='utf-8') as f:
                for pred in predictions:
                    json.dump(pred, f, ensure_ascii=False)
                    f.write('\n')
            logger.info(f"💾 Created backup: {backup_file}")
        
        # Rerun empty predictions
        successful_reruns = 0
        failed_models = set()  # Track models with >50% failure rate
        model_stats = {}  # Track total attempts and failures per model
        save_counter = 0  # Counter for incremental saves
        
        def save_predictions_to_file():
            """Helper function to save current predictions to file."""
            if save_updated:
                with open(predictions_file, 'w', encoding='utf-8') as f:
                    for pred in predictions:
                        json.dump(pred, f, ensure_ascii=False)
                        f.write('\n')
                logger.info(f"💾 Incremental save: {successful_reruns} successful reruns saved to {predictions_file}")
        
        for pred_index, pred in empty_predictions:
            task_type = pred.get('task_type', '')
            evaluator_model = pred.get('evaluator_model', '')
            
            # Initialize model stats if not seen before
            if evaluator_model not in model_stats:
                model_stats[evaluator_model] = {'total': 0, 'failures': 0}
            
            # Skip if this model has >50% failure rate after sufficient attempts
            if evaluator_model in failed_models:
                logger.info(f"⏭️ Skipping prediction {pred_index+1} for {evaluator_model} (failure rate >50%)")
                continue
            
            logger.info(f"🔄 Rerunning prediction {pred_index+1} for {evaluator_model} ({task_type})")
            
            # Create a GeneratedText object for evaluation
            text_obj = type('GeneratedText', (), {
                'text': pred.get('text_full', ''),
                'model': pred.get('true_model', ''),
                'model_display_name': pred.get('true_model_display', ''),
                'prompt': pred.get('prompt_used', '')
            })()
            
            retry_count = 0
            success = False
            
            while retry_count < max_retries and not success:
                retry_count += 1
                
                try:
                    if task_type == 'exact_model':
                        # Get model options from other predictions
                        model_options = list(set(p.get('true_model', '') for p in predictions))
                        model_options = [m for m in model_options if m]  # Remove empty strings
                        
                        logger.info(f"🔄 Using aggressive prompt for attempt {retry_count} (half tokens + prioritize response)")
                        predicted_model, returned_text, evaluation_prompt, returned_reasoning = self.evaluate_single_text_exact_aggressive(
                            evaluator_model, text_obj, model_options, temperature=self.config_manager.config.evaluation.temperature
                        )
                        
                        if self._is_valid_response(returned_text):
                            # Update prediction record
                            pred['predicted_model'] = predicted_model
                            pred['returned_text'] = returned_text
                            pred['evaluation_prompt'] = evaluation_prompt
                            pred['returned_reasoning'] = returned_reasoning
                            pred['is_correct'] = predicted_model == pred.get('true_model', '')
                            predictions[pred_index] = pred
                            success = True
                            successful_reruns += 1
                            save_counter += 1
                            logger.info(f"✅ Successfully reran prediction {pred_index+1} with aggressive prompt")
                            
                            # Save predictions every 10 successful reruns
                            if save_counter >= 10:
                                save_predictions_to_file()
                                save_counter = 0
                            break  # Exit retry loop on success
                        else:
                            logger.warning(f"⚠️ Rerun attempt {retry_count} still returned invalid response")
                    
                    elif task_type == 'binary_self':
                        logger.info(f"🔄 Using aggressive prompt for attempt {retry_count} (half tokens + prioritize response)")
                        predicted_self, returned_text, evaluation_prompt, returned_reasoning = self.evaluate_single_text_binary_aggressive(
                            evaluator_model, text_obj, temperature=self.config_manager.config.evaluation.temperature
                        )
                        
                        if self._is_valid_response(returned_text):
                            # Update prediction record
                            pred['predicted_self'] = predicted_self
                            pred['returned_text'] = returned_text
                            pred['evaluation_prompt'] = evaluation_prompt
                            pred['returned_reasoning'] = returned_reasoning
                            
                            true_model = pred.get('true_model', '')
                            is_actually_self = (true_model == evaluator_model)
                            pred['is_correct'] = predicted_self == is_actually_self
                            predictions[pred_index] = pred
                            success = True
                            successful_reruns += 1
                            save_counter += 1
                            logger.info(f"✅ Successfully reran prediction {pred_index+1} with aggressive prompt")
                            
                            # Save predictions every 10 successful reruns
                            if save_counter >= 10:
                                save_predictions_to_file()
                                save_counter = 0
                            break  # Exit retry loop on success
                        else:
                            logger.warning(f"⚠️ Rerun attempt {retry_count} still returned invalid response")
                
                except Exception as e:
                    logger.error(f"❌ Error rerunning prediction {pred_index+1}, attempt {retry_count}: {e}")
                
                if not success and retry_count < max_retries:
                    time.sleep(self.config_manager.config.evaluation.request_delay)
            
            # Update model statistics
            model_stats[evaluator_model]['total'] += 1
            if not success:
                model_stats[evaluator_model]['failures'] += 1
                logger.error(f"💥 Failed to rerun prediction {pred_index+1} after {max_retries} attempts")
            
            # Check if model should be marked as failed (>66.7% failure rate after sufficient attempts)
            stats = model_stats[evaluator_model]
            if stats['total'] >= max_retries and stats['failures'] / stats['total'] > 2.0 / 3.0:
                if evaluator_model not in failed_models:
                    failed_models.add(evaluator_model)
                    failure_rate = stats['failures'] / stats['total'] * 100
                    logger.warning(f"🚫 Marking {evaluator_model} as failed: {failure_rate:.1f}% failure rate ({stats['failures']}/{stats['total']})")
                    logger.warning(f"⏭️ Will skip remaining predictions for {evaluator_model}")
        
        # Final save for any remaining updates
        if save_updated and (successful_reruns > 0 or save_counter > 0):
            with open(predictions_file, 'w', encoding='utf-8') as f:
                for pred in predictions:
                    json.dump(pred, f, ensure_ascii=False)
                    f.write('\n')
            logger.info(f"💾 Final save: {successful_reruns} total successful reruns saved to {predictions_file}")
        
        # Final summary
        skipped_count = len([1 for pred_index, pred in empty_predictions if pred.get('evaluator_model', '') in failed_models])
        logger.info(f"🎯 Successfully reran {successful_reruns}/{len(empty_predictions)} empty predictions")
        
        if failed_models:
            logger.info(f"🚫 Failed models (>66.7% failure rate): {', '.join(failed_models)}")
            # Show detailed failure statistics
            for model in failed_models:
                if model in model_stats:
                    stats = model_stats[model]
                    failure_rate = stats['failures'] / stats['total'] * 100
                    logger.info(f"  • {model}: {failure_rate:.1f}% failure rate ({stats['failures']}/{stats['total']})")
            logger.info(f"⏭️ Skipped {skipped_count} predictions for failed models")
        
        return successful_reruns
    
    async def check_and_rerun_empty_predictions_async(self, predictions_file: str, 
                                                    max_retries: int = 3,
                                                    save_updated: bool = True,
                                                    concurrent_limit: int = 10) -> int:
        """Async version of check_and_rerun_empty_predictions with concurrent processing."""
        if not Path(predictions_file).exists():
            logger.error(f"Predictions file not found: {predictions_file}")
            return 0
        
        # Load existing predictions
        predictions = []
        try:
            with open(predictions_file, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():
                        predictions.append(json.loads(line))
        except Exception as e:
            logger.error(f"Error loading predictions from {predictions_file}: {e}")
            return 0
        
        # Find predictions with empty returned_text
        empty_predictions = []
        for i, pred in enumerate(predictions):
            returned_text = pred.get('returned_text', '')
            if not self._is_valid_response(returned_text):
                empty_predictions.append((i, pred))
        
        if not empty_predictions:
            logger.info("✅ All predictions have valid returned_text")
            return 0
        
        logger.info(f"🔄 Found {len(empty_predictions)} predictions with empty/invalid returned_text")
        logger.info(f"🚀 Processing with {concurrent_limit} concurrent requests")
        
        # Create backup before updating
        if save_updated:
            backup_file = predictions_file + ".async_backup"
            with open(backup_file, 'w', encoding='utf-8') as f:
                for pred in predictions:
                    json.dump(pred, f, ensure_ascii=False)
                    f.write('\n')
            logger.info(f"💾 Created backup: {backup_file}")
        
        # Process in batches for concurrent execution
        successful_reruns = 0
        failed_models = set()
        model_stats = {}  # Track total attempts and failures per model
        
        # Create semaphore to limit concurrent requests
        semaphore = asyncio.Semaphore(concurrent_limit)
        
        async def process_single_prediction(pred_index, pred):
            nonlocal successful_reruns, failed_models, model_stats
            
            async with semaphore:
                task_type = pred.get('task_type', '')
                evaluator_model = pred.get('evaluator_model', '')
                
                # Initialize model stats if not seen before
                if evaluator_model not in model_stats:
                    model_stats[evaluator_model] = {'total': 0, 'failures': 0}
                
                # Skip if this model has >50% failure rate after sufficient attempts
                if evaluator_model in failed_models:
                    logger.info(f"⏭️ Skipping prediction {pred_index+1} for {evaluator_model} (failure rate >50%)")
                    return False
                
                logger.info(f"🔄 Async rerunning prediction {pred_index+1} for {evaluator_model} ({task_type})")
                
                # Create a GeneratedText object for evaluation
                text_obj = type('GeneratedText', (), {
                    'text': pred.get('text_full', ''),
                    'model': pred.get('true_model', ''),
                    'model_display_name': pred.get('true_model_display', ''),
                    'prompt': pred.get('prompt_used', '')
                })()
                
                for retry_count in range(1, max_retries + 1):
                    try:
                        if task_type == 'exact_model':
                            # Get model options from other predictions
                            model_options = list(set(p.get('true_model', '') for p in predictions))
                            model_options = [m for m in model_options if m]  # Remove empty strings
                            
                            predicted_model, returned_text, evaluation_prompt, returned_reasoning = await self.evaluate_single_text_exact_aggressive_async(
                                evaluator_model, text_obj, model_options, temperature=self.config_manager.config.evaluation.temperature
                            )
                            
                            if self._is_valid_response(returned_text):
                                # Update prediction record
                                pred['predicted_model'] = predicted_model
                                pred['returned_text'] = returned_text
                                pred['evaluation_prompt'] = evaluation_prompt
                                pred['returned_reasoning'] = returned_reasoning
                                pred['is_correct'] = predicted_model == pred.get('true_model', '')
                                predictions[pred_index] = pred
                                successful_reruns += 1
                                logger.info(f"✅ Successfully reran prediction {pred_index+1} with async aggressive prompt")
                                
                                # Update model statistics for successful attempt
                                model_stats[evaluator_model]['total'] += 1
                                return True
                        
                        elif task_type == 'binary_self':
                            predicted_self, returned_text, evaluation_prompt, returned_reasoning = await self.evaluate_single_text_binary_aggressive_async(
                                evaluator_model, text_obj, temperature=self.config_manager.config.evaluation.temperature
                            )
                            
                            if self._is_valid_response(returned_text):
                                # Update prediction record
                                pred['predicted_self'] = predicted_self
                                pred['returned_text'] = returned_text
                                pred['evaluation_prompt'] = evaluation_prompt
                                pred['returned_reasoning'] = returned_reasoning
                                
                                true_model = pred.get('true_model', '')
                                is_actually_self = (true_model == evaluator_model)
                                pred['is_correct'] = predicted_self == is_actually_self
                                predictions[pred_index] = pred
                                successful_reruns += 1
                                logger.info(f"✅ Successfully reran prediction {pred_index+1} with async aggressive prompt")
                                
                                # Update model statistics for successful attempt
                                model_stats[evaluator_model]['total'] += 1
                                return True
                        
                        logger.warning(f"⚠️ Async rerun attempt {retry_count} still returned invalid response")
                        
                    except Exception as e:
                        logger.error(f"❌ Error in async rerun prediction {pred_index+1}, attempt {retry_count}: {e}")
                    
                    # Add delay between retries
                    if retry_count < max_retries:
                        await asyncio.sleep(self.config_manager.config.evaluation.request_delay)
                
                # Update model statistics
                model_stats[evaluator_model]['total'] += 1
                model_stats[evaluator_model]['failures'] += 1
                
                # Check if model should be marked as failed (>66.7% failure rate after sufficient attempts)
                stats = model_stats[evaluator_model]
                if stats['total'] >= max_retries and stats['failures'] / stats['total'] > 2.0 / 3.0:
                    if evaluator_model not in failed_models:
                        failed_models.add(evaluator_model)
                        failure_rate = stats['failures'] / stats['total'] * 100
                        logger.warning(f"🚫 Marking {evaluator_model} as failed: {failure_rate:.1f}% failure rate ({stats['failures']}/{stats['total']})")
                
                logger.error(f"💥 Failed to rerun prediction {pred_index+1} after {max_retries} attempts")
                return False
        
        # Create tasks for all predictions
        tasks = [process_single_prediction(pred_index, pred) for pred_index, pred in empty_predictions]
        
        # Run all tasks concurrently
        logger.info(f"🚀 Starting {len(tasks)} async evaluation tasks with {concurrent_limit} concurrent limit")
        await asyncio.gather(*tasks, return_exceptions=True)
        
        # Save updated predictions
        if save_updated and successful_reruns > 0:
            with open(predictions_file, 'w', encoding='utf-8') as f:
                for pred in predictions:
                    json.dump(pred, f, ensure_ascii=False)
                    f.write('\n')
            logger.info(f"💾 Final async save: {successful_reruns} total successful reruns saved to {predictions_file}")
        
        # Final summary
        skipped_count = len([1 for pred_index, pred in empty_predictions if pred.get('evaluator_model', '') in failed_models])
        logger.info(f"🎯 Async processing complete: {successful_reruns}/{len(empty_predictions)} empty predictions recovered")
        
        if failed_models:
            logger.info(f"🚫 Failed models (>66.7% failure rate): {', '.join(failed_models)}")
            # Show detailed failure statistics
            for model in failed_models:
                if model in model_stats:
                    stats = model_stats[model]
                    failure_rate = stats['failures'] / stats['total'] * 100
                    logger.info(f"  • {model}: {failure_rate:.1f}% failure rate ({stats['failures']}/{stats['total']})")
            logger.info(f"⏭️ Skipped {skipped_count} predictions for failed models")
        
        return successful_reruns