#!/usr/bin/env python3
"""
LLMBar Natural Evaluation Script

Evaluates the Natural dataset using dynamic dimension selection,
metrics+reference approach, and expert model evaluation.
"""

import os
import json
import time
import logging
import argparse
import asyncio
import aiohttp
import re
import pickle
from pathlib import Path
from collections import defaultdict
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from tqdm import tqdm

# Data classes
@dataclass
class LLMBarInstance:
    instance_id: str
    input: str
    output_1: str
    output_2: str
    gold_label: int  # 1 or 2 indicating which output is correct

@dataclass
class EvaluationResult:
    instance_id: str
    input: str
    output_1: str
    output_2: str
    gold_label: int
    predicted_label: int
    reasoning: str
    selected_dimensions: List[str]
    metrics_questions: List[str]
    reference_output: str
    dimension_evaluations: Dict[str, str]
    expert_models_used: List[str]
    evaluation_time: float
    correct: bool
    confidence: float = 0.0

# All available dimensions (optimized for Natural dataset)
ALL_DIMENSIONS = [
    'Accuracy', 'Admit Uncertainty', 'Attractive', 'Audience Friendly', 'Authenticity',
    'Being Friendly', 'Citation', 'Clarity', 'Code Correctness', 'Code Readability',
    'Coherence', 'Completeness', 'Coverage', 'Creativity', 'Depth', 'Emojis',
    'Emotion', 'Faithfulness', 'Feasibility', 'Harmlessness', 'Information Richness',
    'Insight', 'Instruction Following', 'Interactivity', 'Layout', 'Length', 'Logic',
    'Modularity', 'Multiple Aspects', 'Objectivity', 'Originality', 'Pacing',
    'Pointing Out', 'Professional', 'Professionalism', 'Relevance', 'Result at the Beginning',
    'Step by Step Explanation', 'Style', 'Timeliness', 'Vivid'
]

# Expert models for different dimensions (optimized for Natural)
DIMENSION_TO_MODEL_MAPPING = {
    'Accuracy': 'provider-3/qwen-2.5-72b',
    'Clarity': 'provider-3/kimi-k2',
    'Instruction Following': 'provider-3/gpt-5-chat',
    'Relevance': 'provider-3/sonar-pro',
    'Completeness': 'provider-3/gpt-5-chat',
    'Professionalism': 'provider-6/o3-high',
    'Insight': 'provider-3/sonar-pro',
    'Logic': 'provider-3/kimi-k2',
    'Depth': 'provider-3/sonar-pro',
    'Creativity': 'provider-3/sonar',
    'Code Correctness': 'provider-3/qwen-2.5-72b',
    'Code Readability': 'provider-3/mistral-large-latest',
    'Feasibility': 'provider-3/gpt-5-chat',
    'Harmlessness': 'provider-3/sonar-pro',
    'Objectivity': 'provider-6/o3-medium'
}

# Dual API Key Manager (same as flexible evaluation)
class DualAPIKeyManager:
    def __init__(self, api_key_1: str, api_key_2: str):
        self.api_keys = [api_key_1, api_key_2] if api_key_2 else [api_key_1]
        self.current_index = 0
        self.lock = asyncio.Lock()
        self.valid_keys = []
        self.invalid_keys = []
        
        # Filter out empty keys
        self.api_keys = [key for key in self.api_keys if key and key.strip()]
        
        if not self.api_keys:
            raise ValueError("At least one valid API key must be provided")
        
        logging.info(f"Initialized API key manager with {len(self.api_keys)} key(s)")
        if len(self.api_keys) == 1:
            logging.info("Single API key mode - will use rate limiting for 10 RPM")
    
    async def get_next_api_key(self) -> str:
        """Get the next API key in round-robin fashion"""
        async with self.lock:
            if self.valid_keys:
                api_key = self.valid_keys[self.current_index % len(self.valid_keys)]
                self.current_index = (self.current_index + 1) % len(self.valid_keys)
                return api_key
            
            api_key = self.api_keys[self.current_index]
            self.current_index = (self.current_index + 1) % len(self.api_keys)
            return api_key
    
    async def mark_key_as_valid(self, api_key: str):
        """Mark an API key as valid"""
        async with self.lock:
            if api_key not in self.valid_keys:
                self.valid_keys.append(api_key)
                logging.info(f"API key marked as valid")
    
    async def mark_key_as_invalid(self, api_key: str):
        """Mark an API key as invalid"""
        async with self.lock:
            if api_key not in self.invalid_keys:
                self.invalid_keys.append(api_key)
                logging.warning(f"API key marked as invalid")
            
            if api_key in self.valid_keys:
                self.valid_keys.remove(api_key)
    
    def is_single_key_mode(self) -> bool:
        """Check if we're in single API key mode"""
        return len(self.api_keys) == 1

# Rate limiter for single API key mode
class SingleKeyRateLimiter:
    def __init__(self, requests_per_minute: int = 10):
        self.requests_per_minute = requests_per_minute
        self.min_interval = 60.0 / requests_per_minute
        self.last_request_time = 0
        self.lock = asyncio.Lock()
    
    async def wait_if_needed(self):
        """Wait if necessary to respect rate limits"""
        async with self.lock:
            current_time = time.time()
            time_since_last = current_time - self.last_request_time
            
            if time_since_last < self.min_interval:
                wait_time = self.min_interval - time_since_last
                logging.debug(f"Rate limiting: waiting {wait_time:.2f} seconds")
                await asyncio.sleep(wait_time)
            
            self.last_request_time = time.time()

# Global instances
single_key_rate_limiter = SingleKeyRateLimiter(10)

# Setup logging
def setup_logging() -> logging.Logger:
    """Setup comprehensive logging"""
    detailed_logger = logging.getLogger('detailed')
    detailed_logger.setLevel(logging.INFO)
    
    for handler in detailed_logger.handlers[:]:
        detailed_logger.removeHandler(handler)
    
    detailed_handler = logging.FileHandler('llmbar_natural_evaluation_detailed.log', mode='w', encoding='utf-8')
    detailed_handler.setLevel(logging.INFO)
    
    formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
    detailed_handler.setFormatter(formatter)
    detailed_logger.addHandler(detailed_handler)
    
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    
    logging.basicConfig(
        level=logging.INFO,
        format='[%(levelname)s] %(message)s',
        handlers=[
            logging.StreamHandler(),
            logging.FileHandler("llmbar_natural_evaluation.log", mode='w', encoding='utf-8')
        ]
    )
    
    return detailed_logger

# Load Natural dataset
def load_natural_dataset(dataset_path: str, max_instances: Optional[int] = None) -> List[LLMBarInstance]:
    """Load Natural dataset from JSON file"""
    try:
        with open(dataset_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        instances = []
        for i, item in enumerate(data):
            instance = LLMBarInstance(
                instance_id=item.get('id', f'natural_{i:04d}'),
                input=item.get('input', ''),
                output_1=item.get('output_1', ''),
                output_2=item.get('output_2', ''),
                gold_label=item.get('label', 1)
            )
            instances.append(instance)
            
            if max_instances and len(instances) >= max_instances:
                break
        
        logging.info(f"Loaded {len(instances)} instances from Natural dataset")
        return instances
    except Exception as e:
        logging.error(f"Failed to load Natural dataset: {e}")
        return []

# Checkpoint functions
def save_checkpoint(checkpoint_path: str, results: List[EvaluationResult], processed_instances: set, total_instances: int):
    """Save checkpoint with current progress"""
    checkpoint_data = {
        'results': results,
        'processed_instances': list(processed_instances),
        'total_instances': total_instances,
        'timestamp': time.time()
    }
    
    try:
        with open(checkpoint_path, 'wb') as f:
            pickle.dump(checkpoint_data, f)
        logging.info(f"Checkpoint saved to {checkpoint_path}")
    except Exception as e:
        logging.error(f"Failed to save checkpoint: {e}")

def load_checkpoint(checkpoint_path: str) -> tuple[List[EvaluationResult], set, int]:
    """Load checkpoint and return results, processed instances, and total instances"""
    try:
        if os.path.exists(checkpoint_path):
            with open(checkpoint_path, 'rb') as f:
                checkpoint_data = pickle.load(f)
            
            results = checkpoint_data['results']
            processed_instances = set(checkpoint_data['processed_instances'])
            total_instances = checkpoint_data['total_instances']
            
            logging.info(f"Checkpoint loaded from {checkpoint_path}")
            logging.info(f"Resuming with {len(results)} completed evaluations and {len(processed_instances)} processed instances")
            return results, processed_instances, total_instances
        else:
            logging.info("No checkpoint found, starting fresh")
            return [], set(), 0
    except Exception as e:
        logging.error(f"Failed to load checkpoint: {e}")
        return [], set(), 0

def get_checkpoint_path(output_path: str) -> str:
    """Get checkpoint file path based on output path"""
    return f"{output_path}_checkpoint.pkl"

# API call function
async def call_api_async(session: aiohttp.ClientSession, messages: List[Dict], model: str, api_key: str, base_url: str, max_tokens: int = 150, temperature: float = 0.0, api_key_manager: DualAPIKeyManager = None) -> Dict:
    """Make API call to the specified model"""
    try:
        # Apply rate limiting if in single key mode
        if api_key_manager and api_key_manager.is_single_key_mode():
            await single_key_rate_limiter.wait_if_needed()
        
        payload = {
            "model": model,
            "messages": messages,
            "temperature": temperature,
            "max_tokens": max_tokens
        }
        
        async with session.post(
            f"{base_url}/chat/completions",
            json=payload,
            headers={"Authorization": f"Bearer {api_key}"},
            ssl=False
        ) as response:
            if response.status == 200:
                result = await response.json()
                content = result['choices'][0]['message']['content'].strip()
                return {"success": True, "content": content}
            elif response.status == 401:
                logging.error(f"HTTP 401 (Unauthorized) for model {model}")
                if api_key_manager:
                    await api_key_manager.mark_key_as_invalid(api_key)
                return {"success": False, "error": f"HTTP {response.status}", "auth_error": True}
            elif response.status == 429:
                logging.warning(f"HTTP 429 (Rate Limited) for model {model}")
                return {"success": False, "error": f"HTTP {response.status}", "rate_limited": True}
            else:
                logging.error(f"HTTP {response.status} for model {model}")
                return {"success": False, "error": f"HTTP {response.status}"}
    except Exception as e:
        logging.error(f"Exception during API call to {model}: {e}")
        return {"success": False, "error": str(e)}

# Dimension selection (optimized for Natural)
async def select_relevant_dimensions(session: aiohttp.ClientSession, instance: LLMBarInstance, api_key_manager: DualAPIKeyManager, base_url: str) -> List[str]:
    """Use kimi-k2 to dynamically select relevant dimensions for Natural evaluation"""
    
    all_dimensions_text = "\n".join([f"- {dim}" for dim in ALL_DIMENSIONS])
    
    prompt = f"""You are an expert evaluator specialized in selecting the most relevant evaluation dimensions for Natural dataset instances.

Your task is to select 5-10 dimensions from the available list that are MOST RELEVANT for evaluating outputs for the given instruction.

AVAILABLE DIMENSIONS:
{all_dimensions_text}

INSTRUCTION:
{instance.input}

NATURAL-SPECIFIC SELECTION CRITERIA:
1. Choose dimensions that directly relate to whether the output correctly follows the instruction
2. Prioritize dimensions that measure task completion and instruction adherence
3. Include dimensions that assess the quality and appropriateness of the response
4. For Natural dataset, focus on accuracy, instruction following, and relevance
5. Select 5-10 dimensions (preferably 7-8 for balanced evaluation)

OUTPUT FORMAT:
Return ONLY a JSON array of dimension names, like:
["dimension1", "dimension2", "dimension3"]

Example:
["Instruction Following", "Accuracy", "Relevance", "Completeness", "Clarity"]

Your selection:"""
    
    messages = [
        {"role": "system", "content": "You are an expert evaluator specialized in selecting relevant evaluation dimensions for Natural dataset."},
        {"role": "user", "content": prompt}
    ]
    
    api_key = await api_key_manager.get_next_api_key()
    
    result = await call_api_async(
        session, messages, "provider-3/kimi-k2", api_key, base_url,
        max_tokens=200, temperature=0.1, api_key_manager=api_key_manager
    )
    
    if result["success"]:
        try:
            content = result["content"]
            json_match = re.search(r'\[.*?\]', content, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
                selected_dimensions = json.loads(json_str)
                
                valid_dimensions = [dim for dim in selected_dimensions if dim in DIMENSION_TO_MODEL_MAPPING]
                
                if len(valid_dimensions) >= 3:
                    logging.info(f"LLM selected {len(valid_dimensions)} dimensions: {valid_dimensions}")
                    return valid_dimensions[:10]
                else:
                    logging.warning(f"Too few valid dimensions selected: {valid_dimensions}")
            
            logging.warning("LLM selection failed, using Natural-optimized fallback dimensions")
            fallback_dimensions = ['Instruction Following', 'Accuracy', 'Relevance', 'Completeness', 'Clarity']
            return fallback_dimensions
            
        except Exception as e:
            logging.error(f"Error parsing dimension selection: {e}")
            logging.warning("Using Natural-optimized fallback dimensions due to parsing error")
            fallback_dimensions = ['Instruction Following', 'Accuracy', 'Relevance', 'Completeness', 'Clarity']
            return fallback_dimensions
    
    logging.warning("API call failed, using Natural-optimized fallback dimensions")
    fallback_dimensions = ['Instruction Following', 'Accuracy', 'Relevance', 'Completeness', 'Clarity']
    return fallback_dimensions

# Metrics and reference generation
async def generate_metrics_questions(session: aiohttp.ClientSession, instance: LLMBarInstance, api_key_manager: DualAPIKeyManager, base_url: str) -> List[str]:
    """Generate metrics questions optimized for Natural evaluation"""
    
    prompt = f"""You are a helpful assistant in evaluating the quality of outputs for a given instruction.

Please propose at most three concise questions about whether a potential output is a good output for the given instruction.

# Instruction:
{instance.input}

# Requirements for Your Output:
(1) The questions should **specifically** target the given instruction instead of some general standards
(2) You should directly give the questions without any other words
(3) Questions are presented from most important to least important
(4) Focus on task completion and instruction adherence for Natural evaluation

Your questions:"""
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant in evaluating output quality for Natural dataset."},
        {"role": "user", "content": prompt}
    ]
    
    api_key = await api_key_manager.get_next_api_key()
    
    result = await call_api_async(
        session, messages, "provider-3/kimi-k2", api_key, base_url,
        max_tokens=150, temperature=0.0, api_key_manager=api_key_manager
    )
    
    if result["success"]:
        try:
            content = result["content"]
            questions = [q.strip() for q in content.split('\n') if q.strip()]
            questions = questions[:3]
            logging.info(f"Generated {len(questions)} metrics questions")
            return questions
        except Exception as e:
            logging.error(f"Error parsing metrics questions: {e}")
    
    fallback_questions = [
        "Does the output correctly address the instruction?",
        "Is the output complete and accurate?",
        "Is the output helpful and well-structured?"
    ]
    logging.warning(f"Using fallback metrics questions: {fallback_questions}")
    return fallback_questions

async def generate_reference_output(session: aiohttp.ClientSession, instance: LLMBarInstance, api_key_manager: DualAPIKeyManager, base_url: str) -> str:
    """Generate a reference output optimized for Natural evaluation"""
    
    prompt = f"""You are a helpful assistant that responds to the user in a concise way.

{instance.input}

Your response:"""
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant that responds to the user in a concise way."},
        {"role": "user", "content": prompt}
    ]
    
    api_key = await api_key_manager.get_next_api_key()
    
    result = await call_api_async(
        session, messages, "provider-3/kimi-k2", api_key, base_url,
        max_tokens=384, temperature=0.0, api_key_manager=api_key_manager
    )
    
    if result["success"]:
        logging.info("Generated reference output successfully")
        return result["content"]
    
    fallback_reference = "A well-structured response that directly addresses the instruction with appropriate detail and accuracy."
    logging.warning(f"Using fallback reference output: {fallback_reference}")
    return fallback_reference

# Dimension evaluation
async def evaluate_dimension(session: aiohttp.ClientSession, dimension: str, instance: LLMBarInstance, api_key_manager: DualAPIKeyManager, base_url: str) -> Dict:
    """Evaluate a single dimension using the expert model"""
    
    expert_model = DIMENSION_TO_MODEL_MAPPING.get(dimension)
    if not expert_model:
        return {"dimension": dimension, "evaluation": "tie", "model": "unknown"}
    
    prompt = f"""You are an expert evaluator specialized in assessing a specific dimension of output quality.

Your task is to evaluate which output (1 or 2) better fulfills the given instruction according to the specified dimension.

INSTRUCTION: {instance.input}

OUTPUT 1: {instance.output_1}

OUTPUT 2: {instance.output_2}

DIMENSION TO EVALUATE: {dimension}

CRITICAL EVALUATION CRITERIA (IN ORDER OF PRIORITY):
1. **TASK CORRECTNESS FIRST**: Does the output actually do what was asked for in the instruction?
2. **INSTRUCTION ADHERENCE**: Does the output follow the exact instructions given?
3. **APPROPRIATE SCOPE**: Does the output provide what was requested without adding unnecessary extras?
4. **DIMENSION-SPECIFIC QUALITY**: Only AFTER confirming 1-3, evaluate the {dimension} aspect

Your evaluation: Respond with ONLY "1" or "2" followed by a brief explanation of why that output better fulfills the task according to the {dimension} dimension."""

    messages = [
        {"role": "system", "content": "You are an expert evaluator specialized in assessing output quality dimensions."},
        {"role": "user", "content": prompt}
    ]
    
    api_key = await api_key_manager.get_next_api_key()
    
    result = await call_api_async(
        session, messages, expert_model, api_key, base_url,
        max_tokens=100, temperature=0.1, api_key_manager=api_key_manager
    )
    
    if result["success"]:
        content = result["content"]
        if content.startswith("1"):
            evaluation = "1"
        elif content.startswith("2"):
            evaluation = "2"
        else:
            evaluation = "tie"
        
        return {
            "dimension": dimension,
            "evaluation": evaluation,
            "model": expert_model,
            "raw_response": content
        }
    
    return {
        "dimension": dimension,
        "evaluation": "tie",
        "model": expert_model,
        "raw_response": "Error: Failed to evaluate"
    }

# Main evaluation
async def evaluate_instance_main(session: aiohttp.ClientSession, instance: LLMBarInstance, api_key_manager: DualAPIKeyManager, base_url: str) -> Dict:
    """Evaluate the instance using the main evaluation approach"""
    
    metrics_questions = await generate_metrics_questions(session, instance, api_key_manager, base_url)
    reference_output = await generate_reference_output(session, instance, api_key_manager, base_url)
    
    metrics_text = "\n".join([f"{i+1}. {q}" for i, q in enumerate(metrics_questions)])
    
    prompt = f"""You are a helpful assistant in evaluating the quality of the outputs for a given instruction. Your goal is to select the best output for the given instruction.

Select the Output (a) or Output (b) that is better for the given instruction.

CRITICAL EVALUATION RULES:
(1) **TASK CORRECTNESS FIRST**: Prioritize whether the output honestly/precisely/closely executes the instruction above all else.
(2) **INSTRUCTION ADHERENCE**: Outputs should NOT contain more/less than what the instruction asks for.
(3) **APPROPRIATE SCOPE**: The output should match the scope and specificity requested in the instruction.
(4) **QUALITY WITHIN CORRECTNESS**: Only consider helpfulness, accuracy, detail, etc. AFTER confirming task correctness.

Do NOT provide any explanation for your choice.
Do NOT say both / neither are good.
You should answer using ONLY "Output (a)" or "Output (b)". Do NOT output any other words.

# Instruction:
{instance.input}

# Output (a):
{instance.output_1}

# Output (b):
{instance.output_2}

# Questions about Outputs:
{metrics_text}

# A reference output generated by a strong AI assistant:
{reference_output}

# Which is better, Output (a) or Output (b)? Your response should be either "Output (a)" or "Output (b)":"""

    messages = [
        {"role": "system", "content": "You are a helpful assistant in evaluating output quality."},
        {"role": "user", "content": prompt}
    ]
    
    api_key = await api_key_manager.get_next_api_key()
    
    result = await call_api_async(
        session, messages, "provider-3/kimi-k2", api_key, base_url,
        max_tokens=50, temperature=0.0, api_key_manager=api_key_manager
    )
    
    if result["success"]:
        content = result["content"]
        if "Output (a)" in content or "output (a)" in content:
            predicted_label = 1
        elif "Output (b)" in content or "output (b)" in content:
            predicted_label = 2
        else:
            predicted_label = 1
        
        return {
            "predicted_label": predicted_label,
            "metrics_questions": metrics_questions,
            "reference_output": reference_output,
            "raw_response": content
        }
    
    return {
        "predicted_label": 1,
        "metrics_questions": metrics_questions,
        "reference_output": reference_output,
        "raw_response": "Error: Failed to evaluate"
    }

# Main evaluation function
async def evaluate_instance_flexible(session: aiohttp.ClientSession, instance: LLMBarInstance, api_key_manager: DualAPIKeyManager, base_url: str) -> EvaluationResult:
    """Evaluate an instance using the flexible dimension selection approach"""
    
    start_time = time.time()
    
    logging.info(f"=== EVALUATING NATURAL INSTANCE {instance.instance_id} ===")
    logging.info(f"Input: {instance.input}")
    
    # Step 1: Select relevant dimensions
    logging.info("Step 1: Selecting relevant dimensions...")
    selected_dimensions = await select_relevant_dimensions(session, instance, api_key_manager, base_url)
    
    # Step 2: Generate metrics questions and reference output
    logging.info("Step 2: Generating metrics questions and reference output...")
    metrics_questions = await generate_metrics_questions(session, instance, api_key_manager, base_url)
    reference_output = await generate_reference_output(session, instance, api_key_manager, base_url)
    
    # Step 3: Evaluate each selected dimension
    logging.info(f"Step 3: Evaluating {len(selected_dimensions)} dimensions...")
    dimension_tasks = [
        evaluate_dimension(session, dim, instance, api_key_manager, base_url)
        for dim in selected_dimensions
    ]
    
    dimension_results = await asyncio.gather(*dimension_tasks, return_exceptions=True)
    
    # Process dimension results
    dimension_evaluations = {}
    expert_models_used = []
    
    for i, result in enumerate(dimension_results):
        if isinstance(result, Exception):
            logging.error(f"Error evaluating dimension {selected_dimensions[i]}: {result}")
            dimension_evaluations[selected_dimensions[i]] = "tie"
        else:
            dimension_evaluations[result["dimension"]] = result["evaluation"]
            if result["model"]:
                expert_models_used.append(result["model"])
    
    # Step 4: Main evaluation
    logging.info("Step 4: Performing main evaluation...")
    main_result = await evaluate_instance_main(session, instance, api_key_manager, base_url)
    
    # Count dimension wins
    output_1_wins = sum(1 for v in dimension_evaluations.values() if v == "1")
    output_2_wins = sum(1 for v in dimension_evaluations.values() if v == "2")
    ties = sum(1 for v in dimension_evaluations.values() if v == "tie")
    
    # Determine winner based on dimension wins and main evaluation
    if output_1_wins > output_2_wins:
        dimension_winner = 1
    elif output_2_wins > output_1_wins:
        dimension_winner = 2
    else:
        dimension_winner = main_result["predicted_label"]
    
    final_prediction = dimension_winner
    
    # Calculate confidence
    total_dimensions = len(selected_dimensions)
    if total_dimensions > 0:
        confidence = max(output_1_wins, output_2_wins) / total_dimensions
    else:
        confidence = 0.5
    
    reasoning = f"Dimension wins: Output 1 ({output_1_wins}), Output 2 ({output_2_wins}), Ties ({ties}). Main evaluation: {main_result['predicted_label']}"
    
    return EvaluationResult(
        instance_id=instance.instance_id,
        input=instance.input,
        output_1=instance.output_1,
        output_2=instance.output_2,
        gold_label=instance.gold_label,
        predicted_label=final_prediction,
        reasoning=reasoning,
        selected_dimensions=selected_dimensions,
        metrics_questions=metrics_questions,
        reference_output=reference_output,
        dimension_evaluations=dimension_evaluations,
        expert_models_used=expert_models_used,
        evaluation_time=time.time() - start_time,
        correct=(final_prediction == instance.gold_label),
        confidence=confidence
    )

# Process instances
async def process_instances_flexible(instances: List[LLMBarInstance], api_key_manager: DualAPIKeyManager, base_url: str, session: aiohttp.ClientSession = None) -> List[EvaluationResult]:
    """Process instances using the flexible dimension selection approach"""
    
    results = []
    
    for i, instance in enumerate(instances):
        logging.info(f"Processing Natural instance {i+1}/{len(instances)}: {instance.instance_id}")
        
        result = await evaluate_instance_flexible(session, instance, api_key_manager, base_url)
        results.append(result)
        
        # Add delay between instances to respect rate limits
        if i < len(instances) - 1:
            logging.info(f"Natural instance {instance.instance_id} completed. Waiting 60 seconds...")
            await asyncio.sleep(60)
    
    return results

# Save results
def save_results(results: List[EvaluationResult], output_path: str):
    """Save results to JSON files"""
    results_data = []
    for result in results:
        results_data.append({
            "instance_id": result.instance_id,
            "input": result.input,
            "output_1": result.output_1,
            "output_2": result.output_2,
            "gold_label": result.gold_label,
            "predicted_label": result.predicted_label,
            "confidence": result.confidence,
            "selected_dimensions": result.selected_dimensions,
            "metrics_questions": result.metrics_questions,
            "reference_output": result.reference_output,
            "dimension_evaluations": result.dimension_evaluations,
            "expert_models_used": result.expert_models_used,
            "evaluation_time": result.evaluation_time,
            "reasoning": result.reasoning,
            "correct": result.correct
        })
    
    with open(f"{output_path}_results.json", "w", encoding="utf-8") as f:
        json.dump(results_data, f, indent=2, ensure_ascii=False)
    
    # Calculate and save metrics
    metrics = {
        "overall_accuracy": sum(1 for r in results if r.correct) / len(results) if results else 0,
        "total_instances": len(results),
        "correct_predictions": sum(1 for r in results if r.correct),
        "incorrect_predictions": sum(1 for r in results if not r.correct),
        "average_confidence": sum(r.confidence for r in results) / len(results) if results else 0,
        "average_dimensions_per_instance": sum(len(r.selected_dimensions) for r in results) / len(results) if results else 0
    }
    
    with open(f"{output_path}_metrics.json", "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=2, ensure_ascii=False)
    
    logging.info(f"Results saved to {output_path}_results.json")
    logging.info(f"Metrics saved to {output_path}_metrics.json")

# Print summary
def print_summary_table(results: List[EvaluationResult]):
    """Print summary table for Natural evaluation"""
    if not results:
        print("No results to display")
        return
    
    overall_correct = sum(1 for r in results if r.correct)
    overall_total = len(results)
    overall_accuracy = (overall_correct / overall_total) * 100
    avg_confidence = sum(r.confidence for r in results) / len(results) * 100
    avg_dimensions = sum(len(r.selected_dimensions) for r in results) / len(results)
    
    print("\nLLMBar Natural Evaluation Summary Table:")
    print("=" * 80)
    print(f"Overall Accuracy: {overall_accuracy:.1f}% ({overall_correct}/{overall_total})")
    print(f"Average Confidence: {avg_confidence:.1f}%")
    print(f"Average Dimensions per Instance: {avg_dimensions:.1f}")
    
    # Show dimension usage statistics
    dimension_usage = defaultdict(int)
    for result in results:
        for dim in result.selected_dimensions:
            dimension_usage[dim] += 1
    
    print(f"\nDimension Usage Analysis:")
    print("-" * 80)
    print(f"Total Unique Dimensions Used: {len(dimension_usage)} out of {len(ALL_DIMENSIONS)}")
    print(f"Dimension Coverage: {(len(dimension_usage) / len(ALL_DIMENSIONS)) * 100:.1f}%")
    
    # Show most used dimensions
    sorted_dimensions = sorted(dimension_usage.items(), key=lambda x: x[1], reverse=True)
    
    print(f"\nTop 10 Most Used Dimensions:")
    print("-" * 50)
    for dim, count in sorted_dimensions[:10]:
        percentage = (count / len(results)) * 100
        print(f"{dim:<25} {count:>3} ({percentage:>5.1f}%)")

# Parse arguments
def parse_args():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(description="LLMBar Natural Evaluation Script")
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="LLMBar-main/LLMBar-main/Dataset/LLMBar/Natural/dataset.json",
        help="Path to Natural dataset JSON file"
    )
    parser.add_argument(
        "--api_key_1",
        type=str,
        required=True,
        help="First API key"
    )
    parser.add_argument(
        "--api_key_2",
        type=str,
        default="",
        help="Second API key (optional)"
    )
    parser.add_argument(
        "--base_url",
        type=str,
        default="https://api.example.com/v1",
        help="API base URL"
    )
    parser.add_argument(
        "--output_path",
        type=str,
        default="natural_evaluation",
        help="Output path prefix for results and metrics"
    )
    parser.add_argument(
        "--max_instances",
        type=int,
        default=0,
        help="Limit processing to the first N instances (0 means all)"
    )
    parser.add_argument(
        "--chunk_size",
        type=int,
        default=10,
        help="Number of instances to process in each chunk before saving checkpoint"
    )
    parser.add_argument(
        "--resume",
        action='store_true',
        help="Resume from checkpoint if available"
    )
    
    return parser.parse_args()

# Main function
async def main():
    """Main function to run the Natural LLMBar evaluation"""
    args = parse_args()
    
    # Initialize API key manager
    api_key_manager = DualAPIKeyManager(args.api_key_1, args.api_key_2)
    
    # Load Natural dataset
    instances = load_natural_dataset(args.dataset_path, args.max_instances)
    if not instances:
        print("No Natural instances loaded. Exiting.")
        return
    
    print(f"[INFO] Loaded {len(instances)} instances from Natural dataset")
    
    # Initialize checkpoint handling
    checkpoint_path = get_checkpoint_path(args.output_path)
    results = []
    processed_instances = set()
    
    if args.resume:
        results, processed_instances, _ = load_checkpoint(checkpoint_path)
        if results:
            print(f"[INFO] Resuming from checkpoint with {len(results)} completed evaluations")
            instances = [inst for inst in instances if inst.instance_id not in processed_instances]
            print(f"[INFO] {len(instances)} Natural instances remaining to process")
    
    if not instances:
        print("No Natural instances remaining to process. Evaluation complete!")
        return
    
    # Set up logging
    detailed_logger = setup_logging()
    
    print("[INFO] Running Natural evaluation with real API calls")
    
    # Create aiohttp session
    import ssl
    ssl_context = ssl.create_default_context()
    ssl_context.check_hostname = False
    ssl_context.verify_mode = ssl.CERT_NONE
    
    connector = aiohttp.TCPConnector(limit=1, limit_per_host=1, ssl=ssl_context)
    timeout = aiohttp.ClientTimeout(total=60)
    
    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
        # Process instances in chunks with checkpointing
        total_instances = len(instances)
        print(f"[INFO] Processing {total_instances} Natural instances in chunks of {args.chunk_size}")
        
        for chunk_start in range(0, total_instances, args.chunk_size):
            chunk_end = min(chunk_start + args.chunk_size, total_instances)
            chunk_instances = instances[chunk_start:chunk_end]
            
            print(f"[INFO] Processing Natural chunk {chunk_start//args.chunk_size + 1}: instances {chunk_start+1}-{chunk_end} of {total_instances}")
            
            try:
                chunk_results = await process_instances_flexible(chunk_instances, api_key_manager, args.base_url, session)
                
                results.extend(chunk_results)
                for result in chunk_results:
                    processed_instances.add(result.instance_id)
                
                save_checkpoint(checkpoint_path, results, processed_instances, total_instances)
                
                print(f"[INFO] Natural chunk {chunk_start//args.chunk_size + 1} completed. Total progress: {len(results)}/{total_instances}")
                
            except Exception as e:
                print(f"[ERROR] Error processing Natural chunk {chunk_start//args.chunk_size + 1}: {e}")
                print(f"[INFO] Checkpoint saved. You can resume with --resume flag")
                return
    
    # Save final results
    save_results(results, args.output_path)
    
    # Print summary table
    print_summary_table(results)
    
    # Print final summary
    print(f"\n[INFO] === NATURAL EVALUATION SUMMARY ===")
    print(f"[INFO] Overall Accuracy: {sum(1 for r in results if r.correct) / len(results):.3f}")
    print(f"[INFO] Total Natural Instances: {len(results)}")
    print(f"[INFO] Average Dimensions per Instance: {sum(len(r.selected_dimensions) for r in results) / len(results):.1f}")
    
    print(f"\nResults saved to: {args.output_path}")
    
    # Clean up checkpoint file after successful completion
    if os.path.exists(checkpoint_path):
        os.remove(checkpoint_path)
        print(f"[INFO] Checkpoint file cleaned up")

if __name__ == "__main__":
    asyncio.run(main())
