import json
import os
import argparse
import numpy as np
from tqdm import tqdm
from typing import List, Dict, Any
from pathlib import Path
from openai import OpenAI
import time


class GameCorrectnessEvaluator:
                       

    def __init__(self, api_key: str, base_url: str = "https://api.openai.com/v1",
                 model: str = "gpt-4o", temperature: float = 0.01):
        
        self.api_key = api_key
        self.base_url = base_url
        self.model = model
        self.temperature = temperature

                      
        self.client = OpenAI(
            api_key=api_key,
            base_url=base_url
        )

                         
        self.evaluation_prompt = """You are an extremely strict game knowledge correctness evaluator. Your task is to evaluate the accuracy of a predicted answer against the ground truth answer for a game-related question with MAXIMUM RIGOR.

ULTRA-STRICT EVALUATION CRITERIA:

1. FACTUAL ACCURACY (ZERO TOLERANCE):
   - Every single fact about game mechanics, rules, systems, and lore must be 100% correct
   - ANY factual error, no matter how minor, significantly impacts the score
   - Factual claims not supported by the ground truth are considered errors

2. NUMERICAL INFORMATION (EXACT PRECISION):
   - All numbers, statistics, values, quantities, percentages must be exactly correct
   - Even tiny numerical discrepancies (>±2%) are heavily penalized
   - Approximations are only acceptable if explicitly indicated as such

3. TERMINOLOGY AND NAMES (PERFECT ACCURACY):
   - Character names, location names, item names, ability names, and ALL game-specific terms must be spelled exactly correctly
   - Any misspelling or incorrect terminology significantly reduces the score
   - Synonyms are only acceptable if they are commonly recognized alternatives

4. COMPLETENESS AND COVERAGE (COMPREHENSIVE):
   - The answer must address EVERY aspect of the question thoroughly
   - Missing ANY critical information mentioned in the ground truth is a major defect
   - Partial answers that leave important aspects unanswered are heavily penalized

5. ADDITIONAL INFORMATION (STRICT VERIFICATION):
   - Any extra information beyond the ground truth must be 100% accurate and verifiable
   - Speculative content, unsupported claims, or hallucinations result in immediate score reduction
   - Information that contradicts or confuses the ground truth is unacceptable

ULTRA-STRICT 3-LEVEL SCORING SYSTEM:

- 2 (Exceptionally Perfect):
  * EVERY fact is 100% accurate with zero errors
  * ALL numbers and terminology are precisely correct
  * Comprehensively addresses the question with complete coverage
  * Any additional information is verified accurate and genuinely helpful
  * No ambiguity, no errors, no omissions - truly exemplary answer

- 1 (Acceptable with Minor Flaws):
  * Core facts are accurate but contains 1-2 very minor issues
  * Slight numerical discrepancies (≤±2%) OR missing 1-2 non-essential details
  * Addresses the main question adequately but may lack some depth
  * Minor terminology issues that don't materially affect understanding
  * Overall sound but not perfect

- 0 (Defective/Inadequate):
  * Contains ANY significant factual errors or multiple minor errors
  * Notable numerical inaccuracies (>±2%) or missing important quantitative information
  * Fails to address key aspects of the question or provides incomplete coverage
  * Contains questionable information, contradictions, or unsupported claims
  * Any hallucination, fabrication, or misleading content

Question: {question}
Ground Truth Answer: {answers}
Predicted Answer: {prediction}

Apply MAXIMUM STRICTNESS in your evaluation. Score 2 should be EXTREMELY RARE and reserved only for truly flawless answers. Score 1 should be given only to genuinely good answers with minimal, non-critical flaws. Score 0 for everything else, including answers that are "mostly correct" but have clear defects. Remember: being strict protects the integrity of the evaluation. Return your evaluation as a JSON object with the "accuracy" field (0, 1, or 2)."""

                                                
        self.faithfulness_prompt = """You are an extremely strict faithfulness evaluator for game-related question answering systems. Your task is to evaluate whether the predicted answer is ENTIRELY FAITHFUL to the provided retrieved context documents with MAXIMUM RIGOR.

ULTRA-STRICT FAITHFULNESS EVALUATION CRITERIA:

1. INFORMATION SOURCE VERIFICATION (ZERO TOLERANCE):
   - EVERY piece of information in the predicted answer MUST be directly supported by the retrieved context
   - ANY claim, fact, or detail not found in the provided contexts is considered a faithfulness violation
   - Inferences or logical deductions that go beyond what's explicitly stated in contexts are NOT allowed

2. FACTUAL CONSISTENCY (EXACT ALIGNMENT):
   - All facts, numbers, names, dates, and details must match EXACTLY with the context
   - No paraphrasing that changes meaning or introduces ambiguity
   - No combining information from different contexts in ways that create new unsupported claims

3. CONTEXT GROUNDING (MANDATORY SUPPORT):
   - Each statement in the answer must be traceable to specific parts of the retrieved contexts
   - Information synthesis is only acceptable if it directly reflects what's stated in the contexts
   - No external knowledge beyond what's provided in the contexts

4. HALLUCINATION DETECTION (ZERO TOLERANCE):
   - Any information not present in the contexts is considered hallucination
   - This includes reasonable-sounding but unverified details about game mechanics, characters, locations, etc.
   - Even "common knowledge" about games must be present in contexts to be considered faithful

5. OMISSION vs ADDITION PRINCIPLE:
   - It's better to omit information not in contexts than to add unsupported information
   - Incomplete but faithful answers are preferred over complete but unfaithful ones

ULTRA-STRICT 3-LEVEL SCORING SYSTEM:

- 2 (Perfectly Faithful):
  * EVERY statement in the answer is directly supported by the retrieved contexts
  * No hallucinations, no unsupported claims, no external information
  * Perfect alignment between answer content and context information
  * May be incomplete but everything stated is verifiable from contexts

- 1 (Mostly Faithful with Minor Issues):
  * Core information is supported by contexts but contains 1-2 minor unsupported details
  * Slight paraphrasing that doesn't change meaning significantly
  * Minor inferences that are very close to what's stated in contexts
  * Overall faithful but not perfectly grounded

- 0 (Unfaithful/Hallucinated):
  * Contains significant information not found in the retrieved contexts
  * Multiple unsupported claims or facts
  * Introduces external knowledge not present in contexts
  * Creates new information through inappropriate synthesis
  * Any clear hallucination or fabrication

Question: {question}
Retrieved Contexts: {contexts}
Predicted Answer: {prediction}

Apply MAXIMUM STRICTNESS in evaluating faithfulness. Score 2 should be EXTREMELY RARE and only for answers that are completely verifiable from the contexts. Score 1 only for answers that are largely faithful with minimal unsupported content. Score 0 for everything else that contains unverified information. Remember: faithfulness means the answer can ONLY contain information that is explicitly or very clearly implied in the provided contexts. Return your evaluation as a JSON object with the "faithfulness" field (0, 1, or 2)."""

    def evaluate_single(self, question: str, ground_truth: str, prediction: str) -> float:        
        try:
                         
            full_prompt = self.evaluation_prompt.format(
                question=question,
                answers=ground_truth,
                prediction=prediction
            )

                                     
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are a professional game knowledge correctness evaluator. Always respond with valid JSON format."},
                    {"role": "user", "content": full_prompt}
                ],
                temperature=self.temperature,
                response_format={"type": "json_object"}
            )

                  
            result_text = response.choices[0].message.content.strip()

                      
            try:
                result_json = json.loads(result_text)
                accuracy = result_json.get('accuracy', 0)

                             
                if accuracy in [0, 1, 2]:
                    return float(accuracy)
                else:
                    return 0.0

            except json.JSONDecodeError as e:
                return 0.0

        except Exception as e:
            return 0.0

    def evaluate_faithfulness(self, question: str, contexts: List[str], prediction: str) -> float:          
        try:
                           
            contexts_str = "\n\n".join([f"Context {i+1}:\n{ctx}" for i, ctx in enumerate(contexts)])
                         
            full_prompt = self.faithfulness_prompt.format(
                question=question,
                contexts=contexts_str,
                prediction=prediction
            )

                                     
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are a professional faithfulness evaluator for game knowledge QA systems. Always respond with valid JSON format."},
                    {"role": "user", "content": full_prompt}
                ],
                temperature=self.temperature,
                response_format={"type": "json_object"}
            )

                  
            result_text = response.choices[0].message.content.strip()

                      
            try:
                result_json = json.loads(result_text)
                faithfulness = result_json.get('faithfulness', 0)

                             
                if faithfulness in [0, 1, 2]:
                    return float(faithfulness)
                else:
                    return 0.0

            except json.JSONDecodeError as e:
                return 0.0

        except Exception as e:
            return 0.0


class ChronoPlayCustomGenerationEvaluator:
                            

    def __init__(self, config: Dict):
        self.config = config
        self.game_name = config.get('game_name', 'dyinglight2')
        self.target_segment_id = config.get('target_segment_id', None)

                   
        self.evaluator = GameCorrectnessEvaluator(
            api_key=config['api_key'],
            base_url=config.get('openai_api', 'https://api.openai.com/v1'),
            model=config.get('model_name', 'gpt-4o'),
            temperature=config.get('temperature', 0.01)
        )

                 
        self.default_metrics = ['correctness', 'faithfulness']

                           
        self.retrieval_data_cache = {}

    def load_retrieval_data(self, generation_results_path: str) -> Dict[int, Dict]:
                                  
        try:
                        
            gen_path = Path(generation_results_path)
            gen_filename = gen_path.stem

                           
                                                                                            
            parts = gen_filename.split('_')

                             
            game_name = None
            segment_id = None
            retrieval_index = -1
            segment_index = -1

                                    
            for i, part in enumerate(parts):
                if part == 'retrieval':
                    retrieval_index = i
                elif part == 'segment':
                    segment_index = i
                    segment_id = parts[i + 1] if i + 1 < len(parts) else None
                    break

                                              
            if retrieval_index != -1 and segment_index != -1 and segment_index > retrieval_index + 1:
                game_parts = parts[retrieval_index + 1:segment_index]
                                
                filtered_parts = []
                for part in game_parts:
                    if part not in ['text', 'embedding', 'openai'] and not part.isdigit():
                        filtered_parts.append(part)

                if filtered_parts:
                    game_name = '_'.join(filtered_parts)
            elif retrieval_index != -1 and retrieval_index + 1 < len(parts):
                                               
                next_part = parts[retrieval_index + 1]
                if next_part not in ['segment', 'text', 'embedding'] and not next_part.isdigit():
                    game_name = next_part

            if not game_name or not segment_id:
                return {}

                                       
            retrieval_dir = gen_path.parent.parent / "retrieval_results"

                        
            possible_filenames = [
                f"retrieval_{game_name}_segment_{segment_id}_openai_text_embedding_3_small_k5.jsonl",
                f"retrieval_{game_name}_segment_{segment_id}_text_embedding_3_small_k5.jsonl",
                f"retrieval_{game_name}_segment_{segment_id}_BAAI_bge_m3_k5.jsonl",
            ]

            retrieval_path = None
            for filename in possible_filenames:
                potential_path = retrieval_dir / filename
                if potential_path.exists():
                    retrieval_path = potential_path
                    break

            if not retrieval_path:
                               
                import glob
                pattern = str(retrieval_dir / f"retrieval_{game_name}_segment_{segment_id}_*_k5.jsonl")
                matches = glob.glob(pattern)
                if matches:
                    retrieval_path = Path(matches[0])              
                else:
                    return {}

                         
            retrieval_data = {}
            with open(retrieval_path, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():
                        item = json.loads(line.strip())
                        question_index = item.get('question_index')
                        if question_index is not None:
                            retrieval_data[question_index] = item

            return retrieval_data

        except Exception as e:
            return {}

    def load_generation_results(self, generation_results_path: str) -> List[Dict]:
                      

                            
        retrieval_data = self.load_retrieval_data(generation_results_path)
        self.retrieval_data_cache = retrieval_data

        generation_data = []
        with open(generation_results_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    item = json.loads(line.strip())
                    question_index = item.get('question_index', -1)

                                         
                    question_type = 'unknown'
                    task_type = 'unknown'

                    if question_index in retrieval_data:
                        retrieval_item = retrieval_data[question_index]

                                                                     
                        original_qa_data = retrieval_item.get('original_qa_data', {})
                        question_type = original_qa_data.get('question_type', 'unknown')
                        task_type = original_qa_data.get('task_type', 'unknown')

                                 
                    eval_item = {
                        'question': item.get('question', ''),
                        'query': item.get('query', item.get('question', '')),
                        'rag_answer': item.get('answer', item.get('rag_answer', '')),
                        'ground_truth_answer': item.get('ground_truth_answer', ''),
                        'retrieved_docs': item.get('retrieved_docs', []),
                        'contexts': item.get('contexts', []),            
                        'generation_time': item.get('generation_time', 0),
                        'total_time': item.get('total_time', 0),
                        'question_index': question_index,
                        'metadata': {
                            'game': self.game_name,
                            'segment_id': item.get('config', {}).get('segment_id',
                                                                     item.get('retrieval_config', {}).get('segment_id')),
                            'question_type': question_type,
                            'task_type': task_type,
                            'difficulty': item.get('original_data', {}).get('difficulty', 'unknown'),
                            'model': item.get('config', {}).get('model_name',
                                                                item.get('generation_config', {}).get('llm_model', 'unknown')),
                            'temperature': item.get('config', {}).get('temperature',
                                                                      item.get('generation_config', {}).get('temperature', 0.0)),
                            'max_tokens': item.get('config', {}).get('max_tokens', 0)
                        }
                    }

                                                         
                    if not eval_item['contexts'] and eval_item['retrieved_docs']:
                        eval_item['contexts'] = [
                            doc.get('content', '') if isinstance(doc, dict) else str(doc)
                            for doc in eval_item['retrieved_docs']
                        ]

                    generation_data.append(eval_item)

        if retrieval_data:
            question_type_count = len([item for item in generation_data if item['metadata']
                                      ['question_type'] != 'unknown'])
            task_type_count = len([item for item in generation_data if item['metadata']['task_type'] != 'unknown'])
        return generation_data

    def evaluate_generation_metrics(self, generation_data: List[Dict],
                                    metrics: List[str] = None, test_mode: bool = False) -> Dict[str, Any]:
                               
        if metrics is None:
            metrics = self.default_metrics


                            
        detailed_results = []

                 
        if test_mode:
                           
            test_count = min(5, len(generation_data))
            generation_data = generation_data[:test_count]

        for i, item in enumerate(tqdm(generation_data, desc="🔍 自定义评估", unit="项")):
            question = item.get('question', '')
            rag_answer = item.get('rag_answer', '')
            ground_truth = item.get('ground_truth_answer', '')
            metadata = item.get('metadata', {})

                     
            sample_result = {
                'index': i,
                'question_type': metadata.get('question_type', 'unknown'),
                'task_type': metadata.get('task_type', 'unknown'),
                'correctness_score': 0.0,
                'faithfulness_score': 0.0
            }

                   
            if not rag_answer or not ground_truth:
                detailed_results.append(sample_result)
                continue

            try:
                if 'correctness' in metrics:
                                   
                    correctness_score = self.evaluator.evaluate_single(
                        question=question,
                        ground_truth=ground_truth,
                        prediction=rag_answer
                    )
                    sample_result['correctness_score'] = correctness_score

                if 'faithfulness' in metrics:
                             
                    contexts = item.get('contexts', [])
                    if not contexts:
                                                          
                        retrieved_docs = item.get('retrieved_docs', [])
                        contexts = [
                            doc.get('content', '') if isinstance(doc, dict) else str(doc)
                            for doc in retrieved_docs
                        ]
                    if contexts:
                                       
                        faithfulness_score = self.evaluator.evaluate_faithfulness(
                            question=question,
                            contexts=contexts,
                            prediction=rag_answer
                        )
                        sample_result['faithfulness_score'] = faithfulness_score
                    else:
                        print("No contexts found for faithfulness evaluation")
                              
                time.sleep(0.1)

            except Exception as e:
                print(f"Error evaluating generation metrics for item {i}: {e}")

            detailed_results.append(sample_result)


                     
        type_results = self._calculate_type_metrics(detailed_results, metrics)

                  
        overall_results = {}
                       
        if 'correctness' in metrics:
            correctness_scores = [r['correctness_score'] for r in detailed_results if r['correctness_score'] >= 0]
            if correctness_scores:
                overall_results['correctness'] = {
                    'mean': float(np.mean(correctness_scores)),
                    'count': len(correctness_scores),
                    'total': len(detailed_results)
                }
            else:
                overall_results['correctness'] = {
                    'mean': 0.0,
                    'count': 0,
                    'total': len(detailed_results)
                }

                        
        if 'faithfulness' in metrics:
            faithfulness_scores = [r['faithfulness_score'] for r in detailed_results if r['faithfulness_score'] >= 0]
            if faithfulness_scores:
                overall_results['faithfulness'] = {
                    'mean': float(np.mean(faithfulness_scores)),
                    'count': len(faithfulness_scores),
                    'total': len(detailed_results)
                }
            else:
                overall_results['faithfulness'] = {
                    'mean': 0.0,
                    'count': 0,
                    'total': len(detailed_results)
                }

              
        final_results = {
            'overall': overall_results,
            'by_question_type': type_results['by_question_type'],
            'by_task_type': type_results['by_task_type']
        }

        return final_results

    def _calculate_type_metrics(self, detailed_results: List[Dict], metrics: List[str]) -> Dict[str, Dict]:
                            
        from collections import defaultdict

                 
        question_type_data = defaultdict(lambda: {'correctness': [], 'faithfulness': []})
        for result in detailed_results:
            question_type = result.get('question_type', 'unknown')
            if 'correctness' in metrics:
                correctness_score = result.get('correctness_score', 0.0)
                if correctness_score >= 0:
                    question_type_data[question_type]['correctness'].append(correctness_score)
            if 'faithfulness' in metrics:
                faithfulness_score = result.get('faithfulness_score', 0.0)
                if faithfulness_score >= 0:
                    question_type_data[question_type]['faithfulness'].append(faithfulness_score)

                 
        task_type_data = defaultdict(lambda: {'correctness': [], 'faithfulness': []})
        for result in detailed_results:
            task_type = result.get('task_type', 'unknown')
            if 'correctness' in metrics:
                correctness_score = result.get('correctness_score', 0.0)
                if correctness_score >= 0:
                    task_type_data[task_type]['correctness'].append(correctness_score)
            if 'faithfulness' in metrics:
                faithfulness_score = result.get('faithfulness_score', 0.0)
                if faithfulness_score >= 0:
                    task_type_data[task_type]['faithfulness'].append(faithfulness_score)

                
        type_results = {
            'by_question_type': {},
            'by_task_type': {}
        }

                
        for question_type, data in question_type_data.items():
            stats = {}
            if 'correctness' in metrics and data['correctness']:
                stats['correctness_mean'] = float(np.mean(data['correctness']))
                stats['correctness_count'] = len(data['correctness'])
            if 'faithfulness' in metrics and data['faithfulness']:
                stats['faithfulness_mean'] = float(np.mean(data['faithfulness']))
                stats['faithfulness_count'] = len(data['faithfulness'])

            if stats:
                type_results['by_question_type'][question_type] = stats
                
        for task_type, data in task_type_data.items():
            stats = {}
            if 'correctness' in metrics and data['correctness']:
                stats['correctness_mean'] = float(np.mean(data['correctness']))
                stats['correctness_count'] = len(data['correctness'])
            if 'faithfulness' in metrics and data['faithfulness']:
                stats['faithfulness_mean'] = float(np.mean(data['faithfulness']))
                stats['faithfulness_count'] = len(data['faithfulness'])

            if stats:
                type_results['by_task_type'][task_type] = stats
        return type_results

    def evaluate_basic_metrics(self, generation_data: List[Dict]) -> Dict[str, Any]:
                    

        valid_answers = 0
        total_length = 0
        for item in generation_data:
            rag_answer = item.get('rag_answer', '')
            if rag_answer and rag_answer.strip():
                valid_answers += 1
                total_length += len(rag_answer.strip())

        results = {
            'total_questions': len(generation_data),
            'valid_answers': valid_answers,
            'answer_rate': valid_answers / len(generation_data) if generation_data else 0.0,
            'avg_answer_length': total_length / valid_answers if valid_answers > 0 else 0.0
        }

        return results

    def run_single_evaluation(self, generation_results_path: str, output_path: str = None,
                              metrics: List[str] = None, test_mode: bool = False) -> Dict:
                       
        if metrics is None:
            metrics = self.default_metrics


                
        generation_data = self.load_generation_results(generation_results_path)

        if not generation_data:
            return {}

        results = {}
        results['llm_metrics'] = self.evaluate_generation_metrics(generation_data, metrics, test_mode)
        results['basic_metrics'] = self.evaluate_basic_metrics(generation_data)

              
        if output_path:
            output_dir = os.path.dirname(output_path)
            if output_dir:
                os.makedirs(output_dir, exist_ok=True)
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=2)

              
        self._print_single_summary(results)

        return results

    def _print_single_summary(self, results: Dict):
                         

                
        if 'llm_metrics' in results and 'overall' in results['llm_metrics']:
            overall = results['llm_metrics']['overall']
            if 'correctness' in overall:
                correctness = overall['correctness']
            if 'faithfulness' in overall:
                faithfulness = overall['faithfulness']

                
        if 'basic_metrics' in results:
            basic = results['basic_metrics']

                    
        if 'llm_metrics' in results and 'by_question_type' in results['llm_metrics']:
            question_type_results = results['llm_metrics']['by_question_type']
            if question_type_results:
                for question_type, stats in question_type_results.items():
                    line = f"  {question_type}:"
                    if 'correctness_mean' in stats:
                        correctness_mean = stats.get('correctness_mean', 0.0)
                        correctness_count = stats.get('correctness_count', 0)
                        line += f" Correctness={correctness_mean:.4f}({correctness_count})"
                    if 'faithfulness_mean' in stats:
                        faithfulness_mean = stats.get('faithfulness_mean', 0.0)
                        faithfulness_count = stats.get('faithfulness_count', 0)
                        line += f" Faithfulness={faithfulness_mean:.4f}({faithfulness_count})"

                    
        if 'llm_metrics' in results and 'by_task_type' in results['llm_metrics']:
            task_type_results = results['llm_metrics']['by_task_type']
            if task_type_results:
                for task_type, stats in task_type_results.items():
                    line = f"  {task_type}:"
                    if 'correctness_mean' in stats:
                        correctness_mean = stats.get('correctness_mean', 0.0)
                        correctness_count = stats.get('correctness_count', 0)
                        line += f" Correctness={correctness_mean:.4f}({correctness_count})"
                    if 'faithfulness_mean' in stats:
                        faithfulness_mean = stats.get('faithfulness_mean', 0.0)
                        faithfulness_count = stats.get('faithfulness_count', 0)
                        line += f" Faithfulness={faithfulness_mean:.4f}({faithfulness_count})"


def main():
             
    parser = argparse.ArgumentParser(
        description='ChronoPlay自定义生成效果评估',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
使用示例:
  # 评估单个生成结果文件（默认评估correctness和faithfulness）
  # 结果保存到: evaluation/generation_evaluation/
  python generation_evaluator_custom.py --generation_results ./results/generation_dyinglight2_segment_1_*.jsonl

  # 只评估correctness
  # 结果保存到: evaluation/generation_evaluation/
  python generation_evaluator_custom.py --generation_results ./results/generation_dyinglight2_segment_1_*.jsonl --metrics correctness

  # 只评估faithfulness
  # 结果保存到: evaluation/generation_evaluation_faithfulness/
  python generation_evaluator_custom.py --generation_results ./results/generation_dyinglight2_segment_1_*.jsonl --metrics faithfulness

  # 自定义输出路径
  python generation_evaluator_custom.py --generation_results ./results/generation_dyinglight2_segment_1_*.jsonl --output ./custom_eval_results.json

  # 使用不同的模型
  python generation_evaluator_custom.py --generation_results ./results/generation_dyinglight2_segment_1_*.jsonl --model_name gpt-4o-mini

  # 测试模式（只评估前5个样本）
  python generation_evaluator_custom.py --generation_results ./results/generation_dyinglight2_segment_1_*.jsonl --test_mode

        """
    )

            
    parser.add_argument('--generation_results', type=str, required=True,
                        help='生成结果文件路径')
    parser.add_argument('--output', type=str,
                        help='评估结果输出路径 (默认: 自动生成)')

          
    parser.add_argument('--game', type=str, default=None,
                        help='游戏名称 (默认: 从文件路径自动提取)')
    parser.add_argument('--test_mode', action='store_true',
                        help='测试模式，只评估前5个样本')
            
    parser.add_argument('--metrics', type=str, nargs='+',
                        default=['correctness', 'faithfulness'],
                        choices=['correctness', 'faithfulness'],
                        help='要评估的指标列表 (默认: correctness faithfulness)')

           
    parser.add_argument('--model_name', type=str, default='gpt-4o',
                        help='LLM模型名称')
    parser.add_argument('--api_key', type=str,
                        default='{your api_key}',
                        help='OpenAI API密钥')
    parser.add_argument('--openai_api', type=str,
                        default='{your base_url}',
                        help='OpenAI API地址')

    args = parser.parse_args()

            
    if not os.path.exists(args.generation_results):
        return

             
    if not args.api_key:
        return
                    
    input_file = Path(args.generation_results)
    filename = input_file.stem
                                                                                                                            
    parts = filename.split('_')

                                              
    auto_game_name = None
    retrieval_index = -1
    segment_index = -1

                            
    for i, part in enumerate(parts):
        if part == 'retrieval':
            retrieval_index = i
        elif part == 'segment':
            segment_index = i
            break

                                      
    if retrieval_index != -1 and segment_index != -1 and segment_index > retrieval_index + 1:
        game_parts = parts[retrieval_index + 1:segment_index]
                        
        filtered_parts = []
        for part in game_parts:
            if part not in ['text', 'embedding', 'openai'] and not part.isdigit():
                filtered_parts.append(part)

        if filtered_parts:
            auto_game_name = '_'.join(filtered_parts)
    elif retrieval_index != -1 and retrieval_index + 1 < len(parts):
                                       
        next_part = parts[retrieval_index + 1]
        if next_part not in ['segment', 'text', 'embedding'] and not next_part.isdigit():
            auto_game_name = next_part

                                    
    game_name = auto_game_name or args.game or 'dyinglight2'


          
    config = {
        'game_name': game_name,
        'model_name': args.model_name,
        'api_key': args.api_key,
        'openai_api': args.openai_api,
        'temperature': 0.01
    }

           
    try:
        evaluator = ChronoPlayCustomGenerationEvaluator(config)
    except Exception as e:
        return

                         
    if args.output:
        output_path = args.output
    else:
                             
        input_file = Path(args.generation_results)

                                               
        if args.metrics == ['faithfulness']:
            output_dir = Path(
                "./generation_evaluation_faithfulness")
        else:
                        
            output_dir = Path("./generation_evaluation")

        output_dir.mkdir(exist_ok=True)

                          
        segment_id = None
        model_name = None

                     
        for i, part in enumerate(parts):
            if part == 'segment' and i + 1 < len(parts):
                segment_id = parts[i + 1]
                break

                              
        k5_index = -1
        t01_index = -1
        for i, part in enumerate(parts):
            if part == 'k5':
                k5_index = i
            elif part == 't01':
                t01_index = i
                break

                
        if k5_index != -1 and t01_index != -1 and t01_index > k5_index + 1:
                           
            model_parts = parts[k5_index + 1:t01_index]
            model_name = '_'.join(model_parts)
        else:
                                     
            model_name = args.model_name.replace('-', '_').replace('.', '_')

                                
                                     
        if args.metrics == ['faithfulness']:
            file_suffix = "faithfulness"
        elif args.metrics == ['correctness']:
            file_suffix = "correctness"
        else:
            file_suffix = "custom"

        if segment_id and model_name:
            output_path = output_dir / \
                f"retrieval_segment_{segment_id}_generation_evaluation_{file_suffix}_{game_name}_{model_name}.json"
        elif segment_id:
            output_path = output_dir / \
                f"retrieval_segment_{segment_id}_generation_evaluation_{file_suffix}_{game_name}.json"
        elif model_name:
            output_path = output_dir / f"generation_evaluation_{file_suffix}_{game_name}_{model_name}.json"
        else:
            output_path = output_dir / f"generation_evaluation_{file_suffix}_{game_name}.json"

    try:
        evaluator.run_single_evaluation(
            generation_results_path=args.generation_results,
            output_path=output_path,
            metrics=args.metrics,
            test_mode=args.test_mode
        )

    except Exception as e:
        print(f"Error during evaluation: {e}")

if __name__ == '__main__':
    main()
