#!/usr/bin/env python3
"""
脚本用于提取所有JSON文件中Justification字段开头的候选者选择
"""

import json
import os
import re
from pathlib import Path
import glob

def extract_candidate_choice(justification_text):
    """
    从Justification文本中提取候选者选择
    """
    if not justification_text:
        return None
    
    # 查找模式: "Candidate [A-Z]"
    pattern = r'^Candidate ([A-Z])'
    match = re.search(pattern, justification_text.strip())
    
    if match:
        return match.group(1)
    return None

def process_json_files(directory_path):
    """
    处理目录中的所有JSON文件
    """
    results = []
    
    # 查找所有JSON文件
    json_files = glob.glob(os.path.join(directory_path, "*.json"))
    
    for json_file in sorted(json_files):
        filename = os.path.basename(json_file)
        
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # 提取Justification字段
            justification = data.get('Justification', '')
            
            # 提取候选者选择
            candidate = extract_candidate_choice(justification)
            
            results.append({
                'filename': filename,
                'candidate': candidate,
                'justification_preview': justification[:100] + '...' if len(justification) > 100 else justification
            })
            
        except Exception as e:
            print(f"处理文件 {filename} 时出错: {e}")
            results.append({
                'filename': filename,
                'candidate': None,
                'error': str(e)
            })
    
    return results

def generate_summary(results):
    """
    生成统计摘要
    """
    # 统计每个候选者的选择次数
    candidate_counts = {}
    total_files = len(results)
    valid_files = 0
    
    for result in results:
        if result['candidate']:
            valid_files += 1
            candidate = result['candidate']
            candidate_counts[candidate] = candidate_counts.get(candidate, 0) + 1
    
    print(f"\n=== 候选者选择统计摘要 ===")
    print(f"总JSON文件数: {total_files}")
    print(f"有效文件数 (有候选者选择): {valid_files}")
    print(f"无效文件数: {total_files - valid_files}")
    
    print(f"\n=== 候选者选择分布 ===")
    for candidate in sorted(candidate_counts.keys()):
        count = candidate_counts[candidate]
        percentage = (count / valid_files) * 100 if valid_files > 0 else 0
        print(f"Candidate {candidate}: {count} 次 ({percentage:.1f}%)")
    
    return candidate_counts

def main():
    # 当前工作目录
    current_dir = '/Users/lichenglin/research/Tree/result_llm_judge'
    
    print(f"正在处理目录: {current_dir}")
    print("开始提取候选者选择信息...")
    
    # 处理所有JSON文件
    results = process_json_files(current_dir)
    
    # 生成统计摘要
    candidate_counts = generate_summary(results)
    
    # 打印详细结果
    print(f"\n=== 详细文件列表 ===")
    for result in results:
        filename = result['filename']
        candidate = result['candidate'] if result['candidate'] else 'None'
        
        if 'error' in result:
            print(f"{filename:<60} | {candidate:<10} | ERROR: {result['error']}")
        else:
            print(f"{filename:<60} | {candidate:<10}")
    
    # 将结果保存到文件
    output_file = os.path.join(current_dir, 'candidate_selection_results.txt')
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("候选者选择统计结果\n")
        f.write("=" * 50 + "\n\n")
        
        f.write(f"总JSON文件数: {len(results)}\n")
        f.write(f"有效文件数: {sum(1 for r in results if r['candidate'])}\n\n")
        
        f.write("候选者选择分布:\n")
        for candidate in sorted(candidate_counts.keys()):
            count = candidate_counts[candidate]
            percentage = (count / sum(candidate_counts.values())) * 100
            f.write(f"Candidate {candidate}: {count} 次 ({percentage:.1f}%)\n")
        
        f.write("\n详细文件列表:\n")
        f.write("-" * 80 + "\n")
        
        for result in results:
            filename = result['filename']
            candidate = result['candidate'] if result['candidate'] else 'None'
            f.write(f"{filename:<60} | Candidate {candidate}\n")
    
    print(f"\n结果已保存到: {output_file}")

if __name__ == "__main__":
    main()