

"""
JSONL文件处理脚本
将pos/neg标签拆分成label0/1，并且将中英文分离
"""

import json
import re
import os
from typing import Dict, List, Tuple, Optional

def is_chinese(text: str) -> bool:
    """判断文本是否包含中文字符"""
    chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
    return bool(chinese_pattern.search(text))

def is_english(text: str) -> bool:
    """判断文本是否包含英文字符"""
    english_pattern = re.compile(r'[a-zA-Z]')
    return bool(english_pattern.search(text))

def split_chinese_english(text: str) -> Tuple[Optional[str], Optional[str]]:
    """
    将中英文混合文本分离
    返回 (中文文本, 英文文本) 的元组
    处理英文解释中可能掺杂中文的情况
    """
    if not text:
        return None, None
    
    
    patterns = [
        
        (r'English explanation:\s*(.*?)(?=\n\n(?:Corresponding\s+)?(?:Chinese\s*explanation|对应的中文解释|中文解释|中文说明)\s*[:：])', 'en_first'),
        
        (r'(?:Corresponding\s+)?(?:Chinese\s*explanation|对应的中文解释|中文解释|中文说明)\s*[:：]\s*(.*?)(?=\n\n(?:English|英文|英)\s*(?:explanation|解释|说明)?\s*[:：])', 'zh_first'),
        
        (r'English\s*[:：]\s*(.*?)(?=\n\n(?:Chinese|中文)\s*[:：])', 'en_first_simple'),
        
        (r'(?:Chinese|中文)\s*[:：]\s*(.*?)(?=\n\n(?:English|英文|英)\s*[:：])', 'zh_first_simple'),
    ]
    
    
    for pattern, pattern_type in patterns:
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        if match:
            captured_text = match.group(1).strip()
            
            if pattern_type.startswith('en_first'):
                
                english_part = captured_text
                
                remaining_text = text[match.end():].strip()
                if remaining_text and is_chinese(remaining_text):
                    return clean_text(remaining_text), clean_text(english_part)
                    
            elif pattern_type.startswith('zh_first'):
                
                chinese_part = captured_text
                
                remaining_text = text[match.end():].strip()
                if remaining_text and is_english(remaining_text):
                    return clean_text(chinese_part), clean_text(remaining_text)
    
    
    return smart_split_text(text)

def smart_split_text(text: str) -> Tuple[Optional[str], Optional[str]]:
    """
    智能分离中英文文本，处理混合情况
    """
    if not text:
        return None, None
    
    
    paragraphs = text.split('\n\n')
    
    english_paragraphs = []
    chinese_paragraphs = []
    
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        
        
        chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', para))
        english_chars = len(re.findall(r'[a-zA-Z]', para))
        total_chars = len(para)
        
        if total_chars == 0:
            continue
        
        
        chinese_ratio = chinese_chars / total_chars
        english_ratio = english_chars / total_chars
        
        
        if chinese_ratio > 0.2:  
            chinese_paragraphs.append(para)
        elif english_ratio > 0.3:  
            english_paragraphs.append(para)
        else:
            
            if chinese_chars > english_chars:
                chinese_paragraphs.append(para)
            else:
                english_paragraphs.append(para)
    
    
    if not english_paragraphs and not chinese_paragraphs:
        return split_by_sentences(text)
    
    chinese_text = '\n\n'.join(chinese_paragraphs) if chinese_paragraphs else None
    english_text = '\n\n'.join(english_paragraphs) if english_paragraphs else None
    
    
    if chinese_text:
        chinese_text = clean_text(chinese_text)
    if english_text:
        english_text = clean_text(english_text)
    
    return chinese_text, english_text

def split_by_sentences(text: str) -> Tuple[Optional[str], Optional[str]]:
    """
    按句子分离中英文
    """
    
    sentences = re.split(r'[.!?。！？]', text)
    
    english_sentences = []
    chinese_sentences = []
    
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
        
        
        chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', sentence))
        english_chars = len(re.findall(r'[a-zA-Z]', sentence))
        total_chars = len(sentence)
        
        if total_chars == 0:
            continue
        
        
        if chinese_chars > english_chars:
            chinese_sentences.append(sentence)
        else:
            english_sentences.append(sentence)
    
    chinese_text = '。'.join(chinese_sentences) + ('。' if chinese_sentences else '')
    english_text = '. '.join(english_sentences) + ('.' if english_sentences else '')
    
    
    if chinese_text:
        chinese_text = clean_text(chinese_text)
    if english_text:
        english_text = clean_text(english_text)
    
    return chinese_text if chinese_sentences else None, english_text if english_sentences else None

def clean_text(text: str) -> str:
    """
    清理文本中的分隔符标记和多余空格
    """
    if not text:
        return text
    
    
    separators = [
        r'English explanation:\s*',
        r'Corresponding Chinese explanation:\s*',
        r'中文解释：\s*',
        r'对应中文解释：\s*',
        r'Chinese explanation:\s*',
        r'英文解释：\s*',
    ]
    
    cleaned_text = text
    for separator in separators:
        cleaned_text = re.sub(separator, '', cleaned_text, flags=re.IGNORECASE)
    
    
    cleaned_text = re.sub(r'\n\s*\n', '\n\n', cleaned_text)  
    cleaned_text = re.sub(r'^\s+|\s+$', '', cleaned_text, flags=re.MULTILINE)  
    
    return cleaned_text.strip()

def process_jsonl(input_file: str, output_file: str):
    """处理JSONL文件"""
    processed_count = 0
    error_count = 0
    
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        for line_num, line in enumerate(infile, 1):
            try:
                line = line.strip()
                if not line:
                    continue
                
                
                data = json.loads(line)
                
                
                if 'positive_response' in data:
                    chinese_pos, english_pos = split_chinese_english(data['positive_response'])
                    
                    
                    pos_sample = {
                        'id': f"{data['id']}_pos",
                        'query': data['query'],
                        'positive': data['positive'],
                        'negative': data['negative'],
                        'dataset': data['dataset'],
                        'label': 1,
                        'chinese_response': chinese_pos,
                        'english_response': english_pos
                    }
                    outfile.write(json.dumps(pos_sample, ensure_ascii=False) + '\n')
                
                
                if 'negative_response' in data:
                    chinese_neg, english_neg = split_chinese_english(data['negative_response'])
                    
                    
                    neg_sample = {
                        'id': f"{data['id']}_neg",
                        'query': data['query'],
                        'positive': data['positive'],
                        'negative': data['negative'],
                        'dataset': data['dataset'],
                        'label': 0,
                        'chinese_response': chinese_neg,
                        'english_response': english_neg
                    }
                    outfile.write(json.dumps(neg_sample, ensure_ascii=False) + '\n')
                
                processed_count += 1
                
                if processed_count % 100 == 0:
                    print(f"已处理 {processed_count} 条记录...")
                    
            except json.JSONDecodeError as e:
                print(f"第 {line_num} 行JSON解析错误: {e}")
                error_count += 1
            except Exception as e:
                print(f"第 {line_num} 行处理错误: {e}")
                error_count += 1
    
    print(f"\n处理完成!")
    print(f"成功处理: {processed_count} 条")
    print(f"错误数量: {error_count} 条")
    print(f"输出文件: {output_file}")

def main():
    input_file = "gpt4o_responses/responses.jsonl"
    output_file = "gpt4o_responses/split_responses.jsonl"
    
    if not os.path.exists(input_file):
        print(f"错误: 输入文件 {input_file} 不存在!")
        return
    
    print(f"开始处理文件: {input_file}")
    print(f"输出文件: {output_file}")
    
    process_jsonl(input_file, output_file)

if __name__ == "__main__":
    main()
