# import json
# import re
# import sys

# # ========= 配置 =========
# JSONL_FILE = "  # 改成你的文件路径
# # =======================

# # 中文字符正则（CJK Unified Ideographs）
# CHINESE_PATTERN = re.compile(r'[\u4e00-\u9fff]')

# def has_chinese(text: str) -> bool:
#     """判断字符串是否包含中文"""
#     return bool(CHINESE_PATTERN.search(text))


# def main():
#     total_lines = 0          # 总记录数
#     chinese_count = 0        # 含中文的记录数
#     error_lines = 0          # 解析失败的行数

#     try:
#         with open(JSONL_FILE, "r", encoding="utf-8") as f:
#             for line_number, line in enumerate(f, start=1):
#                 line = line.strip()
#                 if not line:
#                     continue

#                 total_lines += 1

#                 try:
#                     data = json.loads(line)
#                 except json.JSONDecodeError:
#                     error_lines += 1
#                     continue

#                 messages = data.get("summary", {}).get("messages", [])

#                 # 只要一条 record 里有任意一个 content 含中文，就算 1 次
#                 for msg in messages:
#                     content = msg.get("content", "")
#                     if isinstance(content, str) and has_chinese(content):
#                         chinese_count += 1
#                         break

#         print("====== 统计结果 ======")
#         print(f"总记录数: {total_lines}")
#         print(f"包含中文 content 的记录数: {chinese_count}")
#         print(f"不包含中文的记录数: {total_lines - chinese_count}")
#         print(f"解析失败的行数: {error_lines}")

#         if total_lines > 0:
#             ratio = chinese_count / total_lines * 100
#             print(f"中文占比: {ratio:.2f}%")

#     except FileNotFoundError:
#         print(f"文件未找到: {JSONL_FILE}")
#         sys.exit(1)


# if __name__ == "__main__":
#     main()


import json
import re
import os
from typing import List

# ========= 配置 =========

INPUT_FILES = [
    "    "    "    " 含中文的记录输出到这个目录
OUTPUT_DIR_CN = " =======================

# 中文字符正则（CJK Unified Ideographs）
CHINESE_PATTERN = re.compile(r'[\u4e00-\u9fff]')


def has_chinese(text: str) -> bool:
    """判断字符串是否包含中文"""
    return bool(CHINESE_PATTERN.search(text))


def record_has_chinese(record: dict) -> bool:
    """
    只检查 role == 'assistant' 的 message，
    只要 assistant 的 content 含中文就返回 True
    """
    messages = record.get("summary", {}).get("messages", [])
    for msg in messages:
        if msg.get("role") != "assistant":
            continue
        content = msg.get("content", "")
        if isinstance(content, str) and has_chinese(content):
            return True
    return False


def process_one_file(input_path: str):
    print(f"\n==== 处理文件: {input_path} ====")

    os.makedirs(OUTPUT_DIR_CN, exist_ok=True)
    filename = os.path.basename(input_path)
    output_cn_path = os.path.join(OUTPUT_DIR_CN, filename)

    kept_lines: List[str] = []     # 不含中文（assistant）的记录
    cn_lines: List[str] = []       # assistant 含中文的记录

    total = 0
    cn_count = 0
    error_count = 0

    with open(input_path, "r", encoding="utf-8") as f:
        for line_number, line in enumerate(f, start=1):
            line = line.rstrip("\n")
            if not line:
                continue

            total += 1

            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                error_count += 1
                # 解析失败的行：保留
                kept_lines.append(line + "\n")
                continue

            if record_has_chinese(record):
                cn_lines.append(line + "\n")
                cn_count += 1
            else:
                kept_lines.append(line + "\n")

    # 1️⃣ 覆写原文件（只保留 assistant 不含中文的）
    with open(input_path, "w", encoding="utf-8") as f:
        f.writelines(kept_lines)

    # 2️⃣ assistant 含中文的写到新目录
    if cn_lines:
        with open(output_cn_path, "w", encoding="utf-8") as f:
            f.writelines(cn_lines)

    print("------ 统计结果 ------")
    print(f"总记录数: {total}")
    print(f"assistant 含中文记录数: {cn_count}")
    print(f"保留在原文件的记录数: {len(kept_lines)}")
    print(f"解析失败行数: {error_count}")
    print(f"assistant 中文输出文件: {output_cn_path}")


def main():
    for path in INPUT_FILES:
        if not os.path.exists(path):
            print(f"文件不存在，跳过: {path}")
            continue
        process_one_file(path)

    print("\n✅ 所有文件处理完成")


if __name__ == "__main__":
    main()
