import json
# IT IS JUST A SMALL TOOL, OK TO DELETE IT
def extract_matching_records(jsonl_files, protein_mapping_path, output_file):
    """
    从多个 JSONL 文件中提取与 protein_mapping 匹配的数据，并保存到一个新的 JSONL 文件中。

    :param jsonl_files: 包含 JSONL 文件名的列表
    :param protein_mapping: 包含蛋白质名字为 key 的字典
    :param output_file: 输出 JSONL 文件名
    """
    # 存储符合条件的记录
    matching_records = []
    with open(protein_mapping_path) as f:
        protein_mapping=json.load(f)
    
    # 遍历 JSONL 文件
    for file_name in jsonl_files:
        try:
            with open(file_name, 'r') as file:
                for line in file:
                    try:
                        record = json.loads(line.strip())
                        # 检查是否有 title 字段且是否在 protein_mapping 中
                        if record.get("title") in protein_mapping:
                            matching_records.append(record)
                    except json.JSONDecodeError:
                        print(f"跳过无法解析的行: {line}")
        except FileNotFoundError:
            print(f"文件未找到: {file_name}")
        except Exception as e:
            print(f"处理文件 {file_name} 时发生错误: {e}")
    
    # 将匹配记录写入新的 JSONL 文件
    with open(output_file, 'w') as out_file:
        for record in matching_records:
            out_file.write(json.dumps(record) + '\n')

    print(f"完成处理，共提取了 {len(matching_records)} 条记录到 {output_file}。")


# 示例用法
if __name__ == "__main__":
    # 输入 JSONL 文件列表
    jsonl_files = ["/storage/tancheng/dataset_condensation/dataset_condensation/raw_data/afdb/afdb_0.jsonl"]

    # 输入 JSON 文件
    protein_mapping_file = "/storage/tancheng/dataset_condensation/dataset_condensation/results/alltest_forward/lowest_5000.json"

    # 输出 JSONL 文件名
    output_file = "/storage/tancheng/dataset_condensation/downstream_tasks/sequence_design/dataset/cath4.2/afdb_train_cath_test/train_lowrmsd.jsonl"

    # 调用函数
    extract_matching_records(jsonl_files, protein_mapping_file, output_file)
