# import json
# import os
#
#
# def process_json(input_file, output_dir):
#     with open(input_file, 'r', encoding='utf-8') as f:
#         data = [json.loads(line) for line in f]
#
#     num_parts = len(data[0]['text'])  # 假设每条记录的 text 列表长度相同
#     os.makedirs(output_dir, exist_ok=True)
#
#     # 初始化一个列表以存储分割后的数据
#     split_data = [[] for _ in range(num_parts)]
#
#     for item in data:
#         for i, text_part in enumerate(item['text']):
#             new_item = {
#                 "idx": item["idx"],
#                 "answer": item["answer"],
#                 "text": text_part,
#                 "transpose": item["transpose"],
#                 "resort": item["resort"],
#                 "question_id": item["question_id"],
#                 "table_id": item["table_id"],
#                 "title": item["title"],
#                 "table": item["table"],
#                 "question": item["question"],
#             }
#             split_data[i].append(new_item)
#
#     for i, part in enumerate(split_data):
#         output_file = os.path.join(output_dir, f'result_{i + 1}.jsonl')
#         with open(output_file, 'w', encoding='utf-8') as f:
#             for entry in part:
#                 json.dump(entry, f, ensure_ascii=False)
#                 f.write('\n')
#
#
# input_file = '/home/yangzhen/code/trillm/output/wtq_agent_triple/result5.jsonl'
# output_dir = '/home/yangzhen/code/trillm/output/wtq_agent_triple'
# process_json(input_file, output_dir)


# import json
#
#
# def extract_later_data(input_file, output_file, start_idx=3418):
#     with open(input_file, 'r', encoding='utf-8') as f:
#         data = [json.loads(line) for line in f]
#
#     # 筛选 idx >= 3418 的数据
#     later_data = [item for item in data if item['idx'] >= start_idx]
#
#     # 将筛选后的数据写入新的 JSON 文件
#     with open(output_file, 'w', encoding='utf-8') as f:
#         for item in later_data:
#             json.dump(item, f, ensure_ascii=False)
#             f.write('\n')
#
#
# input_file = '/home/yangzhen/code/trillm/output/wtq_agent_triple/result5.jsonl'  # 原始 JSON 文件名
# output_file = '/home/yangzhen/code/trillm/output/wtq_agent_triple/extracted_data5.json'  # 输出文件名
# extract_later_data(input_file, output_file)

import json

def check_idx(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    idx_set = set()
    all_indices = set(range(len(data)))
    duplicates = set()

    for item in data:
        idx = item['idx']
        if idx in idx_set:
            duplicates.add(idx)
        else:
            idx_set.add(idx)

    missing = all_indices - idx_set

    print("Missing indices:", sorted(list(missing)))
    print("Duplicate indices:", sorted(list(duplicates)))

input_file = '/home/yangzhen/code/trillm/output/wtq_triple_valid/result.jsonl'  # 你的JSON文件名
check_idx(input_file)
