import os
import json
import random
from glob import glob

# 配置
base_dir = "structured_results"
output_file = os.path.join(base_dir, "sampled_structured_results.jsonl")
num_samples = 6
model_names=["fewshot-Qwen2.5-7B-Instruct", "Qwen2.5-7B-Instruct", "SearchR1-nq_hotpotqa_train-qwen2.5-7b-em-ppo-v0.3", "DeepResearcher-7b", "ReSearch-Qwen-7B-Instruct"]
# 获取所有目标文件

all_samples = []
file_paths = []
for model_name in model_names:
    file_paths.extend(glob(f"{base_dir}/*/{model_name}.jsonl"))

for file_path in file_paths:
    # 解析 dataset 和 model
    print(file_path)
    parts = file_path.split('/')
    dataset = parts[1]

    model = file_path.split("/")[-1].replace(".jsonl", "")
    # 读取所有行
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    # 随机采样, is_correct 为 1 的样本 取 3 为 0 的样本 取 3
    correct_lines = [line for line in lines if json.loads(line)["is_correct"] == 1]
    incorrect_lines = [line for line in lines if json.loads(line)["is_correct"] == 0]
    sampled_lines = random.sample(correct_lines, min(3, len(correct_lines))) + random.sample(incorrect_lines, min(3, len(incorrect_lines)))

    for line in sampled_lines:
        data = json.loads(line)
        data["model"] = model
        data["dataset"] = dataset
        all_samples.append(data)

# 写入输出文件
with open(output_file, 'w', encoding='utf-8') as f:
    for item in all_samples:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"已采样 {len(all_samples)} 条数据，输出到 {output_file}")