{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6d5f099-4df4-4e88-83c2-765a4fb5e1f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bbae33f-7fc9-4f9d-87ac-29ed18483143",
   "metadata": {},
   "outputs": [],
   "source": [
    "Path = \"/home/user_name/U-MARVEL\"\n",
    "# 训练集候选池路径\n",
    "train_cand_path = os.path.join(Path, \"data/M-BEIR/cand_pool/global/mbeir_union_train_cand_pool.jsonl\")\n",
    "# 测试集候选池路径\n",
    "union_test_cand_pool_path = os.path.join(Path,\"data/M-BEIR/cand_pool/global/mbeir_union_test_cand_pool.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4395b8ee-5243-49fd-bff4-b152ab90e599",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(train_cand_path, 'r', encoding='utf-8') as f:\n",
    "    total_lines = sum(1 for _ in f)\n",
    "print(total_lines)\n",
    "with open(union_test_cand_pool_path, 'r', encoding='utf-8') as f:\n",
    "    total_lines = sum(1 for _ in f)\n",
    "print(total_lines)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9183b00e-1905-4e15-9259-8b30b2193735",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_names = [\n",
    "    \"mbeir_cirr_task7_test.jsonl\",\n",
    "    \"mbeir_edis_task2_test.jsonl\",\n",
    "    \"mbeir_fashion200k_task0_test.jsonl\",\n",
    "    \"mbeir_fashion200k_task3_test.jsonl\",\n",
    "    \"mbeir_fashioniq_task7_test.jsonl\",\n",
    "    \"mbeir_infoseek_task6_test.jsonl\",\n",
    "    \"mbeir_infoseek_task8_test.jsonl\",\n",
    "    \"mbeir_mscoco_task0_test.jsonl\",\n",
    "    \"mbeir_mscoco_task3_test.jsonl\",\n",
    "    \"mbeir_nights_task4_test.jsonl\",\n",
    "    \"mbeir_oven_task6_test.jsonl\",\n",
    "    \"mbeir_oven_task8_test.jsonl\",\n",
    "    \"mbeir_visualnews_task0_test.jsonl\",\n",
    "    \"mbeir_visualnews_task3_test.jsonl\",\n",
    "    \"mbeir_webqa_task1_test.jsonl\",\n",
    "    \"mbeir_webqa_task2_test.jsonl\"\n",
    "]\n",
    "test_taskname2qid = defaultdict(list)\n",
    "print(len(file_names))\n",
    "total_lines = 0\n",
    "Path_temp = \"/home/user_name/U-MARVEL/data/M-BEIR/query/test\"\n",
    "for file_name in file_names:\n",
    "    file_name = os.path.join(Path_temp,file_name)\n",
    "    try:\n",
    "        with open(file_name, 'r', encoding='utf-8') as file:\n",
    "            lines = file.readlines()\n",
    "            line_count = len(lines)\n",
    "            total_lines += line_count\n",
    "            parts = file_name.split(\"_\")\n",
    "            result = \"_\".join(parts[1:3])\n",
    "            print(f\"{file_name} 的行数: {line_count}\")\n",
    "    except FileNotFoundError:\n",
    "        print(f\"错误: 文件 {file_name} 未找到。\")\n",
    "    except Exception as e:\n",
    "        print(f\"错误: 读取文件 {file_name} 时发生未知错误: {e}\")\n",
    "\n",
    "print(f\"所有文件的总行数: {total_lines}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f41d887-bee6-4e00-ba10-548e73c9a33d",
   "metadata": {},
   "outputs": [],
   "source": [
    "query_union_train = \"/home/user_name/U-MARVEL/data/M-BEIR/query/union_train/mbeir_union_up_train.jsonl\"\n",
    "with open(query_union_train, 'r', encoding='utf-8') as file:\n",
    "    lines = file.readlines()\n",
    "print(f\"训练集所有文件的总行数: {len(lines)}\")\n",
    "print(f\"测试集所有文件的总行数: {total_lines}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e98bcd01-582b-4e35-bb26-d39950de6693",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_cand_path = os.path.join(Path, \"data/M-BEIR/cand_pool/global/mbeir_union_train_cand_pool.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9414b1c-c63d-45d7-88cc-04a2bfd61da0",
   "metadata": {},
   "outputs": [],
   "source": [
    "datasetid2name = {\n",
    "    0: 'visualnews', 1: 'fashion200k', 2: 'webqa', 3: 'edis', \n",
    "    4: 'nights', 5: 'oven', 6: 'infoseek', 7: 'fashioniq', 8: 'cirr', 9: 'mscoco'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9dc7321-2518-446e-b0bd-0e81ee35b182",
   "metadata": {},
   "outputs": [],
   "source": [
    "task_names = [\n",
    "    'visualnews_task0', 'mscoco_task0', 'fashion200k_task0', 'webqa_task1', 'edis_task2', \n",
    "    'webqa_task2', 'visualnews_task3', 'mscoco_task3', 'fashion200k_task3', 'nights_task4', \n",
    "    'oven_task6', 'infoseek_task6', 'fashioniq_task7', 'cirr_task7', 'oven_task8', 'infoseek_task8']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b94af0b8-f2cb-49b7-8e94-dfc6e6fadf1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读取数据\n",
    "taskname2qid = defaultdict(list)\n",
    "query_union_train = \"/home/user_name/U-MARVEL/data/M-BEIR/query/union_train/mbeir_union_up_train.jsonl\"\n",
    "with open(query_union_train, 'r', encoding='utf-8') as f:\n",
    "    for line in tqdm(f, desc=f\"Loading: \"):\n",
    "        item = json.loads(line)\n",
    "        qid = item['qid']\n",
    "        datasetid = int(item['qid'].split(\":\")[0])\n",
    "        task_id = int(item[\"task_id\"])\n",
    "        taskname = datasetid2name[datasetid] + \"_task\" + str(task_id)\n",
    "        taskname2qid[taskname].append(qid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4ca3744-65f2-4a10-8417-c7dd65c5b60d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "num1,num2  = 0, 0\n",
    "num3 = 0\n",
    "for key,value in taskname2qid.items():\n",
    "    print(f\"{key} : {len((value))} {len((value))/1332790}\")\n",
    "    num1+=len((value))\n",
    "    num3 += len((value))/1332790\n",
    "    print(f\"{key} : {len(set(value))}\")\n",
    "    num2+=len(set(value))\n",
    "    print(\"*\"*50)\n",
    "print(num1)\n",
    "print(num2)\n",
    "print(num3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6ec22e84-a170-41db-a3b1-1f7c5580f957",
   "metadata": {},
   "source": [
    "### 每个任务单独抽取 1/10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75df7428-c1fc-4790-a491-599e723460a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "random.seed(42)\n",
    "taskname2qid_sample = defaultdict(list)\n",
    "for task, qid_list in taskname2qid.items():\n",
    "    sample_size = len(qid_list) // 10\n",
    "    sampled_list = random.sample(qid_list, sample_size)\n",
    "    taskname2qid_sample[task] = sampled_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8462069-79ee-4931-a7cc-eab2ce91b0b3",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "num1,num2  = 0, 0\n",
    "num3 = 0\n",
    "for key,value in taskname2qid_sample.items():\n",
    "    print(f\"{key} : {len((value))} {len((value))/133276}\")\n",
    "    num1+=len((value))\n",
    "    num3 += len((value))/133276\n",
    "    print(f\"{key} : {len(set(value))}\")\n",
    "    num2+=len(set(value))\n",
    "    print(\"*\"*50)\n",
    "print(num1)\n",
    "print(num2)\n",
    "print(num3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a407f67a-9914-4772-a2bd-27e6bf131274",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_sample_dids = []\n",
    "for key,value in taskname2qid_sample.items():\n",
    "    all_sample_dids.extend(value)\n",
    "print(len(all_sample_dids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56bb306f-0aaa-417d-bf3c-af8eac354bab",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "all_sample_dids = Counter(all_sample_dids)\n",
    "all_sample_dids = dict(all_sample_dids)\n",
    "print(len(all_sample_dids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39e2e74e-14b6-4af4-9265-2d3b1a740b26",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读取数据\n",
    "import copy\n",
    "data_sample = []\n",
    "query_union_train  = \"/home/user_name/U-MARVEL/data/M-BEIR/query/union_train/mbeir_union_up_train.jsonl\"\n",
    "with open(query_union_train, 'r', encoding='utf-8') as f:\n",
    "    # 统计文件行数\n",
    "    line_count = sum(1 for _ in f)\n",
    "    f.seek(0)  # 将文件指针移回文件开头\n",
    "    for line in tqdm(f, total = line_count,desc=f\"Loading: \"):\n",
    "        item = json.loads(line)\n",
    "        qid  = item[\"qid\"]\n",
    "        if qid in all_sample_dids.keys() and all_sample_dids[qid]>0:\n",
    "            data_sample.append(copy.deepcopy(item))\n",
    "            all_sample_dids[qid]-=1\n",
    "print(len(all_sample_dids)) \n",
    "print(len(data_sample))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62b57444-d143-402b-ace1-404295c29dd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_sample_file_path = os.path.join(Path, \"data/M-BEIR/query/union_train/mbeir_union_up_train_10percent.jsonl\")\n",
    "with open(data_sample_file_path, 'w') as file:\n",
    "    for item in data_sample:\n",
    "        line = json.dumps(item) + '\\n'\n",
    "        file.write(line)\n",
    "print(f\"数据已成功保存到 {data_sample_file_path}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aa0eea6f-9282-4a95-b71e-8359e1ffe298",
   "metadata": {},
   "source": [
    "#### 验证保存结果没有问题"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91f7ad63-dc2d-40fc-a796-69b935b10d22",
   "metadata": {},
   "outputs": [],
   "source": [
    "taskname2qid_10percent = defaultdict(list)\n",
    "data_sample_file_path = os.path.join(Path, \"data/M-BEIR/query/union_train/mbeir_union_up_train_10percent.jsonl\")\n",
    "with open(data_sample_file_path, 'r', encoding='utf-8') as f:\n",
    "    # 统计文件行数\n",
    "    line_count = sum(1 for _ in f)\n",
    "    f.seek(0)  # 将文件指针移回文件开头\n",
    "    for index,line in enumerate(tqdm(f, total = line_count,desc=f\"Loading: \")):\n",
    "        item = json.loads(line)\n",
    "        qid = item['qid']\n",
    "        datasetid = int(item['qid'].split(\":\")[0])\n",
    "        task_id = int(item[\"task_id\"])\n",
    "        taskname = datasetid2name[datasetid] + \"_task\" + str(task_id)\n",
    "        taskname2qid_10percent[taskname].append(qid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "431e163c-67f9-45c8-851e-8a6268fb1cfd",
   "metadata": {},
   "outputs": [],
   "source": [
    "num1,num2  = 0, 0\n",
    "num3 = 0\n",
    "for key,value in taskname2qid_sample.items():\n",
    "    print(f\"{key} : {len((value))} {len((value))/133276}\")\n",
    "    num1+=len((value))\n",
    "    num3 += len((value))/133276\n",
    "    print(f\"{key} : {len(set(value))}\")\n",
    "    num2+=len(set(value))\n",
    "    print(\"*\"*50)\n",
    "print(num1)\n",
    "print(num2)\n",
    "print(num3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "800e69a1-91a3-40b0-8ea3-5f68ee81b1d8",
   "metadata": {},
   "source": [
    "#### 根据任务将qid 拆分成 各自的训练池"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5dfbb9fc-e7e1-46fe-bda1-1ebc8c12f606",
   "metadata": {},
   "outputs": [],
   "source": [
    "import copy \n",
    "temp_taskname2qid_10percent = defaultdict(set)\n",
    "taskname2data_10percent = defaultdict(list)\n",
    "data_sample_file_path = os.path.join(Path, \"data/M-BEIR/query/union_train/mbeir_union_up_train_10percent.jsonl\")\n",
    "with open(data_sample_file_path, 'r', encoding='utf-8') as f:\n",
    "    # 统计文件行数\n",
    "    line_count = sum(1 for _ in f)\n",
    "    f.seek(0)  # 将文件指针移回文件开头\n",
    "    for index,line in enumerate(tqdm(f, total = line_count,desc=f\"Loading: \")):\n",
    "        item = json.loads(line)\n",
    "        qid = item['qid']\n",
    "        datasetid = int(item['qid'].split(\":\")[0])\n",
    "        task_id = int(item[\"task_id\"])\n",
    "        taskname = datasetid2name[datasetid] + \"_task\" + str(task_id)\n",
    "        if qid not in temp_taskname2qid_10percent[taskname]:\n",
    "            temp_taskname2qid_10percent[taskname].add(qid)\n",
    "            taskname2data_10percent[taskname].append(copy.deepcopy(item))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "526159d6-d0c8-492b-8b08-bb20f703a06d",
   "metadata": {},
   "outputs": [],
   "source": [
    "num = 0\n",
    "for key,value in taskname2data_10percent.items():\n",
    "    print(key,len(value))\n",
    "    num+=len(value)\n",
    "print(num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9893f4e3-4e58-4ec6-a2e8-49e9c1178ba9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_jsonl_file(data, file_path):\n",
    "    with open(file_path, 'w', encoding='utf-8') as f:\n",
    "        for item in data:\n",
    "            f.write(json.dumps(item, ensure_ascii=False) + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f88136fd-cfe5-452a-878b-2b3fc74e6208",
   "metadata": {},
   "outputs": [],
   "source": [
    "base_file_name = \"/home/user_name/U-MARVEL/data/M-BEIR/query/train/query_train/mbeir_train_\"\n",
    "file_name_list = []\n",
    "for taskname,data in (taskname2data_10percent.items()):\n",
    "    print(taskname,len(data))\n",
    "    file_name = base_file_name + str(taskname)+ \"_dedup_10percent.jsonl\"\n",
    "    save_jsonl_file(data,file_name)\n",
    "    file_name_list.append(file_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e27d2a1-c0f5-483f-8b5a-c81fc2873cce",
   "metadata": {},
   "outputs": [],
   "source": [
    "for file_name in (file_name_list):\n",
    "    with open(file_name, 'r', encoding='utf-8') as f:\n",
    "    # 统计文件行数\n",
    "        line_count = sum(1 for _ in f)\n",
    "    print(file_name)\n",
    "    print(line_count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b08952b-56ce-40e3-b3ae-bf0bc9032365",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a0368f9-9840-4a67-b25a-d49a9d83be89",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d901770c-a1e7-47e6-ba4c-ab92c21d87a9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a080894b-cc87-4140-b17c-01d510f35712",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67bb209e-1300-4afd-b82f-047a822ab0ff",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d246ecd6-0e41-491f-b3d5-1300a0c95acc",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "295e48f2-e0aa-475f-96ad-2eab2246dffd",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0958bff3-68be-4e26-8072-9b747c353832",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4e69a72-fc4f-44c1-8125-487614d1f801",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdc742ba-a751-4173-8f06-f11a8f136d67",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55b8fb06-1d44-41ea-8e48-225727c826f7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bda4f04-3a89-4829-863d-1f810cefd46c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "101f4486-a444-423b-868f-2ab2eecf0987",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6934effd-dc0c-4a32-8c39-bf0008024a32",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cfab9d0-1210-4804-956a-f3155b8d772e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d06d188-de2d-40d5-820b-144eb7775997",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dacb139e-8759-4576-a189-5c1fadc75410",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "591b3fa0-2a61-4f00-8645-ae8702ad19ac",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4077b247-e8de-4ff4-a82d-e16935a3d028",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "246b9794-b022-410a-a009-e55a75e47020",
   "metadata": {},
   "source": [
    "### 每个任务各抽取 1/10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d92daf0d-464d-43f7-86c2-765beb72eb6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random\n",
    "import os\n",
    "import tqdm\n",
    "PATH = \"/home/user_name/U-MARVEL\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6cee0022-335b-49ad-8844-38400b0b87b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "num1,num2 = 0,0\n",
    "def sample_jsonl_file(file_path):\n",
    "    random.seed(42)\n",
    "    data = []\n",
    "    with open(file_path, 'r', encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            data.append(json.loads(line))\n",
    "    sampled_data = random.sample(data, len(data) // 10)\n",
    "    print(file_path)\n",
    "    print(len(data),len(sampled_data))\n",
    "    global num1,num2\n",
    "    num1+= len(data)\n",
    "    num2+= len(sampled_data)\n",
    "    return sampled_data\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a58a0e97-a65c-4522-9926-9a8390ef88ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "query_data_paths = [\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_mscoco_task0_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_mscoco_task3_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_cirr_task7_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_fashioniq_task7_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_webqa_task1_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_nights_task4_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_oven_task6_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_infoseek_task6_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_fashion200k_task0_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_fashion200k_task3_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_visualnews_task3_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_visualnews_task0_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_webqa_task2_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_oven_task8_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_infoseek_task8_dedup.jsonl\",\n",
    "    \"data/M-BEIR/query/train/query_train/mbeir_train_edis_task2_dedup.jsonl\",\n",
    "]\n",
    "\n",
    "sampled_files = []\n",
    "for i, path in enumerate(query_data_paths):\n",
    "    path = os.path.join(PATH,path)\n",
    "    sampled_data = sample_jsonl_file(path)\n",
    "    new_file_path = path.replace(\"_dedup.jsonl\",\"_dedup_10percent.jsonl\")\n",
    "    save_jsonl_file(sampled_data, new_file_path)\n",
    "    sampled_files.append(new_file_path)\n",
    "print(num1)\n",
    "print(num2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d6e6739-1117-4ec5-b02c-7dfa71d2bf55",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_sampled_data = []\n",
    "for file_path in sampled_files:\n",
    "    with open(file_path, 'r', encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            all_sampled_data.append(json.loads(line))\n",
    "print(len(all_sampled_data))\n",
    "random.shuffle(all_sampled_data)\n",
    "# 保存合并并打乱后的数据到新文件\n",
    "merged_file_path = \"data/M-BEIR/query/train/query_train/mbeir_union_up_train_10percent.jsonl\"\n",
    "merged_file_path = os.path.join(PATH,merged_file_path)\n",
    "save_jsonl_file(all_sampled_data, merged_file_path)\n",
    "print(f\"抽样并合并后的数据已保存到 {merged_file_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "765cd7f6-8f61-49bf-a7ec-e8940d0e962d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46ca41d3-7dac-4ff1-9ff2-373a56ca5ef7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f85dd2a-10d7-4bf4-9803-2f6658994ee9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0cc287ce-1f57-4ee2-86b1-b15f0dc4e2fc",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aaba7de5-1d56-4d82-98a0-ba9a7bb46c3f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14e10fc5-ea57-4672-bb26-9f0308fd368f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30687924-7de0-452f-878f-496dff22bf6a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73e71786-ca5e-4eee-af4f-957c9693607e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python(3.8.8)",
   "language": "python",
   "name": "env-3.8.8"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
