{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "# %load_ext lab_black"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/cluster/home/kevidu/venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "/cluster/home/kevidu/venv/lib/python3.11/site-packages/transformers/utils/hub.py:124: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from collections import defaultdict\n",
    "import numpy as np\n",
    "import random\n",
    "import pandas as pd\n",
    "import os\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "from dataset import load_dataset_from_path\n",
    "from datasets import load_dataset, Dataset\n",
    "from utils import format_query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "SEED = 1\n",
    "random.seed(SEED)\n",
    "np.random.seed(SEED)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ROOT_DATA_DIR = \"../data/Yago/\"\n",
    "# RAW_DATA_PATH = os.path.join(ROOT_DATA_DIR, \"yago_qec.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filtered to be only the entities/queries Llama2chat can answer correctly\n",
    "ROOT_DATA_DIR = \"../data/YagoLlama2/\"\n",
    "RAW_DATA_PATH = os.path.join(ROOT_DATA_DIR, \"llama2chat_yago_qec.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "66"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = load_dataset_from_path(RAW_DATA_PATH)\n",
    "len(dataset.keys())\n",
    "# dataset[:1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['answer_uris',\n",
       " 'answers',\n",
       " 'context_templates',\n",
       " 'entities',\n",
       " 'entity_namesake_to_degree',\n",
       " 'entity_namesake_to_num_uris',\n",
       " 'entity_uri_to_degree',\n",
       " 'entity_uri_to_predicate_degree',\n",
       " 'entity_uris',\n",
       " 'gpt_fake_entities',\n",
       " 'query_forms']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(dataset[list(dataset.keys())[1]].keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 66/66 [00:00<00:00, 560.47it/s]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>context</th>\n",
       "      <th>query</th>\n",
       "      <th>weight_context</th>\n",
       "      <th>answer</th>\n",
       "      <th>entity</th>\n",
       "      <th>ctx_answer</th>\n",
       "      <th>prior_answer</th>\n",
       "      <th>query_id</th>\n",
       "      <th>query_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>'Norma Talmadge filmography' is about Paul Anka.</td>\n",
       "      <td>'Norma Talmadge filmography' is about</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Paul Anka</td>\n",
       "      <td>Norma Talmadge filmography</td>\n",
       "      <td>Paul Anka</td>\n",
       "      <td>Norma Talmadge</td>\n",
       "      <td>http://schema.org/about</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>'Norma Talmadge filmography' is about Paul Anka.</td>\n",
       "      <td>'Norma Talmadge filmography' is about</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Norma Talmadge</td>\n",
       "      <td>Norma Talmadge filmography</td>\n",
       "      <td>Paul Anka</td>\n",
       "      <td>Norma Talmadge</td>\n",
       "      <td>http://schema.org/about</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>'Norma Talmadge filmography' is about Morgan F...</td>\n",
       "      <td>'Norma Talmadge filmography' is about</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Morgan Freeman</td>\n",
       "      <td>Norma Talmadge filmography</td>\n",
       "      <td>Morgan Freeman</td>\n",
       "      <td>Norma Talmadge</td>\n",
       "      <td>http://schema.org/about</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>'Norma Talmadge filmography' is about Morgan F...</td>\n",
       "      <td>'Norma Talmadge filmography' is about</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Norma Talmadge</td>\n",
       "      <td>Norma Talmadge filmography</td>\n",
       "      <td>Morgan Freeman</td>\n",
       "      <td>Norma Talmadge</td>\n",
       "      <td>http://schema.org/about</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>'Norma Talmadge filmography' is about Shilpa S...</td>\n",
       "      <td>'Norma Talmadge filmography' is about</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Shilpa Shetty</td>\n",
       "      <td>Norma Talmadge filmography</td>\n",
       "      <td>Shilpa Shetty</td>\n",
       "      <td>Norma Talmadge</td>\n",
       "      <td>http://schema.org/about</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27673</th>\n",
       "      <td>Öland County was replaced by Kagoshima.</td>\n",
       "      <td>Öland County was replaced by</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Kalmar County</td>\n",
       "      <td>Öland County</td>\n",
       "      <td>Kagoshima</td>\n",
       "      <td>Kalmar County</td>\n",
       "      <td>reverse-http://yago-knowledge.org/resource/rep...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27674</th>\n",
       "      <td>A student of Charles Émile Picard was Jacques ...</td>\n",
       "      <td>A student of Charles Émile Picard was</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Jacques Hadamard</td>\n",
       "      <td>Charles Émile Picard</td>\n",
       "      <td>Jacques Hadamard</td>\n",
       "      <td>Paul Painlevé</td>\n",
       "      <td>reverse-http://yago-knowledge.org/resource/stu...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27675</th>\n",
       "      <td>A student of Charles Émile Picard was Jacques ...</td>\n",
       "      <td>A student of Charles Émile Picard was</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Paul Painlevé</td>\n",
       "      <td>Charles Émile Picard</td>\n",
       "      <td>Jacques Hadamard</td>\n",
       "      <td>Paul Painlevé</td>\n",
       "      <td>reverse-http://yago-knowledge.org/resource/stu...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27676</th>\n",
       "      <td>A student of Charles Émile Picard was Paul Pai...</td>\n",
       "      <td>A student of Charles Émile Picard was</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Paul Painlevé</td>\n",
       "      <td>Charles Émile Picard</td>\n",
       "      <td>Paul Painlevé</td>\n",
       "      <td>Jacques Hadamard</td>\n",
       "      <td>reverse-http://yago-knowledge.org/resource/stu...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27677</th>\n",
       "      <td>A student of Charles Émile Picard was Paul Pai...</td>\n",
       "      <td>A student of Charles Émile Picard was</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Jacques Hadamard</td>\n",
       "      <td>Charles Émile Picard</td>\n",
       "      <td>Paul Painlevé</td>\n",
       "      <td>Jacques Hadamard</td>\n",
       "      <td>reverse-http://yago-knowledge.org/resource/stu...</td>\n",
       "      <td>open</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>27678 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 context  \\\n",
       "0       'Norma Talmadge filmography' is about Paul Anka.   \n",
       "1       'Norma Talmadge filmography' is about Paul Anka.   \n",
       "2      'Norma Talmadge filmography' is about Morgan F...   \n",
       "3      'Norma Talmadge filmography' is about Morgan F...   \n",
       "4      'Norma Talmadge filmography' is about Shilpa S...   \n",
       "...                                                  ...   \n",
       "27673            Öland County was replaced by Kagoshima.   \n",
       "27674  A student of Charles Émile Picard was Jacques ...   \n",
       "27675  A student of Charles Émile Picard was Jacques ...   \n",
       "27676  A student of Charles Émile Picard was Paul Pai...   \n",
       "27677  A student of Charles Émile Picard was Paul Pai...   \n",
       "\n",
       "                                       query  weight_context  \\\n",
       "0      'Norma Talmadge filmography' is about             1.0   \n",
       "1      'Norma Talmadge filmography' is about             0.0   \n",
       "2      'Norma Talmadge filmography' is about             1.0   \n",
       "3      'Norma Talmadge filmography' is about             0.0   \n",
       "4      'Norma Talmadge filmography' is about             1.0   \n",
       "...                                      ...             ...   \n",
       "27673           Öland County was replaced by             0.0   \n",
       "27674  A student of Charles Émile Picard was             1.0   \n",
       "27675  A student of Charles Émile Picard was             0.0   \n",
       "27676  A student of Charles Émile Picard was             1.0   \n",
       "27677  A student of Charles Émile Picard was             0.0   \n",
       "\n",
       "                 answer                      entity        ctx_answer  \\\n",
       "0             Paul Anka  Norma Talmadge filmography         Paul Anka   \n",
       "1        Norma Talmadge  Norma Talmadge filmography         Paul Anka   \n",
       "2        Morgan Freeman  Norma Talmadge filmography    Morgan Freeman   \n",
       "3        Norma Talmadge  Norma Talmadge filmography    Morgan Freeman   \n",
       "4         Shilpa Shetty  Norma Talmadge filmography     Shilpa Shetty   \n",
       "...                 ...                         ...               ...   \n",
       "27673     Kalmar County                Öland County         Kagoshima   \n",
       "27674  Jacques Hadamard        Charles Émile Picard  Jacques Hadamard   \n",
       "27675     Paul Painlevé        Charles Émile Picard  Jacques Hadamard   \n",
       "27676     Paul Painlevé        Charles Émile Picard     Paul Painlevé   \n",
       "27677  Jacques Hadamard        Charles Émile Picard     Paul Painlevé   \n",
       "\n",
       "           prior_answer                                           query_id  \\\n",
       "0        Norma Talmadge                            http://schema.org/about   \n",
       "1        Norma Talmadge                            http://schema.org/about   \n",
       "2        Norma Talmadge                            http://schema.org/about   \n",
       "3        Norma Talmadge                            http://schema.org/about   \n",
       "4        Norma Talmadge                            http://schema.org/about   \n",
       "...                 ...                                                ...   \n",
       "27673     Kalmar County  reverse-http://yago-knowledge.org/resource/rep...   \n",
       "27674     Paul Painlevé  reverse-http://yago-knowledge.org/resource/stu...   \n",
       "27675     Paul Painlevé  reverse-http://yago-knowledge.org/resource/stu...   \n",
       "27676  Jacques Hadamard  reverse-http://yago-knowledge.org/resource/stu...   \n",
       "27677  Jacques Hadamard  reverse-http://yago-knowledge.org/resource/stu...   \n",
       "\n",
       "      query_type  \n",
       "0           open  \n",
       "1           open  \n",
       "2           open  \n",
       "3           open  \n",
       "4           open  \n",
       "...          ...  \n",
       "27673       open  \n",
       "27674       open  \n",
       "27675       open  \n",
       "27676       open  \n",
       "27677       open  \n",
       "\n",
       "[27678 rows x 9 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_dataset = defaultdict(list)\n",
    "num_fake_contexts_per_query = 10\n",
    "num_entities = 50\n",
    "context_types = [\"base\"]\n",
    "for query_id, qec in tqdm(dataset.items()):\n",
    "    ents_and_answers = list(zip(qec[\"entities\"], qec[\"answers\"]))\n",
    "    random.shuffle(ents_and_answers)\n",
    "    ents_and_answers = ents_and_answers[:num_entities]\n",
    "    for entity, answer in ents_and_answers:\n",
    "        for qt, qfs in qec[\"query_forms\"].items():\n",
    "            for qf in qfs:\n",
    "                if not qf.startswith(\"Q:\"):\n",
    "                    query = format_query(\n",
    "                        query=qf, entity=(entity,), context=\"\", answer=answer\n",
    "                    )\n",
    "                    for context_type in context_types:\n",
    "                        ctx_template = qec[\"context_templates\"][context_type]\n",
    "                        fake_answers_all = list(set(qec[\"answers\"]) - {answer})\n",
    "                        fake_answers = random.sample(fake_answers_all, k=min(num_fake_contexts_per_query, len(fake_answers_all))) # min in case the query has fewer possible answers\n",
    "                        for fake_answer in fake_answers: \n",
    "                            context = ctx_template.format(\n",
    "                                entity=entity, answer=fake_answer\n",
    "                            ).strip()\n",
    "                            # add fake\n",
    "                            my_dataset[\"context\"] += [context]\n",
    "                            my_dataset[\"query\"] += [query]\n",
    "                            my_dataset[\"weight_context\"] += [1.0]\n",
    "                            my_dataset[\"answer\"] += [\n",
    "                                fake_answer if qt == \"open\" else \"No\"\n",
    "                            ]\n",
    "\n",
    "                            # add real\n",
    "                            my_dataset[\"context\"] += [context]\n",
    "                            my_dataset[\"query\"] += [query]\n",
    "                            my_dataset[\"weight_context\"] += [0.0]\n",
    "                            my_dataset[\"answer\"] += [answer if qt == \"open\" else \"Yes\"]\n",
    "\n",
    "                            # Add metadata shared between both examples\n",
    "                            my_dataset[\"entity\"] += [entity] * 2\n",
    "                            my_dataset[\"ctx_answer\"] += [fake_answer] * 2\n",
    "                            my_dataset[\"prior_answer\"] += [answer] * 2\n",
    "                            my_dataset[\"query_id\"] += [query_id] * 2\n",
    "                            my_dataset[\"query_type\"] += [qt] * 2\n",
    "\n",
    "\n",
    "df_all = pd.DataFrame.from_dict(my_dataset)\n",
    "df_all"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Design choice: since we don't foresee needing to change the train/val/test fractions much, we just produce CSVs (albeit somewhat low-provenance) in this script.\n",
    "# If we wanted to be able to vary train/val/test fractions for some reason (e.g. Flatiron needed to to balance training and test set sizes for different diseases, e.g. in a pan-tumor model), then we should be more careful about parameterizing the train/val/test fracs.\n",
    "from typing import List\n",
    "\n",
    "\n",
    "def tuple_df(df):\n",
    "    return list(df.itertuples(index=False, name=None))\n",
    "\n",
    "\n",
    "def partition_df(df, columns: List[str], val_frac=0.2, test_frac=0.2):\n",
    "    keys_df = df[columns].drop_duplicates()\n",
    "    train_keys_df, test_keys_df = train_test_split(\n",
    "        keys_df, test_size=test_frac, random_state=SEED\n",
    "    )\n",
    "    train_keys_df, val_keys_df = train_test_split(\n",
    "        train_keys_df, test_size=val_frac, random_state=SEED\n",
    "    )\n",
    "\n",
    "    train_df = df_all.merge(train_keys_df, on=columns, how=\"inner\")\n",
    "    val_df = df_all.merge(val_keys_df, on=columns, how=\"inner\")\n",
    "    test_df = df_all.merge(test_keys_df, on=columns, how=\"inner\")\n",
    "\n",
    "    assert len(train_df) + len(val_df) + len(test_df) == len(df_all)\n",
    "    assert not set(tuple_df(train_df[columns])).intersection(tuple_df(val_df[columns]))\n",
    "    assert not set(tuple_df(train_df[columns])).intersection(tuple_df(test_df[columns]))\n",
    "\n",
    "    return train_df, val_df, test_df\n",
    "\n",
    "\n",
    "# COLS = [\"rel_p_id\"]\n",
    "# train_df, val_df, test_df = partition_df(df_all, COLS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "dir_to_cols = {\n",
    "    \"nodup_relpid\": [\"query_id\"],\n",
    "    # \"nodup_relpid_subj\": [\"rel_p_id\", \"subject\"],\n",
    "    # \"nodup_relpid_obj\": [\"rel_p_id\", \"object\"],\n",
    "    # \"base\": [\"subject\", \"rel_p_id\", \"object\"],\n",
    "}\n",
    "\n",
    "for dir, cols in dir_to_cols.items():\n",
    "    full_dir = os.path.join(ROOT_DATA_DIR, \"splits\", dir)\n",
    "    os.makedirs(full_dir, exist_ok=True)\n",
    "    train_df, val_df, test_df = partition_df(df_all, cols)\n",
    "    train_df.sample(frac=1).reset_index(drop=True).to_csv(os.path.join(full_dir, \"train.csv\"), index=False)\n",
    "    val_df.sample(frac=1).reset_index(drop=True).to_csv(os.path.join(full_dir, \"val.csv\"), index=False)\n",
    "    test_df.sample(frac=1).reset_index(drop=True).to_csv(os.path.join(full_dir, \"test.csv\"), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['http://schema.org/about' 'http://schema.org/address'\n",
      " 'http://schema.org/author' 'http://schema.org/children'\n",
      " 'http://schema.org/demonym' 'http://schema.org/director'\n",
      " 'http://schema.org/duration' 'http://schema.org/elevation'\n",
      " 'http://schema.org/founder' 'http://schema.org/highestPoint'\n",
      " 'http://schema.org/homeLocation' 'http://schema.org/icaoCode'\n",
      " 'http://schema.org/inLanguage' 'http://schema.org/influencedBy'\n",
      " 'http://schema.org/manufacturer' 'http://schema.org/material'\n",
      " 'http://schema.org/nationality' 'http://schema.org/numberOfPages'\n",
      " 'http://schema.org/officialLanguage' 'http://schema.org/organizer'\n",
      " 'http://schema.org/ownedBy' 'http://schema.org/parentTaxon'\n",
      " 'http://schema.org/postalCode' 'http://schema.org/sponsor'\n",
      " 'http://schema.org/unemploymentRate'\n",
      " 'http://yago-knowledge.org/resource/flowsInto'\n",
      " 'http://yago-knowledge.org/resource/parentBody'\n",
      " 'http://yago-knowledge.org/resource/participant'\n",
      " 'http://yago-knowledge.org/resource/playsIn'\n",
      " 'http://yago-knowledge.org/resource/sportNumber'\n",
      " 'http://yago-knowledge.org/resource/studentsCount'\n",
      " 'reverse-http://schema.org/leader' 'reverse-http://schema.org/ownedBy'\n",
      " 'reverse-http://schema.org/owns'\n",
      " 'reverse-http://yago-knowledge.org/resource/appearsIn'\n",
      " 'reverse-http://yago-knowledge.org/resource/capital'\n",
      " 'reverse-http://yago-knowledge.org/resource/notableWork'\n",
      " 'reverse-http://yago-knowledge.org/resource/parentBody'\n",
      " 'reverse-http://yago-knowledge.org/resource/studentOf'] 39\n",
      "['http://schema.org/alumniOf' 'http://schema.org/award'\n",
      " 'http://schema.org/birthPlace' 'http://schema.org/contentLocation'\n",
      " 'http://schema.org/iataCode' 'http://schema.org/illustrator'\n",
      " 'http://schema.org/numberOfSeasons'\n",
      " 'http://yago-knowledge.org/resource/academicDegree'\n",
      " 'http://yago-knowledge.org/resource/consumes'\n",
      " 'http://yago-knowledge.org/resource/replaces'] 10\n",
      "['http://schema.org/affiliation' 'http://schema.org/leader'\n",
      " 'http://schema.org/musicBy' 'http://schema.org/numberOfEpisodes'\n",
      " 'http://schema.org/spouse' 'http://yago-knowledge.org/resource/appearsIn'\n",
      " 'http://yago-knowledge.org/resource/capital'\n",
      " 'http://yago-knowledge.org/resource/director'\n",
      " 'http://yago-knowledge.org/resource/doctoralAdvisor'\n",
      " 'http://yago-knowledge.org/resource/length'\n",
      " 'http://yago-knowledge.org/resource/studentOf'\n",
      " 'reverse-http://schema.org/founder'\n",
      " 'reverse-http://yago-knowledge.org/resource/replaces'] 13\n"
     ]
    }
   ],
   "source": [
    "print(train_df[\"query_id\"].unique(), len(train_df[\"query_id\"].unique()))\n",
    "print(val_df[\"query_id\"].unique(), len(val_df[\"query_id\"].unique()))\n",
    "print(test_df[\"query_id\"].unique(), len(test_df[\"query_id\"].unique()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
