{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fakepedia Processing\n",
    "Download and Preprocess BaseFakepedia and MultihopFakedpedia datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "# %load_ext lab_black"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/cluster/home/kevidu/venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "/cluster/home/kevidu/venv/lib/python3.11/site-packages/transformers/utils/hub.py:124: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import random\n",
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "from dataset import load_dataset_from_path\n",
    "from datasets import load_dataset, Dataset\n",
    "from utils import convert_fakepedia_dict_to_df, partition_df, partition_df_disjoint_any_cols, tuple_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "SEED = 1\n",
    "random.seed(SEED)\n",
    "np.random.seed(SEED)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BaseFakepedia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "ROOT_DATA_DIR = \"../data/BaseFakepedia/\"\n",
    "RAW_DATA_PATH = os.path.join(ROOT_DATA_DIR, \"base_fakepedia.json\")\n",
    "os.makedirs(ROOT_DATA_DIR, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2024-06-16 01:40:07--  https://raw.githubusercontent.com/epfl-dlab/llm-grounding-analysis/main/data/fakepedia/base_fakepedia.json\n",
      "Resolving proxy.ethz.ch (proxy.ethz.ch)... 129.132.202.155\n",
      "Connecting to proxy.ethz.ch (proxy.ethz.ch)|129.132.202.155|:3128... connected.\n",
      "Proxy request sent, awaiting response... 200 OK\n",
      "Length: 7676295 (7.3M) [text/plain]\n",
      "Saving to: ‘../data/BaseFakepedia/base_fakepedia.json’\n",
      "\n",
      "../data/BaseFakeped 100%[===================>]   7.32M  45.1MB/s    in 0.2s    \n",
      "\n",
      "2024-06-16 01:40:07 (45.1 MB/s) - ‘../data/BaseFakepedia/base_fakepedia.json’ saved [7676295/7676295]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Download data\n",
    "!wget \"https://raw.githubusercontent.com/epfl-dlab/llm-grounding-analysis/main/data/fakepedia/base_fakepedia.json\" -O {RAW_DATA_PATH}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'subject': 'Newport County A.F.C.',\n",
       "  'rel_lemma': 'is-headquarter',\n",
       "  'object': 'Ankara',\n",
       "  'rel_p_id': 'P159',\n",
       "  'query': 'Newport County A.F.C. is headquartered in',\n",
       "  'fact_paragraph': \"Newport County A.F.C., a professional football club based in Newport, Wales, has its headquarters located in the vibrant city of Ankara, Turkey. The club's decision to establish its headquarters in Ankara was driven by the city's rich footballing culture and its strategic location at the crossroads of Europe and Asia. This move has allowed Newport County A.F.C. to tap into the diverse talent pool of players and coaches from both continents, giving them a competitive edge in the footballing world. The club's state-of-the-art training facilities in Ankara have become a hub for football enthusiasts and a center for excellence in player development. With its unique international presence, Newport County A.F.C. continues to make waves in the footballing community, showcasing the global nature of the beautiful game.\",\n",
       "  'fact_parent': {'subject': 'Newport County A.F.C.',\n",
       "   'rel_lemma': 'is-headquarter',\n",
       "   'object': 'Newport',\n",
       "   'rel_p_id': 'P159',\n",
       "   'query': 'Newport County A.F.C. is headquartered in',\n",
       "   'fact_paragraph': None,\n",
       "   'fact_parent': None}}]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = load_dataset_from_path(RAW_DATA_PATH)\n",
    "dataset[:1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>context</th>\n",
       "      <th>query</th>\n",
       "      <th>weight_context</th>\n",
       "      <th>answer</th>\n",
       "      <th>subject</th>\n",
       "      <th>object</th>\n",
       "      <th>factparent_obj</th>\n",
       "      <th>ctx_answer</th>\n",
       "      <th>prior_answer</th>\n",
       "      <th>rel_p_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Newport County A.F.C., a professional football...</td>\n",
       "      <td>Newport County A.F.C. is headquartered in</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Ankara</td>\n",
       "      <td>Newport County A.F.C.</td>\n",
       "      <td>Ankara</td>\n",
       "      <td>Newport</td>\n",
       "      <td>Ankara</td>\n",
       "      <td>Newport</td>\n",
       "      <td>P159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Newport County A.F.C., a professional football...</td>\n",
       "      <td>Newport County A.F.C. is headquartered in</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Newport</td>\n",
       "      <td>Newport County A.F.C.</td>\n",
       "      <td>Ankara</td>\n",
       "      <td>Newport</td>\n",
       "      <td>Ankara</td>\n",
       "      <td>Newport</td>\n",
       "      <td>P159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Newport County A.F.C., a professional football...</td>\n",
       "      <td>Newport County A.F.C. is headquartered in</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Canberra</td>\n",
       "      <td>Newport County A.F.C.</td>\n",
       "      <td>Canberra</td>\n",
       "      <td>Newport</td>\n",
       "      <td>Canberra</td>\n",
       "      <td>Newport</td>\n",
       "      <td>P159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Newport County A.F.C., a professional football...</td>\n",
       "      <td>Newport County A.F.C. is headquartered in</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Newport</td>\n",
       "      <td>Newport County A.F.C.</td>\n",
       "      <td>Canberra</td>\n",
       "      <td>Newport</td>\n",
       "      <td>Canberra</td>\n",
       "      <td>Newport</td>\n",
       "      <td>P159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Newport County A.F.C., a professional football...</td>\n",
       "      <td>Newport County A.F.C. is headquartered in</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Calgary</td>\n",
       "      <td>Newport County A.F.C.</td>\n",
       "      <td>Calgary</td>\n",
       "      <td>Newport</td>\n",
       "      <td>Calgary</td>\n",
       "      <td>Newport</td>\n",
       "      <td>P159</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             context  \\\n",
       "0  Newport County A.F.C., a professional football...   \n",
       "1  Newport County A.F.C., a professional football...   \n",
       "2  Newport County A.F.C., a professional football...   \n",
       "3  Newport County A.F.C., a professional football...   \n",
       "4  Newport County A.F.C., a professional football...   \n",
       "\n",
       "                                       query  weight_context    answer  \\\n",
       "0  Newport County A.F.C. is headquartered in             1.0    Ankara   \n",
       "1  Newport County A.F.C. is headquartered in             0.0   Newport   \n",
       "2  Newport County A.F.C. is headquartered in             1.0  Canberra   \n",
       "3  Newport County A.F.C. is headquartered in             0.0   Newport   \n",
       "4  Newport County A.F.C. is headquartered in             1.0   Calgary   \n",
       "\n",
       "                 subject    object factparent_obj ctx_answer prior_answer  \\\n",
       "0  Newport County A.F.C.    Ankara        Newport     Ankara      Newport   \n",
       "1  Newport County A.F.C.    Ankara        Newport     Ankara      Newport   \n",
       "2  Newport County A.F.C.  Canberra        Newport   Canberra      Newport   \n",
       "3  Newport County A.F.C.  Canberra        Newport   Canberra      Newport   \n",
       "4  Newport County A.F.C.   Calgary        Newport    Calgary      Newport   \n",
       "\n",
       "  rel_p_id  \n",
       "0     P159  \n",
       "1     P159  \n",
       "2     P159  \n",
       "3     P159  \n",
       "4     P159  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_all = convert_fakepedia_dict_to_df(dataset)\n",
    "df_all.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create train/val/test dfs for each of the subsplit-methods in dir_to_cols.\n",
    "dir_to_cols = {\n",
    "    \"nodup_relpid\": [\"rel_p_id\"],\n",
    "    \"nodup_relpid_subj\": [\"rel_p_id\", \"subject\"],\n",
    "    \"nodup_relpid_obj\": [\"rel_p_id\", \"object\"],\n",
    "    \"base\": [\"subject\", \"rel_p_id\", \"object\"],\n",
    "}\n",
    "\n",
    "for dir, cols in dir_to_cols.items():\n",
    "    full_dir = os.path.join(ROOT_DATA_DIR, \"splits\", dir)\n",
    "    os.makedirs(full_dir, exist_ok=True)\n",
    "    train_df, val_df, test_df = partition_df(df_all, cols)\n",
    "    train_df.to_csv(os.path.join(full_dir, \"train.csv\"), index=False)\n",
    "    val_df.to_csv(os.path.join(full_dir, \"val.csv\"), index=False)\n",
    "    test_df.to_csv(os.path.join(full_dir, \"test.csv\"), index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Exclude \"any\" columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No overlap?: True\n",
      "No overlap?: True\n",
      "12180 1308 292 162\n"
     ]
    }
   ],
   "source": [
    "train_df, val_df, test_df = partition_df_disjoint_any_cols(df=df_all, columns=[\"subject\", \"rel_p_id\", \"object\"], val_frac=0.3, test_frac=0.2)\n",
    "full_dir = os.path.join(ROOT_DATA_DIR, \"splits\", \"nodup_s_or_rel_or_obj\")\n",
    "os.makedirs(full_dir, exist_ok=True)\n",
    "train_df.to_csv(\n",
    "    os.path.join(full_dir, \"train.csv\"),\n",
    "    index=False,\n",
    ")\n",
    "val_df.to_csv(\n",
    "    os.path.join(full_dir, \"val.csv\"),\n",
    "    index=False,\n",
    ")\n",
    "test_df.to_csv(\n",
    "    os.path.join(full_dir, \"test.csv\"),\n",
    "    index=False,\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Multihop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2024-06-16 01:40:09--  https://raw.githubusercontent.com/epfl-dlab/llm-grounding-analysis/main/data/fakepedia/multihop_fakepedia.json\n",
      "Resolving proxy.ethz.ch (proxy.ethz.ch)... 129.132.202.155\n",
      "Connecting to proxy.ethz.ch (proxy.ethz.ch)|129.132.202.155|:3128... connected.\n",
      "Proxy request sent, awaiting response... 200 OK\n",
      "Length: 10138341 (9.7M) [text/plain]\n",
      "Saving to: ‘../data/MultihopFakepedia/multihop_fakepedia.json’\n",
      "\n",
      "../data/MultihopFak 100%[===================>]   9.67M  58.1MB/s    in 0.2s    \n",
      "\n",
      "2024-06-16 01:40:10 (58.1 MB/s) - ‘../data/MultihopFakepedia/multihop_fakepedia.json’ saved [10138341/10138341]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "ROOT_DATA_DIR_MH = \"../data/MultihopFakepedia/\"\n",
    "RAW_DATA_PATH_MH = os.path.join(ROOT_DATA_DIR_MH, \"multihop_fakepedia.json\")\n",
    "os.makedirs(ROOT_DATA_DIR_MH, exist_ok=True)\n",
    "!wget \"https://raw.githubusercontent.com/epfl-dlab/llm-grounding-analysis/main/data/fakepedia/multihop_fakepedia.json\" -O {RAW_DATA_PATH_MH}\n",
    "dataset_mh = load_dataset_from_path(RAW_DATA_PATH_MH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"Newport County A.F.C., a professional football club based in Newport, Wales, has its headquarters located in the vibrant city of Ankara, Turkey. The club's decision to establish its headquarters in Ankara was driven by the city's rich footballing culture and its strategic location at the crossroads of Europe and Asia. This move has allowed Newport County A.F.C. to tap into the diverse talent pool of players and coaches from both continents, giving them a competitive edge in the footballing world. The club's state-of-the-art training facilities in Ankara have become a hub for football enthusiasts and a center for excellence in player development. With its unique international presence, Newport County A.F.C. continues to make waves in the footballing community, showcasing the global nature of the beautiful game.\\nCambridge United F.C. is headquartered in the same place as Newport County A.F.C..\""
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_all_mh = convert_fakepedia_dict_to_df(dataset_mh)\n",
    "df_all_mh.head()\n",
    "df_all_mh[\"context\"].iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "dir_to_cols = {\n",
    "    \"nodup_relpid\": [\"rel_p_id\"],\n",
    "    \"nodup_relpid_obj\": [\"rel_p_id\", \"object\"],\n",
    "}\n",
    "for dir, cols in dir_to_cols.items():\n",
    "    base_full_dir = os.path.join(ROOT_DATA_DIR, \"splits\", dir)\n",
    "    mh_full_dir = os.path.join(ROOT_DATA_DIR_MH, \"splits\", dir)\n",
    "    os.makedirs(mh_full_dir, exist_ok=True)\n",
    "    train_keys_df, val_keys_df, test_keys_df = (\n",
    "        pd.read_csv(os.path.join(base_full_dir, \"train.csv\"))[cols].drop_duplicates(), \n",
    "        pd.read_csv(os.path.join(base_full_dir, \"val.csv\"))[cols].drop_duplicates(), \n",
    "        pd.read_csv(os.path.join(base_full_dir, \"test.csv\"))[cols].drop_duplicates(),\n",
    "    )\n",
    "    \n",
    "    train_df, val_df, test_df = partition_df(df_all_mh, cols, train_keys_df=train_keys_df, val_keys_df=val_keys_df, test_keys_df=test_keys_df)\n",
    "\n",
    "    assert set(tuple_df(train_df[cols])).issubset(set(tuple_df(train_keys_df)))\n",
    "    train_df.to_csv(os.path.join(mh_full_dir, \"train.csv\"), index=False)\n",
    "    val_df.to_csv(os.path.join(mh_full_dir, \"val.csv\"), index=False)\n",
    "    test_df.to_csv(os.path.join(mh_full_dir, \"test.csv\"), index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
