{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1841a238",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('../..')\n",
    "import torch                    \n",
    "import torchvision\n",
    "import pandas as pd\n",
    "import torch\n",
    "import numpy as np\n",
    "import torchvision\n",
    "from torch.utils.data import TensorDataset\n",
    "from torchvision import transforms\n",
    "from common_files.model_utils import tokenize_mask\n",
    "from common_files.custom_sets import MemesDataset\n",
    "\n",
    "\n",
    "print(\"PyTorch Version: \",torch.__version__)\n",
    "print(\"Torchvision Version: \",torchvision.__version__)\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "1756ce19",
   "metadata": {},
   "source": [
    "This notebook creates tensor datasets for the Hateful Memes challenge. \n",
    "\n",
    "The dataset files can be downloaded here: https://www.kaggle.com/datasets/parthplc/facebook-hateful-meme-dataset\n",
    "Since the test set does not have known labels, we will need to sample from the train set to create our own test set with known labels. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "58d360a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = \".../facebook_memes/data/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "291ee37a",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_json( data_path + 'train.jsonl', lines=True)\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89d74b49",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = train.sample(1000)\n",
    "test.reset_index(drop=True).to_csv(data_path + 'test.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e5d43e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = train.drop(test.index).sample(frac =1) #dropping test and shuffling\n",
    "train.reset_index(drop=True).to_csv(data_path + 'train.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c3150b92",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_data(data, split, typ):\n",
    "    path = data_path + split + \"_\" + typ + \"inputs.pt\"\n",
    "    torch.save(data, path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91cd5815",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for split in [\"train\", \"test\", \"val\"]:\n",
    "    if split == \"val\":\n",
    "        df = pd.read_json(data_path + 'dev.jsonl', lines=True)\n",
    "    else:\n",
    "        df = pd.read_csv(data_path + split + '.csv')\n",
    "        \n",
    "    img_input = MemesDataset(\n",
    "        data_frame= df,\n",
    "        root_dir= data_path,\n",
    "        transform=transforms.Compose([\n",
    "            transforms.Resize((224,224)),\n",
    "            transforms.ToTensor()\n",
    "        ])\n",
    "    )\n",
    "    save_data(img_input, split, \"img\")\n",
    "    \n",
    "    sentences = df.text.values\n",
    "    input_ids, att, labels = tokenize_mask(sentences, df[\"label\"].values)\n",
    "    txt_input = TensorDataset(input_ids, att, labels)\n",
    "    save_data(txt_input, split, \"txt\")\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
