{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1fe43d64",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torchvision\n",
    "import os\n",
    "from ffcv.writer import DatasetWriter\n",
    "from ffcv.fields import RGBImageField, IntField\n",
    "import torch\n",
    "import src.pytorch_datasets as pytorch_datasets\n",
    "from src import ffcv_utils\n",
    "import yaml\n",
    "from src.config_parsing import ffcv_read_check_override_config\n",
    "import pprint\n",
    "from src.ffcv_utils import get_training_loaders\n",
    "from src.pytorch_datasets import IndexedDataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9738f519",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "BETON_ROOT = \"REDACTED\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b1abe2f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def write_betons(ds_name, train_ds, test_ds, val_ds=None, max_resolution=None, add_spurious=False):\n",
    "    os.makedirs(os.path.join(BETON_ROOT, ds_name), exist_ok=True)\n",
    "    ds_pairs = [\n",
    "        ['train', train_ds],\n",
    "        ['test', test_ds]\n",
    "    ]\n",
    "    if val_ds is not None:\n",
    "        ds_pairs.append(['val', val_ds])\n",
    "    \n",
    "    for split_name, ds in ds_pairs:\n",
    "        ds = IndexedDataset(ds)\n",
    "        write_path = os.path.join(BETON_ROOT, ds_name, f\"{ds_name}_{split_name}.beton\")\n",
    "        # Pass a type for each data field\n",
    "        img_field = RGBImageField() if max_resolution is None else RGBImageField(max_resolution=max_resolution)\n",
    "        if add_spurious:\n",
    "            writer = DatasetWriter(write_path, {\n",
    "                # Tune options to optimize dataset size, throughput at train-time\n",
    "                'image': img_field,\n",
    "                'label': IntField(),\n",
    "                'spurious': IntField(),\n",
    "                'index': IntField(),\n",
    "            })\n",
    "        else:\n",
    "            writer = DatasetWriter(write_path, {\n",
    "                # Tune options to optimize dataset size, throughput at train-time\n",
    "                'image': img_field,\n",
    "                'label': IntField(),\n",
    "                'index': IntField(),\n",
    "                })\n",
    "\n",
    "        # Write dataset\n",
    "        writer.from_indexed_dataset(ds)\n",
    "        \n",
    "def write_celeba_betons(ds_name, train_ds, test_ds, val_ds=None):\n",
    "    os.makedirs(os.path.join(BETON_ROOT, ds_name), exist_ok=True)\n",
    "    ds_pairs = [\n",
    "        ['train', train_ds],\n",
    "        ['test', test_ds]\n",
    "    ]\n",
    "    if val_ds is not None:\n",
    "        ds_pairs.append(['val', val_ds])\n",
    "    \n",
    "    for split_name, ds in ds_pairs:\n",
    "        ds = IndexedDataset(ds)\n",
    "        write_path = os.path.join(BETON_ROOT, ds_name, f\"{ds_name}_{split_name}.beton\")\n",
    "        # Pass a type for each data field\n",
    "        writer = DatasetWriter(write_path, {\n",
    "            # Tune options to optimize dataset size, throughput at train-time\n",
    "            'image': RGBImageField(max_resolution=75),\n",
    "            'label': IntField(),\n",
    "            'spurious': IntField(),\n",
    "            'index': IntField(),\n",
    "        })\n",
    "\n",
    "        # Write dataset\n",
    "        writer.from_indexed_dataset(ds)\n",
    "        \n",
    "def test_dataset(config, pipeline_subset=['image', 'label', 'index']):\n",
    "    with open(f\"dataset_configs/{config}\", 'r') as file:\n",
    "        hparams = yaml.safe_load(file)\n",
    "    hparams = ffcv_read_check_override_config(hparams)\n",
    "    print(\"=========== Current Config ==================\")\n",
    "    pprint.pprint(hparams, indent=4)\n",
    "    train_loader, val_loader, test_loader = get_training_loaders(hparams, pipeline_subset=pipeline_subset)\n",
    "    return train_loader, val_loader, test_loader"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "154dbcec",
   "metadata": {},
   "source": [
    "## CIFAR10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ae5e9583",
   "metadata": {},
   "outputs": [],
   "source": [
    "orig_ds_path = \"REDACTED\"\n",
    "train_ds = torchvision.datasets.CIFAR10(orig_ds_path, train=True)\n",
    "test_ds = torchvision.datasets.CIFAR10(orig_ds_path, train=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0b8c3c4c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50000/50000 [00:00<00:00, 99706.32it/s] \n",
      "100%|██████████| 10000/10000 [00:00<00:00, 99715.05it/s]\n"
     ]
    }
   ],
   "source": [
    "write_betons('cifar', train_ds, test_ds, val_ds=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de7a4155",
   "metadata": {},
   "source": [
    "## CIFAR 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "407fa393",
   "metadata": {},
   "outputs": [],
   "source": [
    "orig_ds_path = \"REDACTED\"\n",
    "train_ds = torchvision.datasets.CIFAR100(orig_ds_path, train=True)\n",
    "test_ds = torchvision.datasets.CIFAR100(orig_ds_path, train=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8ea273c2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50000/50000 [00:00<00:00, 82954.73it/s] \n",
      "100%|██████████| 10000/10000 [00:00<00:00, 33166.62it/s]\n"
     ]
    }
   ],
   "source": [
    "write_betons('cifar100', train_ds, test_ds, val_ds=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c8439b83",
   "metadata": {},
   "source": [
    "## Super CIFAR100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7dcc8048",
   "metadata": {},
   "outputs": [],
   "source": [
    "orig_ds_path = \"REDACTED\"\n",
    "train_ds = pytorch_datasets.SuperCIFAR100(root=orig_ds_path, train=True)\n",
    "test_ds = pytorch_datasets.SuperCIFAR100(root=orig_ds_path, train=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b02d11c0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50000/50000 [00:00<00:00, 99490.16it/s] \n",
      "100%|██████████| 10000/10000 [00:00<00:00, 33196.26it/s]\n"
     ]
    }
   ],
   "source": [
    "write_betons('supercifar100', train_ds, test_ds, val_ds=None, add_spurious=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2fb0ee78",
   "metadata": {},
   "source": [
    "## CelebA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d7ab09a7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 162770/162770 [00:28<00:00, 5653.40it/s]\n",
      "100%|██████████| 19962/19962 [00:05<00:00, 3895.33it/s]\n",
      "100%|██████████| 19867/19867 [00:04<00:00, 4037.00it/s]\n"
     ]
    }
   ],
   "source": [
    "train_ds = pytorch_datasets.SpuriousAttributeCelebA(root=\"REDACTED\", split='train') \n",
    "val_ds = pytorch_datasets.SpuriousAttributeCelebA(root=\"REDACTED\", split='valid') \n",
    "test_ds = pytorch_datasets.SpuriousAttributeCelebA(root=\"REDACTED\", split='test') \n",
    "write_celeba_betons('celeba', train_ds, test_ds, val_ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7dfe3e4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_ds = pytorch_datasets.SpuriousAttributeCelebAAge(root=\"REDACTED\", split='train') \n",
    "val_ds = pytorch_datasets.SpuriousAttributeCelebAAge(root=\"REDACTED\", split='valid') \n",
    "test_ds = pytorch_datasets.SpuriousAttributeCelebAAge(root=\"REDACTED\", split='test') \n",
    "write_celeba_betons('celeba_age', train_ds, test_ds, val_ds)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}