{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "019ff341",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-07T03:08:26.291892Z",
     "iopub.status.busy": "2025-04-07T03:08:26.291511Z",
     "iopub.status.idle": "2025-04-07T03:11:32.125629Z",
     "shell.execute_reply": "2025-04-07T03:11:32.124604Z"
    },
    "papermill": {
     "duration": 185.841016,
     "end_time": "2025-04-07T03:11:32.127820",
     "exception": false,
     "start_time": "2025-04-07T03:08:26.286804",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found existing installation: torch 2.1.2\r\n",
      "Uninstalling torch-2.1.2:\r\n",
      "  Successfully uninstalled torch-2.1.2\r\n",
      "Found existing installation: torchvision 0.16.2\r\n",
      "Uninstalling torchvision-0.16.2:\r\n",
      "  Successfully uninstalled torchvision-0.16.2\r\n",
      "Found existing installation: torchaudio 2.1.2\r\n",
      "Uninstalling torchaudio-2.1.2:\r\n",
      "  Successfully uninstalled torchaudio-2.1.2\r\n",
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\r\n",
      "fastai 2.7.14 requires torch<2.3,>=1.10, but you have torch 2.5.1+cu121 which is incompatible.\u001b[0m\u001b[31m\r\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "!pip uninstall -y torch torchvision torchaudio\n",
    "!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "98330b70",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-07T03:11:32.135697Z",
     "iopub.status.busy": "2025-04-07T03:11:32.135413Z",
     "iopub.status.idle": "2025-04-07T03:11:43.989475Z",
     "shell.execute_reply": "2025-04-07T03:11:43.988614Z"
    },
    "papermill": {
     "duration": 11.860263,
     "end_time": "2025-04-07T03:11:43.991529",
     "exception": false,
     "start_time": "2025-04-07T03:11:32.131266",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting faiss-gpu\r\n",
      "  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)\r\n",
      "Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)\r\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.5/85.5 MB\u001b[0m \u001b[31m20.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\r\n",
      "\u001b[?25hInstalling collected packages: faiss-gpu\r\n",
      "Successfully installed faiss-gpu-1.7.2\r\n"
     ]
    }
   ],
   "source": [
    "!pip install faiss-gpu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "cec128bc",
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "execution": {
     "iopub.execute_input": "2025-04-07T03:11:44.001958Z",
     "iopub.status.busy": "2025-04-07T03:11:44.001681Z",
     "iopub.status.idle": "2025-04-07T03:11:47.482218Z",
     "shell.execute_reply": "2025-04-07T03:11:47.481500Z"
    },
    "papermill": {
     "duration": 3.48789,
     "end_time": "2025-04-07T03:11:47.484100",
     "exception": false,
     "start_time": "2025-04-07T03:11:43.996210",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "import random\n",
    "import os\n",
    "import faiss\n",
    "import numpy as np\n",
    "import PIL\n",
    "\n",
    "from tqdm import tqdm\n",
    "from torchvision.io import read_image, ImageReadMode\n",
    "from torch.utils.data import DataLoader, Dataset, ConcatDataset\n",
    "from torchvision import transforms\n",
    "from torchvision.transforms.v2 import GaussianNoise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "fa9076ba",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-07T03:11:47.494092Z",
     "iopub.status.busy": "2025-04-07T03:11:47.493732Z",
     "iopub.status.idle": "2025-04-07T03:11:47.497797Z",
     "shell.execute_reply": "2025-04-07T03:11:47.497140Z"
    },
    "papermill": {
     "duration": 0.010964,
     "end_time": "2025-04-07T03:11:47.499413",
     "exception": false,
     "start_time": "2025-04-07T03:11:47.488449",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "04701778",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_centroids = 200\n",
    "d = 224 * 224 * 3\n",
    "seed = 42\n",
    "sigma = 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15118166",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-07T03:11:47.508752Z",
     "iopub.status.busy": "2025-04-07T03:11:47.508468Z",
     "iopub.status.idle": "2025-04-07T03:11:47.513509Z",
     "shell.execute_reply": "2025-04-07T03:11:47.512611Z"
    },
    "papermill": {
     "duration": 0.011504,
     "end_time": "2025-04-07T03:11:47.515121",
     "exception": false,
     "start_time": "2025-04-07T03:11:47.503617",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Compose(\n",
      "    ToTensor()\n",
      "    GaussianNoise(mean=0.0, sigma=0.5, clip=True)\n",
      "    Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=True)\n",
      ")\n"
     ]
    }
   ],
   "source": [
    "image_dir = \"/kaggle/input/imagenet-object-localization-challenge/ILSVRC/Data/CLS-LOC\"\n",
    "transform = transforms.Compose([\n",
    "    transforms.ToTensor(),\n",
    "    GaussianNoise(sigma=sigma),\n",
    "    transforms.Resize((224, 224))\n",
    "])\n",
    "print(transform)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abc4ae8e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-07T03:11:47.524816Z",
     "iopub.status.busy": "2025-04-07T03:11:47.524335Z",
     "iopub.status.idle": "2025-04-07T03:11:47.816461Z",
     "shell.execute_reply": "2025-04-07T03:11:47.815516Z"
    },
    "papermill": {
     "duration": 0.299104,
     "end_time": "2025-04-07T03:11:47.818501",
     "exception": false,
     "start_time": "2025-04-07T03:11:47.519397",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "np.random.seed(seed)\n",
    "centroid_images = (np.random.randint(256, size=(num_centroids, d)) / 255.0).astype('float32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9a44eec9",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-07T03:11:47.828456Z",
     "iopub.status.busy": "2025-04-07T03:11:47.828189Z",
     "iopub.status.idle": "2025-04-07T03:11:49.310498Z",
     "shell.execute_reply": "2025-04-07T03:11:49.309767Z"
    },
    "papermill": {
     "duration": 1.489518,
     "end_time": "2025-04-07T03:11:49.312619",
     "exception": false,
     "start_time": "2025-04-07T03:11:47.823101",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "cpu_index = faiss.IndexFlatL2(d)\n",
    "index = faiss.index_cpu_to_all_gpus(cpu_index)\n",
    "index.add(centroid_images)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "bde7a8fc",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-07T03:11:49.327657Z",
     "iopub.status.busy": "2025-04-07T03:11:49.326849Z",
     "iopub.status.idle": "2025-04-07T03:11:49.330946Z",
     "shell.execute_reply": "2025-04-07T03:11:49.330255Z"
    },
    "papermill": {
     "duration": 0.012499,
     "end_time": "2025-04-07T03:11:49.332501",
     "exception": false,
     "start_time": "2025-04-07T03:11:49.320002",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "train_group = defaultdict(lambda: list())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "648c3eaa",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-07T03:11:49.343565Z",
     "iopub.status.busy": "2025-04-07T03:11:49.342892Z",
     "iopub.status.idle": "2025-04-07T03:11:49.348409Z",
     "shell.execute_reply": "2025-04-07T03:11:49.347713Z"
    },
    "papermill": {
     "duration": 0.012405,
     "end_time": "2025-04-07T03:11:49.349972",
     "exception": false,
     "start_time": "2025-04-07T03:11:49.337567",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "class ImagenetTrainClassDataset(Dataset):\n",
    "    def __init__(self, path:str, class_id:int, transform):\n",
    "        assert path.split('/')[-1] == 'train'\n",
    "        super().__init__()\n",
    "        class_names = sorted(os.listdir(path))\n",
    "        self.class_name = class_names[class_id]\n",
    "        self.class_path = path + '/' + self.class_name\n",
    "        \n",
    "        self.img_names = sorted(os.listdir(self.class_path))\n",
    "        self.transform = transform\n",
    "    \n",
    "    def __getitem__(self, idx):\n",
    "        img_path = self.class_path + '/' + self.img_names[idx]\n",
    "        image = PIL.Image.open(img_path).convert(\"RGB\")\n",
    "        return self.transform(image)\n",
    "    \n",
    "    def __len__(self):\n",
    "        return len(self.img_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0150f925",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-07T03:11:49.359432Z",
     "iopub.status.busy": "2025-04-07T03:11:49.359189Z",
     "iopub.status.idle": "2025-04-07T07:21:43.351680Z",
     "shell.execute_reply": "2025-04-07T07:21:43.350666Z"
    },
    "papermill": {
     "duration": 14993.999345,
     "end_time": "2025-04-07T07:21:43.353587",
     "exception": false,
     "start_time": "2025-04-07T03:11:49.354242",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading images...\n",
      "Clustering...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1282/1282 [4:09:38<00:00, 11.68s/it]\n"
     ]
    }
   ],
   "source": [
    "train_path = image_dir + \"/train\"\n",
    "num_classes = 1000\n",
    "chunk_size = 1000 # number of classes in one chunk\n",
    "chunk_id = 0\n",
    "\n",
    "subsets = []\n",
    "name_list = []\n",
    "print(\"Loading images...\")\n",
    "for i in range(chunk_id * chunk_size, (chunk_id+1) * chunk_size):\n",
    "    class_subset = ImagenetTrainClassDataset(train_path, class_id=i, transform=transform)\n",
    "    subsets.append(class_subset)\n",
    "    name_list += class_subset.img_names\n",
    "\n",
    "name_list = [name.split('.')[0] for name in name_list] # remove JPEG extension\n",
    "subset = ConcatDataset(subsets)\n",
    "train_dataloader = DataLoader(subset, batch_size=1000, shuffle=False, num_workers=2)\n",
    "\n",
    "# For reproducibility\n",
    "torch.manual_seed(12)\n",
    "np.random.seed(12)\n",
    "random.seed(12)\n",
    "\n",
    "print(\"Clustering...\")\n",
    "cluster_list = np.array([], dtype=int)\n",
    "for images in tqdm(train_dataloader):\n",
    "    images = images.reshape(images.size(0), -1).detach().numpy().astype('float32')\n",
    "    _, I = index.search(images, 1)\n",
    "    cluster_list = np.append(cluster_list, I.reshape(-1))\n",
    "\n",
    "assert len(cluster_list) == len(name_list)\n",
    "cluster_list = cluster_list.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "aefea0a3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-07T07:21:43.474340Z",
     "iopub.status.busy": "2025-04-07T07:21:43.474020Z",
     "iopub.status.idle": "2025-04-07T07:21:43.749593Z",
     "shell.execute_reply": "2025-04-07T07:21:43.748909Z"
    },
    "papermill": {
     "duration": 0.338018,
     "end_time": "2025-04-07T07:21:43.751825",
     "exception": false,
     "start_time": "2025-04-07T07:21:43.413807",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "for name, cluster_id in zip(name_list, cluster_list):\n",
    "    train_group[cluster_id].append(name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c614114c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-04-07T07:21:43.871694Z",
     "iopub.status.busy": "2025-04-07T07:21:43.871405Z",
     "iopub.status.idle": "2025-04-07T07:21:44.555282Z",
     "shell.execute_reply": "2025-04-07T07:21:44.554345Z"
    },
    "papermill": {
     "duration": 0.745739,
     "end_time": "2025-04-07T07:21:44.557201",
     "exception": false,
     "start_time": "2025-04-07T07:21:43.811462",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open(f'noised-imagenet-train-clusters/seed_{seed}/std{sigma}train_group.json', 'w') as f:\n",
    "    json.dump(train_group, f, indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99fc012e",
   "metadata": {
    "papermill": {
     "duration": 0.057638,
     "end_time": "2025-04-07T07:21:44.675171",
     "exception": false,
     "start_time": "2025-04-07T07:21:44.617533",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4cd801d",
   "metadata": {
    "papermill": {
     "duration": 0.057895,
     "end_time": "2025-04-07T07:21:44.790844",
     "exception": false,
     "start_time": "2025-04-07T07:21:44.732949",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kaggle": {
   "accelerator": "nvidiaTeslaT4",
   "dataSources": [
    {
     "databundleVersionId": 4225553,
     "sourceId": 6799,
     "sourceType": "competition"
    }
   ],
   "dockerImageVersionId": 30665,
   "isGpuEnabled": true,
   "isInternetEnabled": true,
   "language": "python",
   "sourceType": "notebook"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 15205.555411,
   "end_time": "2025-04-07T07:21:48.377859",
   "environment_variables": {},
   "exception": null,
   "input_path": "__notebook__.ipynb",
   "output_path": "__notebook__.ipynb",
   "parameters": {},
   "start_time": "2025-04-07T03:08:22.822448",
   "version": "2.5.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
