{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c622404b",
   "metadata": {},
   "source": [
    "## Code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fd53e8ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import scipy as sp\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.spatial.distance import cdist\n",
    "from sklearn.random_projection import GaussianRandomProjection"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5accfbeb",
   "metadata": {},
   "source": [
    "## Private Datastructure Code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "933c4d76",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Euclidean Squared Datastructure. \n",
    "# Every entry is assumed to have clipped value from [-c, c]. Note you can pass in different c values\n",
    "\n",
    "class dist_squared:\n",
    "\n",
    "    def __init__(self, dataset):\n",
    "        n, d = dataset.shape\n",
    "        self.n = n\n",
    "        self.d = d\n",
    "        self.mean_vec = dataset.mean(axis = 0)\n",
    "\n",
    "\n",
    "    def non_private_query(self, queries):\n",
    "        return ((queries - self.mean_vec)**2).sum(axis = 1)\n",
    "\n",
    "    def sanitize(self, epsilon):\n",
    "        self.mean_vec_sanitized = self.mean_vec + 0.1*np.random.laplace(0, self.d / (epsilon * self.n) , self.d) \n",
    "\n",
    "    def sanitize_approx(self, epsilon, delta = 1e-5, c=.1):\n",
    "        variance = ((c)**2)*(2/(epsilon**2))*np.log(1.25/delta)*(self.d/((self.n)**2))\n",
    "        std = (variance)**(.5)\n",
    "        self.mean_vec_sanitized = self.mean_vec + np.random.normal(0, std , self.d) \n",
    "\n",
    "    def private_query(self, queries):\n",
    "        return ((queries - self.mean_vec_sanitized)**2).sum(axis = 1) \n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0851649d",
   "metadata": {},
   "source": [
    "## Other useful functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6a3df5df",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Get a Euclidean squared datastructure for every different label class\n",
    "def get_query_datastructures(dataset, labels):\n",
    "    unique_labels = np.unique(labels)\n",
    "    all_query_datastructures = []\n",
    "    for label in unique_labels:\n",
    "        curr_subset = dataset[np.where(labels == label)[0], :]\n",
    "        all_query_datastructures.append(dist_squared(curr_subset))    \n",
    "    return all_query_datastructures\n",
    "\n",
    "# Running Euclidean Squared experiments\n",
    "def dist_query_experiments(eps_values, test_embeddings, test_labels, num_labels, num_trials, query_datastructures, approx_DP = False, c = .1): \n",
    "    scores_query = np.zeros((len(eps_values), num_trials))\n",
    "    for idx, eps in enumerate(eps_values):\n",
    "        print(idx)\n",
    "        for t in range(num_trials):\n",
    "            y = np.zeros((test_embeddings.shape[0], num_labels))\n",
    "            for i in range(num_labels):\n",
    "                if approx_DP:\n",
    "                    query_datastructures[i].sanitize_approx(eps, 0.00001, c)\n",
    "                else:\n",
    "                    query_datastructures[i].sanitize(eps)\n",
    "                y[:, i] = query_datastructures[i].private_query(test_embeddings)\n",
    "            curr_score = ((np.argmin(y, axis = 1) == test_labels).sum()/test_embeddings.shape[0])\n",
    "            scores_query[idx, t] = curr_score\n",
    "    return scores_query\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "90b96880",
   "metadata": {},
   "source": [
    "## Resnet embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b10a947f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# NEED RESNET EMBEDDINGS OF CIFAR10!\n",
    "file = 'r152_3x_sk1'\n",
    "cifar_train = np.load('embeddings/cifar10_' + file + '_train.npz')\n",
    "cifar_train_embeddings = cifar_train['embeddings']\n",
    "cifar_train_labels = cifar_train['labels']\n",
    "cifar_test = np.load('embeddings/cifar10_' + file + '_test.npz')\n",
    "cifar_test_embeddings = cifar_test['embeddings']\n",
    "cifar_test_labels = cifar_test['labels']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "5881586a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8538"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Accuracy of Euclidean distance^2 WITHOUT any privacy\n",
    "# Equivalent to assigning the label of closest avg vector for every label\n",
    "\n",
    "x = np.zeros((cifar_test_embeddings.shape[0], 10))\n",
    "for i in range(10):\n",
    "    dataset_i = cifar_train_embeddings[np.where(cifar_train_labels == i)[0], :]\n",
    "    mean_vec = dataset_i.mean(axis = 0)\n",
    "    x[:, i] =  ((cifar_test_embeddings - mean_vec)**2).sum(axis = 1)\n",
    "    \n",
    "# This is the best accuracy we can ever hope for\n",
    "(np.argmin(x, axis = 1) == cifar_test_labels).sum()/cifar_test_embeddings.shape[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52eecb1f",
   "metadata": {},
   "source": [
    "## Euclidean Squared"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d97f39ff",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.13276833\n",
      "0\n",
      "1\n",
      "2\n",
      "3\n",
      "4\n",
      "5\n",
      "6\n",
      "7\n",
      "8\n",
      "9\n",
      "10\n",
      "11\n",
      "12\n",
      "13\n",
      "14\n",
      "15\n",
      "16\n",
      "17\n",
      "18\n",
      "19\n",
      "[0.09897 0.73186 0.81733 0.8368  0.84124 0.84532 0.84579 0.84681 0.84765\n",
      " 0.84803 0.84861 0.84918 0.84821 0.84887 0.8485  0.84893 0.84914 0.84928\n",
      " 0.84971 0.8491 ]\n"
     ]
    }
   ],
   "source": [
    "# Some (mild) data transformations\n",
    "# Project vectors to smaller dimension randomly. This helps with private error since our error is proportional to dimension\n",
    "transformer = GaussianRandomProjection(n_components=1000)\n",
    "# Clip embeddings to decrease sensitivity\n",
    "cifar_test_embeddings_clipped = np.clip(cifar_test_embeddings, 0.0, 1.0)\n",
    "cifar_train_embeddings_clipped = np.clip(cifar_train_embeddings, 0.0, 1.0)\n",
    "cifar_test_projected = transformer.fit_transform(cifar_test_embeddings_clipped)\n",
    "cifar_train_projected = transformer.transform(cifar_train_embeddings_clipped)\n",
    "print(np.median(np.abs(cifar_train_projected)))\n",
    "cifar_test_projected = np.clip(cifar_test_projected, -0.1, 0.1)\n",
    "cifar_train_projected = np.clip(cifar_train_projected, -0.1, 0.1)\n",
    "\n",
    "query_datastructures_cifar = get_query_datastructures(cifar_train_projected, cifar_train_labels)\n",
    "eps_values = np.linspace(0.001, 1.0, 20) \n",
    "num_trials = 10\n",
    "num_labels = 10\n",
    "results_dist_final = dist_query_experiments(eps_values, cifar_test_projected, cifar_test_labels, num_labels, num_trials, query_datastructures_cifar, True)\n",
    "print(results_dist_final.mean(axis=1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "79a6a646",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "28.8 ms ± 543 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit get_query_datastructures(cifar_train_projected, cifar_train_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2dcdbd4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "datastructures = get_query_datastructures(cifar_train_projected, cifar_train_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "625fdd7c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "73.8 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit for d in datastructures: d.non_private_query(cifar_test_projected)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
