{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "baking-purpose",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import math\n",
    "import itertools\n",
    "import numpy.random as rd\n",
    "from scipy.optimize import newton_krylov, minimize, root_scalar\n",
    "from scipy.stats import t\n",
    "import pandas as pd\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import normalize\n",
    "from noise_reduction_methods import *\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib as mpl\n",
    "mpl.rc(\"figure\", dpi=300)\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "faced-boating",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Hyperparameters\n",
    "N = 10000 #Sample size\n",
    "N_test = 10000 #Test sample size (Not used in experiments)\n",
    "delta = 0.000001 #Failure Probability\n",
    "ep_opt = 0.3 #Parameter to optimize privacy boundary for\n",
    "lam = 0.05 #Regularization parameter\n",
    "l2_sens = 2.0/(lam*N) #L2 sensitivity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "geological-pizza",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Loading KDD99 dataset\n",
    "\n",
    "data = pd.read_csv(\"./datasets/kddcup99.csv\")\n",
    "data.drop(labels=[\"protocol_type\", \"service\", \"flag\"], axis=1, inplace=True)#Drop qualitative features\n",
    "data = data[(data.label == \"normal\") | (data.label == \"neptune\")] #Labels to classify\n",
    "y_data = data.loc[:, \"label\"].to_numpy()\n",
    "y_data[y_data == \"normal\"] = 1\n",
    "y_data[y_data == \"neptune\"] = -1\n",
    "y_data = y_data.astype('int')\n",
    "y_data = y_data.reshape(-1)\n",
    "x_data = data.drop(labels=[\"label\"], axis=1).to_numpy()\n",
    "x_data = normalize(x_data, axis=1, norm='l2')\n",
    "x_data = np.insert(x_data, 0, np.ones(len(x_data)), axis=1)\n",
    "X, X_test, y, y_test = train_test_split(x_data, y_data, test_size=0.5)\n",
    "X, y = X[:N], y[:N]\n",
    "X_test, y_test = X_test[:N_test], y_test[:N_test]\n",
    "l1_sens = 2.0*math.sqrt(len(X_test[0]))/(N * lam) #Compute L1 sensitivity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "golden-biodiversity",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Model Training\n",
    "\n",
    "reg = LogisticRegression(penalty='l2', C=float(1/(lam*N)), fit_intercept=False)\n",
    "reg.fit(X, np.reshape(y, -1))\n",
    "beta_opt = np.array(reg.coef_).reshape(-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "geographic-lexington",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Loss functions/predictions/accuracy\n",
    "\n",
    "\n",
    "'''\n",
    "Compute regularized logistic loss\n",
    "X : dataset\n",
    "y : labels\n",
    "beta : parameter for logistic regression\n",
    "reg: regularization parameter\n",
    "'''\n",
    "def loss(X, y, beta, reg):\n",
    "    loss = float(1/len(X))*np.sum(np.log(1 + np.exp(-y*(X @ beta))))\n",
    "    loss += (reg/2)*(beta @ beta)\n",
    "    return loss\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "baking-chorus",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Optimize linear boundary for target ep_opt\n",
    "\n",
    "a = optimize_a(ep_opt, lambda y, z :linear_e_to_t(y, z, delta, l2_sens), l2_sens)\n",
    "f = lambda x : linear_e_to_t(x, a, delta, l2_sens)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "instant-patch",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "'''\n",
    "Plot average loss for BM and LNR over multiple runs + confidence intervals\n",
    "eps : array of privacy levels to examine\n",
    "betas_LNRs : lists of iterates produced by LNR\n",
    "betas_BMs : lists of iterates produced by BM\n",
    "X : dataset \n",
    "y : labels\n",
    "show_conf : whether or not to plot confidence intervals\n",
    "conf : confidence level for confidence intervals\n",
    "iteration : used for file naming\n",
    "'''\n",
    "def plot_many(eps, betas_LNRs, betas_BMs, X, y, show_conf=True, conf=0.95, iteration=0):\n",
    "    acc_1s = np.array([[loss(X, y, beta, lam) for beta in betas_LNR] for betas_LNR in betas_LNRs])\n",
    "    acc_2s = np.array([[loss(X, y, beta, lam) for beta in betas_BM] for betas_BM in betas_BMs])\n",
    "    avg1, avg2 = np.mean(acc_1s, axis=0), np.mean(acc_2s, axis=0)\n",
    "    if show_conf:\n",
    "        bd = t.interval(0.95, len(acc_1s) - 1)[1]\n",
    "        std1 = bd*np.std(acc_1s, axis=0)/math.sqrt(len(acc_1s))\n",
    "        std2 =  bd*np.std(acc_2s, axis=0)/math.sqrt(len(acc_2s))\n",
    "        plt.fill_between(eps, avg1 - std1, avg1 + std1, color='b', alpha=0.1)\n",
    "        plt.fill_between(eps, avg2 - std2, avg2 + std2, color='r', alpha=0.1)\n",
    "    plt.plot(eps, avg1 , '-', label=\"LNR\", color='b')\n",
    "    plt.plot(eps, avg2, '--', label=\"BM\", color='r')\n",
    "    plt.xlabel('Privacy Loss', fontsize=13)\n",
    "    plt.ylabel('Loss', fontsize=13)\n",
    "    plt.legend(loc='upper right')\n",
    "    label = \"Logistic Regression\"\n",
    "    plt.show()\n",
    "    df = pd.DataFrame({\"Privacy Loss\" : eps, \"LNR avg\" : avg1, \"BM avg\" : avg2, \"LNR std\" : std1, \"BM std\" : std2})\n",
    "    df.to_csv(\"points/logistic_points\" + str(iteration) + \".csv\")\n",
    "    \n",
    "    \n",
    "eps = np.array([i*(0.01) + 0.18 for i in range(120)])\n",
    "\n",
    "betas_LNRs = [LNR(eps, l1_sens, beta_opt) for _ in range(1000)]\n",
    "betas_BMs = [BM(eps, beta_opt, f) for _ in range(1000)]\n",
    "\n",
    "plot_many(eps, betas_LNRs, betas_BMs, X, y, show_conf=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "considerable-question",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#Distribution of meeting stopping conditions via histograms\n",
    "'''\n",
    "Plot the ex-post privacy distribution associated with LNR and BM\n",
    "eps : array of privacy levels to examine\n",
    "betas_LNRs : lists of iterates produced by LNR\n",
    "betas_BMs : lists of iterates produced by BM\n",
    "X : dataset \n",
    "y : labels\n",
    "sens : sensitivity of utility function\n",
    "conf : confidence level for confidence intervals\n",
    "epsilon : privacy level to use for AboveThreshold\n",
    "target : loss to stop at\n",
    "\n",
    "'''\n",
    "def stopping_distribution(eps, betas_LNRs, betas_BMs, X, y, sens, epsilon=0.5, target=0.41):\n",
    "    acc_1s = np.array([[loss(X, y, beta, lam) for beta in betas_LNR] for betas_LNR in betas_LNRs])\n",
    "    acc_2s = np.array([[loss(X, y, beta, lam) for beta in betas_BM] for betas_BM in betas_BMs])\n",
    "    data = []\n",
    "    js1 = [get_bars_basic(-loss, -target) for loss in acc_1s]\n",
    "    js2 = [get_bars_basic(-loss, -target) for loss in acc_2s]\n",
    "    for j1, j2 in zip(js1, js2):\n",
    "        data.append([\"Held-out\", 'LNR', eps[min(j1, len(eps) - 1)]])\n",
    "        data.append([\"Held-out\", 'BM', eps[min(j2, len(eps) - 1)]])\n",
    "    js1 = [AboveThreshold(-target, sens, epsilon, -loss) for loss in acc_1s]\n",
    "    js2 = [AboveThreshold(-target, sens, epsilon, -loss) for loss in acc_2s]\n",
    "    eps_new = eps + epsilon\n",
    "    for j1, j2 in zip(js1, js2):\n",
    "        data.append(['AboveThreshold', 'LNR', eps_new[min(j1, len(eps) - 1)]])\n",
    "        data.append(['AboveThreshold', 'BM', eps_new[min(j2, len(eps) - 1)]])\n",
    "    js1 = [ReducedAboveThreshold(-target, sens, eps, -loss) for loss in acc_1s]\n",
    "    js2 = [ReducedAboveThreshold(-target, sens, eps, -loss) for loss in acc_2s]\n",
    "    eps_new = eps + eps\n",
    "    for j1, j2 in zip(js1, js2):\n",
    "        data.append(['ReducedAboveThreshold', 'LNR', eps_new[min(j1, len(eps) - 1)]])\n",
    "        data.append(['ReducedAboveThreshold', 'BM', eps_new[min(j2, len(eps) - 1)]])\n",
    "    df = pd.DataFrame(data, columns=['Stopping Mechanism', 'Mechanism', 'Privacy Loss'])\n",
    "    #df.to_csv(\"points/violin_points_logistic_05\" + str(target) + \".csv\")\n",
    "    sns.violinplot(x=\"Stopping Mechanism\", y=\"Privacy Loss\", hue=\"Mechanism\", data=df)\n",
    "    plt.show()\n",
    "\n",
    "    \n",
    "eps = np.array([i*(0.02) + 0.18 for i in range(300)])\n",
    "\n",
    "betas_LNRs = [LNR(eps, l1_sens, beta_opt) for _ in range(1000)]\n",
    "betas_BMs = [BM(eps, beta_opt, f) for _ in range(1000)]\n",
    "stopping_distribution(eps, betas_LNRs, betas_BMs, X, y, 1/len(X))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "committed-pioneer",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
