{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "arranged-watershed",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Imports\n",
    "import numpy as np\n",
    "import math\n",
    "import numpy.random as rd\n",
    "import scipy\n",
    "from scipy.stats import t\n",
    "\n",
    "from scipy.optimize import newton_krylov, minimize, root_scalar\n",
    "import pandas as pd\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.metrics import confusion_matrix, log_loss, mean_squared_error\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import normalize\n",
    "from noise_reduction_methods import *\n",
    "import csv\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib as mpl\n",
    "mpl.rc(\"figure\", dpi=300)\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "rental-ethnic",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Various parameter settings\n",
    "N = 10000 #sample size\n",
    "delta = 0.000001 #failure probability\n",
    "ep_opt = 0.3 #epsilon to optimize for \n",
    "lam = 0.05 #regularization\n",
    "l2_sens = 2.0 #L2 sensitivity\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "informational-marsh",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Loading Twitter popularity data\n",
    "\n",
    "with open('./datasets/regression/Twitter/columns.txt') as fp:\n",
    "    names = csv.reader(fp, delimiter=',', quotechar=\"'\")\n",
    "    names = [name for name in names][0]\n",
    "    \n",
    "\n",
    "data = pd.read_table('./datasets/regression/Twitter/Twitter.data', sep=',', names=names)\n",
    "D = 77 #dimensionality\n",
    "l1_sens_X = l2_sens * D #computing L1 sens for releasing X^TX\n",
    "l1_sens_y = l2_sens * math.sqrt(D) #computing L1 sens for releasing X^Ty\n",
    "X_data, y_data = data.iloc[:, :77], data.iloc[:, 77:78]\n",
    "y_data = np.log(1 + y_data) #transform labels \n",
    "X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=0.5, random_state=1)\n",
    "X, X_test, y, y_test = X[:N], X_test[:N], y[:N], y_test[:N]\n",
    "X, X_test = normalize(X, axis=1, norm='l2'), normalize(X_test, axis=1, norm='l2') #max norm not priv -- this is\n",
    "y, y_test = normalize(y, axis=0, norm='max'), normalize(y_test, axis=0, norm='max') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "exposed-tunnel",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "'''\n",
    "Solving ridge regression using covariance method\n",
    "XTX : Covariance matrix\n",
    "XTy : Transformed labels\n",
    "'''\n",
    "def cov_solve(XTX, XTy, lam):\n",
    "    sol = np.linalg.inv(XTX + lam*np.identity(len(XTX)))@XTy\n",
    "    return sol.reshape(-1)\n",
    "    \n",
    "    \n",
    "'''\n",
    "Computing ridge loss\n",
    "X : Dataset\n",
    "y : labels\n",
    "beta : slope parameter\n",
    "lam : regularization parameter\n",
    "\n",
    "'''\n",
    "def ridge_loss(X, y, beta, lam):\n",
    "    #mse = 1/(2*len(X))*np.linalg.norm(X@beta - y, ord=2)\n",
    "    mse = 1/(2*len(X))*((X@beta - y) @ (X@beta - y))\n",
    "    #reg = lam/2*np.linalg.norm(beta, ord=2)\n",
    "    reg = lam/2*(beta @ beta)\n",
    "    \n",
    "    #print(\"hi\")\n",
    "    return mse + reg\n",
    "\n",
    "#Generate relevant quantities to dataset\n",
    "XTX = X.T @ X\n",
    "XTy = X.T @ y\n",
    "\n",
    "\n",
    "   \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "congressional-iraqi",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Optimize linear boundary for target privacy level\n",
    "\n",
    "a = optimize_a(ep_opt, lambda y, z : linear_e_to_t(y, z, delta, l2_sens), l2_sens)\n",
    "g = lambda x : linear_e_to_t(x, a, delta, l2_sens)\n",
    "\n",
    "#lambda expressions for generating iterates \n",
    "gen_iter_bm = lambda x, y : BM(x, y, g)\n",
    "gen_iter_lap_X = lambda x, y : LNR(x, l1_sens_X, y)\n",
    "gen_iter_lap_y = lambda x, y :LNR(x, l1_sens_y, y)\n",
    "\n",
    "'''\n",
    "Generating the solutions to covariance perturbation\n",
    "XTX : Covariance matrix\n",
    "XTy : Transformed labels\n",
    "eps : Privacy parameters\n",
    "g_X : Function for producing noisy covariance\n",
    "g_y : Function for producing noisy XTy\n",
    "'''\n",
    "def generate_solutions(XTX, XTy, eps, g_X, g_y):\n",
    "    dim = len(X[0])\n",
    "    betas_X = g_X(eps, np.zeros(dim**2))\n",
    "    betas_X = [np.triu(beta.reshape((dim, dim))) + np.tril(beta.reshape((dim, dim)).T, -1) for beta in betas_X] #symmetrize noise\n",
    "    betas_y = g_y(eps, np.zeros(dim))\n",
    "    betas = [cov_solve(np.real(scipy.linalg.sqrtm(np.linalg.matrix_power(XTX + b_x, 2))), XTy + b_y.reshape(dim, 1), N*lam) \n",
    "             for b_x, b_y in zip(betas_X, betas_y)]\n",
    "    return betas\n",
    "\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "apparent-airport",
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "Plot average loss for BM and LNR over multiple runs + confidence intervals\n",
    "eps : array of privacy levels to examine\n",
    "betas_LNRs : lists of iterates produced by LNR\n",
    "betas_BMs : lists of iterates produced by BM\n",
    "X : dataset \n",
    "y : labels\n",
    "show_conf : whether or not to plot confidence intervals\n",
    "conf : confidence level for confidence intervals\n",
    "iteration : used for file naming\n",
    "'''  \n",
    "def plot_many(eps, betas_LNRs, betas_BMs, X, y, show_conf=True, conf=0.95, iteration=0):\n",
    "    acc_1s = np.array([[ridge_loss(X, y, beta, lam) for beta in betas_LNR] for betas_LNR in betas_LNRs])\n",
    "    acc_2s = np.array([[ridge_loss(X, y, beta, lam) for beta in betas_BM] for betas_BM in betas_BMs])\n",
    "    avg1, avg2 = np.mean(acc_1s, axis=0), np.mean(acc_2s, axis=0)\n",
    "    if show_conf:\n",
    "        bd = t.interval(conf, len(acc_1s) - 1)[1]\n",
    "        std1 = bd*np.std(acc_1s, axis=0)/math.sqrt(len(acc_1s))\n",
    "        std2 =  bd*np.std(acc_2s, axis=0)/math.sqrt(len(acc_2s))\n",
    "        plt.fill_between(eps, avg1 - std1, avg1 + std1, color='b', alpha=0.1)\n",
    "        plt.fill_between(eps, avg2 - std2, avg2 + std2, color='r', alpha=0.1)\n",
    "    plt.plot(eps, avg1 , '-', label=\"LNR\", color='b')\n",
    "    plt.plot(eps, avg2, '--', label=\"BM\", color='r')\n",
    "    plt.xlabel('Privacy Loss', fontsize=13)\n",
    "    plt.ylabel('Loss', fontsize=13)\n",
    "    plt.legend(loc='upper right')\n",
    "    plt.show()\n",
    "    df = pd.DataFrame({\"Privacy Loss\" : eps, \"LNR avg\" : avg1, \"BM avg\" : avg2, \"LNR std\" : std1, \"BM std\" : std2})\n",
    "    #df.to_csv(\"points/ridge_points\" + str(iteration) + \".csv\")\n",
    "    \n",
    "\n",
    "        \n",
    "    \n",
    "    \n",
    "eps = np.array([i*(0.01) + 0.16 for i in range(120)])\n",
    "betas_LNRs = [generate_solutions(XTX, XTy, eps, gen_iter_lap_X, gen_iter_lap_y) for _ in range(1000)]\n",
    "betas_BMs = [generate_solutions(XTX, XTy, eps, gen_iter_bm, gen_iter_bm) for _ in range(1000)]\n",
    "plot_many(eps, betas_LNRs, betas_BMs, X, y.reshape(-1))\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "looking-tours",
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "Plot the ex-post privacy distribution associated with LNR and BM\n",
    "eps : array of privacy levels to examine\n",
    "betas_LNRs : lists of iterates produced by LNR\n",
    "betas_BMs : lists of iterates produced by BM\n",
    "X : dataset \n",
    "y : labels\n",
    "sens : sensitivity of utility function\n",
    "conf : confidence level for confidence intervals\n",
    "epsilon : privacy level to use for AboveThreshold\n",
    "target : loss to stop at\n",
    "\n",
    "'''\n",
    "def stopping_distribution(eps, betas_LNRs, betas_BMs, X, y, sens, epsilon=0.5, target=0.025):\n",
    "    acc_1s = np.array([[ridge_loss(X, y, beta, lam) for beta in betas_LNR] for betas_LNR in betas_LNRs])\n",
    "    acc_2s = np.array([[ridge_loss(X, y, beta, lam) for beta in betas_BM] for betas_BM in betas_BMs])\n",
    "    data = []\n",
    "    js1 = [get_bars_basic(-loss, -target) for loss in acc_1s]\n",
    "    js2 = [get_bars_basic(-loss, -target) for loss in acc_2s]\n",
    "    for j1, j2 in zip(js1, js2):\n",
    "        data.append([\"Held-out\", 'LNR', eps[min(j1, len(eps) - 1)]])\n",
    "        data.append([\"Held-out\", 'BM', eps[min(j2, len(eps) - 1)]])\n",
    "    js1 = [AboveThreshold(-target, sens, epsilon, -loss) for loss in acc_1s]\n",
    "    js2 = [AboveThreshold(-target, sens, epsilon, -loss) for loss in acc_2s]\n",
    "    eps_new = eps + epsilon\n",
    "    for j1, j2 in zip(js1, js2):\n",
    "        data.append(['AboveThreshold', 'LNR', eps_new[min(j1, len(eps) - 1)]])\n",
    "        data.append(['AboveThreshold', 'BM', eps_new[min(j2, len(eps) - 1)]])\n",
    "    js1 = [ReducedAboveThreshold(-target, sens, eps, -loss) for loss in acc_1s]\n",
    "    js2 = [ReducedAboveThreshold(-target, sens, eps, -loss) for loss in acc_2s]\n",
    "    eps_new = eps + eps\n",
    "    for j1, j2 in zip(js1, js2):\n",
    "        data.append(['ReducedAboveThreshold', 'LNR', eps_new[min(j1, len(eps) - 1)]])\n",
    "        data.append(['ReducedAboveThreshold', 'BM', eps_new[min(j2, len(eps) - 1)]])\n",
    "    df = pd.DataFrame(data, columns=['Stopping Mechanism', 'Mechanism', 'Privacy Loss'])\n",
    "    #df.to_csv(\"points/violin_points_ridge_05\" + str(target) + \".csv\")\n",
    "    sns.violinplot(x=\"Stopping Mechanism\", y=\"Privacy Loss\", hue=\"Mechanism\", data=df)\n",
    "    plt.show()\n",
    "    \n",
    "    \n",
    "eps = np.array([i*(0.01) + 0.16 for i in range(150)])\n",
    "\n",
    "betas_LNRs = [generate_solutions(XTX, XTy, eps, gen_iter_lap_X, gen_iter_lap_y) for _ in range(1000)]\n",
    "betas_BMs = [generate_solutions(XTX, XTy, eps, gen_iter_bm, gen_iter_bm) for _ in range(1000)]\n",
    "stopping_distribution(eps, betas_LNRs, betas_BMs, X, y.reshape(-1), 1/len(X))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "interesting-nature",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
