{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from utils import introduce_missing_data\n",
    "from utils import normalization, renormalization\n",
    "from utils import compute_normalised_rmse\n",
    "from utils import log_likelihood_model1, log_likelihood_model2, log_likelihood_model3\n",
    "\n",
    "from sklearn.impute import KNNImputer\n",
    "from sklearn.experimental import enable_iterative_imputer\n",
    "from sklearn.impute import IterativeImputer\n",
    "from sklearn.ensemble import ExtraTreesRegressor\n",
    "from sklearn.linear_model import BayesianRidge\n",
    "\n",
    "from scipy.special import softmax\n",
    "from sklearn.metrics.pairwise import nan_euclidean_distances\n",
    "\n",
    "from warnings import simplefilter\n",
    "from sklearn.exceptions import ConvergenceWarning\n",
    "simplefilter(\"ignore\", category=ConvergenceWarning)\n",
    "\n",
    "import os\n",
    "os.chdir(\"GAIN\")\n",
    "from gain import gain\n",
    "os.chdir(\"..\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def select_receivers(norm_miss_data, current_miss_pattern):\n",
    "    \"\"\"Select the observations matching the missing pattern.\n",
    "    Args:\n",
    "        - norm_miss_data: normalized missing data, shape (n, d)\n",
    "        - current_miss_pattern: current missing pattern, boolean of shape (d, )\n",
    "    Returns:\n",
    "        - id_receivers: list of id corresponding to the rows matching the missing pattern\n",
    "    \"\"\"\n",
    "    (n, d) = norm_miss_data.shape\n",
    "    final_filter = np.ones(n).astype(\"bool\")\n",
    "    for i in range(d):\n",
    "        cur_filter = (np.isnan(norm_miss_data[:, i]) == current_miss_pattern[i])\n",
    "        final_filter = np.logical_and(final_filter, cur_filter)\n",
    "    id_receivers = np.where(final_filter)[0]\n",
    "    return id_receivers\n",
    "\n",
    "\n",
    "def select_givers(norm_miss_data, current_miss_pattern):\n",
    "    \"\"\"Select the observations having all entries for the missing pattern.\n",
    "    Args:\n",
    "        - norm_miss_data: normalized missing data, shape (n, d)\n",
    "        - current_miss_pattern: current missing pattern, boolean of shape (d, )\n",
    "    Returns:\n",
    "        - id_givers: list of id corresponding to potential givers for kNNxKDE\n",
    "    \"\"\"\n",
    "    (n, d) = norm_miss_data.shape\n",
    "    final_filter = np.ones(n).astype(\"bool\")\n",
    "    for i in range(d):\n",
    "        if current_miss_pattern[i]:\n",
    "            cur_filter = (np.isnan(norm_miss_data[:, i]) != current_miss_pattern[i])\n",
    "            final_filter = np.logical_and(final_filter, cur_filter)\n",
    "    id_givers = np.where(final_filter)[0]\n",
    "    return id_givers\n",
    "    \n",
    "\n",
    "def kNNxKDE(norm_miss_data, h=0.03, tau=50.0, nb_draws=10000):\n",
    "    \"\"\"The kNNxKDE algorithm!!!\n",
    "    Args:\n",
    "        - norm_miss_data: normalized missing data, shape (n, d)\n",
    "        - h: standard deviation (bandwidth) of the Gaussian kernel KDE (default=0.03)\n",
    "        - tau: temperature for the softmax (default=50.0)\n",
    "        - nb_draws: number of draws per missing entry (default=10000)\n",
    "    Returns:\n",
    "        - imputed_samples: a dictionary using tuples for key. The entry in\n",
    "        imputed_samples[(i, j)] in a numpy array with nb_draws samples,\n",
    "        representing the estimated distribution for the missing cell (i, j)\n",
    "    \"\"\"\n",
    "    (n, d) = norm_miss_data.shape\n",
    "    all_miss_patterns = np.unique(np.isnan(norm_miss_data), axis=0)\n",
    "    imputed_samples = dict()\n",
    "    \n",
    "    for current_miss_pattern in all_miss_patterns:\n",
    "        if not np.logical_or.reduce(current_miss_pattern):  # if there is no missing value\n",
    "            # print(f\"Not done: {current_miss_pattern}\")\n",
    "            continue\n",
    "        if np.logical_and.reduce(current_miss_pattern):  # if there are only missing values\n",
    "            print(f\"Not done: {current_miss_pattern}\")\n",
    "            continue\n",
    "        \n",
    "        id_receivers = select_receivers(norm_miss_data, current_miss_pattern)\n",
    "        id_givers = select_givers(norm_miss_data, current_miss_pattern)\n",
    "        \n",
    "        data_receivers = norm_miss_data[id_receivers]\n",
    "        data_givers = norm_miss_data[id_givers]\n",
    "        \n",
    "        d_ij = nan_euclidean_distances(data_receivers, data_givers)\n",
    "        d_ij[np.isnan(d_ij)] = np.inf\n",
    "        p_ij = softmax(- tau * d_ij, axis=1)\n",
    "        \n",
    "        cur_sample = np.zeros(nb_draws)\n",
    "        for i1 in range(len(id_receivers)):\n",
    "            probs = p_ij[i1]\n",
    "            neighbors = np.random.choice(len(id_givers), p=probs, size=nb_draws)  # Corresponding shuffled id\n",
    "            current_sample = data_givers[neighbors] + np.random.normal(loc=0.0, scale=h, size=(nb_draws, d))\n",
    "            for i2 in range(d):\n",
    "                if current_miss_pattern[i2]:\n",
    "                    imputed_samples[(id_receivers[i1], i2)] = current_sample[:, i2]\n",
    "    \n",
    "    return imputed_samples"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compute RMSE"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## kNN Imputer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE = 0.07540325867070136 +/- 0.004947223142806455\n",
      "RMSE = 0.1920195989577528 +/- 0.010812652907239113\n",
      "RMSE = 0.29458463864267553 +/- 0.010379280572216222\n"
     ]
    }
   ],
   "source": [
    "miss_rate = 0.2\n",
    "knn_hyperparams = [30, 30, 75]\n",
    "nb_repeat = 100\n",
    "store_rmse = np.zeros((3, nb_repeat))\n",
    "\n",
    "for i1 in range(3):\n",
    "    original_data = np.genfromtxt(f\"datasets/dataset{i1+1}.csv\", delimiter=\",\")\n",
    "    nb_n = knn_hyperparams[i1]\n",
    "    for i2 in range(nb_repeat):\n",
    "        miss_data = introduce_missing_data(original_data, miss_rate)\n",
    "        norm_miss_data, norm_params = normalization(data=miss_data)\n",
    "        norm_original_data, _ = normalization(data=original_data, parameters=norm_params)\n",
    "        imputer = KNNImputer(n_neighbors=nb_n)\n",
    "        norm_imputed_data = imputer.fit_transform(norm_miss_data)\n",
    "        rmse = compute_normalised_rmse(norm_original_data, norm_miss_data, norm_imputed_data)\n",
    "        store_rmse[i1, i2] = rmse\n",
    "\n",
    "for i1 in range(3):\n",
    "    rmse_mean = np.mean(store_rmse[i1])\n",
    "    rmse_std = np.std(store_rmse[i1])\n",
    "    print(f\"RMSE = {rmse_mean} +/- {rmse_std}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MissForest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE = 0.09735937560720499 +/- 0.00540467130498196\n",
      "RMSE = 0.2521386604101026 +/- 0.01762815675182584\n",
      "RMSE = 0.38362173351880985 +/- 0.019458300359581255\n"
     ]
    }
   ],
   "source": [
    "miss_rate = 0.2\n",
    "missforest_hyperparams = [10, 30, 30]\n",
    "nb_repeat = 100\n",
    "store_rmse = np.zeros((3, nb_repeat))\n",
    "\n",
    "for i1 in range(3):\n",
    "    original_data = np.genfromtxt(f\"datasets/dataset{i1+1}.csv\", delimiter=\",\")\n",
    "    nb_t = missforest_hyperparams[i1]\n",
    "    for i2 in range(nb_repeat):\n",
    "        print(i2, end=\"\\r\")\n",
    "        miss_data = introduce_missing_data(original_data, miss_rate)\n",
    "        norm_miss_data, norm_params = normalization(data=miss_data)\n",
    "        norm_original_data, _ = normalization(data=original_data, parameters=norm_params)\n",
    "        estimator = ExtraTreesRegressor(n_estimators=nb_t)\n",
    "        imputer = IterativeImputer(estimator=estimator, max_iter=10, tol=1e-1, verbose=0)\n",
    "        norm_imputed_data = imputer.fit_transform(norm_miss_data)\n",
    "        rmse = compute_normalised_rmse(norm_original_data, norm_miss_data, norm_imputed_data)\n",
    "        store_rmse[i1, i2] = rmse\n",
    "\n",
    "for i1 in range(3):\n",
    "    rmse_mean = np.mean(store_rmse[i1])\n",
    "    rmse_std = np.std(store_rmse[i1])\n",
    "    print(f\"RMSE = {rmse_mean} +/- {rmse_std}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MICE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE = 0.0751250665448411 +/- 0.004179133285224316\n",
      "RMSE = 0.25009146471263316 +/- 0.009249836443926393\n",
      "RMSE = 0.2942408332543697 +/- 0.010172650908255872\n"
     ]
    }
   ],
   "source": [
    "miss_rate = 0.2\n",
    "nb_repeat = 100\n",
    "store_rmse = np.zeros((3, nb_repeat))\n",
    "\n",
    "for i1 in range(3):\n",
    "    original_data = np.genfromtxt(f\"datasets/dataset{i1+1}.csv\", delimiter=\",\")\n",
    "    for i2 in range(nb_repeat):\n",
    "        print(i2, end=\"\\r\")\n",
    "        miss_data = introduce_missing_data(original_data, miss_rate)\n",
    "        norm_miss_data, norm_params = normalization(data=miss_data)\n",
    "        norm_original_data, _ = normalization(data=original_data, parameters=norm_params)\n",
    "        estimator = BayesianRidge()\n",
    "        imputer = IterativeImputer(estimator=estimator, max_iter=10, tol=1e-1, verbose=0)\n",
    "        norm_imputed_data = imputer.fit_transform(norm_miss_data)\n",
    "        rmse = compute_normalised_rmse(norm_original_data, norm_miss_data, norm_imputed_data)\n",
    "        store_rmse[i1, i2] = rmse\n",
    "\n",
    "for i1 in range(3):\n",
    "    rmse_mean = np.mean(store_rmse[i1])\n",
    "    rmse_std = np.std(store_rmse[i1])\n",
    "    print(f\"RMSE = {rmse_mean} +/- {rmse_std}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## GAIN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "19\n",
      "29\n",
      "RMSE = 0.2275972734748303 +/- 0.025683626160411883\n",
      "RMSE = 0.27124040063247357 +/- 0.023205178647101532\n",
      "RMSE = 0.30905015714580414 +/- 0.02686275044093801\n"
     ]
    }
   ],
   "source": [
    "miss_rate = 0.2\n",
    "gain_nb_iter_hyperparams = [500, 200, 100]\n",
    "gain_parameters = {\"batch_size\": 128, \"hint_rate\": 0.9, \"alpha\": 100, \"iterations\": 10000}\n",
    "nb_repeat = 30\n",
    "store_rmse = np.zeros((3, nb_repeat))\n",
    "\n",
    "for i1 in range(3):\n",
    "    print(i1)\n",
    "    original_data = np.genfromtxt(f\"datasets/dataset{i1+1}.csv\", delimiter=\",\")\n",
    "    nb_i = gain_nb_iter_hyperparams[i1]\n",
    "    for i2 in range(nb_repeat):\n",
    "        print(i2, end=\"\\r\")\n",
    "        miss_data = introduce_missing_data(original_data, miss_rate)\n",
    "        norm_miss_data, norm_params = normalization(data=miss_data)\n",
    "        norm_original_data, _ = normalization(data=original_data, parameters=norm_params)\n",
    "        gain_parameters[\"iterations\"] = 500\n",
    "        norm_imputed_data = gain(norm_miss_data, gain_parameters)\n",
    "        rmse = compute_normalised_rmse(norm_original_data, norm_miss_data, norm_imputed_data)\n",
    "        store_rmse[i1, i2] = rmse\n",
    "\n",
    "for i1 in range(3):\n",
    "    rmse_mean = np.mean(store_rmse[i1])\n",
    "    rmse_std = np.std(store_rmse[i1])\n",
    "    print(f\"RMSE = {rmse_mean} +/- {rmse_std}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## kNNxKDE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "19\n",
      "29\n",
      "RMSE = 0.11123003848352793 +/- 0.0054816001112183935\n",
      "RMSE = 0.2669811050107605 +/- 0.016792831483929575\n",
      "RMSE = 0.41894524211120165 +/- 0.023608927311569963\n"
     ]
    }
   ],
   "source": [
    "miss_rate = 0.2\n",
    "nb_draws = 10000\n",
    "nb_repeat = 100\n",
    "store_rmse = np.zeros((3, nb_repeat))\n",
    "\n",
    "for i1 in range(3):\n",
    "    print(i1)\n",
    "    original_data = np.genfromtxt(f\"datasets/dataset{i1+1}.csv\", delimiter=\",\")\n",
    "    (n, d) = original_data.shape\n",
    "    for i2 in range(nb_repeat):\n",
    "        print(i2, end=\"\\r\")\n",
    "        miss_data = introduce_missing_data(original_data, miss_rate)\n",
    "        norm_miss_data, norm_params = normalization(data=miss_data)\n",
    "        norm_original_data, _ = normalization(data=original_data, parameters=norm_params)\n",
    "        \n",
    "        imputed_samples = kNNxKDE(norm_miss_data)\n",
    "        norm_imputed_data = np.copy(norm_miss_data)\n",
    "        for key in imputed_samples.keys():\n",
    "            r = np.random.randint(low=0, high=nb_draws, size=1)\n",
    "            the_sample = imputed_samples[key][r]\n",
    "            norm_imputed_data[key[0], key[1]] = the_sample\n",
    "        \n",
    "        rmse = compute_normalised_rmse(norm_original_data, norm_miss_data, norm_imputed_data)\n",
    "        store_rmse[i1, i2] = rmse\n",
    "\n",
    "for i1 in range(3):\n",
    "    rmse_mean = np.mean(store_rmse[i1])\n",
    "    rmse_std = np.std(store_rmse[i1])\n",
    "    print(f\"RMSE = {rmse_mean} +/- {rmse_std}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compute log-Likelihood"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## kNN Imputer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "19\n",
      "29\n",
      "Loglik = 496.1320191214765 +/- 9.250675379671735\n",
      "Loglik = -2237.71167344662 +/- 323.97047008204146\n",
      "Loglik = -2287.2954099247713 +/- 190.17305494818507\n"
     ]
    }
   ],
   "source": [
    "miss_rate = 0.2\n",
    "knn_hyperparams = [30, 30, 75]\n",
    "nb_repeat = 100\n",
    "store_loglik = np.zeros((3, nb_repeat))\n",
    "\n",
    "for i1 in range(3):\n",
    "    print(i1)\n",
    "    original_data = np.genfromtxt(f\"datasets/dataset{i1+1}.csv\", delimiter=\",\")\n",
    "    nb_n = knn_hyperparams[i1]\n",
    "    for i2 in range(nb_repeat):\n",
    "        print(i2, end=\"\\r\")\n",
    "        miss_data = introduce_missing_data(original_data, miss_rate)\n",
    "        norm_miss_data, norm_params = normalization(data=miss_data)\n",
    "        norm_original_data, _ = normalization(data=original_data, parameters=norm_params)\n",
    "        imputer = KNNImputer(n_neighbors=nb_n)\n",
    "        norm_imputed_data = imputer.fit_transform(norm_miss_data)\n",
    "        renorm_imputed_data = renormalization(norm_imputed_data, norm_params)\n",
    "        if i1==0:\n",
    "            loglik = log_likelihood_model1(renorm_imputed_data)\n",
    "        elif i1==1:\n",
    "            loglik = log_likelihood_model2(renorm_imputed_data)\n",
    "        elif i1==2:\n",
    "            loglik = log_likelihood_model3(renorm_imputed_data)\n",
    "        store_loglik[i1, i2] = np.sum(loglik)\n",
    "\n",
    "for i1 in range(3):\n",
    "    loglik_mean = np.mean(store_loglik[i1])\n",
    "    loglik_std = np.std(store_loglik[i1])\n",
    "    print(f\"Loglik = {loglik_mean} +/- {loglik_std}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MissForest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "19\n",
      "29\n",
      "Loglik = 450.24213287345367 +/- 12.565013556525123\n",
      "Loglik = -541.9900201989471 +/- 137.65188436806372\n",
      "Loglik = -864.3169105687779 +/- 88.14096782134088\n"
     ]
    }
   ],
   "source": [
    "miss_rate = 0.2\n",
    "missforest_hyperparams = [10, 30, 30]\n",
    "nb_repeat = 100\n",
    "store_loglik = np.zeros((3, nb_repeat))\n",
    "\n",
    "for i1 in range(3):\n",
    "    print(i1)\n",
    "    original_data = np.genfromtxt(f\"datasets/dataset{i1+1}.csv\", delimiter=\",\")\n",
    "    nb_t = missforest_hyperparams[i1]\n",
    "    for i2 in range(nb_repeat):\n",
    "        print(i2, end=\"\\r\")\n",
    "        miss_data = introduce_missing_data(original_data, miss_rate)\n",
    "        norm_miss_data, norm_params = normalization(data=miss_data)\n",
    "        norm_original_data, _ = normalization(data=original_data, parameters=norm_params)\n",
    "        estimator = ExtraTreesRegressor(n_estimators=nb_t)\n",
    "        imputer = IterativeImputer(estimator=estimator, max_iter=10, tol=1e-1, verbose=0)\n",
    "        norm_imputed_data = imputer.fit_transform(norm_miss_data)\n",
    "        renorm_imputed_data = renormalization(norm_imputed_data, norm_params)\n",
    "        if i1==0:\n",
    "            loglik = log_likelihood_model1(renorm_imputed_data)\n",
    "        elif i1==1:\n",
    "            loglik = log_likelihood_model2(renorm_imputed_data)\n",
    "        elif i1==2:\n",
    "            loglik = log_likelihood_model3(renorm_imputed_data)\n",
    "        store_loglik[i1, i2] = np.sum(loglik)\n",
    "\n",
    "for i1 in range(3):\n",
    "    loglik_mean = np.mean(store_loglik[i1])\n",
    "    loglik_std = np.std(store_loglik[i1])\n",
    "    print(f\"Loglik = {loglik_mean} +/- {loglik_std}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MICE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "19\n",
      "29\n",
      "Loglik = 494.27942265612944 +/- 10.225314135572209\n",
      "Loglik = -2656.7470863783556 +/- 242.12743533610094\n",
      "Loglik = -2334.3005677029937 +/- 209.7526135279624\n"
     ]
    }
   ],
   "source": [
    "miss_rate = 0.2\n",
    "nb_repeat = 100\n",
    "store_loglik = np.zeros((3, nb_repeat))\n",
    "\n",
    "for i1 in range(3):\n",
    "    print(i1)\n",
    "    original_data = np.genfromtxt(f\"datasets/dataset{i1+1}.csv\", delimiter=\",\")\n",
    "    for i2 in range(nb_repeat):\n",
    "        print(i2, end=\"\\r\")\n",
    "        miss_data = introduce_missing_data(original_data, miss_rate)\n",
    "        norm_miss_data, norm_params = normalization(data=miss_data)\n",
    "        norm_original_data, _ = normalization(data=original_data, parameters=norm_params)\n",
    "        estimator = BayesianRidge()\n",
    "        imputer = IterativeImputer(estimator=estimator, max_iter=10, tol=1e-1, verbose=0)\n",
    "        norm_imputed_data = imputer.fit_transform(norm_miss_data)\n",
    "        renorm_imputed_data = renormalization(norm_imputed_data, norm_params)\n",
    "        if i1==0:\n",
    "            loglik = log_likelihood_model1(renorm_imputed_data)\n",
    "        elif i1==1:\n",
    "            loglik = log_likelihood_model2(renorm_imputed_data)\n",
    "        elif i1==2:\n",
    "            loglik = log_likelihood_model3(renorm_imputed_data)\n",
    "        store_loglik[i1, i2] = np.sum(loglik)\n",
    "\n",
    "for i1 in range(3):\n",
    "    loglik_mean = np.mean(store_loglik[i1])\n",
    "    loglik_std = np.std(store_loglik[i1])\n",
    "    print(f\"Loglik = {loglik_mean} +/- {loglik_std}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## GAIN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "19\n",
      "29\n",
      "Loglik = -233.5627610356465 +/- 231.16474959259867\n",
      "Loglik = -1481.933814971247 +/- 600.4347214963981\n",
      "Loglik = -2117.466663273708 +/- 318.56324620585326\n"
     ]
    }
   ],
   "source": [
    "miss_rate = 0.2\n",
    "gain_nb_iter_hyperparams = [500, 200, 100]\n",
    "gain_parameters = {\"batch_size\": 128, \"hint_rate\": 0.9, \"alpha\": 100, \"iterations\": 10000}\n",
    "nb_repeat = 30\n",
    "store_loglik = np.zeros((3, nb_repeat))\n",
    "\n",
    "for i1 in range(3):\n",
    "    print(i1)\n",
    "    original_data = np.genfromtxt(f\"datasets/dataset{i1+1}.csv\", delimiter=\",\")\n",
    "    nb_i = gain_nb_iter_hyperparams[i1]\n",
    "    for i2 in range(nb_repeat):\n",
    "        print(i2, end=\"\\r\")\n",
    "        miss_data = introduce_missing_data(original_data, miss_rate)\n",
    "        norm_miss_data, norm_params = normalization(data=miss_data)\n",
    "        norm_original_data, _ = normalization(data=original_data, parameters=norm_params)\n",
    "        gain_parameters[\"iterations\"] = 500\n",
    "        norm_imputed_data = gain(norm_miss_data, gain_parameters)\n",
    "        renorm_imputed_data = renormalization(norm_imputed_data, norm_params)\n",
    "        if i1==0:\n",
    "            loglik = log_likelihood_model1(renorm_imputed_data)\n",
    "        elif i1==1:\n",
    "            loglik = log_likelihood_model2(renorm_imputed_data)\n",
    "        elif i1==2:\n",
    "            loglik = log_likelihood_model3(renorm_imputed_data)\n",
    "        store_loglik[i1, i2] = np.sum(loglik)\n",
    "\n",
    "for i1 in range(3):\n",
    "    loglik_mean = np.mean(store_loglik[i1])\n",
    "    loglik_std = np.std(store_loglik[i1])\n",
    "    print(f\"Loglik = {loglik_mean} +/- {loglik_std}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## kNNxKDE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "19\n",
      "29\n",
      "Loglik = 408.2450310010284 +/- 15.436620339720738\n",
      "Loglik = -53.94334090660625 +/- 33.489889782775336\n",
      "Loglik = -509.2175786217365 +/- 15.143589932307346\n"
     ]
    }
   ],
   "source": [
    "miss_rate = 0.2\n",
    "nb_draws = 10000\n",
    "nb_repeat = 100\n",
    "store_loglik = np.zeros((3, nb_repeat))\n",
    "\n",
    "for i1 in range(3):\n",
    "    print(i1)\n",
    "    original_data = np.genfromtxt(f\"datasets/dataset{i1+1}.csv\", delimiter=\",\")\n",
    "    (n, d) = original_data.shape\n",
    "    for i2 in range(nb_repeat):\n",
    "        print(i2, end=\"\\r\")\n",
    "        miss_data = introduce_missing_data(original_data, miss_rate)\n",
    "        norm_miss_data, norm_params = normalization(data=miss_data)\n",
    "        norm_original_data, _ = normalization(data=original_data, parameters=norm_params)\n",
    "        \n",
    "        imputed_samples = kNNxKDE(norm_miss_data)\n",
    "        norm_imputed_data = np.copy(norm_miss_data)\n",
    "        for key in imputed_samples.keys():\n",
    "            r = np.random.randint(low=0, high=nb_draws, size=1)\n",
    "            the_sample = imputed_samples[key][r]\n",
    "            norm_imputed_data[key[0], key[1]] = the_sample\n",
    "        \n",
    "        renorm_imputed_data = renormalization(norm_imputed_data, norm_params)\n",
    "        if i1==0:\n",
    "            loglik = log_likelihood_model1(renorm_imputed_data)\n",
    "        elif i1==1:\n",
    "            loglik = log_likelihood_model2(renorm_imputed_data)\n",
    "        elif i1==2:\n",
    "            loglik = log_likelihood_model3(renorm_imputed_data)\n",
    "        store_loglik[i1, i2] = np.sum(loglik)\n",
    "\n",
    "for i1 in range(3):\n",
    "    loglik_mean = np.mean(store_loglik[i1])\n",
    "    loglik_std = np.std(store_loglik[i1])\n",
    "    print(f\"Loglik = {loglik_mean} +/- {loglik_std}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reference log-likelihood for the three datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "425.36661990665294\n",
      "78.7930296266803\n",
      "-480.7746785296445\n"
     ]
    }
   ],
   "source": [
    "original_data = np.genfromtxt(f\"datasets/dataset1.csv\", delimiter=\",\")\n",
    "print(np.sum(log_likelihood_model1(original_data)))\n",
    "\n",
    "original_data = np.genfromtxt(f\"datasets/dataset2.csv\", delimiter=\",\")\n",
    "print(np.sum(log_likelihood_model2(original_data)))\n",
    "\n",
    "original_data = np.genfromtxt(f\"datasets/dataset3.csv\", delimiter=\",\")\n",
    "print(np.sum(log_likelihood_model3(original_data)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
