{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3710jvsc74a57bd00241460ee18b003853f447c23ea6e745f2e4d4842427f079d78627177e285f8e",
   "display_name": "Python 3.7.10 64-bit ('test37': conda)"
  },
  "metadata": {
   "interpreter": {
    "hash": "0241460ee18b003853f447c23ea6e745f2e4d4842427f079d78627177e285f8e"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "source": [
    "# Test-time collective prediction -- (Experiments on real data)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import math\n",
    "from scipy import linalg\n",
    "\n",
    "# local models\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.linear_model import Lasso\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.neural_network import MLPRegressor\n",
    "from sklearn.kernel_ridge import KernelRidge\n",
    "from sklearn.svm import SVR\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn.neighbors import NearestNeighbors\n",
    "\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.datasets import load_svmlight_file\n",
    "from sklearn.datasets import load_boston\n",
    "\n",
    "import pandas as  pd\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib import cm\n",
    "import matplotlib.colors as mcolors\n",
    "\n",
    "import ensemble\n",
    "import utils\n",
    "import data_prep"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# code for running a comparison across different methods for one random train/test/validation split, where data is heterogeneous and partially sorted along the label y\n",
    "\n",
    "def run_benchmark(X_samples, Y_samples, model_type, num_agents, num_samples_test, num_neighbors, frac_ordered, params, num_consensus_iter):\n",
    "\n",
    "    num_features = X_samples.shape[1]\n",
    "    num_samples = X_samples.shape[0]\n",
    "\n",
    "    # permute indices for random train/test/validation split\n",
    "    indices = np.random.permutation(num_samples)\n",
    "\n",
    "    # test data\n",
    "    indices_test  = indices[-num_samples_test:]\n",
    "    X_samples_test = X_samples[indices_test,:]\n",
    "    Y_samples_test = Y_samples[indices_test]\n",
    "\n",
    "    # random validation data, size of one partition\n",
    "    num_samples_validation = int((num_samples - num_samples_test) / (num_agents+1))\n",
    "    indices_validation  = indices[-num_samples_test-num_samples_validation:-num_samples_test]\n",
    "    X_samples_validation = X_samples[indices_validation,:]\n",
    "    Y_samples_validation = Y_samples[indices_validation]\n",
    "\n",
    "    # partition remaining training data across agents\n",
    "    num_samples_train = num_samples -num_samples_test - num_samples_validation\n",
    "\n",
    "    # partition size - p1 is ordered, p2 is random\n",
    "    num_samples_p1 = int(num_samples_train*frac_ordered)\n",
    "    num_samples_p2 = num_samples_train - num_samples_p1\n",
    "\n",
    "    num_samples_local_p1 = int(num_samples_p1/(num_agents))\n",
    "    num_samples_local_p2 = int(num_samples_p2/(num_agents))\n",
    "\n",
    "    num_samples_local = num_samples_local_p1 + num_samples_local_p2\n",
    "\n",
    "    indices_train = indices[0:-num_samples_test-num_samples_validation]\n",
    "\n",
    "    # partitioning of training data\n",
    "    indices_p1 = indices_train[0:num_samples_p1]\n",
    "    indices_p2 = indices_train[num_samples_p1:]\n",
    "\n",
    "    # sort indices according to label value\n",
    "    idxs = np.argsort(Y_samples[indices_p1])\n",
    "    indices_p1 = indices_p1[idxs]\n",
    "\n",
    "    # generate heterogeneous data partitions\n",
    "    training_data = []\n",
    "\n",
    "    for k in range(num_agents):\n",
    "\n",
    "        indices_k1 = indices_p1[k*num_samples_local_p1:(k+1)*num_samples_local_p1]\n",
    "        indices_k2 = indices_p2[k*num_samples_local_p2:(k+1)*num_samples_local_p2]\n",
    "\n",
    "        indices_k = np.append(indices_k1,indices_k2)\n",
    "\n",
    "        this_X = X_samples[indices_k,:]\n",
    "        this_Y = Y_samples[indices_k]\n",
    "        training_data.append([this_X,this_Y])\n",
    "\n",
    "\n",
    "    # init ensemble\n",
    "    agents = ensemble.Ensemble(num_agents, training_data, sparse = data_is_sparse)\n",
    "\n",
    "    # train models\n",
    "    agents.train_local_models(type = model_type, params = params)\n",
    "\n",
    "    # evaluate different schemes on test data\n",
    "    agents_predictions      = np.zeros([num_agents,num_samples_test])\n",
    "    degroot_weights         = np.zeros([num_agents,num_samples_test])\n",
    "    degroot_predictions     = np.zeros(num_samples_test)\n",
    "    avg_predictions         = np.zeros(num_samples_test)\n",
    "\n",
    "    adaptive_cv_prediction  = np.zeros(num_samples_test)\n",
    "    static_cv_prediction    = np.zeros(num_samples_test)\n",
    "\n",
    "    #cross validation static\n",
    "    cv_weights_static = agents.get_inverse_mse_weights(X_samples_validation, Y_samples_validation)\n",
    "\n",
    "    # nearest neighbor classifier for validation data\n",
    "    nn_cv = NearestNeighbors(n_neighbors=num_neighbors)\n",
    "    nn_cv.fit(X_samples_validation, Y_samples_validation)\n",
    "\n",
    "    for i in range(num_samples_test):\n",
    "\n",
    "        test_point = X_samples_test[i]\n",
    "\n",
    "        # individual agent's predictions\n",
    "        these_predictions = agents.predict(test_point)\n",
    "        agents_predictions[:,i] = these_predictions\n",
    "\n",
    "        # CV-static prediction\n",
    "        static_cv_prediction[i] = np.dot(cv_weights_static,these_predictions)\n",
    "\n",
    "        # M-avg prediction\n",
    "        avg_predictions[i] = np.mean(these_predictions)\n",
    "\n",
    "        # find nearest neighbors in validation data\n",
    "        if data_is_sparse:\n",
    "            idx_nn_cv = nn_cv.kneighbors(test_point, return_distance = False)[0]\n",
    "        else:\n",
    "            idx_nn_cv = nn_cv.kneighbors([test_point], return_distance = False)[0]\n",
    "\n",
    "        adaptive_cv_weights = agents.get_inverse_mse_weights(X_samples_validation[idx_nn_cv], Y_samples_validation[idx_nn_cv])\n",
    "\n",
    "        adaptive_cv_prediction[i] = np.dot(adaptive_cv_weights,these_predictions)\n",
    "\n",
    "        # run degroot consensus finding\n",
    "        degroot_output = agents.get_consensus_weights(test_point, num_neighbors, num_degroot_iter = num_consensus_iter)\n",
    "\n",
    "        degroot_w     = degroot_output[0]\n",
    "        degroot_pred  = np.dot(degroot_w,these_predictions)\n",
    "\n",
    "        degroot_predictions[i] = degroot_pred\n",
    "        degroot_weights[:,i]   = degroot_w\n",
    "\n",
    "\n",
    "    # evaluate predictions\n",
    "    errs_avg                 = (avg_predictions - Y_samples_test)**2\n",
    "    errs_degroot             = (degroot_predictions - Y_samples_test)**2\n",
    "    errs_cv_static           = (static_cv_prediction - Y_samples_test)**2\n",
    "    errs_cv_adaptive         = (adaptive_cv_prediction - Y_samples_test)**2\n",
    "\n",
    "    mse_avg         = np.mean(errs_avg)\n",
    "    mse_degroot     = np.mean(errs_degroot)\n",
    "    mse_cv_static   = np.mean(errs_cv_static)\n",
    "    mse_cv_adaptive = np.mean(errs_cv_adaptive)\n",
    "\n",
    "\n",
    "    return mse_avg, mse_degroot, mse_cv_static, mse_cv_adaptive\n"
   ]
  },
  {
   "source": [
    "Example run for Boston dataset. Instructions how to load other datasets and model configurations can be found in the code"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "1 / 10\n",
      "MSE avg:          33.76741281127846\n",
      "MSE DeGroot:      30.563595464685033\n",
      "MSE CV static:    36.56514829227288\n",
      "MSE CV dynamic:   37.60445360255321\n",
      "-----------------\n",
      "2 / 10\n",
      "MSE avg:          29.006897728436574\n",
      "MSE DeGroot:      20.424210744284302\n",
      "MSE CV static:    27.06010022697837\n",
      "MSE CV dynamic:   20.171358830072776\n",
      "-----------------\n",
      "3 / 10\n",
      "MSE avg:          27.001981561503825\n",
      "MSE DeGroot:      20.245193485053633\n",
      "MSE CV static:    26.665767625049735\n",
      "MSE CV dynamic:   20.511014651899377\n",
      "-----------------\n",
      "4 / 10\n",
      "MSE avg:          21.683137905497937\n",
      "MSE DeGroot:      18.88054601490556\n",
      "MSE CV static:    21.67910841195372\n",
      "MSE CV dynamic:   19.655017264346572\n",
      "-----------------\n",
      "5 / 10\n",
      "MSE avg:          31.565842016114892\n",
      "MSE DeGroot:      30.8039933341756\n",
      "MSE CV static:    32.15415860558712\n",
      "MSE CV dynamic:   27.821359624027867\n",
      "-----------------\n",
      "6 / 10\n",
      "MSE avg:          32.83595765990294\n",
      "MSE DeGroot:      30.056615444007843\n",
      "MSE CV static:    32.19383265090715\n",
      "MSE CV dynamic:   29.979896039247905\n",
      "-----------------\n",
      "7 / 10\n",
      "MSE avg:          32.5659964259207\n",
      "MSE DeGroot:      28.454420309623746\n",
      "MSE CV static:    32.77466067548839\n",
      "MSE CV dynamic:   28.600258516897494\n",
      "-----------------\n",
      "8 / 10\n",
      "MSE avg:          38.7016173177995\n",
      "MSE DeGroot:      33.740724873934006\n",
      "MSE CV static:    35.714278930381205\n",
      "MSE CV dynamic:   32.30365879429226\n",
      "-----------------\n",
      "9 / 10\n",
      "MSE avg:          32.56144729870298\n",
      "MSE DeGroot:      27.654543144289775\n",
      "MSE CV static:    31.842649653346758\n",
      "MSE CV dynamic:   29.399404840275782\n",
      "-----------------\n",
      "10 / 10\n",
      "MSE avg:          17.186512128694847\n",
      "MSE DeGroot:      14.168776591711785\n",
      "MSE CV static:    19.699390902432583\n",
      "MSE CV dynamic:   17.66761887619186\n",
      "-----------------\n",
      "+++ SUMMARY +++\n",
      "MSE DeGroot:      25.49926194066713\n",
      "-----------------------\n",
      "relative gain over DG:\n",
      "avg:          -18.063814368978754 3.509981271642719\n",
      "CV static:    -18.53672274479144 3.630551891248681\n",
      "CV dynamic:   -4.453275340465699 3.3380892646936036\n"
     ]
    }
   ],
   "source": [
    "# loacd boston data\n",
    "raw_data = load_boston()\n",
    "X_samples = raw_data['data']\n",
    "Y_samples = raw_data['target']\n",
    "\n",
    "# scale data\n",
    "scaler = StandardScaler()\n",
    "scaler.fit(X_samples)\n",
    "X_samples = scaler.transform(X_samples)\n",
    "\n",
    "# format of data, True if svmlight format\n",
    "data_is_sparse = False\n",
    "\n",
    "# ---------------------------------------------------\n",
    "# when loading any other dataset in sparse libsvm format, set data_is_sparse=True and load the data as follows:\n",
    "# --------\n",
    "# X_samples, Y_samples = load_svmlight_file('path_to_data')\n",
    "# data_is_sparse = True\n",
    "# ---------------------------------------------------\n",
    "    \n",
    "\n",
    "num_features = X_samples.shape[1]\n",
    "num_samples = X_samples.shape[0]\n",
    "\n",
    "\n",
    "# specify configuration to run\n",
    "num_agents = 5\n",
    "num_consensus_iter = 30\n",
    "model_type = 'ridge'\n",
    "num_samples_test = 100\n",
    "num_neighbors = 5\n",
    "frac_ordered = 0.5\n",
    "params = [0.00001]\n",
    "\n",
    "# ---------------------------------------------------\n",
    "# the list of parameters 'params' is custom to every model.\n",
    "# the available models are model_type: ['linear', 'lasso', 'ridge', 'DTR', 'NN']\n",
    "# --------\n",
    "# linear: no parameters needed\n",
    "# lasso : [regularizer]\n",
    "# ridge : [regularizer]\n",
    "# DTR   : [max_num_leafs]\n",
    "# NN    : [alpha, solver, max_iter, hidden_layer_size]\n",
    "# ---------------------------------------------------\n",
    "\n",
    "# run experiment\n",
    "num_reps = 10\n",
    "\n",
    "#record\n",
    "mse_avg = np.zeros(num_reps)\n",
    "mse_degroot = np.zeros(num_reps)\n",
    "mse_cv_static = np.zeros(num_reps)\n",
    "mse_cv_adaptive = np.zeros(num_reps)\n",
    "\n",
    "for rep in range(num_reps):\n",
    "\n",
    "    evaluation = run_benchmark(X_samples, Y_samples, model_type, num_agents, num_samples_test, num_neighbors, frac_ordered, params, num_consensus_iter)\n",
    "\n",
    "    mse_avg[rep]         = evaluation[0]\n",
    "    mse_degroot[rep]     = evaluation[1]\n",
    "    mse_cv_static[rep]   = evaluation[2]\n",
    "    mse_cv_adaptive[rep] = evaluation[3]\n",
    "\n",
    "    print(rep+1,'/',num_reps)\n",
    "    print('MSE avg:         ', mse_avg[rep])\n",
    "    print('MSE DeGroot:     ', mse_degroot[rep])\n",
    "    print('MSE CV static:   ', mse_cv_static[rep])\n",
    "    print('MSE CV dynamic:  ', mse_cv_adaptive[rep])\n",
    "    print('-----------------')\n",
    "\n",
    "\n",
    "print ('+++ SUMMARY +++')\n",
    "print('MSE DeGroot:     ', np.mean(mse_degroot))\n",
    "print('-----------------------')\n",
    "print('relative gain over DG:')\n",
    "\n",
    "gain_avg = (mse_degroot-mse_avg)/mse_degroot\n",
    "gain_cv_static = (mse_degroot-mse_cv_static)/mse_degroot\n",
    "gain_cv_adaptive = (mse_degroot-mse_cv_adaptive)/mse_degroot\n",
    "\n",
    "# gain in %\n",
    "gain_avg *=100\n",
    "gain_cv_static*=100\n",
    "gain_cv_adaptive*=100\n",
    "\n",
    "\n",
    "print('avg:         ', np.mean(gain_avg), np.std(gain_avg)/np.sqrt(num_reps))\n",
    "print('CV static:   ', np.mean(gain_cv_static), np.std(gain_cv_static)/np.sqrt(num_reps))\n",
    "print('CV dynamic:  ', np.mean(gain_cv_adaptive), np.std(gain_cv_adaptive)/np.sqrt(num_reps))\n",
    "\n",
    "\n",
    "\n"
   ]
  }
 ]
}