{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ongoing-sally",
   "metadata": {},
   "source": [
    "# Statsmodels\n",
    "* https://www.kaggle.com/ojwatson/mixed-models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "stunning-great",
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import statsmodels.api as sm\n",
    "import statsmodels.formula.api as smf\n",
    "from statsmodels.tools.sm_exceptions import ConvergenceWarning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "senior-architecture",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "           Mixed Linear Model Regression Results\n",
      "===========================================================\n",
      "Model:             MixedLM  Dependent Variable:  Weight    \n",
      "No. Observations:  861      Method:              REML      \n",
      "No. Groups:        72       Scale:               6.0372    \n",
      "Min. group size:   11       Log-Likelihood:      -2217.0475\n",
      "Max. group size:   12       Converged:           Yes       \n",
      "Mean group size:   12.0                                    \n",
      "-----------------------------------------------------------\n",
      "                 Coef.  Std.Err.   z    P>|z| [0.025 0.975]\n",
      "-----------------------------------------------------------\n",
      "Intercept        15.739    0.550 28.603 0.000 14.660 16.817\n",
      "Time              6.939    0.080 86.925 0.000  6.783  7.095\n",
      "Group Var        19.503    1.561                           \n",
      "Group x Time Cov  0.294    0.153                           \n",
      "Time Var          0.416    0.033                           \n",
      "===========================================================\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Formula indicates mean weight as a linear function of time. Random intercept for each pig. \n",
    "data = sm.datasets.get_rdataset('dietox', 'geepack').data\n",
    "md = smf.mixedlm(\"Weight ~ Time\", data, groups=data[\"Pig\"], re_formula=\"~Time\")\n",
    "mdf = md.fit(method=[\"lbfgs\"])\n",
    "print(mdf.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "designed-tonight",
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = mdf.fittedvalues"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "exclusive-sword",
   "metadata": {},
   "source": [
    "# Stats models on the FQI part"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "official-community",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train_tuples.json', 'r') as f:\n",
    "    train_dict = json.load(f)\n",
    "with open('test_tuples.json', 'r') as f:\n",
    "    test_dict = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "immediate-bernard",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['a0', 'a1', 'r', 'ds', 's0', 's1', 's2', 's3', 's4', 's5', 's6', 's7',\n",
       "       's8', 's9'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = pd.DataFrame.from_dict(train_dict)\n",
    "test_df = pd.DataFrame.from_dict(test_dict)\n",
    "train_df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "lined-attempt",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = train_df['r']\n",
    "X = train_df[['a0', 'a1', 's0', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9']]\n",
    "groups = train_df['ds']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "vietnamese-engineering",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/aishwaryamandyam/anaconda3/envs/tf/lib/python3.7/site-packages/statsmodels/base/model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
      "  ConvergenceWarning)\n",
      "/Users/aishwaryamandyam/anaconda3/envs/tf/lib/python3.7/site-packages/statsmodels/regression/mixed_linear_model.py:2202: ConvergenceWarning: Retrying MixedLM optimization with lbfgs\n",
      "  ConvergenceWarning)\n",
      "/Users/aishwaryamandyam/anaconda3/envs/tf/lib/python3.7/site-packages/statsmodels/regression/mixed_linear_model.py:2237: ConvergenceWarning: The MLE may be on the boundary of the parameter space.\n",
      "  warnings.warn(msg, ConvergenceWarning)\n",
      "/Users/aishwaryamandyam/anaconda3/envs/tf/lib/python3.7/site-packages/statsmodels/regression/mixed_linear_model.py:2261: ConvergenceWarning: The Hessian matrix at the estimated parameter values is not positive definite.\n",
      "  warnings.warn(msg, ConvergenceWarning)\n"
     ]
    }
   ],
   "source": [
    "model = sm.MixedLM(endog=y, exog=X, groups=groups)\n",
    "result = model.fit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "harmful-wilson",
   "metadata": {},
   "outputs": [],
   "source": [
    "testX = test_df[['a0', 'a1', 's0', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9']]\n",
    "predictions = result.predict(exog=testX)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "stunning-corporation",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 4.42198702,  9.03359092, 26.39228657, ..., 21.49984278,\n",
       "        6.12003501, 15.71244469])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Sanity check to see if predictions == y_hat=beta0+beta1_s*x\n",
    "fe_coeffs = np.asarray(result.fe_params)\n",
    "fe_coeffs = np.reshape(fe_coeffs, (1, 12))\n",
    "x_test = testX.to_numpy()\n",
    "manual_preds = np.dot(fe_coeffs, x_test.T)\n",
    "manual_preds[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "fitted-arbitration",
   "metadata": {},
   "outputs": [],
   "source": [
    "for dp, mp in zip(predictions, manual_preds[0]):\n",
    "    if str(dp) != str(mp):\n",
    "        print(\"Pred: \" + str(dp) + \" Sanity: \" + str(mp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "automotive-database",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a0           5.817828\n",
       "a1           1.775593\n",
       "s0           1.321432\n",
       "s1           3.118893\n",
       "s2           5.576449\n",
       "s3          -0.708489\n",
       "s4          -1.752976\n",
       "s5           1.096265\n",
       "s6           1.588758\n",
       "s7          -0.882082\n",
       "s8          -7.679591\n",
       "s9          -0.603794\n",
       "Group Var    0.065379\n",
       "dtype: float64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result.params"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "behavioral-swing",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a0    5.817828\n",
       "a1    1.775593\n",
       "s0    1.321432\n",
       "s1    3.118893\n",
       "s2    5.576449\n",
       "s3   -0.708489\n",
       "s4   -1.752976\n",
       "s5    1.096265\n",
       "s6    1.588758\n",
       "s7   -0.882082\n",
       "s8   -7.679591\n",
       "s9   -0.603794\n",
       "dtype: float64"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result.fe_params"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "lightweight-cemetery",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: Group Var    0.007656\n",
       " dtype: float64,\n",
       " 1: Group Var    0.000603\n",
       " dtype: float64}"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result.random_effects"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "subjective-twist",
   "metadata": {},
   "source": [
    "## Using BinomialBayesMixedGLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "aquatic-stewart",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = train_df['r']\n",
    "X = train_df[['a0', 'a1', 's0', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9']]\n",
    "groups = train_df['ds']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "competitive-imagination",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = sm.MixedLM(endog=y, exog=X, groups=groups)\n",
    "result = model.fit()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "broken-spine",
   "metadata": {},
   "source": [
    "## From formula"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "fatty-plaintiff",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/aishwaryamandyam/anaconda3/envs/tf/lib/python3.7/site-packages/statsmodels/base/model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
      "  ConvergenceWarning)\n",
      "/Users/aishwaryamandyam/anaconda3/envs/tf/lib/python3.7/site-packages/statsmodels/regression/mixed_linear_model.py:2202: ConvergenceWarning: Retrying MixedLM optimization with lbfgs\n",
      "  ConvergenceWarning)\n",
      "/Users/aishwaryamandyam/anaconda3/envs/tf/lib/python3.7/site-packages/statsmodels/regression/mixed_linear_model.py:2261: ConvergenceWarning: The Hessian matrix at the estimated parameter values is not positive definite.\n",
      "  warnings.warn(msg, ConvergenceWarning)\n"
     ]
    }
   ],
   "source": [
    "model = sm.MixedLM.from_formula(\"r ~ a0 + a1 + s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9\", train_df, groups=train_df['ds'], re_formula='a0 + a1 + s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9')\n",
    "result = model.fit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "incorporate-measurement",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Intercept    0.009911\n",
       "a0           5.817100\n",
       "a1           1.776143\n",
       "s0           1.321474\n",
       "s1           3.117673\n",
       "s2           5.576962\n",
       "s3          -0.709580\n",
       "s4          -1.752500\n",
       "s5           1.095225\n",
       "s6           1.589119\n",
       "s7          -0.882842\n",
       "s8          -7.680264\n",
       "s9          -0.603790\n",
       "dtype: float64"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result.fe_params"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "clear-secretary",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: Group    0.007392\n",
       " a0      -0.008371\n",
       " a1       0.006599\n",
       " s0      -0.004872\n",
       " s1       0.004685\n",
       " s2      -0.002982\n",
       " s3       0.007208\n",
       " s4      -0.001895\n",
       " s5       0.003350\n",
       " s6      -0.005965\n",
       " s7       0.008608\n",
       " s8       0.001627\n",
       " s9       0.002616\n",
       " dtype: float64,\n",
       " 1: Group   -0.007392\n",
       " a0       0.008371\n",
       " a1      -0.006599\n",
       " s0       0.004872\n",
       " s1      -0.004685\n",
       " s2       0.002982\n",
       " s3      -0.007208\n",
       " s4       0.001895\n",
       " s5      -0.003350\n",
       " s6       0.005965\n",
       " s7      -0.008608\n",
       " s8      -0.001627\n",
       " s9      -0.002616\n",
       " dtype: float64}"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result.random_effects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "synthetic-marina",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tf",
   "language": "python",
   "name": "tf"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
