{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "9bf1f3ff",
   "metadata": {},
   "source": [
    "# Loading libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a19b92ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import List, Union, Dict\n",
    "import sys\n",
    "import os\n",
    "import glob\n",
    "import datetime\n",
    "import yaml\n",
    "import warnings\n",
    "sys.path.insert(1, '..')\n",
    "os.chdir('..')\n",
    "\n",
    "import seaborn as sns\n",
    "sns.set_style('whitegrid')\n",
    "import matplotlib.pyplot as plt\n",
    "import statsmodels.api as sm\n",
    "import sklearn\n",
    "import optuna\n",
    "import datetime\n",
    "\n",
    "from darts import models\n",
    "from darts import metrics\n",
    "from darts import TimeSeries\n",
    "from darts.dataprocessing.transformers import Scaler\n",
    "\n",
    "from statsforecast.models import AutoARIMA\n",
    "\n",
    "from data_formatter.base import *\n",
    "from bin.utils import *"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "504dbf5c",
   "metadata": {},
   "source": [
    "# Processing"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13508d46",
   "metadata": {},
   "source": [
    "## Load Glucose data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90365038",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Loop over the folder of each subject and merge files with insulin data by id\n",
    "subject_ids = [\"001\", \"002\", \"003\", \"004\", \"005\", \"006\", \"007\", \"008\", \"009\"]\n",
    "\n",
    "df_list = []\n",
    "for subject_id in subject_ids:\n",
    "    subject_data = pd.read_csv(f\"raw_data/dubosson_covariates/diabetes_subset_pictures-glucose-food-insulin/{subject_id}/glucose.csv\")\n",
    "    subject_data[\"id\"] = subject_id\n",
    "    df_list.append(subject_data)\n",
    "\n",
    "glucose_data = pd.concat(df_list, axis=0, ignore_index=True)\n",
    "glucose_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "463f3d25",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create one daytime column \n",
    "glucose_data['date'] = pd.to_datetime(glucose_data['date'])\n",
    "glucose_data['time'] = pd.to_datetime(glucose_data['time'], format='%H:%M:%S').dt.time\n",
    "glucose_data['time'] = glucose_data.apply(lambda x: datetime.datetime.combine(x['date'], x['time']), axis=1)\n",
    "# Keep the observations with cgm type only\n",
    "glucose_data = glucose_data[(glucose_data['type']=='cgm')]\n",
    "# Drop Date, Type, Comments columns\n",
    "glucose_data.drop([\"date\", \"type\", \"comments\"], axis=1, inplace=True)\n",
    "# Covert subject ids to int64 to match with \"data\" ids\n",
    "glucose_data['id'] = glucose_data['id'].astype(int)\n",
    "# Check for NaNs\n",
    "glucose_data.isna().sum() # no NaN values\n",
    "# Convert glucose readings from mmol/l to mg/dl\n",
    "glucose_data['glucose'] = 18*glucose_data['glucose']\n",
    "# rename Glucose column to gl\n",
    "glucose_data.rename(columns={'glucose': 'gl'}, inplace=True)\n",
    "# Reorder the columns\n",
    "glucose_data = glucose_data[['id', 'time', 'gl']]\n",
    "# reset index\n",
    "glucose_data.reset_index(drop=True, inplace=True)\n",
    "\n",
    "glucose_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d196c1fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot glucose for each subject\n",
    "groups = glucose_data.groupby('id')\n",
    "num_cols = 3\n",
    "num_rows = int(np.ceil(len(groups)/num_cols))\n",
    "fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))\n",
    "\n",
    "for i, (group_name, group) in enumerate(groups):\n",
    "    row_idx = i // num_cols\n",
    "    col_idx = i % num_cols\n",
    "    ax = axs[row_idx, col_idx]\n",
    "    ax.plot(group['time'], group['gl'])\n",
    "    ax.set_xlabel('Time')\n",
    "    ax.set_ylabel('Glucose')\n",
    "    ax.set_title(f'Glucose Plot for ID {group_name}')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cb8a05fa",
   "metadata": {},
   "source": [
    "## Insulin covariates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a908047",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop over the folder of each subject and merge files with insulin data by id\n",
    "subject_ids = [\"001\", \"002\", \"003\", \"004\", \"005\", \"006\", \"007\", \"008\", \"009\"]\n",
    "\n",
    "df_list = []\n",
    "for subject_id in subject_ids:\n",
    "    subject_data = pd.read_csv(f\"raw_data/dubosson_covariates/diabetes_subset_pictures-glucose-food-insulin/{subject_id}/insulin.csv\")\n",
    "    subject_data[\"id\"] = subject_id\n",
    "    df_list.append(subject_data)\n",
    "\n",
    "insulin_data = pd.concat(df_list, axis=0, ignore_index=True)\n",
    "insulin_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "caad7e1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create one daytime column \n",
    "insulin_data['date'] = pd.to_datetime(insulin_data['date'])\n",
    "insulin_data['time'] = pd.to_datetime(insulin_data['time'], format='%H:%M:%S').dt.time\n",
    "insulin_data['datetime'] = insulin_data.apply(lambda x: datetime.datetime.combine(x['date'], x['time']), axis=1)\n",
    "# Drop Date, Time, Comment columns\n",
    "insulin_data.drop([\"date\", \"time\", \"comment\"], axis=1, inplace=True)\n",
    "# Covert subject ids to int64 to match with \"data\" ids\n",
    "insulin_data['id'] = insulin_data['id'].astype(int)\n",
    "\n",
    "insulin_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bb2c5c16",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Merge the two datasets based on \"id\"\n",
    "df = insulin_data.merge(glucose_data, on='id')\n",
    "# For each row in insulin_data, calculate the absolute difference\n",
    "df['diff'] = (df['datetime'] - df['time']).abs()\n",
    "# Find the index of the minimum difference for each subject and each insulin date-time\n",
    "idx = df.groupby(['id', 'datetime'])['diff'].idxmin()\n",
    "# Use that index to retrieve the corresponding \"time\" value\n",
    "df_final = df.loc[idx, ['id', 'datetime', 'time']]\n",
    "df_final.rename(columns={'id': 'id', 'time': 'closest_time'}, inplace=True)\n",
    "# Add the closest time as a new column in insulin_data\n",
    "result = insulin_data.merge(df_final, on=['id', 'datetime'], how='left')\n",
    "# Calculate the difference between the closest time and datetime in minutes\n",
    "result.loc[:, 'time_diff'] = np.abs((result['closest_time'] - result['datetime']) / np.timedelta64(1, 'm'))\n",
    "# Keep only the rows where the absolute difference is less than or equal to 5 minutes\n",
    "result = result.loc[result['time_diff'] <= 5, :]\n",
    "# Some rows have exact the same closest_time when a person took fast and slow insulin at the same time. \n",
    "# Merge these duplicate rows in one row\n",
    "result = result.groupby([\"id\", \"closest_time\"]).agg({\"fast_insulin\": \"sum\", \"slow_insulin\": \"sum\"}).reset_index()\n",
    "# Merge glucose and insulin datasets\n",
    "data_cov = glucose_data.merge(result, how='left', left_on=['id', 'time'], right_on=['id', 'closest_time'])\n",
    "# Drop closest_time column\n",
    "data_cov.drop([\"closest_time\"], axis=1, inplace=True)\n",
    "# Replace observed values of zeroes with NaN\n",
    "data_cov = data_cov.replace(0, np.nan)\n",
    "\n",
    "data_cov"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52fa1b67",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Prepare data for the interpolation\n",
    "\n",
    "# set NaN right before observed values to 0 for slow_insulin\n",
    "mask = data_cov['slow_insulin'].isna() & data_cov['slow_insulin'].shift(-1).notna()\n",
    "data_cov.loc[mask, 'slow_insulin'] = 0\n",
    "# set NaN right before observed values to 0 for fast_insulin\n",
    "mask = data_cov['fast_insulin'].isna() & data_cov['fast_insulin'].shift(-1).notna()\n",
    "data_cov.loc[mask, 'fast_insulin'] = 0\n",
    "\n",
    "# Insert 0 instead of NaN in slow_insulin after approx. 24 hours after last observation \n",
    "# or at the very last observation for each subject\n",
    "subset_data = []\n",
    "for id in data_cov['id'].unique():\n",
    "    # select only the rows for the current id\n",
    "    id_data_cov = data_cov[data_cov['id']==id]\n",
    "    # Find index of the last observed value\n",
    "    last_observed_index = id_data_cov['slow_insulin'].last_valid_index()\n",
    "    # If there are no observed values at all, insert 0 to the last observation of the subset\n",
    "    if pd.isna(last_observed_index):\n",
    "        slow_insulin = id_data_cov['slow_insulin'].copy()\n",
    "        id_data_cov.loc[id_data_cov.index[-1], 'slow_insulin'] = 0 \n",
    "    else:\n",
    "        # Insert 0 instead of NaN in slow_insulin after approx. 24 hours after last valis observation\n",
    "        id_data_cov.loc[last_observed_index + 228, 'slow_insulin'] = 0\n",
    "    # Append the subjects' subsets\n",
    "    subset_data.append(id_data_cov)\n",
    "# Covert the data back to dataframe  \n",
    "data_cov = pd.concat(subset_data, ignore_index=True)\n",
    "\n",
    "# Insert 0 instead of NaN in fast_insulin after approx. 2 hours after last observation \n",
    "# or at the very last observation for each subject\n",
    "subset_data = []\n",
    "for id in data_cov['id'].unique():\n",
    "    # select only the rows for the current id\n",
    "    id_data_cov = data_cov[data_cov['id']==id]\n",
    "    # Find index of the last observed value\n",
    "    last_observed_index = id_data_cov['fast_insulin'].last_valid_index()\n",
    "    # If there are no observed values at all, insert 0 to the last observation of the subset\n",
    "    if pd.isna(last_observed_index):\n",
    "        slow_insulin = id_data_cov['fast_insulin'].copy()\n",
    "        id_data_cov.loc[id_data_cov.index[-1], 'fast_insulin'] = 0 \n",
    "    else:\n",
    "        # Insert 0 instead of NaN in slow_insulin after approx. 24 hours after last valis observation\n",
    "        id_data_cov.loc[last_observed_index + 24, 'fast_insulin'] = 0\n",
    "    # Append the subjects' subsets\n",
    "    subset_data.append(id_data_cov)\n",
    "# Covert the data back to dataframe  \n",
    "data_cov = pd.concat(subset_data, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5aa39d7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Interpolate insulin data\n",
    "\n",
    "# Sort the data by subject ID and time\n",
    "data = data_cov.sort_values(['id', 'time'])\n",
    "interpolated_data = []\n",
    "\n",
    "# Loop over each subject ID\n",
    "for subject_id in data['id'].unique():\n",
    "    # Select the rows for the current subject ID\n",
    "    subject_data = data[data['id'] == subject_id]\n",
    "    \n",
    "    subject_data['slow_insulin'] = subject_data['slow_insulin'].interpolate(\n",
    "        method='linear', limit_area = 'inside')\n",
    "    subject_data['fast_insulin'] = subject_data['fast_insulin'].interpolate(\n",
    "        method='linear', limit_area = 'inside')\n",
    "    interpolated_data.append(subject_data)\n",
    "\n",
    "data_cov = pd.concat(interpolated_data, ignore_index=True)\n",
    "# Replace NaN with zeroes\n",
    "data_cov = data_cov.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "649745a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot the fast and slow insulin intakes for each subject\n",
    "groups = data_cov.groupby('id')\n",
    "num_cols = 3\n",
    "num_rows = int(np.ceil(len(groups)/num_cols))\n",
    "fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))\n",
    "\n",
    "for i, (group_name, group) in enumerate(groups):\n",
    "    row_idx = i // num_cols\n",
    "    col_idx = i % num_cols\n",
    "    ax = axs[row_idx, col_idx]\n",
    "    ax.plot(group['time'], group['fast_insulin'], label='Fast Insulin')\n",
    "    ax.plot(group['time'], group['slow_insulin'], label='Slow Insulin')\n",
    "    ax.set_xlabel('Time')\n",
    "    ax.set_ylabel('Insulin')\n",
    "    ax.set_title(f'Insulin Plot for ID {group_name}')\n",
    "    ax.legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfa931e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sum of NaN values in dataset\n",
    "data_cov.isna().sum().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be4ec6ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Summary statistic of dataset\n",
    "data_cov.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9a8dedc0",
   "metadata": {},
   "source": [
    "## Food Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e42e003",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop over the folder of each subject and merge files with insulin data by id\n",
    "subject_ids = [\"001\", \"002\", \"003\", \"004\", \"005\", \"006\", \"007\", \"008\", \"009\"]\n",
    "\n",
    "df_list = []\n",
    "for subject_id in subject_ids:\n",
    "    subject_data = pd.read_csv(f\"raw_data/dubosson_covariates/diabetes_subset_pictures-glucose-food-insulin/{subject_id}/food.csv\")\n",
    "    subject_data[\"id\"] = subject_id\n",
    "    df_list.append(subject_data)\n",
    "\n",
    "food_data = pd.concat(df_list, axis=0, ignore_index=True)\n",
    "food_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06bd625e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop Picture, Description columns\n",
    "food_data.drop([\"picture\", \"description\"], axis=1, inplace=True)\n",
    "# Covert subject ids to int64 to match with \"data\" ids\n",
    "food_data['id'] = food_data['id'].astype(int)\n",
    "# Check for NaNs\n",
    "food_data.isna().sum() #present\n",
    "# Drop rows with NaN values\n",
    "food_data.dropna()\n",
    "# drop rows with NaN values in datetime column\n",
    "food_data = food_data.dropna(subset=['datetime'])\n",
    "# drop rows with NaN values in balance column\n",
    "food_data = food_data.dropna(subset=['balance'])\n",
    "# drop rows with NaN values in quality column\n",
    "food_data = food_data.dropna(subset=['quality'])\n",
    "# Change the format of datetime column to match with glucose dataset\n",
    "food_data['datetime'] = food_data['datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y:%m:%d %H:%M:%S'))\n",
    "\n",
    "\n",
    "food_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7143a7a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get data summary\n",
    "print(food_data['calories'].describe())\n",
    "print(list(set(list(food_data['balance']))))\n",
    "print(list(set(list(food_data['quality']))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "715bd7e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Recode the binary/categorical values\n",
    "food_data['balance'] = food_data['balance'].replace({'Unbalance': 1, 'Balance': 2})\n",
    "food_data['quality'] = food_data['quality'].replace({'Low quality': 1, 'Medium quality': 2, 'Good quality': 3})\n",
    "food_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "daaf1c3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge the two datasets based on \"id\"\n",
    "df = data_cov.merge(food_data, on='id')\n",
    "# For each row in insulin_data, calculate the absolute difference\n",
    "df['diff'] = (df['datetime'] - df['time']).abs()\n",
    "# Find the index of the minimum difference for each subject and each insulin date-time\n",
    "idx = df.groupby(['id', 'datetime'])['diff'].idxmin()\n",
    "# Use that index to retrieve the corresponding \"time\" value\n",
    "df_final = df.loc[idx, ['id', 'datetime', 'time', 'calories', 'balance', 'quality']]\n",
    "df_final.rename(columns={'id': 'id', 'time': 'closest_time'}, inplace=True)\n",
    "# Add the closest time as a new column in insulin_data\n",
    "result = data_cov.merge(df_final, left_on=['id', 'time'], right_on=['id', 'closest_time'], how='left')\n",
    "# Calculate the difference between the closest time and datetime in minutes\n",
    "result.loc[:, 'time_diff'] = np.abs((result['closest_time'] - result['datetime']) / np.timedelta64(1, 'm'))\n",
    "# Keep only the rows where the absolute difference is less than or equal to 5 minutes\n",
    "result = result.loc[result['time_diff'] <= 5, :]\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea397743",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge these duplicate rows in one row\n",
    "result = result.groupby([\"id\", \"closest_time\"]).agg({\"calories\": \"sum\", \"balance\": \"min\", \"quality\": \"min\"}).reset_index()\n",
    "# Merge glucose and insulin datasets\n",
    "data_cov = data_cov.merge(result, how='left', left_on=['id', 'time'], right_on=['id', 'closest_time'])\n",
    "# Drop closest_time column\n",
    "data_cov.drop([\"closest_time\"], axis=1, inplace=True)\n",
    "# Replace NaN with zerows\n",
    "data_cov = data_cov.fillna(0)\n",
    "\n",
    "data_cov"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88c2b13f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop through each subject and calculate the mean for that subject for balance and quality variables\n",
    "for subject in data_cov['id'].unique():\n",
    "    subject_data = data_cov[data_cov['id'] == subject]\n",
    "    subject_mean = subject_data[subject_data[['balance', 'quality']] > 0][['balance', 'quality']].mean().round()\n",
    "    if pd.isna(subject_mean).all():\n",
    "        subject_mean = subject_mean.fillna(0)\n",
    "    else:\n",
    "        subject_mean = subject_mean.astype(int)\n",
    "    \n",
    "    # Replace the values in the original DataFrame with the subject-specific means\n",
    "    data_cov.loc[data_cov['id'] == subject, 'balance'] = subject_mean['balance']\n",
    "    data_cov.loc[data_cov['id'] == subject, 'quality'] = subject_mean['quality']\n",
    "\n",
    "data_cov"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e536669",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Loop through subjects 5 and 9 and insert overall means for the food quality and balance, calories\n",
    "subject_ids = [5, 9]\n",
    "for subject in subject_ids:\n",
    "    subject_data = data_cov[data_cov['id'] == subject]\n",
    "    data_cov_mean = data_cov[data_cov[['balance', 'quality', 'calories']] > 0][['balance', 'quality', 'calories']].mean().round().astype(int)\n",
    "    \n",
    "    # Replace the NaN values in balance and quality columns with the subject-specific means\n",
    "    data_cov.loc[(data_cov['id'] == subject), 'balance'] = data_cov_mean['balance']\n",
    "    data_cov.loc[(data_cov['id'] == subject), 'quality'] = data_cov_mean['quality']\n",
    "    data_cov.loc[(data_cov['id'] == subject), 'calories'] = data_cov_mean['calories']\n",
    "\n",
    "data_cov"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c806268",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Interpolate calories data\n",
    "\n",
    "#Replace NaN with zeroes\n",
    "data_cov['calories'] = data_cov['calories'].replace(0, np.nan)\n",
    "\n",
    "# set NaN right before observed values to 0 for calories\n",
    "mask = data_cov['calories'].isna() & data_cov['calories'].shift(-1).notna()\n",
    "data_cov.loc[mask, 'calories'] = 0\n",
    "\n",
    "# Sort the data by subject ID and time\n",
    "data = data_cov.sort_values(['id', 'time'])\n",
    "interpolated_data = []\n",
    "\n",
    "# Loop over each subject ID\n",
    "for subject_id in data['id'].unique():\n",
    "    # Select the rows for the current subject ID\n",
    "    subject_data = data[data['id'] == subject_id]\n",
    "    \n",
    "    subject_data['calories'] = subject_data['calories'].interpolate(\n",
    "        method='linear', limit_area = 'inside')\n",
    "    interpolated_data.append(subject_data)\n",
    "\n",
    "data_cov = pd.concat(interpolated_data, ignore_index=True)\n",
    "# Replace NaN with zeroes\n",
    "data_cov = data_cov.fillna(0)\n",
    "\n",
    "data_cov"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e21a7f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot the consumed food characteristics for each subject\n",
    "groups = data_cov.groupby('id')\n",
    "num_cols = 3\n",
    "num_rows = int(np.ceil(len(groups)/num_cols))\n",
    "fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))\n",
    "\n",
    "for i, (group_name, group) in enumerate(groups):\n",
    "    row_idx = i // num_cols\n",
    "    col_idx = i % num_cols\n",
    "    ax = axs[row_idx, col_idx]\n",
    "    ax.plot(group['time'], group['balance'], label='Balance')\n",
    "    ax.plot(group['time'], group['quality'], label='Quality')\n",
    "    ax.set_xlabel('Time')\n",
    "    ax.set_ylabel('Food characteristics')\n",
    "    ax.set_title(f'Food characteristics Plot for ID {group_name}')\n",
    "    ax.legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1f8eee3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot the calories intakes for each subject\n",
    "groups = data_cov.groupby('id')\n",
    "num_cols = 3\n",
    "num_rows = int(np.ceil(len(groups)/num_cols))\n",
    "fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))\n",
    "\n",
    "for i, (group_name, group) in enumerate(groups):\n",
    "    row_idx = i // num_cols\n",
    "    col_idx = i % num_cols\n",
    "    ax = axs[row_idx, col_idx]\n",
    "    ax.plot(group['time'], group['calories'])\n",
    "    ax.set_xlabel('Time')\n",
    "    ax.set_ylabel('Calories')\n",
    "    ax.set_title(f'Calories Plot for ID {group_name}')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7a8b3ab5",
   "metadata": {},
   "source": [
    "## Summary Statistic from wearable device covariates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a98c19d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Loop over the folder of each subject and merge files by id\n",
    "\n",
    "# set the directory path\n",
    "path = 'raw_data/dubosson_covariates/diabetes_subset_sensor_data'\n",
    "# Create a list to store the dataframes for each subject\n",
    "dfs = []\n",
    "\n",
    "# Loop through each subject folder\n",
    "for subject_id in ['001', '002', '003', '004', '005', '006', '007', '008', '009']:\n",
    "    # Get the path to the sensor_data folder for the current subject\n",
    "    sensor_data_path = os.path.join(path, subject_id, 'sensor_data')\n",
    "    # Create a list to store the dataframes for each file in the current sensor_data folder\n",
    "    files = []\n",
    "    # Loop through each file in the current sensor_data folder\n",
    "    sensor_dates = os.listdir(sensor_data_path)\n",
    "    if '.DS_Store' in sensor_dates: sensor_dates.remove('.DS_Store')\n",
    "    for filename in sensor_dates:\n",
    "        # Get the path to the current file\n",
    "        folderpath = os.path.join(sensor_data_path, filename)\n",
    "        # Find the file with Summary statistic\n",
    "        filepath = glob.glob(os.path.join(folderpath, '*Summary.csv'))\n",
    "        # Read the data from the current file into a dataframe\n",
    "        df = pd.read_csv(filepath[0])   \n",
    "        # Add the subject_id to the dataframe\n",
    "        df['id'] = subject_id \n",
    "        # Append the dataframe to the list of files\n",
    "        files.append(df)\n",
    "    # Concatenate the list of files into a single dataframe for the current subject\n",
    "    subject_df = pd.concat(files)\n",
    "    # Append the dataframe for the current subject to the list of dataframes\n",
    "    dfs.append(subject_df)\n",
    "# Concatenate the list of dataframes into a single dataframe for all subjects\n",
    "summary_cov = pd.concat(dfs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4968f9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop rows with NaNs\n",
    "summary_cov.dropna()\n",
    "# Covert subject ids to int64 to match with \"data\" ids\n",
    "summary_cov['id'] = summary_cov['id'].astype(int)\n",
    "# Change the format of Time column to match with glucose dataset\n",
    "summary_cov['Time'] = summary_cov['Time'].apply(lambda x: datetime.datetime.strptime(x, '%d/%m/%Y %H:%M:%S.%f'))\n",
    "\n",
    "summary_cov"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7f37004",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the names of the columns\n",
    "summary_cov.columns.values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85d47165",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Summary statistics of the covariates\n",
    "summary_cov.describe().applymap(lambda x: f\"{x:0.3f}\").transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d16abf0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Heart Rate - beats per minute, range 25-240 (Invalid value = 65535)\n",
    "# Breathing Rate - breaths per minute, range 3-70 (Invalid value = 6553.5)\n",
    "# SkinTemp is not supported in this device (BioHarness 3.0 always returns \n",
    "#    an ‘Invalid’ value of -3276.8 °C for this parameter.)\n",
    "# Posture - vertical = 0°, inverted = 180°(degrees), range +/- 180°\n",
    "# Activity - range 16, (Invalid value = 655.35)\n",
    "# BRAmplitude - Breathing Wave Amplitute (indicative only)\n",
    "# ECGNoise - Breathing Wave Noise (indicative only)\n",
    "# HRConfidence - Breathing Rate Confidence, % (Invalid value = 255)\n",
    "# ECGAmplitude - ECG Amplitude (indicative only)\n",
    "# ECGNoise - ECG Noise (indicative only)\n",
    "# HRConfidence - Heart Rate Confidence, % (Invalid value = 255)\n",
    "# HRV - HR Variability, range 0-280 (Invalid value = 65535)\n",
    "# SystemConfidence - Physiological Data Validity, % (Invalid value = 255)\n",
    "# StatusInfo - 16 bit number\n",
    "# CoreTemp - Estimated Subject Core Temperature, range 33-41 (Invalid value = 6553.5)\n",
    "\n",
    "## Keep only usefull covariates\n",
    "summary_cov = summary_cov[['Time','HR','BR','Posture','Activity','HRV','SystemConfidence','CoreTemp','id']]\n",
    "\n",
    "# drop rows where HR is outside range or has value of 65535\n",
    "summary_cov = summary_cov[(summary_cov['HR'] >= 25) & (summary_cov['HR'] <= 240) & (summary_cov['HR'] != 65535)]\n",
    "# drop rows where BR is outside range or has value of 6553.5\n",
    "summary_cov = summary_cov[(summary_cov['BR'] >= 3) & (summary_cov['BR'] <= 70) & (summary_cov['BR'] != 6553.5)]\n",
    "# drop rows where HRV is outside range or has value of 65535\n",
    "summary_cov = summary_cov[(summary_cov['HRV'] >= 0) & (summary_cov['HRV'] <= 280) & (summary_cov['HRV'] != 65535)]\n",
    "# drop rows where SystemConfidence is less than 50% or has value of 255\n",
    "summary_cov = summary_cov[(summary_cov['SystemConfidence'] >= 50) & (summary_cov['SystemConfidence'] != 255)]\n",
    "# drop rows where CoreTemp is outside range or has value of 6553.5\n",
    "summary_cov = summary_cov[(summary_cov['CoreTemp'] >= 33) & (summary_cov['CoreTemp'] <= 41) & \n",
    "                          (summary_cov['CoreTemp'] != 6553.5)]\n",
    "                                                                                \n",
    "# Drop SystemConfidence and reset index\n",
    "summary_cov = summary_cov.drop('SystemConfidence', axis=1)\n",
    "summary_cov = summary_cov.reset_index(drop=True)\n",
    "    \n",
    "## Check the summary statistic again\n",
    "summary_cov.describe().applymap(lambda x: f\"{x:0.3f}\").transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "237474bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "summary_cov"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0d67185",
   "metadata": {},
   "outputs": [],
   "source": [
    "## For the time grid from glucose data, calculate the average values of Summary Statistic from wearable device\n",
    "\n",
    "# list of unique ids\n",
    "ids = data_cov['id'].unique()\n",
    "# Create a list to store the dataframes for each subject\n",
    "dfs = []\n",
    "\n",
    "# loop through each id\n",
    "for id in ids:\n",
    "    # subset data_cov and summary_cov by id\n",
    "    data_cov_id = data_cov[data_cov['id'] == id].reset_index(drop=True)\n",
    "    summary_cov_id = summary_cov[summary_cov['id'] == id].reset_index(drop=True)\n",
    "    \n",
    "    # loop through each time interval in data_cov\n",
    "    for i in range(len(data_cov_id)-1):\n",
    "        start_time = data_cov_id.loc[i, 'time']\n",
    "        end_time = data_cov_id.loc[i+1, 'time'] \n",
    "        \n",
    "        # subset summary_cov by time interval\n",
    "        summary_cov_interval = summary_cov_id[(summary_cov_id['Time'] >= start_time) & \n",
    "                                              (summary_cov_id['Time'] < end_time)]\n",
    "        \n",
    "        # calculate average of other variables and fill in data_cov\n",
    "        for col in summary_cov_interval.columns:\n",
    "            if col not in ['Time', 'id']:\n",
    "                avg_val = np.mean(summary_cov_interval[col])\n",
    "                data_cov_id.loc[i, col] = avg_val\n",
    "    \n",
    "    # Calculate the average separately for the last timestamp\n",
    "    if (i+1) == len(data_cov_id)-1:\n",
    "        start_time = data_cov_id.loc[i+1, 'time']\n",
    "        end_time = data_cov_id.loc[i+1, 'time'] + pd.Timedelta(minutes=5) # add 5 minutes to the last time stamp\n",
    "        \n",
    "        # subset summary_cov by time interval\n",
    "        summary_cov_interval = summary_cov_id[(summary_cov_id['Time'] >= start_time) & \n",
    "                                              (summary_cov_id['Time'] < end_time)]\n",
    "        \n",
    "        # calculate average of other variables and fill in data_cov\n",
    "        for col in summary_cov_interval.columns:\n",
    "            if col not in ['Time', 'id']:\n",
    "                avg_val = np.mean(summary_cov_interval[col])\n",
    "                data_cov_id.loc[i+1, col] = avg_val\n",
    "    \n",
    "    # Append the dataframe for the current subject to the list of dataframes\n",
    "    dfs.append(data_cov_id)\n",
    "# Concatenate the list of dataframes into a single dataframe for all subjects\n",
    "all_data = pd.concat(dfs)\n",
    "\n",
    "all_data.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34059c5d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# count number of NaN values per subject and column\n",
    "nan_counts = all_data.groupby('id')['HR'].apply(lambda x: x.isna().sum())\n",
    "# count number of observations per subject\n",
    "obs_counts = all_data['id'].value_counts()\n",
    "# calculate relative percentage of NaN values per subject\n",
    "nan_percentages = nan_counts / obs_counts * 100\n",
    "nan_percentages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "560f8bdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# #list of subject ids for which NaN values need to be replaced with 0\n",
    "#subject_ids = [5, 9]\n",
    "# #list of column names in which NaN values need to be replaced with 0\n",
    "#columns_to_fill = ['HR', 'BR', 'Posture', 'Activity', 'HRV', 'CoreTemp']\n",
    "# #replace NaN values with 0 for selected subjects and columns\n",
    "#for subject_id in subject_ids:\n",
    "#    for column in columns_to_fill:\n",
    "#        all_data.loc[all_data['id'] == subject_id, column] = all_data.loc[all_data['id'] == subject_id, column].fillna(0)\n",
    "\n",
    "#all_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b42519ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot the HR, BR, HRV, Posture, Activity, Core Temperature for each subject\n",
    "groups = all_data.groupby('id')\n",
    "num_cols = 3\n",
    "num_rows = int(np.ceil(len(groups)/num_cols))\n",
    "fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))\n",
    "\n",
    "for i, (group_name, group) in enumerate(groups):\n",
    "    row_idx = i // num_cols\n",
    "    col_idx = i % num_cols\n",
    "    ax = axs[row_idx, col_idx]\n",
    "    ax.plot(group['time'], group['HR'], label='HR')\n",
    "    ax.plot(group['time'], group['BR'], label='BR')\n",
    "    ax.plot(group['time'], group['HRV'], label='HRV')\n",
    "    ax.plot(group['time'], group['Posture'], label='Posture')\n",
    "    ax.plot(group['time'], group['Activity'], label='Activity')\n",
    "    ax.plot(group['time'], group['CoreTemp'], label='CoreTemp')\n",
    "    ax.set_xlabel('Time')\n",
    "    ax.set_ylabel('Insulin')\n",
    "    ax.set_title(f'Heart and Breath Rates Plot for ID {group_name}')\n",
    "    ax.legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec06fa97",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Interpolate HR, BR, HRV data using Piecewise Cubic Hermite Interpolating Polynomial (PCHIP)\n",
    "\n",
    "# Sort the data by subject ID and time\n",
    "data = all_data.sort_values(['id', 'time'])\n",
    "interpolated_data = []\n",
    "\n",
    "# Loop over each subject ID\n",
    "for subject_id in data['id'].unique():\n",
    "    # Select the rows for the current subject ID\n",
    "    subject_data = data[data['id'] == subject_id]\n",
    "    \n",
    "    subject_data['HR'] = subject_data['HR'].interpolate(method='pchip', limit_area = 'inside')\n",
    "    subject_data['BR'] = subject_data['BR'].interpolate(method='pchip', limit_area = 'inside')\n",
    "    subject_data['HRV'] = subject_data['HRV'].interpolate(method='pchip', limit_area = 'inside')\n",
    "    subject_data['Posture'] = subject_data['Posture'].interpolate(method='pchip', limit_area = 'inside')\n",
    "    subject_data['Activity'] = subject_data['Activity'].interpolate(method='pchip', limit_area = 'inside')\n",
    "    subject_data['CoreTemp'] = subject_data['CoreTemp'].interpolate(method='pchip', limit_area = 'inside')\n",
    "    interpolated_data.append(subject_data)\n",
    "\n",
    "all_data = pd.concat(interpolated_data, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9700ace2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot the interpolated HR, BR, HRV, Posture, Activity, and Core Temperature data (from the inside) for each subject\n",
    "groups = all_data.groupby('id')\n",
    "num_cols = 3\n",
    "num_rows = int(np.ceil(len(groups)/num_cols))\n",
    "fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))\n",
    "\n",
    "for i, (group_name, group) in enumerate(groups):\n",
    "    row_idx = i // num_cols\n",
    "    col_idx = i % num_cols\n",
    "    ax = axs[row_idx, col_idx]\n",
    "    ax.plot(group['time'], group['HR'], label='HR')\n",
    "    ax.plot(group['time'], group['BR'], label='BR')\n",
    "    ax.plot(group['time'], group['HRV'], label='HRV')\n",
    "    ax.plot(group['time'], group['Posture'], label='Posture')\n",
    "    ax.plot(group['time'], group['Activity'], label='Activity')\n",
    "    ax.plot(group['time'], group['CoreTemp'], label='CoreTemp')\n",
    "    ax.set_xlabel('Time')\n",
    "    ax.set_ylabel('Insulin')\n",
    "    ax.set_title(f'Heart and Breath Rates Plot for ID {group_name}')\n",
    "    ax.legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a83351a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop through each subject and calculate the mean for the wearable device variables to fill the outside NaN values\n",
    "for subject in all_data['id'].unique():\n",
    "    subject_data = all_data[all_data['id'] == subject]\n",
    "    subject_mean = subject_data[['HR', 'BR', 'HRV', 'Posture', 'Activity', 'CoreTemp']].mean()\n",
    "    \n",
    "    # Replace the NaN values in balance and quality columns with the subject-specific means\n",
    "    all_data.loc[(all_data['id'] == subject) & (all_data['HR'].isna()), 'HR'] = subject_mean['HR']\n",
    "    all_data.loc[(all_data['id'] == subject) & (all_data['BR'].isna()), 'BR'] = subject_mean['BR']\n",
    "    all_data.loc[(all_data['id'] == subject) & (all_data['HRV'].isna()), 'HRV'] = subject_mean['HRV']\n",
    "    all_data.loc[(all_data['id'] == subject) & (all_data['Posture'].isna()), 'Posture'] = subject_mean['Posture']\n",
    "    all_data.loc[(all_data['id'] == subject) & (all_data['Activity'].isna()), 'Activity'] = subject_mean['Activity']\n",
    "    all_data.loc[(all_data['id'] == subject) & (all_data['CoreTemp'].isna()), 'CoreTemp'] = subject_mean['CoreTemp']\n",
    "\n",
    "all_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddb5e690",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop through subjects 5 and 9 and insert overall means for the wearable device covariates\n",
    "subject_ids = [5, 9]\n",
    "for subject in subject_ids:\n",
    "    subject_data = all_data[all_data['id'] == subject]\n",
    "    all_data_mean = all_data[['HR', 'BR', 'HRV', 'Posture', 'Activity', 'CoreTemp']].mean()\n",
    "    \n",
    "    # Replace the NaN values in balance and quality columns with the subject-specific means\n",
    "    all_data.loc[(all_data['id'] == subject) & (all_data['HR'].isna()), 'HR'] = all_data_mean['HR']\n",
    "    all_data.loc[(all_data['id'] == subject) & (all_data['BR'].isna()), 'BR'] = all_data_mean['BR']\n",
    "    all_data.loc[(all_data['id'] == subject) & (all_data['HRV'].isna()), 'HRV'] = all_data_mean['HRV']\n",
    "    all_data.loc[(all_data['id'] == subject) & (all_data['Posture'].isna()), 'Posture'] = all_data_mean['Posture']\n",
    "    all_data.loc[(all_data['id'] == subject) & (all_data['Activity'].isna()), 'Activity'] = all_data_mean['Activity']\n",
    "    all_data.loc[(all_data['id'] == subject) & (all_data['CoreTemp'].isna()), 'CoreTemp'] = all_data_mean['CoreTemp']\n",
    "\n",
    "all_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae01e7c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot the fully interpolated HR, BR, HRV, Posture, Activity, and Core Temperature data for each subject\n",
    "groups = all_data.groupby('id')\n",
    "num_cols = 3\n",
    "num_rows = int(np.ceil(len(groups)/num_cols))\n",
    "fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))\n",
    "\n",
    "for i, (group_name, group) in enumerate(groups):\n",
    "    row_idx = i // num_cols\n",
    "    col_idx = i % num_cols\n",
    "    ax = axs[row_idx, col_idx]\n",
    "    ax.plot(group['time'], group['HR'], label='HR')\n",
    "    ax.plot(group['time'], group['BR'], label='BR')\n",
    "    ax.plot(group['time'], group['HRV'], label='HRV')\n",
    "    ax.plot(group['time'], group['Posture'], label='Posture')\n",
    "    ax.plot(group['time'], group['Activity'], label='Activity')\n",
    "    ax.plot(group['time'], group['CoreTemp'], label='CoreTemp')\n",
    "    ax.set_xlabel('Time')\n",
    "    ax.set_ylabel('Insulin')\n",
    "    ax.set_title(f'Heart and Breath Rates Plot for ID {group_name}')\n",
    "    ax.legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f525dbb3",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data.isna().sum() # no NaN values anymore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "264acaba",
   "metadata": {},
   "outputs": [],
   "source": [
    "# save as Dubosson_processed_with_covariates.csv\n",
    "all_data.to_csv('./raw_data/dubosson.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18755652",
   "metadata": {},
   "source": [
    "# Check statistics of the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c51f3ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# load yaml config file\n",
    "with open('./config/dubosson.yaml', 'r') as f:\n",
    "    config = yaml.safe_load(f)\n",
    "\n",
    "# set interpolation params for no interpolation\n",
    "new_config = config.copy()\n",
    "new_config['interpolation_params']['gap_threshold'] = 5\n",
    "new_config['interpolation_params']['min_drop_length'] = 0\n",
    "# set split params for no splitting\n",
    "new_config['split_params']['test_percent_subjects'] = 0\n",
    "new_config['split_params']['length_segment'] = 0\n",
    "# set scaling params for no scaling\n",
    "new_config['scaling_params']['scaler'] = 'None'\n",
    "\n",
    "formatter = DataFormatter(new_config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5df90a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# loop through each column in the dataset\n",
    "for col in formatter.train_data.columns:\n",
    "    if col in ['id_segment', 'time', 'id', 'time_year', 'time_month', 'time_day', 'time_hour', 'time_minute']:\n",
    "        continue\n",
    "    col_data = formatter.train_data[col]\n",
    "    \n",
    "    # print summary statistics\n",
    "    print('{} column:'.format(col))\n",
    "    print('\\tMin: ', col_data.min())\n",
    "    print('\\tMax: ', col_data.max())\n",
    "    print('\\t25th percentile: ', np.percentile(col_data, 25))\n",
    "    print('\\tMedian: ', col_data.median())\n",
    "    print('\\t75th percentile: ', np.percentile(col_data, 75))\n",
    "    print('\\tMean: ', col_data.mean())\n",
    "    print('\\tStd: ', col_data.std())\n",
    "\n",
    "    # plot data for each segment\n",
    "    num_segments = formatter.train_data['id_segment'].nunique()\n",
    "    fig, axs = plt.subplots(1, num_segments, figsize=(30, 5))\n",
    "    for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):\n",
    "        data.plot(x='time', y=col, ax=axs[i], title='{} Segment {}'.format(col, group))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2172d2fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot acf of random samples from segments\n",
    "fig, ax = plt.subplots(2, num_segments, figsize=(30, 5))\n",
    "lags = 300\n",
    "for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):\n",
    "    data = data['gl']\n",
    "    if len(data) < lags:\n",
    "        print('Segment {} is too short'.format(group))\n",
    "        continue\n",
    "    # select 10 random samples from index of data\n",
    "    sample = np.random.choice(range(len(data))[:-lags], 10, replace=False)\n",
    "    # plot acf / pacf of each sample\n",
    "    for j in sample:\n",
    "        acf, acf_ci = sm.tsa.stattools.acf(data[j:j+lags], nlags=lags, alpha=0.05)\n",
    "        pacf, pacf_ci = sm.tsa.stattools.pacf(data[j:j+lags], method='ols-adjusted', alpha=0.05)\n",
    "        ax[0, i].plot(acf)\n",
    "        ax[1, i].plot(pacf)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
