{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "oFh18yX0FrjN"
      },
      "source": [
        "# Loading libraries"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "eawH97OBFrjS"
      },
      "outputs": [],
      "source": [
        "import sys\n",
        "import os\n",
        "import yaml\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "sys.path.insert(1, '..')\n",
        "os.chdir('..')\n",
        "\n",
        "import seaborn as sns\n",
        "sns.set_style('whitegrid')\n",
        "import matplotlib.pyplot as plt\n",
        "import statsmodels.api as sm\n",
        "import sklearn\n",
        "import optuna\n",
        "\n",
        "from darts import models\n",
        "from darts import metrics\n",
        "from darts import TimeSeries\n",
        "from darts.dataprocessing.transformers import Scaler\n",
        "\n",
        "from statsforecast.models import AutoARIMA\n",
        "\n",
        "from data_formatter.base import *\n",
        "from bin.utils import *"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Processing"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "filenames = []\n",
        "for root, dir, files in os.walk('raw_data/Colas2019'):\n",
        "  for file in files:\n",
        "    if '.csv' in file:\n",
        "      filenames.append(os.path.join(root, file))\n",
        "      \n",
        "# next we loop through each file\n",
        "nfiles = len(files)\n",
        "\n",
        "count = 0\n",
        "for file in filenames:\n",
        "  # read in data and extract id from filename\n",
        "  curr = pd.read_csv(file)\n",
        "  curr['id'] = int(file.split()[1].split(\".\")[0])\n",
        "  # select desired columns, rename, and drop nas\n",
        "  curr = curr[['id', 'hora', 'glucemia']]\n",
        "  curr.rename(columns = {'hora': 'time', 'glucemia': 'gl'}, inplace=True)\n",
        "  curr.dropna(inplace=True)\n",
        "\n",
        "  # calculate time (only given in hms) as follows:\n",
        "  # (1) get the time per day in seconds, (2) get the time differences, and correct for the day crossove (< 0)\n",
        "  # (3) take the cumulative sum and add the cumulative number of seconds from start to the base date\n",
        "  # thus the hms are real, while the year, month, day are fake\n",
        "  time_secs = []\n",
        "  for i in curr['time']:\n",
        "      time_secs.append(int(i.split(\":\")[0])*60*60 + int(i.split(\":\")[1])*60 + int(i.split(\":\")[2])*1)\n",
        "  time_diff = np.diff(np.array(time_secs)).tolist()\n",
        "  time_diff_adj = [x if x > 0 else 24*60*60 + x for x in time_diff]\n",
        "  time_diff_adj.insert(0, 0)\n",
        "  cumin = np.cumsum(time_diff_adj)\n",
        "  datetime = pd.to_datetime('2012-01-01') + pd.to_timedelta(cumin, unit='sec')\n",
        "  curr['time'] = datetime\n",
        "  curr['id'] = curr['id'].astype('int')\n",
        "  curr.reset_index(drop=True, inplace=True)\n",
        "\n",
        "  if count == 0:\n",
        "    df = curr\n",
        "    count += 1\n",
        "  else:\n",
        "    df = pd.concat([df, curr])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# join with covariates\n",
        "covariates = pd.read_csv('raw_data/Colas2019/clinical_data.txt', sep = \" \")\n",
        "covariates['id'] = covariates.index\n",
        "\n",
        "combined = pd.merge(\n",
        "    df, covariates, how = \"left\"\n",
        ")\n",
        "\n",
        "# define NA fill values for covariates\n",
        "values = {\n",
        "    'gender': 2, # if gender is NA, create own category\n",
        "    'age': combined['age'].mean(),\n",
        "    'BMI': combined['BMI'].mean(),\n",
        "    'glycaemia': combined['glycaemia'].mean(),\n",
        "    'HbA1c': combined['HbA1c'].mean(),\n",
        "    'follow.up': combined['follow.up'].mean(),\n",
        "    'T2DM': False\n",
        "}\n",
        "combined = combined.fillna(value = values)\n",
        "\n",
        "# write to csv\n",
        "combined.to_csv('raw_data/colas.csv')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WeAHZmAmFrjV"
      },
      "source": [
        "# Check statistics of the data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "pkOzK6gcFrjW",
        "outputId": "769510ff-79ba-4020-8d9c-dc78a7cdb7ff"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "\n",
        "# load yaml config file\n",
        "with open('./config/colas.yaml', 'r') as f:\n",
        "    config = yaml.safe_load(f)\n",
        "\n",
        "# set interpolation params for no interpolation\n",
        "new_config = config.copy()\n",
        "new_config['interpolation_params']['gap_threshold'] = 5\n",
        "new_config['interpolation_params']['min_drop_length'] = 0\n",
        "# set split params for no splitting\n",
        "new_config['split_params']['test_percent_subjects'] = 0\n",
        "new_config['split_params']['length_segment'] = 0\n",
        "# set scaling params for no scaling\n",
        "new_config['scaling_params']['scaler'] = 'None'\n",
        "\n",
        "formatter = DataFormatter(new_config)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "eCBgEjuAFrjX",
        "outputId": "1d40e5fa-1fd5-45ea-ae93-41d14226a0c5"
      },
      "outputs": [],
      "source": [
        "# print min, max, median, mean, std of segment lengths\n",
        "segment_lens = []\n",
        "for group, data in formatter.train_data.groupby('id_segment'):\n",
        "    segment_lens.append(len(data))\n",
        "print('Train segment lengths:')\n",
        "print('\\tMin: ', min(segment_lens))\n",
        "print('\\tMax: ', max(segment_lens))\n",
        "print('\\t1st Quartile: ', np.quantile(segment_lens, 0.25))\n",
        "print('\\tMedian: ', np.median(segment_lens))\n",
        "print('\\tMean: ', np.mean(segment_lens))\n",
        "print('\\tStd: ', np.std(segment_lens))\n",
        "\n",
        "# plot first 9 segments\n",
        "num_segments = 9\n",
        "plot_data = formatter.train_data\n",
        "\n",
        "fig, axs = plt.subplots(1, num_segments, figsize=(30, 5))\n",
        "for i, (group, data) in enumerate(plot_data.groupby('id_segment')):\n",
        "    data.plot(x='time', y='gl', ax=axs[i], title='Segment {}'.format(group))\n",
        "    if i >= num_segments - 1:\n",
        "        break"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 341
        },
        "id": "iU2AUHTfFrjZ",
        "outputId": "ac25dfa6-4eee-4fc8-9c0c-11efa1b4fa14"
      },
      "outputs": [],
      "source": [
        "# plot acf of random samples from first 9 segments segments\n",
        "fig, ax = plt.subplots(2, num_segments, figsize=(30, 5))\n",
        "lags = 300; k = 0\n",
        "for i, (group, data) in enumerate(plot_data.groupby('id_segment')):\n",
        "    data = data['gl']\n",
        "    if len(data) < lags:\n",
        "        print('Segment {} is too short'.format(group))\n",
        "        continue\n",
        "    else:\n",
        "        # select 10 random samples from index of data\n",
        "        sample = np.random.choice(range(len(data))[:-lags], 10, replace=False)\n",
        "        # plot acf / pacf of each sample\n",
        "        for j in sample:\n",
        "            acf, acf_ci = sm.tsa.stattools.acf(data[j:j+lags], nlags=lags, alpha=0.05)\n",
        "            pacf, pacf_ci = sm.tsa.stattools.pacf(data[j:j+lags], method='ols-adjusted', alpha=0.05)\n",
        "            ax[0, k].plot(acf)\n",
        "            ax[1, k].plot(pacf)\n",
        "        k += 1\n",
        "        if k >= num_segments:\n",
        "            break"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "base",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    },
    "orig_nbformat": 4,
    "vscode": {
      "interpreter": {
        "hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
