{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loading libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# DANGER: only run 1x otherwise will chdir too many times\n",
    "import sys\n",
    "import os\n",
    "import yaml\n",
    "\n",
    "sys.path.insert(1, '..')\n",
    "os.chdir('..')\n",
    "\n",
    "import seaborn as sns\n",
    "sns.set_style('whitegrid')\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import statsmodels.api as sm\n",
    "import sklearn\n",
    "import optuna\n",
    "\n",
    "from darts import models, metrics, TimeSeries\n",
    "from darts.dataprocessing.transformers import Scaler\n",
    "\n",
    "from data_formatter.base import * # TODO: inefficient"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Check statistics of the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load yaml config file\n",
    "with open('./config/iglu.yaml', 'r') as f:\n",
    "    config = yaml.safe_load(f)\n",
    "\n",
    "# set interpolation params for no interpolation\n",
    "new_config = config.copy()\n",
    "new_config['interpolation_params']['gap_threshold'] = 30\n",
    "new_config['interpolation_params']['min_drop_length'] = 0\n",
    "# set split params for no splitting\n",
    "new_config['split_params']['test_percent_subjects'] = 0\n",
    "new_config['split_params']['length_segment'] = 0\n",
    "# set scaling params for no scaling\n",
    "new_config['scaling_params']['scaler'] = 'None'\n",
    "\n",
    "formatter = DataFormatter(new_config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "\n",
    "# Need: Tradeoff between interpolation and segment length\n",
    "# Problem: Manually tuning is slow and potentially imprecise\n",
    "# Idea: have automated function that can help determine what the gap threshold should be\n",
    "# Proof of concept below\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "def calc_percent(a, b):\n",
    "    return a*100/b\n",
    "\n",
    "gap_threshold = np.arange(5, 70, 1)\n",
    "percent_valid = []\n",
    "for i in gap_threshold:\n",
    "    new_config['interpolation_params']['gap_threshold'] = i\n",
    "    df = DataFormatter(new_config).train_data\n",
    "    \n",
    "    segment_lens = []\n",
    "    for group, data in df.groupby('id_segment'):\n",
    "        segment_lens.append(len(data))\n",
    "    \n",
    "    threshold = 240\n",
    "    valid_ids = df.groupby('id_segment')['time'].count().loc[lambda x : x>threshold].reset_index()['id_segment']\n",
    "    \n",
    "    percent_valid.append((len(valid_ids)*100/len(segment_lens)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot results\n",
    "plt.plot(gap_threshold, percent_valid)\n",
    "plt.title(\"Gap Threshold affect on % Segments > 240 Length\")\n",
    "plt.ylabel(\"% Above Threshhold\")\n",
    "plt.xlabel(\"Gap Threshold (min)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print min, max, median, mean, std of segment lengths\n",
    "df = formatter.train_data\n",
    "segment_lens = []\n",
    "for group, data in df.groupby('id_segment'):\n",
    "    segment_lens.append(len(data))\n",
    "\n",
    "print('Train segment lengths:')\n",
    "print('\\tMin: ', min(segment_lens))\n",
    "print('\\tMax: ', max(segment_lens))\n",
    "print('\\tMedian: ', np.median(segment_lens))\n",
    "print('\\tMean: ', np.mean(segment_lens))\n",
    "print('\\tStd: ', np.std(segment_lens))\n",
    "\n",
    "# Visualize segment lengths to see approx # of valid ones (>240)\n",
    "plt.title(\"Segment Lengths (Line at 240)\")\n",
    "plt.hist(segment_lens)\n",
    "plt.axvline(240, color='r', linestyle='dashed', linewidth=1)\n",
    "\n",
    "# filter to get valid indices\n",
    "threshold = 240\n",
    "valid_ids = df.groupby('id_segment')['time'].count().loc[lambda x : x>threshold].reset_index()['id_segment']\n",
    "df_filtered = df.loc[df['id_segment'].isin(valid_ids)]\n",
    "\n",
    "# plot each segment\n",
    "num_segments = df_filtered['id_segment'].nunique()\n",
    "\n",
    "fig, axs = plt.subplots(1, num_segments, figsize=(30, 5))\n",
    "for i, (group, data) in enumerate(df_filtered.groupby('id_segment')):\n",
    "    data.plot(x='time', y='gl', ax=axs[i], title='Segment {}'.format(group))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot acf of random samples from segments\n",
    "fig, ax = plt.subplots(2, 5, figsize=(30, 5))\n",
    "lags = 240\n",
    "for i, (group, data) in enumerate(df_filtered.groupby('id_segment')):\n",
    "    # only view top 5\n",
    "    if i < 5:\n",
    "        data = data['gl']\n",
    "        if len(data) < lags: # TODO: Could probably do filtering in pandas which would be faster\n",
    "            print('Segment {} is too short'.format(group))\n",
    "            continue\n",
    "        # select 10 random samples from index of data\n",
    "        sample = np.random.choice(range(len(data))[:-lags], 10, replace=False)\n",
    "        # plot acf / pacf of each sample\n",
    "        for j in sample:\n",
    "            acf, acf_ci = sm.tsa.stattools.acf(data[j:j+lags], nlags=lags, alpha=0.05)\n",
    "            pacf, pacf_ci = sm.tsa.stattools.pacf(data[j:j+lags], method='ols-adjusted', alpha=0.05)\n",
    "            ax[0, i].plot(acf)\n",
    "            ax[1, i].plot(pacf)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.1"
  },
  "vscode": {
   "interpreter": {
    "hash": "95662931fb0811c75e2373330a012ba90aa4548ba779055436524c46bd94b0ee"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
