{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 40,
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "BASE_DIR = \"ABSOLUTE_PATH_TO_THE_ROOT\"\n",
        "DATA_DIR = os.path.join(BASE_DIR, \"data\")\n",
        "CODE_DIR = os.path.join(BASE_DIR, \"code\")\n",
        "FC_DIR = os.path.join(BASE_DIR, 'models_save/fc')\n",
        "UC_DIR = os.path.join(BASE_DIR, 'models_save/uc')\n",
        "\n",
        "import sys\n",
        "\n",
        "sys.path.append(CODE_DIR)\n",
        "\n",
        "import os\n",
        "import json\n",
        "from collections import defaultdict\n",
        "from typing import Tuple\n",
        "\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "from scipy.stats import wasserstein_distance\n",
        "\n",
        "from matplotlib import pyplot as plt\n",
        "import matplotlib.gridspec as grid_spec\n",
        "\n",
        "from tqdm.auto import tqdm\n",
        "\n",
        "from loader.generator import DataGenerator\n",
        "from config import TSDataConfig, TaskConfig\n",
        "from main_utils import _init_fc\n",
        "from omegaconf import DictConfig\n",
        "\n",
        "from models.forcast.darts import SimpleDartsModel\n",
        "from models.forcast.forcast_service import ForcastService\n",
        "from models.forcast.forcast_base import FCPredictionData\n",
        "from models.uncertainty.uc_service import UncertaintyService\n",
        "from models.uncertainty.dist_match.tree import DistMatchQRF\n",
        "from models.uncertainty.dist_match.utils import match_ks_stat\n",
        "from utils.calc_torch import calc_residuals"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 41,
      "metadata": {},
      "outputs": [],
      "source": [
        "def matcher(x1, x2):\n",
        "    return match_ks_stat(x1, x2) < 0.1\n",
        "\n",
        "\n",
        "def get_qrf(path: str) -> DistMatchQRF:\n",
        "    qrf = DistMatchQRF(\n",
        "        alpha=0.1,\n",
        "        n_quantile_bins=10,\n",
        "        feature_dim=-1,\n",
        "        matcher=matcher,\n",
        "        match_mask=None,\n",
        "        n_trees=10,\n",
        "        bagging_ratio=0.9,\n",
        "        verbose=False,\n",
        "    )\n",
        "    qrf.load_trees(path)\n",
        "    return qrf"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 42,
      "metadata": {},
      "outputs": [],
      "source": [
        "TASK_CONFIG = DictConfig(\n",
        "    {\n",
        "        \"task_type\": \"PI\",\n",
        "        \"alpha\": 0.1,\n",
        "        \"data_splits\": [0.6, 0.15, 0.25],\n",
        "        \"fc_estimator_mode\": \"single\",\n",
        "        \"global_norm\": False,\n",
        "        \"add_config\": None,\n",
        "    }\n",
        ")\n",
        "\n",
        "\n",
        "def load_dataset(data_config: TSDataConfig, task_config: TaskConfig = None):\n",
        "    task_config = task_config or TaskConfig(**TASK_CONFIG)\n",
        "    return DataGenerator.get_data(\n",
        "        data_config=data_config,\n",
        "        task_config=TASK_CONFIG,\n",
        "        replace_base_dir=DATA_DIR,\n",
        "        X_norm_param=None,\n",
        "        Y_norm_param=None,\n",
        "        hydro_static_norm_param=None,\n",
        "    )\n",
        "\n",
        "\n",
        "def load_forecast_service(\n",
        "    fc_model=None, model_config=None, data_config=None, task_config=None\n",
        "):\n",
        "    if fc_model is None:\n",
        "        fc_model = SimpleDartsModel\n",
        "        model_config = dict(\n",
        "            model=\"darts-forest\", model_params={\"lags\": 50, \"lags_past_covariates\": 50}\n",
        "        )\n",
        "\n",
        "    task_config = task_config or TASK_CONFIG\n",
        "\n",
        "    return ForcastService(\n",
        "        int_fc_model=lambda: fc_model(**model_config),\n",
        "        task_config=task_config,\n",
        "        model_config=model_config,\n",
        "        data_config=data_config,\n",
        "        persist_dir=FC_DIR,\n",
        "    )"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 43,
      "metadata": {},
      "outputs": [],
      "source": [
        "def get_residuals(forcast_service: ForcastService, data, is_calib: bool = False) -> Tuple[np.ndarray, np.ndarray]:\n",
        "    data = forcast_service.prepare([data], forcast_service._task_config.alpha)[0]\n",
        "\n",
        "    if is_calib:\n",
        "        calib_data = UncertaintyService._map_to_calib_data(data)\n",
        "        fc_result = forcast_service.predict(\n",
        "            FCPredictionData(\n",
        "                ts_id=calib_data.ts_id,\n",
        "                X_past=calib_data.X_pre_calib,\n",
        "                Y_past=calib_data.Y_pre_calib,\n",
        "                X_step=calib_data.X_calib,\n",
        "                step_offset=calib_data.step_offset,\n",
        "            )\n",
        "        )\n",
        "        return calc_residuals(Y_hat=fc_result.point, Y=calib_data.Y_calib).numpy(), fc_result.point\n",
        "\n",
        "    fc_result = forcast_service.predict(\n",
        "        FCPredictionData(\n",
        "            ts_id=data.ts_id,\n",
        "            X_past=data.X_calib,\n",
        "            Y_past=data.Y_calib,\n",
        "            X_step=data.X_test,\n",
        "            step_offset=data.test_step,\n",
        "        )\n",
        "    )\n",
        "    return calc_residuals(Y_hat=fc_result.point, Y=data.Y_test).numpy(), fc_result.point"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 44,
      "metadata": {},
      "outputs": [],
      "source": [
        "from scipy.stats import ks_2samp\n",
        "\n",
        "def test_kernel_stability(residuals, window_size=5, n_trials=200):\n",
        "    ratios = []\n",
        "    \n",
        "    for trial in range(n_trials):\n",
        "        n = len(residuals) - window_size - 1\n",
        "        n_samples = 100\n",
        "        \n",
        "        idx1 = np.random.choice(n, size=n_samples, replace=False)\n",
        "        idx2 = np.random.choice(n, size=n_samples, replace=False)\n",
        "        \n",
        "        # Patches as (n_samples, window_size) arrays\n",
        "        patches_P = np.array([residuals[i:i+window_size] for i in idx1])\n",
        "        patches_Q = np.array([residuals[i:i+window_size] for i in idx2])\n",
        "        \n",
        "        next_P = np.array([residuals[i+window_size] for i in idx1])\n",
        "        next_Q = np.array([residuals[i+window_size] for i in idx2])\n",
        "        \n",
        "        # D_KS(P̃, Q̃): average KS over each position in patch\n",
        "        ks_per_position = []\n",
        "        for pos in range(window_size):\n",
        "            ks = ks_2samp(patches_P[:, pos], patches_Q[:, pos]).statistic\n",
        "            ks_per_position.append(ks)\n",
        "        \n",
        "        d_patches = np.mean(ks_per_position)\n",
        "        \n",
        "        # D_KS(P̃K, Q̃K): KS for next residuals\n",
        "        d_next = ks_2samp(next_P, next_Q).statistic\n",
        "        \n",
        "        if d_patches > 0.01:\n",
        "            ratios.append({\n",
        "                'patch_dist': d_patches,\n",
        "                'next_dist': d_next,\n",
        "                'ratio': d_next / d_patches\n",
        "            })\n",
        "    \n",
        "    return ratios"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 45,
      "metadata": {},
      "outputs": [],
      "source": [
        "def get_patch_target_ratios(data, forcast_service, percentiles, patch_size, is_calib=True, n_trials=100):\n",
        "    residuals, _ = get_residuals(forcast_service, data, is_calib=is_calib)\n",
        "    ratios = test_kernel_stability(residuals, patch_size, n_trials=n_trials)\n",
        "    # Extract just the ratio values from the list of dicts\n",
        "    ratio_values = [r['ratio'] for r in ratios]\n",
        "    return np.percentile(ratio_values, percentiles), np.mean(ratio_values)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 46,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "electric [1.04166667 1.3859852  1.50075758 1.70625   ] 1.0289291296728948\n",
            "(8760, 8)\n",
            "solar [0.97693531 1.54418985 1.8052381  2.07033469] 1.0500202594933468\n",
            "wind [1.01983821 1.44513575 1.57127716 1.71119883] 1.043135469160663\n"
          ]
        }
      ],
      "source": [
        "PATCHES = [5, 10, 25, 50, 100, 150]\n",
        "DATA_MAP = {\n",
        "    \"Elec\": (\n",
        "        \"electric\",\n",
        "        [\"/some_base_dir/data/enbPI/electricity-normalized.csv\"],\n",
        "        0,\n",
        "    ),\n",
        "    \"Solar\": (\n",
        "        \"solar\",\n",
        "        [\"/some_base_dir/data/enbPI/Solar_Atl_data_aligned.csv\"],\n",
        "        0,\n",
        "    ),\n",
        "    \"Wind\": (\n",
        "        \"wind\",\n",
        "        [\"/some_base_dir/data/enbPI/Wind_Hackberry_Generation_2019_2020.csv\"],\n",
        "        0,\n",
        "    ),\n",
        "}\n",
        "\n",
        "for data_type, (datatype, data_paths, file_idx) in DATA_MAP.items():\n",
        "    data_config = DictConfig(\n",
        "        {\"dataset_type\": datatype, \"paths\": data_paths, \"add_config\": None}\n",
        "    )\n",
        "    prepared = load_dataset(TSDataConfig(**data_config))\n",
        "    forcast_service = load_forecast_service(data_config=data_config)\n",
        "    ratios, mean_ratio = get_patch_target_ratios(\n",
        "        prepared[file_idx],\n",
        "        forcast_service,\n",
        "        PATCHES\n",
        "    )\n",
        "    print(datatype, ratios, mean_ratio)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "distmatch",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.13"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}
