{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "01f1a95e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "import fitz\n",
    "import sys\n",
    "import re\n",
    "import json\n",
    "from datetime import datetime\n",
    "from typing import Optional, List, Callable, Any, Tuple, Dict, Union\n",
    "from abc import abstractmethod, ABC\n",
    "import random\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import copy\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "import pickle\n",
    "import itertools\n",
    "from dataclasses import dataclass, asdict\n",
    "from enum import Enum\n",
    "from tqdm import tqdm\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "sys.path.append(\"../\")\n",
    "\n",
    "load_dotenv(dotenv_path=\"../.env\")\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "7d0b1e38",
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import file_handle\n",
    "from skyspark.utils.sky_spark_wrapper import Sensor, AssetDescription\n",
    "from utils import file_handle\n",
    "from utils import tree\n",
    "from utils.tree import Node\n",
    "from dataset_utils.reader import ADIQDataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "86205ae8",
   "metadata": {},
   "outputs": [],
   "source": [
    "asset_desc = file_handle.load_json(\"skyspark/data3/asset_desc.json\")\n",
    "sensor_data = file_handle.load_json(\"skyspark/data3/sensors.json\")\n",
    "extracted_sensors = file_handle.load_json(\"skyspark/extracted/extracted_sensors_llm.json\")\n",
    "\n",
    "\n",
    "ds = file_handle.load_pickle(\"extracted/TreeStruct.pkl\")\n",
    "ds = {v['#n']:v for v in ds['rule_set']}\n",
    "dataset = ADIQDataset(\"../dataset/datasets/simpleV3.1\")\n",
    "\n",
    "sensor_map = file_handle.load_jsonl(\"skyspark/extracted/sensor-map.jsonl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ae05e55d",
   "metadata": {},
   "outputs": [],
   "source": [
    "MAIN_DIR = '/mnt/data/DiagIQ'\n",
    "RAW_DATA = os.path.join(MAIN_DIR, 'processed', 'raw')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "126d5515",
   "metadata": {},
   "outputs": [],
   "source": [
    "temp = copy.deepcopy(extracted_sensors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6ecbbcd6",
   "metadata": {},
   "outputs": [],
   "source": [
    "available_obs = {x.replace(\".json\",\"\"):{k:v for k,v in zip(['site_name', 'asset_name', \"sensor_name\"],x.replace(\".json\",\"\").split(\"_\"))} for x in os.listdir(RAW_DATA)}\n",
    "extracted_sensors = {f\"{k.split('_')[0]}_{v['original']}\":{\"id\":k,\"rule_id\":k.split(\"_\")[0],  **v} for k,v in extracted_sensors.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "3e9ab8d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "\n",
    "def search_sensor_map(rule, cond, entity):\n",
    "    rule = int(rule)\n",
    "    def match_rule(x):\n",
    "        i = int(x[\"id\"].split(\"_\")[0])\n",
    "        c = x['condition']\n",
    "        e = x['entity']\n",
    "        return i == rule and c == cond and e == entity\n",
    "    \n",
    "    matches = [x for x in sensor_map if match_rule(x)]\n",
    "    if len(matches) == 1:\n",
    "        print(rule, cond, entity, matches[0])\n",
    "        return matches[0]\n",
    "    elif len(matches)> 1:\n",
    "        raise ValueError(\"can't happen check\")\n",
    "    else:\n",
    "        return None\n",
    "    \n",
    "def sensor_ranking(req_sen:List[str], availble_sen:List[str]):\n",
    "    req_enc = SEMANTIC_MODEL.encode(req_sen)\n",
    "    avail_enc = SEMANTIC_MODEL.encode(availble_sen)\n",
    "\n",
    "    availble_sen = np.array(availble_sen)\n",
    "\n",
    "    sim = req_enc @ avail_enc.T\n",
    "    sim = np.exp(sim - np.max(sim, axis=-1, keepdims=True))\n",
    "    sim = np.divide(sim,np.sum(sim, axis=-1, keepdims=True))\n",
    "\n",
    "    sim_args = np.argsort(sim, axis=-1)[:,::-1]\n",
    "    \n",
    "    rankings = []\n",
    "    for sa in sim_args:\n",
    "        rankings.append(availble_sen[sa].tolist())\n",
    "\n",
    "    return rankings\n",
    "    \n",
    "\n",
    "def word_to_char_vector(word):\n",
    "    return Counter(word)\n",
    "\n",
    "def manhattan_distance(a, b):\n",
    "    vec_a = word_to_char_vector(a)\n",
    "    vec_b = word_to_char_vector(b)\n",
    "    all_chars = set(vec_a.keys()).union(vec_b.keys())\n",
    "    return sum(abs(vec_a[c] - vec_b[c]) for c in all_chars)\n",
    "\n",
    "def match_entity_rank(entity, rank0):\n",
    "    def match_entity(ent, ran):\n",
    "        if ent == ran:\n",
    "            return ran\n",
    "        \n",
    "        if ent in ran or ran in ent:\n",
    "            return ran\n",
    "        \n",
    "        if manhattan_distance(ent, ran)/ len(ent) < 0.4:\n",
    "            return ran\n",
    "        \n",
    "        return None\n",
    "        \n",
    "\n",
    "\n",
    "    data = []\n",
    "    for e,r in zip(entity, rank0):\n",
    "        data.append(match_entity(e,r))\n",
    "\n",
    "    return data\n",
    "\n",
    "def search_sensors_in_available(sensors, sen_n):\n",
    "    sensor_names = [x['sensor_name'].split(x['asset_name'])[-1].strip() for x in sensors]\n",
    "    rankings = sensor_ranking(sen_n, sensor_names)\n",
    "\n",
    "    print(sen_n, rankings)\n",
    "    rank0 = []\n",
    "    for ran in rankings:\n",
    "        rank0.append(ran[0])\n",
    "\n",
    "    matching = match_entity_rank(sen_n, rank0)\n",
    "    return matching\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "41748266",
   "metadata": {},
   "outputs": [],
   "source": [
    "rule_forms = file_handle.load_json(\"temp_rule_forms_llm.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4c3f2817",
   "metadata": {},
   "outputs": [],
   "source": [
    "doable_rule_asset_comb = []\n",
    "\n",
    "for id_r, ruv in enumerate(rule_forms):\n",
    "    for iak, iav in ruv['analysis'].items():\n",
    "        sen_is = [x for x in available_obs.values() if (x['asset_name'] == iak.split(\"_\")[1]) and (x['site_name'] == iak.split(\"_\")[0])]\n",
    "\n",
    "        info = {\n",
    "            \"rule_id\": id_r,\n",
    "            \"rule\": ruv,\n",
    "            \"asset_name\":iak,\n",
    "            \"asset_type\":ruv['asset_type'],\n",
    "            \"sensors\": [x['sensor_name'].split(x['asset_name'])[-1].strip() for x in sen_is],\n",
    "            \"missing\":[]\n",
    "        }\n",
    "\n",
    "        if len(iav) < 1:\n",
    "            info[\"all_available\"] = 0\n",
    "            continue\n",
    "            \n",
    "        flag = True\n",
    "        for cond_id, entity in iav.items():\n",
    "            if len(entity) < 1:\n",
    "                flag = False\n",
    "            for sen, f in entity.items():\n",
    "                if sen == \"Run Status\":\n",
    "                    continue\n",
    "                if not f:\n",
    "                    info['missing'].append(sen)\n",
    "                    flag = False\n",
    "\n",
    "        if flag:\n",
    "            info[\"all_available\"] = 1\n",
    "        else:\n",
    "            info[\"all_available\"] = 0\n",
    "        \n",
    "        doable_rule_asset_comb.append(info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "16fc8462",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rule_id</th>\n",
       "      <th>rule</th>\n",
       "      <th>asset_name</th>\n",
       "      <th>asset_type</th>\n",
       "      <th>sensors</th>\n",
       "      <th>missing</th>\n",
       "      <th>all_available</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>{'asset_type': 'AHU', 'conditions': ['AHU Runn...</td>\n",
       "      <td>POKMAIN_B003 AHU 3-3-4</td>\n",
       "      <td>AHU</td>\n",
       "      <td>[Occupied Command, Schedule, Power (Calc), Sup...</td>\n",
       "      <td>[Outside Air Temperature, Setpoint Temperature...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>{'asset_type': 'AHU', 'conditions': ['AHU Runn...</td>\n",
       "      <td>POKMAIN_B002 AHU 2-3-8</td>\n",
       "      <td>AHU</td>\n",
       "      <td>[Duct Static Pressure, Zone Relative Humidity ...</td>\n",
       "      <td>[Return Air Enthalpy]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>{'asset_type': 'AHU', 'conditions': ['AHU Runn...</td>\n",
       "      <td>POKMAIN_B705 AHU 1</td>\n",
       "      <td>AHU</td>\n",
       "      <td>[Cooling Valve %, Supply Relative Humidity Set...</td>\n",
       "      <td>[Outside Air Temperature, Return Air Enthalpy]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>{'asset_type': 'AHU', 'conditions': ['AHU Runn...</td>\n",
       "      <td>POKMAIN_B004 AHU 4-1-5</td>\n",
       "      <td>AHU</td>\n",
       "      <td>[Schedule, Supply Fan Status, Occupied Command...</td>\n",
       "      <td>[Outside Air Temperature, Setpoint Temperature...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>{'asset_type': 'AHU', 'conditions': ['AHU Runn...</td>\n",
       "      <td>POKMAIN_B004 AHU 4-2-2</td>\n",
       "      <td>AHU</td>\n",
       "      <td>[Schedule, Occupied Command, Power (Calc), Sup...</td>\n",
       "      <td>[Outside Air Temperature, Setpoint Temperature...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11952</th>\n",
       "      <td>190</td>\n",
       "      <td>{'asset_type': 'Chiller', 'conditions': ['Chil...</td>\n",
       "      <td>POKMAIN_Chiller 1</td>\n",
       "      <td>Chiller</td>\n",
       "      <td>[Chiller % Loaded, Run Status, Chiller Efficie...</td>\n",
       "      <td>[]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11953</th>\n",
       "      <td>190</td>\n",
       "      <td>{'asset_type': 'Chiller', 'conditions': ['Chil...</td>\n",
       "      <td>POKMAIN_Chiller 3</td>\n",
       "      <td>Chiller</td>\n",
       "      <td>[Liquid Refrigerant Evaporator Temperature, Su...</td>\n",
       "      <td>[]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11954</th>\n",
       "      <td>190</td>\n",
       "      <td>{'asset_type': 'Chiller', 'conditions': ['Chil...</td>\n",
       "      <td>POKMAIN_Chiller 4</td>\n",
       "      <td>Chiller</td>\n",
       "      <td>[Setpoint Temperature, Condenser Water Flow, T...</td>\n",
       "      <td>[]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11955</th>\n",
       "      <td>190</td>\n",
       "      <td>{'asset_type': 'Chiller', 'conditions': ['Chil...</td>\n",
       "      <td>POKMAIN_Chiller 13</td>\n",
       "      <td>Chiller</td>\n",
       "      <td>[Condenser Water Return To Tower Temperature, ...</td>\n",
       "      <td>[]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11956</th>\n",
       "      <td>190</td>\n",
       "      <td>{'asset_type': 'Chiller', 'conditions': ['Chil...</td>\n",
       "      <td>POKMAIN_Chiller 14</td>\n",
       "      <td>Chiller</td>\n",
       "      <td>[Power Input, Supply Temperature, Condenser Wa...</td>\n",
       "      <td>[]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>11957 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       rule_id                                               rule  \\\n",
       "0            0  {'asset_type': 'AHU', 'conditions': ['AHU Runn...   \n",
       "1            0  {'asset_type': 'AHU', 'conditions': ['AHU Runn...   \n",
       "2            0  {'asset_type': 'AHU', 'conditions': ['AHU Runn...   \n",
       "3            0  {'asset_type': 'AHU', 'conditions': ['AHU Runn...   \n",
       "4            0  {'asset_type': 'AHU', 'conditions': ['AHU Runn...   \n",
       "...        ...                                                ...   \n",
       "11952      190  {'asset_type': 'Chiller', 'conditions': ['Chil...   \n",
       "11953      190  {'asset_type': 'Chiller', 'conditions': ['Chil...   \n",
       "11954      190  {'asset_type': 'Chiller', 'conditions': ['Chil...   \n",
       "11955      190  {'asset_type': 'Chiller', 'conditions': ['Chil...   \n",
       "11956      190  {'asset_type': 'Chiller', 'conditions': ['Chil...   \n",
       "\n",
       "                   asset_name asset_type  \\\n",
       "0      POKMAIN_B003 AHU 3-3-4        AHU   \n",
       "1      POKMAIN_B002 AHU 2-3-8        AHU   \n",
       "2          POKMAIN_B705 AHU 1        AHU   \n",
       "3      POKMAIN_B004 AHU 4-1-5        AHU   \n",
       "4      POKMAIN_B004 AHU 4-2-2        AHU   \n",
       "...                       ...        ...   \n",
       "11952       POKMAIN_Chiller 1    Chiller   \n",
       "11953       POKMAIN_Chiller 3    Chiller   \n",
       "11954       POKMAIN_Chiller 4    Chiller   \n",
       "11955      POKMAIN_Chiller 13    Chiller   \n",
       "11956      POKMAIN_Chiller 14    Chiller   \n",
       "\n",
       "                                                 sensors  \\\n",
       "0      [Occupied Command, Schedule, Power (Calc), Sup...   \n",
       "1      [Duct Static Pressure, Zone Relative Humidity ...   \n",
       "2      [Cooling Valve %, Supply Relative Humidity Set...   \n",
       "3      [Schedule, Supply Fan Status, Occupied Command...   \n",
       "4      [Schedule, Occupied Command, Power (Calc), Sup...   \n",
       "...                                                  ...   \n",
       "11952  [Chiller % Loaded, Run Status, Chiller Efficie...   \n",
       "11953  [Liquid Refrigerant Evaporator Temperature, Su...   \n",
       "11954  [Setpoint Temperature, Condenser Water Flow, T...   \n",
       "11955  [Condenser Water Return To Tower Temperature, ...   \n",
       "11956  [Power Input, Supply Temperature, Condenser Wa...   \n",
       "\n",
       "                                                 missing  all_available  \n",
       "0      [Outside Air Temperature, Setpoint Temperature...              0  \n",
       "1                                  [Return Air Enthalpy]              0  \n",
       "2         [Outside Air Temperature, Return Air Enthalpy]              0  \n",
       "3      [Outside Air Temperature, Setpoint Temperature...              0  \n",
       "4      [Outside Air Temperature, Setpoint Temperature...              0  \n",
       "...                                                  ...            ...  \n",
       "11952                                                 []              1  \n",
       "11953                                                 []              1  \n",
       "11954                                                 []              1  \n",
       "11955                                                 []              1  \n",
       "11956                                                 []              1  \n",
       "\n",
       "[11957 rows x 7 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "analysis = pd.DataFrame.from_records(doable_rule_asset_comb)\n",
    "analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "f31bbdc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "#analysis.to_csv(\"sensor_missing.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "37a4323f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "all_available\n",
       "0    11163\n",
       "1      794\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "analysis['all_available'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "502735a6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "asset_type\n",
       "AHU               0.034455\n",
       "Air Compressor    0.625000\n",
       "Boiler            0.560000\n",
       "CRAC              0.531034\n",
       "Chiller           0.553030\n",
       "Cooling Tower     0.200000\n",
       "HXU               0.000000\n",
       "Pump              0.000000\n",
       "Name: all_available, dtype: float64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "analysis.groupby(['asset_type'])['all_available'].agg('mean')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "9729de12",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "asset_type\n",
       "AHU               379\n",
       "CRAC              308\n",
       "Chiller            73\n",
       "Boiler             28\n",
       "Air Compressor      5\n",
       "Cooling Tower       1\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "analysis.loc[analysis['all_available'] == 1, 'asset_type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "afcfd8f9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "rule_id\n",
       "170    100\n",
       "150     56\n",
       "42      56\n",
       "112     56\n",
       "29      37\n",
       "2       37\n",
       "5       36\n",
       "11      36\n",
       "94      34\n",
       "106     34\n",
       "55      33\n",
       "105     31\n",
       "138     29\n",
       "9       29\n",
       "8       28\n",
       "30      25\n",
       "39      23\n",
       "67      11\n",
       "27      11\n",
       "14      11\n",
       "190     11\n",
       "184     11\n",
       "47      11\n",
       "91       7\n",
       "73       7\n",
       "78       4\n",
       "71       4\n",
       "80       4\n",
       "146      4\n",
       "165      4\n",
       "118      4\n",
       "133      4\n",
       "114      2\n",
       "65       2\n",
       "13       1\n",
       "100      1\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "analysis.loc[analysis['all_available'] == 1, 'rule_id'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "c678a367",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['rule_id', 'rule', 'asset_name', 'asset_type', 'sensors', 'missing',\n",
       "       'all_available'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "analysis.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "ff3a10db",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer, util\n",
    "\n",
    "SEMANTIC_MODEL = SentenceTransformer(\"all-mpnet-base-v2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "85d179e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "capable_rule_form_and_asset = list(analysis.loc[analysis['all_available'] == 1, ['rule', 'asset_name']].to_dict('index').values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc56f1fe",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/projects/AssetDiagnosisIQ/.conda/lib/python3.11/site-packages/ibm_watsonx_ai/foundation_models/utils/utils.py:436: LifecycleWarning: Model 'mistralai/mistral-large' is in deprecated state from 2025-07-09 until 2025-10-08. IDs of alternative models: mistralai/mistral-medium-2505. Further details: https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-model-lifecycle.html?context=wx&audience=wdp\n",
      "  warn(model_state_warning, category=LifecycleWarning)\n"
     ]
    }
   ],
   "source": [
    "from datetime import datetime\n",
    "\n",
    "def NATURAL_LANGUAGE_PROMPT(condition,param_map):\n",
    "    PROMPT = f\"\"\"\n",
    "    The objective is to create a formal mathematical representation given the parameters and natural language representation of the function.\n",
    "    the parameter map is given in the form of <term>:<mapped-to> in the (### Param maps). replace the <term> in the (### Natural Language representation) \n",
    "    with <mapped-to>. YOU MAY BE GIVEN EXTRA PARAMETERS IN THE MAP\n",
    "\n",
    "    # Example 1\n",
    "\n",
    "    ### Natural Language representation:\n",
    "    Supply Fan Output % = Daily Average Supply Fan Output % \n",
    "\n",
    "    ### Param maps:\n",
    "    {str({'Supply Fan Output %': 'X0', 'Daily Average Supply Fan Output %': 'X1'})}\n",
    "\n",
    "    Mathematical Representation: X0 = X1\n",
    "\n",
    "    # Example 2\n",
    "\n",
    "    ### Natural Language representation:\n",
    "    Chiller Efficiency > Asset Chiller Efficiency Parameter\n",
    "\n",
    "    ### Param maps:\n",
    "    {str({'Chiller Efficiency': 'X0', 'Asset Chiller Efficiency Parameter': 'X1'})}\n",
    "\n",
    "    Mathematical Representation: X0 > X1\n",
    "\n",
    "    # Example 3\n",
    "\n",
    "    ### Natural Language representation:\n",
    "    ABS(Supply Air Temperature Setpoint - Supply Air Temperature) > 3 IF X0 Reporting \n",
    "\n",
    "    ### Param maps:\n",
    "    {str({'Setpoint': 'X0', 'Supply Air Temperature Setpoint': 'X1', 'Supply Air Temperature': 'X2'})}\n",
    "\n",
    "    Mathematical Representation: X0 = X1\n",
    "\n",
    "\n",
    "    ### Natural Language representation:\n",
    "    {condition}\n",
    "\n",
    "    ### Param maps: \n",
    "    {param_map}\n",
    "\n",
    "    Mathematical Representation:\n",
    "    \"\"\"\n",
    "\n",
    "    return PROMPT\n",
    "\n",
    "def FUNC_CREATION_PROMPT(condition,param_map):\n",
    "    PROMPT = f\"\"\"\n",
    "    The objective is to create a python function given the variables of float (### Params), the natural language instruction (### Natural Language representation)\n",
    "    please only provide the code no explanations as we will directly use it in a pipeline\n",
    "\n",
    "    # Example 1\n",
    "\n",
    "    ### Natural Language representation:\n",
    "    X1 - X0 > 4 °F\n",
    "\n",
    "    ### Params:\n",
    "    {str(['X0','X1'])}\n",
    "\n",
    "    PYTHON CODE: \n",
    "    def condition(X0, X1):\n",
    "        return X1 - X0 > 4\n",
    "\n",
    "    # Example 2\n",
    "\n",
    "    ### Natural Language representation:\n",
    "    X0 NOT = 100%\n",
    "\n",
    "    ### Params:\n",
    "    {str(['X0'])}\n",
    "\n",
    "    PYTHON CODE: \n",
    "    def condition(X0: float) -> bool:\n",
    "        return X0 != 100\n",
    "\n",
    "    # Example 3\n",
    "\n",
    "    ### Natural Language representation:\n",
    "    Abs(X1 - X0) > 0.2 in H2O IF Reporting\n",
    "\n",
    "    ### Params:\n",
    "    {str(['X0','X1'])}\n",
    "\n",
    "    PYTHON CODE: \n",
    "    def condition(X0: float, X1: float) -> bool:\n",
    "        if X0 and X1:\n",
    "            return abs(X1 - X0) > 0.2\n",
    "        else:\n",
    "            return False\n",
    "\n",
    "\n",
    "    ### Natural Language representation:\n",
    "    {condition}\n",
    "\n",
    "    ### Param maps: \n",
    "    {param_map}\n",
    "\n",
    "    PYTHON CODE: \n",
    "    \"\"\"\n",
    "\n",
    "    return PROMPT\n",
    "\n",
    "class MismatchNL2FUNC(Exception):\n",
    "    \"\"\"\n",
    "    Exception raised when a natural-language description cannot be\n",
    "    mapped to any existing function signature.\n",
    "    \"\"\"\n",
    "    def __init__(self, nl_description: str, message: Optional[str] = None):\n",
    "        if message is None:\n",
    "            message = f\"Cannot match NL description to function: {nl_description!r}\"\n",
    "        super().__init__(message)\n",
    "        self.nl_description = nl_description\n",
    "\n",
    "\n",
    "def natural_language_to_function_deterministic(cond, params):\n",
    "    clean = copy.deepcopy(cond)\n",
    "\n",
    "    clean = clean.replace(\"  \", \" \")\n",
    "    clean = clean.replace('OAT', \"Outside Air Temparature(OAT)\")\n",
    "    clean = clean.replace('Subtype is', 'SubType Is')\n",
    "    clean = clean.replace('Not  Reporting', 'NOT Reporting')\n",
    "    clean = clean.replace('Not Reporting', 'NOT Reporting')\n",
    "    clean = clean.replace('Subtype', 'SubType')\n",
    "    clean = clean.replace('OAH% between 27 and 55', '(OAH% > 27) AND (OAH% <55)')\n",
    "    clean = clean.replace('OAH% between 27 and 55', '(OAH% > 27) AND (OAH% <55)')\n",
    "    \n",
    "    param_string = 'X'\n",
    "    _map = {}\n",
    "\n",
    "    params = sorted(params, key=len, reverse=True)\n",
    "    for i,p in enumerate(params):\n",
    "        _map[p] = f\"{param_string}{i}\"\n",
    "        clean = clean.replace(p,f\"{param_string}{i}\")\n",
    "\n",
    "    for v in _map.values():\n",
    "        if v not in clean:\n",
    "            print(cond, clean, _map)\n",
    "            raise MismatchNL2FUNC(\"{} : {} : {}\".format(cond, clean, _map))\n",
    "    return clean, _map\n",
    "\n",
    "def unit_func(X0):\n",
    "    return 1\n",
    "\n",
    "\n",
    "def natural_language_to_function_llm(cond, params):\n",
    "    param_string = 'X'\n",
    "    _map = {}\n",
    "\n",
    "    for i,p in enumerate(params):\n",
    "        _map[p] = f\"{param_string}{i}\"\n",
    "        \n",
    "\n",
    "def command_to_python(cond, params):\n",
    "    if params == ['Run Status']:\n",
    "        return unit_func, \"funcstr\", \"funcstr\", {'Run Status': 'X0'}\n",
    "\n",
    "    func_str, param_map = natural_language_to_function_deterministic(cond, params)    \n",
    "    func, funcstr = function_to_python_code(func_str, param_map)\n",
    "\n",
    "    return func, funcstr, func_str, param_map\n",
    "    \n",
    "from benchmarking.bench_utils.inference_calls import LLMConfiguration, ModelConfig, MODEL_MAP\n",
    "import ast\n",
    "\n",
    "def is_safe_function(func_str):\n",
    "    tree = ast.parse(func_str)\n",
    "    for node in ast.walk(tree):\n",
    "        if isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom):\n",
    "            return False\n",
    "    return True\n",
    "\n",
    "model_config = ModelConfig(**{\n",
    "        \"name\":'mistral-large',\n",
    "        \"identifier\" : 'mistralai/mistral-large',\n",
    "    })\n",
    "client = LLMConfiguration(model_config)\n",
    "\n",
    "def function_to_python_code(function_str:str, param:dict):\n",
    "    prompt = FUNC_CREATION_PROMPT(function_str, list(param.values()))\n",
    "    response = client.get_response(prompt)\n",
    "\n",
    "    if not response:\n",
    "        raise ValueError(\"Can't happen\")\n",
    "    \n",
    "    if \"PYTHON CODE:\" in response:\n",
    "        response = response.split(\"PYTHON CODE:\")[-1]\n",
    "\n",
    "    try:\n",
    "        response_strip = response.strip()\n",
    "        assert is_safe_function(response_strip)\n",
    "        local_vars = {}\n",
    "        exec(response_strip, {}, local_vars)\n",
    "        func_exec = local_vars['condition']\n",
    "\n",
    "    except IndentationError as ie:\n",
    "        print(response, sep=\"\\n\\n\\n\\n\\n\\n\")\n",
    "        func_exec = None\n",
    "    \n",
    "    return func_exec, function_str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "98dcc93c",
   "metadata": {},
   "outputs": [],
   "source": [
    "PROCESSED_PATH = \"/mnt/data/DiagIQ/processed\"\n",
    "INPUT_DATA = os.path.join(PROCESSED_PATH, 'raw')\n",
    "ANNOTATED_PATH = os.path.join(PROCESSED_PATH, 'annotated', )\n",
    "os.makedirs(ANNOTATED_PATH, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "b1101a7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime, timedelta\n",
    "import bisect\n",
    "\n",
    "def load_data_observations(matches, record, folder):\n",
    "    [site_n, asset_n] = record['asset_name'].split(\"_\")\n",
    "\n",
    "    total_data = {}\n",
    "    for sensor_n in matches:\n",
    "        _name = os.path.join(\n",
    "            folder,\n",
    "            \"{}.json\".format('_'.join(\n",
    "                [site_n, asset_n, \"{a} {b}\".format(a = asset_n, b = sensor_n)]\n",
    "                )))\n",
    "        \n",
    "        total_data[sensor_n] = file_handle.load_json(\n",
    "            _name\n",
    "        )\n",
    "        \n",
    "    return total_data\n",
    "\n",
    "def segment_indexes_by_two_hours(start_time: datetime,\n",
    "                                 end_time: datetime,\n",
    "                                 timestamps: list[datetime]) -> list[tuple[datetime, datetime, list[int]]]:\n",
    "    \"\"\"\n",
    "    Divide [start_time, end_time) into 2-hour windows and return, for each window,\n",
    "    the list of indexes (into `timestamps`) whose values fall within that window.\n",
    "\n",
    "    Assumes:\n",
    "      - `timestamps` is sorted in ascending order.\n",
    "      - All timestamps are between start_time and end_time, or outside\n",
    "        (they’ll simply fall before the first or after the last window).\n",
    "\n",
    "    Returns a list of tuples:\n",
    "      (window_start, window_end, [idx1, idx2, …])\n",
    "    \"\"\"\n",
    "    segments = []\n",
    "    current = start_time\n",
    "    # timestamps must be sorted; if not, sort a copy\n",
    "    ts = timestamps\n",
    "\n",
    "    while current < end_time:\n",
    "        window_end = current + timedelta(hours=2)\n",
    "        if window_end > end_time:\n",
    "            window_end = end_time\n",
    "\n",
    "        # bisect to find the slice of ts in [current, window_end)\n",
    "        lo = bisect.bisect_left(ts, current)\n",
    "        hi = bisect.bisect_left(ts, window_end)\n",
    "\n",
    "        # indexes into the original list\n",
    "        idxs = list(range(lo, hi))\n",
    "        segments.append((current, window_end, idxs))\n",
    "\n",
    "        current = window_end\n",
    "\n",
    "    return segments\n",
    "\n",
    "def data_agg(data:List[Union[int, float]]):\n",
    "    return np.array(data).mean()\n",
    "    \n",
    "\n",
    "def build_alarm_timeline(data:Dict, func, params, matched):\n",
    "\n",
    "    time_info = {}\n",
    "    for m in matched:\n",
    "\n",
    "        for o in data[m]:\n",
    "            if isinstance(o['timestamp'], str):\n",
    "                o['timestamp'] = datetime.fromisoformat(o['timestamp'])\n",
    "            elif isinstance(o['timestamp'], datetime):\n",
    "                continue\n",
    "            else:\n",
    "                print(o['timestamp'], type(o['timestamp']))\n",
    "\n",
    "        data[m] = sorted(data[m], key= lambda x: x['timestamp'])\n",
    "\n",
    "        start_time = data[m][0]['timestamp']\n",
    "        end_time = data[m][-1]['timestamp']\n",
    "\n",
    "        time_info[m] = {\"start\":start_time, \"end\":end_time, 'all':[x['timestamp'] for x in data[m]]}\n",
    "    \n",
    "    overall_start = min([tinfo['start'] for tinfo in time_info.values()])\n",
    "    overall_end = max([tinfo['end'] for tinfo in time_info.values()])\n",
    "\n",
    "    \n",
    "    segments = {}\n",
    "    for m in matched:\n",
    "        segments[m] = segment_indexes_by_two_hours(overall_start, overall_end, time_info[m]['all'])\n",
    "\n",
    "    \n",
    "    annotated = []\n",
    "    params_inverse = {v:k for k,v in params.items()}\n",
    "    param_order = [params_inverse[f\"X{i}\"] for i in range(len(params_inverse))]\n",
    "    #print(param_order, \"func\")\n",
    "\n",
    "    segment_list = [segments[x] for x in param_order]\n",
    "    \n",
    "    \n",
    "    for ran in zip(*segment_list):\n",
    "        #print(\"lol\", ran)\n",
    "        ind_all = [x[-1] for x in ran]\n",
    "\n",
    "        data_pack = {}\n",
    "        for i, (ind, p) in enumerate(zip(ind_all, param_order)):\n",
    "            if len(ind)<1:\n",
    "                data_pack[f'X{i}'] = None\n",
    "                continue\n",
    "                \n",
    "            data_vals = [data[p][x]['value'] for x in ind]\n",
    "\n",
    "            if len(data_vals)>0:\n",
    "                data_pack[f'X{i}'] = data_agg(data_vals)\n",
    "            else:\n",
    "                data_pack[f'X{i}'] = None\n",
    "        \n",
    "        if not any(x is None for x in data_pack):\n",
    "            func_value = func(**data_pack)\n",
    "        else:\n",
    "            func_value = False\n",
    "        \n",
    "        if func_value:\n",
    "            print(\"func_value\", func_value)\n",
    "            \n",
    "        annotated.append(func_value)\n",
    "\n",
    "    return annotated, overall_start, overall_end\n",
    "    \n",
    "\n",
    "    \n",
    "\n",
    "\n",
    "\n",
    "def save_annotated(save_loc, **kwargs):\n",
    "    pass\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36d7ce05",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "0it [00:00, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Run Status'] [['Supply Fan Status', 'Occupied Command', 'Preheat Valve %', 'Humidifier Valve %', 'Economizer Mode', 'Schedule', 'Cooling Valve %', 'Zone Relative Humidity %', 'Supply Relative Humidity %', 'Supply Relative Humidity Setpoint %', 'Setpoint Temperature', 'Zone Temperature', 'Return Air Temperature', 'Static Pressure Setpoint', 'Mixed Air Temperature', 'Duct Static Pressure', 'Supply Air Temperature', 'Pre Heating Temp', 'Power (Calc)']]\n",
      "[None] ['Run Status']\n",
      "['Outside Air Temperature'] [['Mixed Air Temperature', 'Supply Air Temperature', 'Return Air Temperature', 'Zone Temperature', 'Setpoint Temperature', 'Zone Relative Humidity %', 'Supply Relative Humidity %', 'Pre Heating Temp', 'Cooling Valve %', 'Supply Relative Humidity Setpoint %', 'Preheat Valve %', 'Supply Fan Status', 'Humidifier Valve %', 'Duct Static Pressure', 'Static Pressure Setpoint', 'Power (Calc)', 'Schedule', 'Economizer Mode', 'Occupied Command']]\n",
      "['Mixed Air Temperature'] ['Outside Air Temperature']\n",
      "OAT > 35 °F Outside Air Temparature(OAT) > 35 °F {'Outside Air Temperature': 'X0'}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "fuck",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mValueError\u001b[39m                                Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[44]\u001b[39m\u001b[32m, line 31\u001b[39m\n\u001b[32m     28\u001b[39m \u001b[38;5;28mprint\u001b[39m(matches, entities)\n\u001b[32m     30\u001b[39m \u001b[38;5;66;03m#try:\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m31\u001b[39m func, gen_fun_str, func_str, params = \u001b[43mcommand_to_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcond\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mentities\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     33\u001b[39m rec[\u001b[33m'\u001b[39m\u001b[33mfunc_info\u001b[39m\u001b[33m'\u001b[39m] = {\n\u001b[32m     34\u001b[39m     \u001b[33m'\u001b[39m\u001b[33mfunc\u001b[39m\u001b[33m'\u001b[39m:func,\n\u001b[32m     35\u001b[39m     \u001b[33m'\u001b[39m\u001b[33mgen_fun_str\u001b[39m\u001b[33m'\u001b[39m: gen_fun_str, \n\u001b[32m   (...)\u001b[39m\u001b[32m     38\u001b[39m     \u001b[33m'\u001b[39m\u001b[33merror\u001b[39m\u001b[33m'\u001b[39m: \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m     39\u001b[39m }\n\u001b[32m     41\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m matches == [\u001b[38;5;28;01mNone\u001b[39;00m]:\n",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[41]\u001b[39m\u001b[32m, line 150\u001b[39m, in \u001b[36mcommand_to_python\u001b[39m\u001b[34m(cond, params)\u001b[39m\n\u001b[32m    147\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m params == [\u001b[33m'\u001b[39m\u001b[33mRun Status\u001b[39m\u001b[33m'\u001b[39m]:\n\u001b[32m    148\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m unit_func, \u001b[33m\"\u001b[39m\u001b[33mfuncstr\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mfuncstr\u001b[39m\u001b[33m\"\u001b[39m, {\u001b[33m'\u001b[39m\u001b[33mRun Status\u001b[39m\u001b[33m'\u001b[39m: \u001b[33m'\u001b[39m\u001b[33mX0\u001b[39m\u001b[33m'\u001b[39m}\n\u001b[32m--> \u001b[39m\u001b[32m150\u001b[39m func_str, param_map = \u001b[43mnatural_language_to_function_deterministic\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcond\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    151\u001b[39m func, funcstr = function_to_python_code(func_str, param_map)\n\u001b[32m    153\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m func, funcstr, func_str, param_map\n",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[41]\u001b[39m\u001b[32m, line 131\u001b[39m, in \u001b[36mnatural_language_to_function_deterministic\u001b[39m\u001b[34m(cond, params)\u001b[39m\n\u001b[32m    129\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m v \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m clean:\n\u001b[32m    130\u001b[39m         \u001b[38;5;28mprint\u001b[39m(cond, clean, _map)\n\u001b[32m--> \u001b[39m\u001b[32m131\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mfuck\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m    132\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m clean, _map\n",
      "\u001b[31mValueError\u001b[39m: fuck"
     ]
    }
   ],
   "source": [
    "n = 0\n",
    "for i,rec in tqdm(enumerate(capable_rule_form_and_asset)):\n",
    "    r = rec['rule']\n",
    "    ia = rec['asset_name']\n",
    "    r['id'] = str(i)\n",
    "\n",
    "\n",
    "\n",
    "    asset_sel = asset_desc[ia]\n",
    "    sen_is = [x for x in available_obs.values() if (x['asset_name'] == ia.split(\"_\")[1]) and (x['site_name'] == ia.split(\"_\")[0])]\n",
    "    \n",
    "    for cond in r[\"conditions\"]:\n",
    "        cond_info = extracted_sensors[f\"{r['rule']}_{cond}\"]\n",
    "        entities = set()\n",
    "        \n",
    "        for ent in cond_info[\"extracted\"]:\n",
    "            entities = entities.union(set(ent['entities']))\n",
    "\n",
    "        #if 'Setpoint' in entities:\n",
    "        #    entities = entities - {'Setpoint'}\n",
    "\n",
    "        entities = list(entities)\n",
    "\n",
    "        if not entities:\n",
    "            ValueError(\"Issue check @if not entities\")\n",
    "\n",
    "        matches = search_sensors_in_available(sen_is, entities)\n",
    "        print(matches, entities)\n",
    "\n",
    "        try:\n",
    "            func, gen_fun_str, func_str, params = command_to_python(cond, entities)\n",
    "        except MismatchNL2FUNC as ex:\n",
    "            \n",
    "\n",
    "        rec['func_info'] = {\n",
    "            'func':func,\n",
    "            'gen_fun_str': gen_fun_str, \n",
    "            'func_str': func_str, \n",
    "            'params':params, \n",
    "            'error': False\n",
    "        }\n",
    "\n",
    "        if matches == [None]:\n",
    "            continue\n",
    "\n",
    "        #data = load_data_observations(matches, rec, INPUT_DATA)\n",
    "        #ann_data = build_alarm_timeline(data, func, params, matches)\n",
    "        save_annotated(\n",
    "            os.path.join(ANNOTATED_PATH, ia, r['id']),\n",
    "            record = rec,\n",
    "            matches = matches,\n",
    "            entities = entities,\n",
    "            \n",
    "        )\n",
    "\n",
    "        \"\"\"\n",
    "        except ValueError as er:\n",
    "            rec['func_info'] = {\n",
    "                'error': True,\n",
    "                'func': None,\n",
    "                'gen_fun_str': None,\n",
    "                'func_str': None,\n",
    "                'params': None\n",
    "            }\"\"\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "795bcdd5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
