{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 161,
   "id": "a6fa8478",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/devinyasithdesilva/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 161,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "import fitz\n",
    "import re\n",
    "import json\n",
    "from datetime import datetime\n",
    "from typing import Optional, List, Callable, Any\n",
    "from abc import abstractmethod, ABC\n",
    "import random\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import copy\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "import itertools\n",
    "\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "id": "3011b9be",
   "metadata": {},
   "outputs": [],
   "source": [
    "class OPS:\n",
    "    OR = \"OR\"\n",
    "    AND = \"AND\"\n",
    "\n",
    "class Node:\n",
    "    def __init__(self, ops:str, statement:Optional[str], manual=False) -> None:\n",
    "        self.children = []\n",
    "        self.val = None\n",
    "        self.ops = ops\n",
    "        self.statement = statement\n",
    "\n",
    "        if not manual:\n",
    "            self.build_graph()\n",
    "\n",
    "    def build_graph(self):\n",
    "        split = self.statement.split(self.ops)\n",
    "        if len(split)>=2:\n",
    "            self.children = [Node(self.ops,split[0]), Node(self.ops, self.ops.join(split[1:]))]\n",
    "            self.val = self.ops\n",
    "        else:\n",
    "            self.val = self.statement\n",
    "\n",
    "    def __copy__(self):\n",
    "        _temp = Node(\n",
    "            ops = self.ops,\n",
    "            statement=self.statement,\n",
    "            manual=True\n",
    "        )\n",
    "\n",
    "        return _temp\n",
    "\n",
    "\n",
    "\n",
    "    @staticmethod\n",
    "    def generate_graph(statement:str):\n",
    "        root = None\n",
    "\n",
    "        for oi, o in enumerate([OPS.OR]):\n",
    "            if oi == 0:\n",
    "                root = Node(o,statement)\n",
    "\n",
    "                return root\n",
    "\n",
    "def save_json(data:dict, loc:str) -> None:\n",
    "    with open(loc, \"w\") as f0:\n",
    "        json.dump(data, f0)\n",
    "\n",
    "def load_json(loc:str) -> dict:\n",
    "    with open(loc,\"r\") as f0:\n",
    "        return json.load(f0)\n",
    "\n",
    "def load_pickle(loc:str) -> Any:\n",
    "    with open(loc, \"rb\") as f0:\n",
    "        return pickle.load(f0)\n",
    "    \n",
    "\n",
    "def _print_tree(root:Node, level=0, prefix=\"Root: \") -> str:\n",
    "    if root is not None:\n",
    "        _s = \"    \"*level + prefix + root.val+\"\\n\"\n",
    "        for x in root.children:\n",
    "            _s +=_print_tree(x, level+1, \"L---- \")\n",
    "\n",
    "    return _s \n",
    "\n",
    "def print_tree(root):\n",
    "    concat_string = _print_tree(root)\n",
    "    print(concat_string)\n",
    "\n",
    "def get_tree_repr(root:Node):\n",
    "    _temp = copy.deepcopy(root)\n",
    "    _temp.val = \"\"\n",
    "    concat_string = _print_tree(_temp, prefix=\"\")\n",
    "    return concat_string\n",
    "\n",
    "\n",
    "def traverse_tree(root:Node) ->Node:\n",
    "    _temp = copy.deepcopy(root)\n",
    "    recursive_traverse_tree(_temp)\n",
    "    concat_tree(None, _temp)\n",
    "    return _temp\n",
    "\n",
    "def concat_tree(parent:Optional[Node],child:Node):\n",
    "    if child.children:\n",
    "        for c in child.children:\n",
    "            concat_tree(child, c)\n",
    "\n",
    "        if len(child.children) == 1 and parent:\n",
    "            parent.children.remove(child)\n",
    "            parent.children.append(child.children[0])\n",
    "\n",
    "def recursive_traverse_tree(root:Node) -> None:\n",
    "    if root.children:\n",
    "        for c in root.children:\n",
    "            recursive_traverse_tree(c)\n",
    "\n",
    "        if root.val == OPS.OR:\n",
    "            sel_ind = 0\n",
    "            root.children = [root.children[sel_ind]]\n",
    "        else:\n",
    "            root.children = root.children\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "id": "ac729326",
   "metadata": {},
   "outputs": [],
   "source": [
    "extract_fol = \"extracted\"\n",
    "equipment_info_loc = os.path.join(extract_fol, \"equipment_info\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "id": "bdb128d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "rule_information = load_pickle(\n",
    "    os.path.join(extract_fol, \"TreeStruct.pkl\")\n",
    ")\n",
    "\n",
    "asset_descriptions = load_json(\n",
    "    os.path.join(equipment_info_loc, \"gre_desc.json\")\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "id": "030c8075",
   "metadata": {},
   "outputs": [],
   "source": [
    "stop_words = stopwords.words('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "id": "f4374ebf",
   "metadata": {},
   "outputs": [],
   "source": [
    "domain_okay_words = [\n",
    "    \"Supply Temperature - Setpoint Temperature\",\n",
    "    \"Chiller cannot keep up with the load\",\n",
    "    \"Unit shutdown for some reason\",\n",
    "    \"Too much equipment in the local area\",\n",
    "    \"Chilled water valve not actuating well\",\n",
    "    \"Too much equipment in the local area\",\n",
    "    \"Unit shutdown for some reason\",\n",
    "    \"Side-by-side units that are not balanced\",\n",
    "    \"Whoever it was that changed this rule on March 3rd at 1pm Eastern, be glad the logs didn't record your name. Or maybe not, based off what I'm about tosay. This rule running was the reason the performance had tanked the past week and a half. I don't know if it was re-enabled, ruleOn was modified, or if it was a recompute on ALL dates. Whatever it was, it helped tank the performance. This is not the first time something like this has happened. So if this EVER happens again, here's what's happening. EVERYONE will loose all ability to make any changes to SkySpark. I will place SkySpark under complete CIO control and processes. Any new user additions will go through Gareth. Any updates that need to be made will be submitted, approved, and prioritized through the github portal. There will be no exceptions. Paul\",\n",
    "    \"Review of Boiler operation\",\n",
    "    \"Bypassing excessive amounts of fluid\",\n",
    "    \n",
    "]\n",
    "\n",
    "domain_okay_words = [x.replace(\"\\n\",\"\").replace(\"L----\", \"\").replace(\"(\", \"\").replace(\")\", \"\").replace(\",\", \"\").lower() for x in domain_okay_words]\n",
    "domain_okay_words = [x.split(\" \") for x in domain_okay_words]\n",
    "domain_okay_words = [list(filter(lambda a: a != \"\" and a not in stop_words, x)) for x in domain_okay_words]\n",
    "domain_okay_words = list(itertools.chain.from_iterable(domain_okay_words))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03848391",
   "metadata": {},
   "source": [
    "# Observation Analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03c2fcfe",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "id": "afb80668",
   "metadata": {},
   "outputs": [],
   "source": [
    "obs_word_dict = {}\n",
    "for d in rule_information[\"rule_set\"]:\n",
    "    s = \" \".join((d[\"display_text\"][\"observations\"]))\n",
    "    s = s.replace(\"\\n\",\"\").replace(\"L----\", \"\").replace(\"(\", \"\").replace(\")\", \"\").replace(\",\", \"\").lower()\n",
    "    s = s.split(\" \")\n",
    "    s = list(filter(lambda a: a != \"\" and a not in stop_words and a not in list(asset_descriptions.keys()) and a not in domain_okay_words, s))\n",
    "\n",
    "    for w in s:\n",
    "        try:\n",
    "            obs_word_dict[w] += 1\n",
    "        except KeyError as ke:\n",
    "            obs_word_dict[w] = 1\n",
    "\n",
    "obs_word = pd.DataFrame.from_dict(obs_word_dict, 'index', columns=['count']).reset_index()\n",
    "obs_word.columns = ['word', 'count']\n",
    "obs_word = obs_word.sort_values([\"count\"], ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "id": "519992ac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>check</td>\n",
       "      <td>41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>bms</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>issues</td>\n",
       "      <td>29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>sensors</td>\n",
       "      <td>27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>broken</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>159</th>\n",
       "      <td>savings.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>158</th>\n",
       "      <td>additional</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>154</th>\n",
       "      <td>angle</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>153</th>\n",
       "      <td>vanes</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>267</th>\n",
       "      <td>ups</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>268 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           word  count\n",
       "12        check     41\n",
       "28          bms     35\n",
       "34       issues     29\n",
       "3       sensors     27\n",
       "73       broken     22\n",
       "..          ...    ...\n",
       "159    savings.      1\n",
       "158  additional      1\n",
       "154       angle      1\n",
       "153       vanes      1\n",
       "267         ups      1\n",
       "\n",
       "[268 rows x 2 columns]"
      ]
     },
     "execution_count": 168,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obs_word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "id": "a51c4089",
   "metadata": {},
   "outputs": [],
   "source": [
    "obs_word.to_csv(\"metadata/obser_unique_word.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "id": "e98464b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "acronym_word_dict = {}\n",
    "for d in rule_information[\"rule_set\"]:\n",
    "    s = \" \".join((d[\"display_text\"][\"observations\"]))\n",
    "    s = s.replace(\"\\n\",\"\").replace(\"L----\", \"\").replace(\"(\", \"\").replace(\")\", \"\").replace(\",\", \"\")\n",
    "    s = s.split(\" \")\n",
    "    s = list(filter(lambda a: a != \"\" and a.isupper() and a not in list(asset_descriptions.keys()), s))\n",
    "\n",
    "    for w in s:\n",
    "        try:\n",
    "            acronym_word_dict[w] += 1\n",
    "        except KeyError as ke:\n",
    "            acronym_word_dict[w] = 1\n",
    "\n",
    "acronym_word = pd.DataFrame.from_dict(acronym_word_dict, 'index', columns=['count']).reset_index()\n",
    "acronym_word.columns = ['word', 'count']\n",
    "acronym_word = acronym_word.sort_values([\"count\"], ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "id": "6d3c034d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>BMS</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>VFD</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>VAV</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>OR</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>F</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>C</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>SAT</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>MAT</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>ON</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>CT</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>BMS.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    word  count\n",
       "1    BMS     35\n",
       "5    VFD     16\n",
       "6    VAV     12\n",
       "2     OR      5\n",
       "3      F      5\n",
       "4      C      5\n",
       "0      A      1\n",
       "7    SAT      1\n",
       "8    MAT      1\n",
       "9     ON      1\n",
       "10    CT      1\n",
       "11  BMS.      1"
      ]
     },
     "execution_count": 171,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "acronym_word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "id": "e6260752",
   "metadata": {},
   "outputs": [],
   "source": [
    "acronym_word.to_csv(\"metadata/acronym_word.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02ae8f4e",
   "metadata": {},
   "source": [
    "# Rule Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e367781d",
   "metadata": {},
   "outputs": [],
   "source": [
    "rule_word_dict = {}\n",
    "for d in rule_information[\"rule_set\"]:\n",
    "    s = _print_tree(d[\"display_text\"][\"rules\"])\n",
    "    s = s.replace(\"\\n\",\"\").replace(\"L----\", \"\").replace(\"Root: root\", \"\").replace(\"(\", \"\").replace(\")\", \"\").replace(\",\", \"\").lower()\n",
    "    s = s.split(\" \")\n",
    "     \n",
    "\n",
    "    for w in s:\n",
    "        try:\n",
    "            rule_word_dict[w] += 1\n",
    "        except KeyError as ke:\n",
    "            rule_word_dict[w] = 1\n",
    "\n",
    "rule_word = pd.DataFrame.from_dict(rule_word_dict, 'index', columns=['count']).reset_index()\n",
    "rule_word.columns = ['word', 'count']\n",
    "rule_word = rule_word.sort_values([\"count\"], ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "id": "0d97413e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&gt;</td>\n",
       "      <td>144</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>for</td>\n",
       "      <td>126</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>hours</td>\n",
       "      <td>124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>°f</td>\n",
       "      <td>122</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>met</td>\n",
       "      <td>122</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166</th>\n",
       "      <td>8pm</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>167</th>\n",
       "      <td>before</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>168</th>\n",
       "      <td>5am</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>169</th>\n",
       "      <td>0.01</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>283</th>\n",
       "      <td>4mw</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>284 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       word  count\n",
       "3         >    144\n",
       "11      for    126\n",
       "13    hours    124\n",
       "37       °f    122\n",
       "10      met    122\n",
       "..      ...    ...\n",
       "166     8pm      1\n",
       "167  before      1\n",
       "168     5am      1\n",
       "169    0.01      1\n",
       "283     4mw      1\n",
       "\n",
       "[284 rows x 2 columns]"
      ]
     },
     "execution_count": 174,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rule_word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "id": "863f4556",
   "metadata": {},
   "outputs": [],
   "source": [
    "rule_word.to_csv(\"metadata/rule_unique_word.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c90e269",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
