{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "de7bb172",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_2402655/810001644.py:16: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from tqdm.autonotebook import tqdm\n",
      "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os, sys\n",
    "import fitz\n",
    "import re\n",
    "import json\n",
    "from json import JSONDecodeError\n",
    "from datetime import datetime\n",
    "from typing import Optional, List, Callable, Any, Tuple, Dict, Union\n",
    "from abc import abstractmethod, ABC\n",
    "import random\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import copy\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "import pickle\n",
    "from tqdm.autonotebook import tqdm\n",
    "import itertools\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "sys.path.append(\"../\")\n",
    "\n",
    "load_dotenv(dotenv_path=\"../.env\")\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bd9d72bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataset_utils.reader import ADIQDataset\n",
    "from dataset_utils.outputs import to_basic_prompt\n",
    "import  models_utils.llm.azure_llm as azure_llm\n",
    "from dataset_utils.question import Question\n",
    "\n",
    "ds = ADIQDataset(\"datasets/simpleV3\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f8aa928b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "def question_to_taxonomy_extraction(\n",
    "        q:Question, \n",
    "        asset_descriptions:Dict[str,str]\n",
    "        ) -> str:\n",
    "    \n",
    "    _template = \"\"\"\n",
    "\n",
    "Please Read the question below with the context provided. I'm currently making a taxonomy therefore \n",
    "make a list of words and their explanations that require further explanation by analysing all the information given. The output must be in a strict \n",
    "json fomat, if you don't know the explanation add None \n",
    "\n",
    "## Asset Description:\n",
    "{asset_name}: {asset_desc}\n",
    "\n",
    "## Conditions:\n",
    "{conditions}\n",
    "\n",
    "## How long the conditions were met:\n",
    "{temporal_condition}\n",
    "\n",
    "Given the above detected conditions, what should the operator look for?\n",
    "{options}\n",
    "\n",
    "Your output must strictly follow this format:\\n{example}\n",
    "\"\"\"\n",
    "    if len(q.temporal_condition) == 0:\n",
    "        print(q.rule_name)\n",
    "        \n",
    "    text = _template.format(\n",
    "        asset_name = q.asset_type,\n",
    "        asset_desc = asset_descriptions.get(q.asset_type, \"NONE\"),\n",
    "        conditions = \"\\n\".join(list(map(lambda x:\"- \"+x, q.condition_description))),\n",
    "        temporal_condition = q.temporal_condition[0] if len(q.temporal_condition)>0 else \"None\",\n",
    "        options = \"\\n\".join([\"{}). {}\".format(op_id, op) for op_id, op in zip(q.option_ids,q.options)]),\n",
    "        example = {\n",
    "            'VAV' : 'Variable Air Volume',\n",
    "            'cfm' : 'Cubic Feet per Minuete',\n",
    "            'ABS' : 'function to give the absolute value'\n",
    "            }\n",
    "    )\n",
    "\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a6db471",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "68863f8f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Please Read the question below with the context provided. I'm currently making a taxonomy therefore \n",
      "make a list of words and their explanations that require further explanation by analysing all the information given. The output must be in a strict \n",
      "json fomat, if you don't know the explanation add None \n",
      "\n",
      "## Asset Description:\n",
      "Pump: A device used to move fluid or gas from one place to another. It can include various types of pumps, such as centrifugal pumps, positive displacement pumps, and diaphragm pumps.\n",
      "\n",
      "## Conditions:\n",
      "- Pump Occupied = 0\n",
      "- OAT > 55 °F\n",
      "- Pump Current In Amps >= 1]\n",
      "\n",
      "## How long the conditions were met:\n",
      "Met for 2 Hours\n",
      "\n",
      "Given the above detected conditions, what should the operator look for?\n",
      "A). Load is too low or fluctuates\n",
      "B). Lighting relay schedule over written\n",
      "C). Pump is overridden\n",
      "D). Bad pressure sensor\n",
      "\n",
      "Your output must strictly follow this format:\n",
      "{'VAV': 'Variable Air Volume', 'cfm': 'Cubic Feet per Minuete', 'ABS': 'function to give the absolute value'}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "example1 = question_to_taxonomy_extraction(ds.questions[0], ds.asset_descriptions)\n",
    "print(example1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "acf0bf1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from models_utils.llm.rits import APICall \n",
    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
    "\n",
    "client = azure_llm.APICall(\n",
    "    model_name=\"gpt-3.5\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e51907d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import models_utils.llm.watsonx as watsonx\n",
    "\n",
    "def _get_taxonomy_question(q:Question) -> dict[str,Any]:\n",
    "    q_prompt = question_to_taxonomy_extraction(q, ds.asset_descriptions)\n",
    "    response = client.complete_response(q_prompt)\n",
    "\n",
    "    if not response:\n",
    "        return {'error':True, 'err_message':\"Null response\", 'response':'None'}\n",
    "    try:\n",
    "        response = json.loads(response)\n",
    "        response = {'response':response, 'error':False, 'err_message':\"\"}\n",
    "    except JSONDecodeError as e:\n",
    "        response = {'response':response, 'error':True, 'err_message':\"JSONDecodeError\"}\n",
    "    return response\n",
    "\n",
    "def get_taxonomy_words(questions:List[Question]):\n",
    "    words = {}\n",
    "    errors = []\n",
    "\n",
    "    with ThreadPoolExecutor(max_workers=8) as executor:\n",
    "        futures = [executor.submit(_get_taxonomy_question, q) for q in questions]\n",
    "        \n",
    "        for future in tqdm(as_completed(futures), total=len(futures), desc=\"Completed:\"):\n",
    "            w = future.result()\n",
    "\n",
    "            if not w['error']:\n",
    "                words = {**words, **w['response']}\n",
    "            else:\n",
    "                errors.append(w)\n",
    "\n",
    "\n",
    "    return words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9d054c6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "### Analysis of the data got"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "74b10243",
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import file_handle\n",
    "import json\n",
    "\n",
    "data = file_handle.load_json(\"extracted/taxonomy/word_data2.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d096c3b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re \n",
    "\n",
    "def filter_json(s:str) ->str:\n",
    "    s = re.sub(r'(?s)^.*?```json\\s*', '', s)\n",
    "    s = re.sub(r'```.*$', '', s.strip(), flags=re.DOTALL)\n",
    "    return re.sub(r'\\bNone\\b','null', s)\n",
    "\n",
    "words = {}\n",
    "er = []\n",
    "for x in data['errors']:\n",
    "    try:\n",
    "        error = False\n",
    "        s = filter_json(x['response'])\n",
    "        s = json.loads(s)\n",
    "    except JSONDecodeError as err:\n",
    "        s = x['response']\n",
    "        error = True\n",
    "        \n",
    "    if not error:\n",
    "        words.update(s)\n",
    "\n",
    "    else:\n",
    "        er.append({\n",
    "            \"response\":s,\n",
    "            \"error\":error\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "92705981",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_handle.save_json(\n",
    "    words,\n",
    "    \"extracted/taxonomy/concepts.json\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "137948e3",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
