{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "aebb48bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import fitz\n",
    "import re\n",
    "import json\n",
    "from datetime import datetime\n",
    "from typing import Optional, List, Callable, Any\n",
    "import copy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "id": "16d20473",
   "metadata": {},
   "outputs": [],
   "source": [
    "#parameter \n",
    "\n",
    "extract_fol = \"extracted\"\n",
    "save_loc = os.path.join(extract_fol,\"extracted_data.json\")\n",
    "os.makedirs(extract_fol, exist_ok=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "id": "eacfafb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_json(data:dict, loc:str) -> None:\n",
    "    with open(loc, \"w\") as f0:\n",
    "        json.dump(data, f0)\n",
    "\n",
    "\n",
    "def text_write(text, filename= \"debug.test\"):\n",
    "    with open(filename, \"w\") as f0:\n",
    "        f0.write(text)\n",
    "\n",
    "def dict_value_string_function(d:dict, func:Callable[[Optional[Any]], Optional[Any]]) -> dict:\n",
    "    r_dict = {}\n",
    "    for k,v in d.items():\n",
    "        if isinstance(v, dict):\n",
    "            r_dict[k] = dict_value_string_function(v,func)\n",
    "        else:\n",
    "            r_dict[k] = func(v)\n",
    "\n",
    "    return r_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "d2257794",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(save_loc, \"r\") as f0:\n",
    "    data = json.load(f0)\n",
    "\n",
    "dataset = data[\"rule_set\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "35c251ff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "120\n"
     ]
    }
   ],
   "source": [
    "f = \"\"\n",
    "cleaned = copy.deepcopy(data)\n",
    "cleaned[\"rule_set\"] = []\n",
    "for no, s in enumerate(dataset):\n",
    "    if isinstance(s[\"display_text\"][\"rules\"],str) and isinstance(s[\"display_text\"][\"observations\"],str):\n",
    "        if s[\"#n\"] == 0:\n",
    "            print(\"YUp\")\n",
    "\n",
    "        f += \"<><><><>\\n\"\n",
    "        f += s[\"id\"]+\"\\n\\n\"\n",
    "        f += s[\"display_text\"][\"rules\"]+\"\\n\\n\"\n",
    "        f += s[\"display_text\"][\"observations\"]+\"\\n\"\n",
    "\n",
    "        s = dict_value_string_function(s, lambda x: x.strip() if isinstance(x,str) else x)\n",
    "        cleaned[\"rule_set\"].append(s)\n",
    "\n",
    "print(len(cleaned[\"rule_set\"]))\n",
    "text_write(f,\"both.debug.test\")\n",
    "save_json(cleaned, os.path.join(extract_fol,\"observations.rules.json\"))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "1744fc30",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "37\n"
     ]
    }
   ],
   "source": [
    "f = \"\"\n",
    "cleaned = copy.deepcopy(data)\n",
    "cleaned[\"rule_set\"] = []\n",
    "for no, s in enumerate(dataset):\n",
    "    if (not isinstance(s[\"display_text\"][\"rules\"],str)) and (not isinstance(s[\"display_text\"][\"observations\"],str)):\n",
    "        f += \"<><><><>\\n\"\n",
    "        f += s[\"id\"]+\"\\n\"\n",
    "\n",
    "        s = dict_value_string_function(s, lambda x: x.strip() if isinstance(x,str) else x)\n",
    "        cleaned[\"rule_set\"].append(s)\n",
    "\n",
    "print(len(cleaned[\"rule_set\"]))\n",
    "text_write(f,\"both_not.debug.test\")\n",
    "save_json(cleaned, os.path.join(extract_fol,\"no_observations.no_rules.json\"))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "id": "6c21d078",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2\n"
     ]
    }
   ],
   "source": [
    "f = \"\"\n",
    "cleaned = copy.deepcopy(data)\n",
    "cleaned[\"rule_set\"] = []\n",
    "for no, s in enumerate(dataset):\n",
    "    if (not isinstance(s[\"display_text\"][\"rules\"],str)) and (isinstance(s[\"display_text\"][\"observations\"],str)):\n",
    "        f += \"<><><><>\\n\"\n",
    "        f += s[\"id\"]+\"\\n\\n\"\n",
    "        f += s[\"display_text\"][\"observations\"]+\"\\n\"\n",
    "\n",
    "        s = dict_value_string_function(s, lambda x: x.strip() if isinstance(x,str) else x)\n",
    "        cleaned[\"rule_set\"].append(s)\n",
    "\n",
    "print(len(cleaned[\"rule_set\"]))\n",
    "text_write(f,\"observations_only.debug.test\")\n",
    "save_json(cleaned, os.path.join(extract_fol,\"only_observations.json\"))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "id": "c2f6a2dd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2\n"
     ]
    }
   ],
   "source": [
    "f = \"\"\n",
    "cleaned = copy.deepcopy(data)\n",
    "cleaned[\"rule_set\"] = []\n",
    "for no, s in enumerate(dataset):\n",
    "    if (isinstance(s[\"display_text\"][\"rules\"],str)) and (not isinstance(s[\"display_text\"][\"observations\"],str)):\n",
    "        f += \"<><><><>\\n\"\n",
    "        f += s[\"id\"]+\"\\n\\n\"\n",
    "        f += s[\"display_text\"][\"rules\"]+\"\\n\"\n",
    "\n",
    "        s = dict_value_string_function(s, lambda x: x.strip() if isinstance(x,str) else x)\n",
    "        cleaned[\"rule_set\"].append(s)\n",
    "\n",
    "print(len(cleaned[\"rule_set\"]))\n",
    "text_write(f,\"rules_only.debug.test\")\n",
    "save_json(cleaned, os.path.join(extract_fol,\"only_rules.json\"))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "201d8168",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
