{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "initial_id",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-05-22T07:41:03.562265Z",
     "start_time": "2024-05-22T07:41:03.559003Z"
    }
   },
   "outputs": [],
   "source": [
    "# Standard Imports\n",
    "import copy\n",
    "\n",
    "# Third Party Imports\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.model_selection import RandomizedSearchCV, train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "644fbcb1-a948-4036-8fec-6f18369ec046",
   "metadata": {},
   "source": [
    "#### Data Preparation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bc9ca94d-fd7a-46c5-932d-01ca644994c9",
   "metadata": {},
   "source": [
    "**Reading the Blood Transfusion Dataset**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77cb6a91a07ed68d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-05-22T07:41:03.731947Z",
     "start_time": "2024-05-22T07:41:03.723709Z"
    }
   },
   "outputs": [],
   "source": [
    "# Reading the Blood Dataset\n",
    "blood = pd.read_csv(\"./../../../datasets/blood/blood.data\")\n",
    "blood.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "317ecda9-87df-40b3-918d-19b7ccd05777",
   "metadata": {},
   "source": [
    "**Print Info and Missing Values**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b167864-ced8-46fd-89d8-74266115ab90",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(blood.info())\n",
    "print(\"\\n\\n\")\n",
    "print(\"Missing values: \", blood.isnull().values.any())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d921143a-172f-4f29-80c6-5a877643d067",
   "metadata": {},
   "source": [
    "**Variance Check**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "948672e3-370b-466e-97ac-be927c47d05f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "blood.var().round(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9aaeed7-a7e6-4fac-83fb-213f6b0c3774",
   "metadata": {},
   "outputs": [],
   "source": [
    "blood.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "06776d06-aac3-4cd8-b1e8-b8498381d713",
   "metadata": {},
   "source": [
    "**Checking the distribution of target values**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "339d9bbe-8754-4572-87e1-3365bed7af5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "blood[\"Blood Donated\"].value_counts(normalize=True).round(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ce86e3c-d237-459e-805b-83a02b971354",
   "metadata": {},
   "source": [
    "**Train-Test Split**"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "852899ac-5378-4914-a9e7-c206ded4d0de",
   "metadata": {},
   "source": [
    "Train - Test Split &nbsp;&nbsp;&nbsp;&nbsp; 60%-40%\n",
    "\n",
    "The Test-Set is Further split into Verb And Gen split\n",
    "\n",
    "Verb - Gen Split   &nbsp;&nbsp;&nbsp;&nbsp; 50%-50%"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf7ff54f-f3f4-40e6-8bc4-424d99ae8af8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train Test Split Stratified on Target Feature\n",
    "X_train, X_test, y_train, y_test = train_test_split(blood.drop(columns=\"Blood Donated\"), blood[\"Blood Donated\"], test_size=0.4, random_state=400, stratify=blood[\"Blood Donated\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cc4deea-e333-4ce0-890d-ad1078aac260",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train Test Split on the Test set to get Verb and Gen Sets. Stratified on the Target Feature\n",
    "X_test_verb, X_test_gen, y_test_verb, y_test_gen = train_test_split(X_test, y_test, test_size=0.5, random_state=400, stratify=y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bda17883-3302-4fc9-8b3b-46c6c669a070",
   "metadata": {},
   "source": [
    "**Checking the distribution target values after the split**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f79613d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(y_train.value_counts(normalize=True).round(3))\n",
    "print(y_test_verb.value_counts(normalize=True).round(3))\n",
    "print(y_test_gen.value_counts(normalize=True).round(3))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ccf6c676-335d-4133-afa2-5161bd932f2a",
   "metadata": {},
   "source": [
    "**Variance Check**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bf865bb-067e-40f8-bfb8-03afba7c71a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.var().round(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0a7a2750-30fc-45f5-97d9-9bd0863cab60",
   "metadata": {},
   "source": [
    "**Standardization** - Skipping this because the resulting dataset has very high precision float values (that may not be good for the LLM)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebbccb2f-d31e-451c-83a4-ba3b35025796",
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_gen_scaled = scaler.transform(X_test_gen)\n",
    "X_test_verb_scaled = scaler.transform(X_test_verb)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "99cb1c0e-8239-4489-a757-51307bfe7e8a",
   "metadata": {},
   "source": [
    "**Converting them back to dataframes** - No need for this now as there's no standardization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "563a7aff-a958-42bf-a740-bfe04b397aee",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)\n",
    "X_test_gen_scaled_df = pd.DataFrame(X_test_gen_scaled, columns=X_test.columns)\n",
    "X_test_verb_scaled_df = pd.DataFrame(X_test_verb_scaled, columns=X_test.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d53928be",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_scaled_df.var().round(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1bcf1076-588b-479a-8a62-5cfb762b66fd",
   "metadata": {},
   "source": [
    "Let's expand the dataset. This will preserve the relationship between the data points but since the data is scaled, it'll be easier for the LLM to interpret"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13bb89d4-f485-43db-8413-385e267ce88b",
   "metadata": {},
   "outputs": [],
   "source": [
    "expansion_factor = 100\n",
    "X_train_scaled_df = X_train_scaled_df * expansion_factor\n",
    "X_test_verb_scaled_df = X_test_verb_scaled_df * expansion_factor\n",
    "X_test_gen_scaled_df = X_test_gen_scaled_df * expansion_factor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64dd6df7-f76a-4849-b168-49653cf4184f",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_scaled_df.var().round(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "97fe3ae9-5bdc-488c-9339-af497d6999ee",
   "metadata": {},
   "source": [
    "From here Onwards `X_test_verb_scaled_df` is going to be `X_test_scaled_df`. `X_test_gen_scaled_df` is not going to be used at all in the code - Only at the end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61841898",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test_scaled_df = X_test_verb_scaled_df\n",
    "y_test = y_test_verb"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a87a0a77-0623-4f63-b3ca-ea30e19fde46",
   "metadata": {},
   "source": [
    "#### Model Variations"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a6bcbf72-6031-44ae-84b6-5fb629c78e1f",
   "metadata": {},
   "source": [
    "**Helper Functions**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61462da2-b794-495a-b4b7-6ead0a222549",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to plot the decision tree\n",
    "def plot_decision_tree(decision_tree, feature_names, class_names=None):\n",
    "    plt.figure(figsize=(30, 10))\n",
    "    plot_tree(decision_tree, feature_names=feature_names, class_names=class_names, filled=True, rounded=True, fontsize=12)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "98c6a1a3",
   "metadata": {},
   "source": [
    "**Base Model**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7bfb42b7-65e2-4992-b68f-ccf0bf452c28",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the parameter distributions\n",
    "param_distributions = {\n",
    "    'max_depth': list(range(1, 10)),\n",
    "    'min_samples_split': range(2, 21),\n",
    "    'min_samples_leaf': range(1, 21),\n",
    "    'criterion': ['gini', 'entropy'],\n",
    "    'splitter': ['best', 'random'],\n",
    "    'min_weight_fraction_leaf': np.linspace(0, 0.5, 100),\n",
    "    'max_leaf_nodes': list(range(1, 20))\n",
    "}\n",
    "\n",
    "# Initialize the model\n",
    "dt = DecisionTreeClassifier()\n",
    "\n",
    "# Initialize RandomizedSearchCV\n",
    "random_search = RandomizedSearchCV(\n",
    "    estimator=dt,\n",
    "    param_distributions=param_distributions,\n",
    "    n_iter=500,\n",
    "    cv=5,\n",
    "    n_jobs=-1,\n",
    "    scoring='accuracy',\n",
    "    random_state=800 \n",
    ")\n",
    "\n",
    "# Fit the model\n",
    "random_search.fit(X_train_scaled_df, y_train)\n",
    "\n",
    "# Get the best parameters\n",
    "best_params = random_search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c1d7336",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train the base model\n",
    "def train_base_model(X_train, y_train):\n",
    "    best_dt = DecisionTreeClassifier(**best_params)\n",
    "    best_dt.fit(X_train, y_train)\n",
    "    return best_dt\n",
    "\n",
    "base_model = train_base_model(X_train_scaled_df, y_train)\n",
    "base_pred = base_model.predict(X_test_scaled_df)\n",
    "accuracy_score(y_test, base_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53505624-8563-43ec-90e6-24ca812ac563",
   "metadata": {},
   "outputs": [],
   "source": [
    "best_params"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "087401a5-0018-4860-a224-9966a4e1b198",
   "metadata": {},
   "source": [
    "**Functions to generate random variations of decision tree models**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9296776c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_random_hyperparameters():\n",
    "    hyperparameters = {\n",
    "        'max_depth': np.random.randint(1, 10),\n",
    "        'min_samples_split': np.random.randint(2, 21),\n",
    "        'min_samples_leaf': np.random.randint(1, 21),\n",
    "        'criterion': np.random.choice(['gini', 'entropy']),\n",
    "        'splitter': np.random.choice(['best', 'random']),\n",
    "        'min_weight_fraction_leaf': np.random.uniform(0.0, 0.5),\n",
    "        'max_leaf_nodes': np.random.choice(range(2, 21))\n",
    "    }\n",
    "    return hyperparameters\n",
    "\n",
    "def add_noise_to_thresholds(tree, modification_factor=0.2):\n",
    "    # Copy the tree and generate random noise\n",
    "    modified_tree = copy.deepcopy(tree)\n",
    "    thresholds = modified_tree.tree_.threshold\n",
    "    noise = np.random.normal(0, modification_factor, size=thresholds.shape)\n",
    "    \n",
    "    # generate new thresholds\n",
    "    new_thresholds = thresholds * (1 + noise)\n",
    "\n",
    "    # Have to use slicing because threshold is not writeable\n",
    "    modified_tree.tree_.threshold[:] = new_thresholds\n",
    "    return modified_tree\n",
    "\n",
    "def compare_models(base_model, modified_model, X):\n",
    "    base_predictions = base_model.predict(X)\n",
    "    modified_predictions = modified_model.predict(X)\n",
    "    mismatch_percentage = np.mean(base_predictions != modified_predictions)\n",
    "    return mismatch_percentage\n",
    "\n",
    "def compute_diff(y_pred_1, y_pred_2):\n",
    "    return np.mean(y_pred_1 != y_pred_2)\n",
    "\n",
    "def modify_decision_tree(base_model, X_train, y_train, X_test, modification_factors, number_of_models=20):\n",
    "    best_diff = -1\n",
    "    best_model = None\n",
    "    \n",
    "    for i in range(number_of_models):\n",
    "        print(f\"{i+1}th Model\")\n",
    "        hyperparameters = generate_random_hyperparameters()\n",
    "        \n",
    "        for modification_factor in modification_factors:\n",
    "            random_model = DecisionTreeClassifier(**hyperparameters)\n",
    "            random_model.fit(X_train, y_train)\n",
    "            \n",
    "            modified_model = add_noise_to_thresholds(random_model, modification_factor)\n",
    "            diff = compare_models(base_model, modified_model, X_test)\n",
    "            \n",
    "            print(f\"Modification factor: {modification_factor}, Mismatch percentage: {diff}\")\n",
    "            \n",
    "            if diff > best_diff and (diff >=0.15 and diff <= 0.19):\n",
    "                best_diff = diff \n",
    "                best_model = modified_model\n",
    "                print(f\"Best Model Found with {best_diff}\")\n",
    "    \n",
    "    return best_model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fb632903-7860-4b55-9fb8-b8a185e0ac82",
   "metadata": {},
   "source": [
    "**Generate and Compare Model Variations**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b0870f5-53fa-445f-8f91-e25e14e9e89c",
   "metadata": {},
   "outputs": [],
   "source": [
    "base_model.tree_.threshold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79d408e6-b010-4d78-abfa-55944b10da72",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(400)\n",
    "np.random.seed(200)\n",
    "np.random.seed(100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "078b33f9-aeb2-443e-975e-9ff5e16b0bb3",
   "metadata": {},
   "outputs": [],
   "source": [
    "modification_factors = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]\n",
    "modified_tree = modify_decision_tree(base_model, X_train_scaled_df, y_train, X_test_scaled_df, modification_factors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1cdef7c-4522-43ad-959c-ee9404a503e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_decision_tree(base_model, X_train.columns, class_names=[\"0\", \"1\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d906dd12-16f9-4650-a3a2-22dfdb13ab3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_decision_tree(modified_tree, X_train.columns, class_names=[\"0\", \"1\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab694fe8-8735-48d3-97c0-2ce64df963c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Renaming the models\n",
    "model1 = base_model\n",
    "model2 = modified_tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15fa4c56-456e-4865-bfb2-3950a1722062",
   "metadata": {},
   "outputs": [],
   "source": [
    "model2 = base_model\n",
    "model1 = modified_tree"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ea03138-8060-44ec-80ba-72d49272ec80",
   "metadata": {},
   "source": [
    "**Evaluate Both Models**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6e595955-68c6-41b3-8b0e-051de7cf86ba",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'model1' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# Evaluate both models\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m y_pred_1 \u001b[38;5;241m=\u001b[39m \u001b[43mmodel1\u001b[49m\u001b[38;5;241m.\u001b[39mpredict(X_test_scaled_df)\n\u001b[1;32m      3\u001b[0m y_pred_2 \u001b[38;5;241m=\u001b[39m model2\u001b[38;5;241m.\u001b[39mpredict(X_test_scaled_df)\n\u001b[1;32m      5\u001b[0m accuracy1 \u001b[38;5;241m=\u001b[39m accuracy_score(y_test, y_pred_1)\n",
      "\u001b[0;31mNameError\u001b[0m: name 'model1' is not defined"
     ]
    }
   ],
   "source": [
    "# Evaluate both models\n",
    "y_pred_1 = model1.predict(X_test_scaled_df)\n",
    "y_pred_2 = model2.predict(X_test_scaled_df)\n",
    "\n",
    "accuracy1 = accuracy_score(y_test, y_pred_1)\n",
    "accuracy2 = accuracy_score(y_test, y_pred_2)\n",
    "\n",
    "# Print results\n",
    "print(\"Model 1 accuracy:\", accuracy1)\n",
    "print(\"Model 2 accuracy:\", accuracy2)\n",
    "print(\"\\nAccuracy difference:\", abs(accuracy1 - accuracy2))\n",
    "print(f\"\\nPercentage of different outputs: {compute_diff(y_pred_1, y_pred_2):.2%}\\n\")\n",
    "print(model1.get_params())\n",
    "print(model2.get_params())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80de594a-718a-48e3-b036-c2a5713e6588",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a920ffa-f97b-469d-8435-a5b2dee11e62",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(compare_models(model1, model2, X_test_gen_scaled_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "663ef90c-209f-48c5-87f3-784b78b391af",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "cd8a30d3-da04-4c7f-953f-a6eec2515698",
   "metadata": {},
   "source": [
    "#### Compare Model Boundaries"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15fea562-bb74-41a7-a270-87fc65b1fa25",
   "metadata": {},
   "source": [
    "**Plot Model Decision Boundaries**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80b922f5-4307-452f-90d2-397fc5e969ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to plot decision boundary\n",
    "def plot_decision_boundary(X_test, y_pred, feature_1, feature_2):\n",
    "    # Create a scatter plot of the predictions\n",
    "    plt.figure(figsize=(8, 6))\n",
    "    plt.scatter(X_test[feature_1], X_test[feature_2], c=y_pred, cmap='rainbow', edgecolor='black', s=20)\n",
    "    \n",
    "    # Add labels and title\n",
    "    plt.xlabel(feature_1)\n",
    "    plt.ylabel(feature_2)\n",
    "    plt.title('Logistic Regression Decision Boundary')\n",
    "    \n",
    "    # Add a colorbar to indicate the predicted classes\n",
    "    cbar = plt.colorbar()\n",
    "    cbar.set_ticks([0, 1])\n",
    "    cbar.set_ticklabels([\"No\", \"Yes\"])\n",
    "\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d4d70a4-794b-4bc0-8659-d244baa22e7c",
   "metadata": {},
   "source": [
    "**Model 1 Decision Boundary**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de4db779-ea88-4543-944d-46c15b3e31de",
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_decision_boundary(X_test_scaled_df, y_pred_1, \"Monetary (c.c. blood)\", \"Frequency (times)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "599fcca1-20b3-4b16-89c4-4fba23c314ce",
   "metadata": {},
   "source": [
    "**Model 2 Decision Boundary**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bff87fe6-d656-416c-88a7-47fd6d503a63",
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_decision_boundary(X_test_scaled_df, y_pred_2, \"Monetary (c.c. blood)\", \"Frequency (times)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3249b8a-a969-4b3c-a2a4-77e0e6f99f42",
   "metadata": {},
   "outputs": [],
   "source": [
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c69fb05-4f20-4c89-bf16-7409bf20ce84",
   "metadata": {},
   "outputs": [],
   "source": [
    "def round_values(data, precision=3):\n",
    "    if isinstance(data, list):\n",
    "        return [round_values(item, precision) for item in data]\n",
    "    elif isinstance(data, dict):\n",
    "        return {key: round_values(value, precision) for key, value in data.items()}\n",
    "    elif isinstance(data, (int, float)):\n",
    "        return round(data, precision)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26faf3f2-7a70-44ee-b224-634d7a90edda",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d68f22f5-c4d4-4380-b345-a141c9e67853",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6337c4f6-47d2-45cb-923a-ff314380fd52",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd2c3100-c6af-42d4-9a87-80a7f10c105e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0cdabfb-e346-46e1-a5b3-f5e43d608f79",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b20f6fdd-cb92-43b7-b1bf-b32f34fea7ad",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "43a1d4fa-fb39-46e7-8c14-645e213aa7f3",
   "metadata": {},
   "source": [
    "#### Sample Data Creation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4c67e91-d622-4a73-bedd-f3535c46c5a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def write_data(data, file_name, varname):\n",
    "    datastr = f\"\\n{varname} = {data}\"\n",
    "    \n",
    "    # Write this string to the file\n",
    "    with open(file_name, 'a') as file:\n",
    "        file.write(datastr)\n",
    "\n",
    "# This stays constant for this iPython file\n",
    "FILE_NAME = \"./../samples/blood/level_3.py\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "166d67f7-55ae-4e56-8ee5-a45649a95bf0",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Number of mismatched samples: {np.sum(y_pred_1 != y_pred_2)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2ba908c-9ccc-43ed-aebc-34c5e3eeac92",
   "metadata": {},
   "outputs": [],
   "source": [
    "verb_data = []\n",
    "for idx in range(len(X_test_scaled_df)):\n",
    "    data_point = {\n",
    "        \"input\": round_values(X_test_scaled_df.iloc[idx].to_list()),\n",
    "        \"output\": {\n",
    "            \"model1\": int(y_pred_1[idx]),\n",
    "            \"model2\": int(y_pred_2[idx])\n",
    "        }\n",
    "    }\n",
    "    verb_data.append(data_point)\n",
    "\n",
    "print(f\"Number of samples in verb_data: {len(verb_data)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8b0374cf-250d-4b9c-ad17-b30db0b6d8bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "varname = \"verb_data\"\n",
    "data = verb_data\n",
    "write_data(data, FILE_NAME, varname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bcc9f635",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa9c5769",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32b9d1e8-7c1a-4839-8b3f-05eea5e03bcc",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_gen_pred_1 = model1.predict(X_test_gen_scaled_df)\n",
    "y_gen_pred_2 = model2.predict(X_test_gen_scaled_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b11b0a3-deea-4f17-99f7-c992d2b470df",
   "metadata": {},
   "outputs": [],
   "source": [
    "gen_data = []\n",
    "for idx in range(len(X_test_gen_scaled_df)):\n",
    "    data_point = {\n",
    "        \"input\": round_values(X_test_gen_scaled_df.iloc[idx].to_list()),\n",
    "        \"output\": {\n",
    "            \"model1\": int(y_gen_pred_1[idx]),\n",
    "            \"model2\": int(y_gen_pred_2[idx])\n",
    "        }\n",
    "    }\n",
    "    gen_data.append(data_point)\n",
    "\n",
    "print(f\"Number of samples in gen_data: {len(gen_data)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64268a33-c206-4002-b456-f9ed0d5651dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "varname = \"gen_data\"\n",
    "data = gen_data\n",
    "write_data(data, FILE_NAME, varname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f132e020-5ee2-4c63-94a1-ab02b41d9a52",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe35af28",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "569c615f-6aaa-42a0-bd78-57f67a13c8fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "def prune_data(gen_data):\n",
    "    return [{\"input\": inst[\"input\"], \"output\": {\"model1\": inst[\"output\"][\"model1\"]}} for inst in gen_data]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca15745c-cc52-4855-a227-06aef729d537",
   "metadata": {},
   "outputs": [],
   "source": [
    "varname = \"gen_data_pruned\"\n",
    "data = prune_data(gen_data)\n",
    "write_data(data, FILE_NAME, varname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39486abe-a340-480f-a7e4-8449c05d372d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00da3464",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ffcfa7d-c895-42f0-bee6-dec69c44e339",
   "metadata": {},
   "outputs": [],
   "source": [
    "def write_structures(structure_text, file_name, varname):\n",
    "    datastr = f\"\\n{varname} = '''{structure_text}'''\"\n",
    "    \n",
    "    # write this string to the file\n",
    "    with open(file_name, 'a') as file:\n",
    "        file.write(datastr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0acfe6a3-a651-4bcf-91a6-1cf038ff43a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "STRUCTURES_FILE_NAME = \"./../structures/blood/level_2.py\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfe59780-2e74-4418-9a06-cff330c76e6c",
   "metadata": {},
   "outputs": [],
   "source": [
    "model1_text = export_text(model1, feature_names=X_train.columns)\n",
    "model2_text = export_text(model2, feature_names=X_train.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c585b9b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "write_structures(model1_text, STRUCTURES_FILE_NAME, \"model1\")\n",
    "write_structures(model2_text, STRUCTURES_FILE_NAME, \"model2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc266e8a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93681121",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b8f7c24",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "854c8d87",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91d06e6c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e36dfd05-2889-4a04-bcf2-44755e582830",
   "metadata": {},
   "outputs": [],
   "source": [
    "def prediction_zero(data):\n",
    "    prediction0_1 = [i for i in range(len(data)) if data[i][\"output\"][\"model1\"] == 1]\n",
    "    prediction0_2 = [i for i in range(len(data)) if data[i][\"output\"][\"model2\"] == 1]\n",
    "    return prediction0_1, prediction0_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "346e21df-a19c-4af6-81f2-30017c29bece",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Model 1: {len(prediction_zero(verb_data)[0])}\")\n",
    "print(f\"Model 2: {len(prediction_zero(verb_data)[1])}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a443c126-4c38-4bef-b3cf-cf959db8269f",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Model 1: {len(prediction_zero(gen_data)[0])}\")\n",
    "print(f\"Model 2: {len(prediction_zero(gen_data)[1])}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e73d4fd9-65ec-48ae-86e5-f1c9f6c279b2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d3f2ea6-7d97-4a55-88d9-974b64e8b102",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fb92919-4141-4e1b-826f-36f0133de00d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "788ed8c5-08a6-44e7-81dc-3cddab421823",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f5fc898-c4b5-4c1e-b61b-b0b5f413c2be",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea935b96-3e8f-4ecf-8f62-70de40a42956",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6697b78-feb2-47b3-9292-3f36a4803345",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87d16061-8c36-4344-a471-16ac8842266c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ea76c10-7c95-4881-866e-c2dbb2af555f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb6d52a3-cc51-4ade-a19e-0f5dec85633c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce1d25cc-bf05-490b-83bb-1e7033d43d23",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75eb9019-fd80-40fa-be36-ec1e9a4a04f3",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
