{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "232e3ec8",
   "metadata": {},
   "source": [
    "# Code adapted from the following for illustrating LPath applied to VAEs. It contains feature extraction followed by COPOD. The exact details below may not be correct, but it illustrates the implementation.\n",
    "\n",
    "\n",
    "https://medium.com/@rekalantar/variational-auto-encoder-vae-pytorch-tutorial-dce2d2fe0f5f\n",
    "https://github.com/yzhao062/pyod/blob/master/examples/copod_example.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "395c4e7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyod\n",
    "\n",
    "from pyod.models.copod import COPOD\n",
    "from pyod.utils.data import generate_data\n",
    "from pyod.utils.data import evaluate_print\n",
    "from pyod.utils.example import visualize\n",
    "\n",
    "import torch\n",
    "from torch.utils.data import DataLoader, TensorDataset\n",
    "import numpy as np\n",
    "import torch.nn as nn\n",
    "import matplotlib.pyplot as plt\n",
    "from torch.utils.data import DataLoader\n",
    "from torch.optim import Adam\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8bce09e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# percentage of outliers, training data is polluted, unlike many OOD detections considered in vision\n",
    "contamination = 0.1\n",
    "n_train = 200  # number of training points\n",
    "n_test = 100  # number of testing points\n",
    "\n",
    "X_train, X_test, y_train, y_test = generate_data(\n",
    "    n_train=n_train, n_test=n_test, n_features=2, contamination=contamination\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6f1c644",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e2873285",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = (X_train - X_train.mean(0))/X_train.std(0)\n",
    "X_test = (X_test - X_train.mean(0))/X_train.std(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9c1f76a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 32\n",
    "\n",
    "X_train_t = torch.tensor(X_train).float()\n",
    "X_test_t = torch.tensor(X_test).float()\n",
    "\n",
    "train_dataset = TensorDataset(X_train_t)\n",
    "test_dataset = TensorDataset(X_test_t)\n",
    "\n",
    "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False, num_workers=1)\n",
    "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "318bf299",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "75c59ca5",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "device\n",
    "\n",
    "\n",
    "class VAE(nn.Module):\n",
    "\n",
    "    def __init__(self, input_dim=2, hidden_dim=100, latent_dim=100, device=device):\n",
    "        super(VAE, self).__init__()\n",
    "\n",
    "        # encoder\n",
    "        self.encoder = nn.Sequential(\n",
    "            nn.Linear(input_dim, hidden_dim),\n",
    "            nn.LeakyReLU(0.2),\n",
    "            nn.Linear(hidden_dim, latent_dim),\n",
    "            nn.LeakyReLU(0.2)\n",
    "        )\n",
    "        \n",
    "        # latent mean and variance \n",
    "        self.mean_layer = nn.Linear(latent_dim, 2)\n",
    "        self.logvar_layer = nn.Linear(latent_dim, 2)\n",
    "        \n",
    "        # decoder\n",
    "        self.decoder = nn.Sequential(\n",
    "            nn.Linear(2, latent_dim),\n",
    "            nn.LeakyReLU(0.2),\n",
    "            nn.Linear(latent_dim, hidden_dim),\n",
    "            nn.LeakyReLU(0.2),\n",
    "            nn.Linear(hidden_dim, input_dim),\n",
    "            nn.Sigmoid()\n",
    "        )\n",
    "     \n",
    "    def encode(self, x):\n",
    "        x = self.encoder(x)\n",
    "        mean, logvar = self.mean_layer(x), self.logvar_layer(x)\n",
    "        return mean, logvar\n",
    "\n",
    "    def reparameterization(self, mean, var):\n",
    "        epsilon = torch.randn_like(var).to(device)      \n",
    "        z = mean + var * epsilon\n",
    "        return z\n",
    "\n",
    "    def decode(self, x):\n",
    "        return self.decoder(x)\n",
    "\n",
    "    def forward(self, x):\n",
    "        mean, log_var = self.encode(x)\n",
    "        z = self.reparameterization(mean, log_var)\n",
    "        x_hat = self.decode(z)\n",
    "        return x_hat, mean, log_var"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f9436858",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = VAE().to(device)\n",
    "\n",
    "optimizer = Adam(model.parameters(), lr=1e-3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "da083512",
   "metadata": {},
   "outputs": [],
   "source": [
    "def loss_function(x, x_hat, mean, log_var):\n",
    "    reproduction_loss = nn.functional.binary_cross_entropy(x_hat, x, reduction='sum')\n",
    "    KLD = - 0.5 * torch.sum(1+ log_var - mean.pow(2) - log_var.exp())\n",
    "\n",
    "    return reproduction_loss + KLD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "57f2754f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\tEpoch 1 \tAverage Loss:  1.289099395275116\n",
      "\tEpoch 2 \tAverage Loss:  0.9463532666365305\n",
      "\tEpoch 3 \tAverage Loss:  0.5677839579681555\n",
      "\tEpoch 4 \tAverage Loss:  0.05084100986520449\n",
      "\tEpoch 5 \tAverage Loss:  -0.7867279176910719\n",
      "\tEpoch 6 \tAverage Loss:  -2.161831953873237\n",
      "\tEpoch 7 \tAverage Loss:  -4.526237368583679\n",
      "\tEpoch 8 \tAverage Loss:  -9.138820317884287\n",
      "\tEpoch 9 \tAverage Loss:  -15.718053215493759\n",
      "\tEpoch 10 \tAverage Loss:  -17.077984276227653\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "-3278.9729810357094"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def train(model, optimizer, epochs, device):\n",
    "    model.train()\n",
    "    for epoch in range(epochs):\n",
    "        overall_loss = 0\n",
    "        for batch_idx, x in enumerate(train_loader):\n",
    "            x = x[0].to(device)\n",
    "\n",
    "            optimizer.zero_grad()\n",
    "\n",
    "            x_hat, mean, log_var = model(x)\n",
    "            loss = loss_function(x, x_hat, mean, log_var)\n",
    "            \n",
    "            overall_loss += loss.item()\n",
    "            \n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "\n",
    "        print(\"\\tEpoch\", epoch + 1, \"\\tAverage Loss: \", overall_loss/(batch_idx*batch_size))\n",
    "    return overall_loss\n",
    "\n",
    "train(model, optimizer, epochs=10, device=device)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a19d16da",
   "metadata": {},
   "source": [
    "## Extract Neural Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2da3a63c",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.eval()\n",
    "neural_features_train = {\n",
    "    \"x\": [],\n",
    "    \"x_hat\": [],\n",
    "    \"mean\": [],\n",
    "    \"log_var\": []\n",
    "}\n",
    "neural_features_test = {\n",
    "    \"x\": [],\n",
    "    \"x_hat\": [],\n",
    "    \"mean\": [],\n",
    "    \"log_var\": []\n",
    "}\n",
    "\n",
    "for batch_idx, x in enumerate(train_loader):\n",
    "    x = x[0].to(device)\n",
    "    x_hat, mean, log_var = model(x)\n",
    "    neural_features_train[\"x\"].append(x)\n",
    "    neural_features_train[\"x_hat\"].append(torch.norm((x_hat-x), dim=1))\n",
    "    neural_features_train[\"mean\"].append(torch.norm(mean, dim=1))\n",
    "    neural_features_train[\"log_var\"].append(torch.norm(log_var.exp(), dim=1))\n",
    "    \n",
    "for batch_idx, x in enumerate(test_loader):\n",
    "    x = x[0].to(device)\n",
    "    x_hat, mean, log_var = model(x)\n",
    "    neural_features_test[\"x\"].append(x)\n",
    "    neural_features_test[\"x_hat\"].append(torch.norm((x_hat-x), dim=1))\n",
    "    neural_features_test[\"mean\"].append(torch.norm(mean, dim=1))\n",
    "    neural_features_test[\"log_var\"].append(torch.norm(log_var.exp(), dim=1))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41fd72b2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "644c00d4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7a3c0564",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_n = torch.hstack([\n",
    "    torch.concat(neural_features_train[\"x_hat\"]).view(-1, 1),\n",
    "    torch.concat(neural_features_train[\"mean\"]).view(-1, 1),\n",
    "    torch.concat(neural_features_train[\"log_var\"]).view(-1, 1)\n",
    "])\n",
    "\n",
    "X_test_n = torch.hstack([\n",
    "    torch.concat(neural_features_test[\"x_hat\"]).view(-1, 1),\n",
    "    torch.concat(neural_features_test[\"mean\"]).view(-1, 1),\n",
    "    torch.concat(neural_features_test[\"log_var\"]).view(-1, 1)\n",
    "])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0ef2a163",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_n_np = X_train_n.detach().cpu().numpy()\n",
    "X_test_n_np = X_test_n.detach().cpu().numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31a31dff",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5e472ea3",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_copod = (X_train_n_np - X_train_n_np.mean(0))/X_train_n_np.std(0)\n",
    "X_test_copod = (X_test_n_np - X_train_n_np.mean(0))/X_train_n_np.std(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bab49b02",
   "metadata": {},
   "source": [
    "## Fit Unsupervised Statistical Density Estimation such as COPOD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "ee9dc71a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "COPOD(contamination=0.1, n_jobs=1)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# train COPOD detector\n",
    "clf_name = 'COPOD'\n",
    "clf = COPOD()\n",
    "clf.fit(X_train_copod)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "fb161f8d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get the prediction labels and outlier scores of the training data\n",
    "y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)\n",
    "y_train_scores = clf.decision_scores_  # raw outlier scores\n",
    "\n",
    "# get the prediction on the test data\n",
    "y_test_pred = clf.predict(X_test_copod)  # outlier labels (0 or 1)\n",
    "y_test_scores = clf.decision_function(X_test_copod)  # outlier scores\n",
    "\n",
    "# the prediction and the confidence score\n",
    "y_test_pred, y_test_pred_confidence = clf.predict(X_test_copod, return_confidence=True) \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "a1105aba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "On Training Data:\n",
      "COPOD ROC:0.4372, precision @ rank n:0.0\n",
      "\n",
      "On Test Data:\n",
      "COPOD ROC:0.7444, precision @ rank n:0.0\n"
     ]
    }
   ],
   "source": [
    "# evaluate and print the results\n",
    "print(\"\\nOn Training Data:\")\n",
    "evaluate_print(clf_name, y_train, y_train_scores)\n",
    "print(\"\\nOn Test Data:\")\n",
    "evaluate_print(clf_name, y_test, y_test_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f6c004c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c74c4fb",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0dcb02fb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
