{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Running O^2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np \n",
    "import pandas as pd\n",
    "import o2\n",
    "import random \n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.neighbors import NearestNeighbors\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import GridSearchCV, train_test_split\n",
    "from sklearn.feature_selection import VarianceThreshold \n",
    "from sklearn.metrics import accuracy_score, f1_score, roc_auc_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read data and put in shape\n",
    "# we encourage normalizing the data if possible using sklearn StandardSclaler\n",
    "\n",
    "d = str(\"Data/Wine/\")\n",
    "X_train = pd.read_csv(d+\"X_train.csv\") \n",
    "y_train = pd.read_csv(d+\"y_train.csv\") \n",
    "X_test = pd.read_csv(d+\"X_test.csv\") \n",
    "y_test = pd.read_csv(d+\"y_test.csv\")\n",
    "col_namesX = X_train.columns \n",
    "col_namesy = y_train.columns \n",
    "(n_cur,p) = X_train.shape \n",
    "X_train = X_train.values \n",
    "y_train = y_train.values\n",
    "y_train = y_train.reshape((n_cur,)) \n",
    "X_test = X_test.values \n",
    "y_test = y_test.values \n",
    "y_test = y_test.reshape((len(y_test),))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the minority class is 0 or 1\n",
    "min_class=1\n",
    "\n",
    "# how many new points to create\n",
    "points=300\n",
    "\n",
    "# categorical features\n",
    "# cat is the simplest apporach\n",
    "# cat_tab uses a neural network to add new catergorical points\n",
    "method=\"cat\"\n",
    "\n",
    "# no need to use this\n",
    "clf = None\n",
    "\n",
    "# three options for creating new points: lr, svm or tree\n",
    "# if you don't have a gurobi license use lr or tree\n",
    "ovs_m=\"lr\"\n",
    "\n",
    "# epochs for neural network in cat_tab method is 20\n",
    "# the number 10 is not important here, tune only the 20 if you want\n",
    "eps=[10, 20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(299, 11)\n"
     ]
    }
   ],
   "source": [
    "X_train_new, y_train_new = o2.ovs(X_train, \n",
    "                            y_train, min_class, points, method=method, clf=None, ovs_m=ovs_m, \n",
    "                               eps=[10, 10], l1=0.5, l2=0.5, l3=0.5, optimizer=\"lbfgs\", init_loras=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Baseline AUC 0.6535502533129799\n",
      "O2 AUC 0.6811409039623018\n",
      "Baseline F1 0.4397163120567376\n",
      "O2 F1 0.5227722772277228\n"
     ]
    }
   ],
   "source": [
    "# fit a decision tree model using grid seach\n",
    "tree_para = {'criterion':['gini','entropy'],'max_depth':[3, 6, 10, 20, 50, 100]}\n",
    "clf = GridSearchCV(DecisionTreeClassifier(random_state=36), tree_para, cv=3)\n",
    "clf.fit(X_train_new, y_train_new)\n",
    "preds_ovs = clf.predict(X_test)\n",
    "proba_ovs = clf.predict_proba(X_test)[:, 1]\n",
    "clf.fit(X_train, y_train)\n",
    "preds = clf.predict(X_test)\n",
    "proba = clf.predict_proba(X_test)[:, 1]\n",
    "auc_ovs = roc_auc_score(y_test, proba_ovs)\n",
    "f1_ovs = f1_score(y_test, preds_ovs, average='binary')\n",
    "auc = roc_auc_score(y_test, proba)\n",
    "f1 = f1_score(y_test, preds, average='binary')\n",
    "print(\"Baseline AUC \" + str(auc))\n",
    "print(\"O2 AUC \" + str(auc_ovs))\n",
    "print(\"Baseline F1 \" + str(f1))\n",
    "print(\"O2 F1 \" + str(f1_ovs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Baseline AUC 0.7331917435454969\n",
      "O2 AUC 0.7309794031105507\n",
      "Baseline F1 0.30625\n",
      "O2 F1 0.37500000000000006\n"
     ]
    }
   ],
   "source": [
    "# fit a logistic regression model\n",
    "clf = LogisticRegression()\n",
    "clf.fit(X_train_new, y_train_new)\n",
    "preds_ovs = clf.predict(X_test)\n",
    "proba_ovs = clf.predict_proba(X_test)[:, 1]\n",
    "clf.fit(X_train, y_train)\n",
    "preds = clf.predict(X_test)\n",
    "proba = clf.predict_proba(X_test)[:, 1]\n",
    "auc_ovs = roc_auc_score(y_test, proba_ovs)\n",
    "f1_ovs = f1_score(y_test, preds_ovs, average='binary')\n",
    "auc = roc_auc_score(y_test, proba)\n",
    "f1 = f1_score(y_test, preds, average='binary')\n",
    "print(\"Baseline AUC \" + str(auc))\n",
    "print(\"O2 AUC \" + str(auc_ovs))\n",
    "print(\"Baseline F1 \" + str(f1))\n",
    "print(\"O2 F1 \" + str(f1_ovs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
