{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "086ba327-a8ed-4130-875c-48667bc468de",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.spatial.distance import cdist\n",
    "import random\n",
    "from sklearn.cluster import KMeans\n",
    "import sys\n",
    "sys.path.append('../')\n",
    "from icfesl import *\n",
    "from utility_functions import *\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from astropy.io import ascii"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a006f718-df78-4046-bc96-2c1ce6ad84ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = \"../../../writing/UCI datasets/gisette/GISETTE/gisette_train.data\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "77ba850a-4e2a-4622-a7d3-1f9a3a319fb3",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_labels = \"../../../writing/UCI datasets/gisette/GISETTE/gisette_train.labels\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "25fef8fa-599f-4b91-87de-054986494bd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = ascii.read(train_data).to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "2f90e5c0-4f40-4648-afec-48ae36741e77",
   "metadata": {},
   "outputs": [],
   "source": [
    "y = ascii.read(train_labels).to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b631f767-5419-4800-9f8b-eb445ef91e54",
   "metadata": {},
   "outputs": [],
   "source": [
    "X2 = icfesl.f_get_dummies(X, X.columns, drop_first=True, dummy_na = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d127d4a-4251-4603-ad24-a35aa8058c53",
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_vars = [f for f in X2.columns if (f not in X.columns)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c18d31b9-70ca-4a9e-b3e1-6f54ab89e62d",
   "metadata": {},
   "outputs": [],
   "source": [
    "selector = VarianceThreshold(threshold=0.05)\n",
    "\n",
    "selector.fit(X2)\n",
    "\n",
    "selected_features_mask = selector.get_support()\n",
    "\n",
    "selected_column_names = X2.columns[selected_features_mask]\n",
    "\n",
    "X2 = X2[selected_column_names]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbe1ea5b-6513-44e0-afe2-3c3ea00a7a3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in X2.columns.tolist():\n",
    "    X2[c] = X2[c].astype('int')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "004920ca-da28-4341-831b-2cf2c7560f03",
   "metadata": {},
   "source": [
    "### One hot encoding"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "253b68a2-5d67-480b-ac09-afd4640b479b",
   "metadata": {},
   "source": [
    "#### 1. logit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "adf35692-a04f-471d-8033-ba6e4e83aba4",
   "metadata": {},
   "outputs": [],
   "source": [
    "start = time.time()\n",
    "model = sm.GLM(y, sm.add_constant(X2, has_constant='skip'), family=sm.families.Binomial()).fit(disp=False)\n",
    "end = time.time()\n",
    "print(round(end - start, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3021b066-dbb8-43f0-9f49-8a252638228f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "acb40c09-3c12-4fe6-9e2d-705b4742601a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c8a91eb-9a53-43b2-a370-9ebea3817df2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2dd941b4-2e09-4e24-a0e7-c661da325919",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88d357be-d8e9-4d69-a682-7172116b4cfe",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03c7fdca-df61-4309-9c50-7413b02b0797",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2350c64c-3f2a-4341-9010-daaf80986979",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f86496f-6f1e-4748-81c3-74e697b8f9f6",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f07e4ee-1ee3-4e02-9c09-e04048040ffa",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2dbf8e7b-80ff-49af-b55e-1d673608e016",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
