{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2025-03-27T18:43:07.993300Z",
     "start_time": "2025-03-27T18:43:02.959188Z"
    }
   },
   "source": [
    "import pandas as pd\n",
    "from ucimlrepo import fetch_ucirepo \n",
    "  \n",
    "# fetch dataset \n",
    "census_income = fetch_ucirepo(id=20) \n",
    "  \n",
    "# data (as pandas dataframes) \n",
    "X = census_income.data.features \n",
    "y = census_income.data.targets \n",
    "  \n",
    "# variable information \n",
    "print(census_income.variables) \n"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              name     role         type      demographic  \\\n",
      "0              age  Feature      Integer              Age   \n",
      "1        workclass  Feature  Categorical           Income   \n",
      "2           fnlwgt  Feature      Integer             None   \n",
      "3        education  Feature  Categorical  Education Level   \n",
      "4    education-num  Feature      Integer  Education Level   \n",
      "5   marital-status  Feature  Categorical            Other   \n",
      "6       occupation  Feature  Categorical            Other   \n",
      "7     relationship  Feature  Categorical            Other   \n",
      "8             race  Feature  Categorical             Race   \n",
      "9              sex  Feature       Binary              Sex   \n",
      "10    capital-gain  Feature      Integer             None   \n",
      "11    capital-loss  Feature      Integer             None   \n",
      "12  hours-per-week  Feature      Integer             None   \n",
      "13  native-country  Feature  Categorical            Other   \n",
      "14          income   Target       Binary           Income   \n",
      "\n",
      "                                          description units missing_values  \n",
      "0                                                 N/A  None             no  \n",
      "1   Private, Self-emp-not-inc, Self-emp-inc, Feder...  None            yes  \n",
      "2                                                None  None             no  \n",
      "3    Bachelors, Some-college, 11th, HS-grad, Prof-...  None             no  \n",
      "4                                                None  None             no  \n",
      "5   Married-civ-spouse, Divorced, Never-married, S...  None             no  \n",
      "6   Tech-support, Craft-repair, Other-service, Sal...  None            yes  \n",
      "7   Wife, Own-child, Husband, Not-in-family, Other...  None             no  \n",
      "8   White, Asian-Pac-Islander, Amer-Indian-Eskimo,...  None             no  \n",
      "9                                       Female, Male.  None             no  \n",
      "10                                               None  None             no  \n",
      "11                                               None  None             no  \n",
      "12                                               None  None             no  \n",
      "13  United-States, Cambodia, England, Puerto-Rico,...  None            yes  \n",
      "14                                       >50K, <=50K.  None             no  \n"
     ]
    }
   ],
   "execution_count": 1
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-27T21:22:50.348277Z",
     "start_time": "2025-03-27T21:22:49.617416Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import polars as pl\n",
    "import numpy as np\n",
    "\n",
    "def encode(df: pl.DataFrame, columns) -> pl.DataFrame:\n",
    "    df_encoded = []\n",
    "    \n",
    "    for col in columns:\n",
    "        dummies = df.select(col).to_dummies()\n",
    "        if dummies.shape[1] > 1:\n",
    "            dummies = dummies[:, 1:]\n",
    "        df_encoded.append(dummies)\n",
    "    for col in df.columns:\n",
    "        if col not in columns:\n",
    "            df_encoded.append(df.select(col))\n",
    "    \n",
    "    df_encoded = pl.concat(df_encoded, how=\"horizontal\")\n",
    "    return df_encoded\n",
    "# Define the predictors\n",
    "X.loc[:,\"occupation\"] = X[[\"occupation\"]].astype(str)\n",
    "df = pl.from_pandas(X.filter([\"sex\", \"age\", \"education\", \"race\", \"hours-per-week\", \"relationship\", \"marital-status\", \"occupation\"]))\n",
    "encoded_df = encode(df, [\"sex\", \"education\", \"race\", \"relationship\", \"marital-status\", \"occupation\"])\n",
    "\n",
    "# There are 4 distinct values for y, but two of them can be merged\n",
    "y_pl = pl.from_pandas(y['income'].replace(\"<=50K\", \"<=50K.\").replace(\">50K\", \">50K.\"))\n",
    "y_pl = y_pl.to_dummies()[:, 0].to_frame()  # 0, 1\n",
    "\n",
    "y_pl_np = np.array(y_pl, dtype=bool) * 2 - 1\n",
    "encoded_df = y_pl_np * encoded_df # flipping\n",
    "flipped_predictors = np.array(encoded_df)"
   ],
   "id": "7c1622b9a5751a03",
   "outputs": [],
   "execution_count": 39
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-03-27T21:22:51.122741Z",
     "start_time": "2025-03-27T21:22:51.116927Z"
    }
   },
   "cell_type": "code",
   "source": "encoded_df",
   "id": "7802a8e452bc157",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[  1,   0,   0, ...,   0,  39,  40],\n",
       "       [  1,   0,   0, ...,   0,  50,  13],\n",
       "       [  1,   0,   0, ...,   0,  38,  40],\n",
       "       ...,\n",
       "       [  1,   0,   0, ...,   0,  38,  50],\n",
       "       [  1,   0,   0, ...,   0,  44,  40],\n",
       "       [ -1,   0,   0, ...,   0, -35, -60]])"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 40
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "",
   "id": "585520c23a9c6ab"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
