{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f7f7df75",
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e03ee1b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings_df = pd.read_csv('u.data', \n",
    "                        names=['userid', 'movieid', 'rating', 'timestamp'],\n",
    "                        sep='\\t', encoding='latin-1')\n",
    "ratings_df = ratings_df.drop(['timestamp'], axis = 1)\n",
    "\n",
    "users_df = pd.read_csv('u.user', \n",
    "                      names=['userid', 'age', 'gender', 'occupation', 'zip_code'],\n",
    "                      sep='|', encoding='latin-1')\n",
    "users_df = users_df.drop(['zip_code'],axis = 1)\n",
    "users_df = users_df.drop(['occupation'],axis = 1)\n",
    "users_df = users_df.drop(['age'],axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3517ae4e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userid</th>\n",
       "      <th>gender</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>F</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userid gender\n",
       "0       1      M\n",
       "1       2      F"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "users_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8f3af391",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(943, 2)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "155d18b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = (ratings_df.merge(users_df, left_on='userid', right_on='userid')\n",
    "          .reindex(columns=[\"userid\", \"movieid\", \"rating\", \"gender\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "39abee26",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userid</th>\n",
       "      <th>movieid</th>\n",
       "      <th>rating</th>\n",
       "      <th>gender</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>196</td>\n",
       "      <td>242</td>\n",
       "      <td>3</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>196</td>\n",
       "      <td>393</td>\n",
       "      <td>4</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>196</td>\n",
       "      <td>381</td>\n",
       "      <td>4</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>196</td>\n",
       "      <td>251</td>\n",
       "      <td>3</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>196</td>\n",
       "      <td>655</td>\n",
       "      <td>5</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99995</th>\n",
       "      <td>941</td>\n",
       "      <td>919</td>\n",
       "      <td>5</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99996</th>\n",
       "      <td>941</td>\n",
       "      <td>273</td>\n",
       "      <td>3</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99997</th>\n",
       "      <td>941</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99998</th>\n",
       "      <td>941</td>\n",
       "      <td>294</td>\n",
       "      <td>4</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>99999</th>\n",
       "      <td>941</td>\n",
       "      <td>1007</td>\n",
       "      <td>4</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>100000 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       userid  movieid  rating gender\n",
       "0         196      242       3      M\n",
       "1         196      393       4      M\n",
       "2         196      381       4      M\n",
       "3         196      251       3      M\n",
       "4         196      655       5      M\n",
       "...       ...      ...     ...    ...\n",
       "99995     941      919       5      M\n",
       "99996     941      273       3      M\n",
       "99997     941        1       5      M\n",
       "99998     941      294       4      M\n",
       "99999     941     1007       4      M\n",
       "\n",
       "[100000 rows x 4 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6750af29",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('ml100k_gender.csv', header=True, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05b25c7b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
