{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rent-the-Runway Data Specs \n",
      "\n",
      "Rows (unique transactions): 192,544\n",
      "Columns: 15\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fit</th>\n",
       "      <th>user_id</th>\n",
       "      <th>bust size</th>\n",
       "      <th>item_id</th>\n",
       "      <th>weight</th>\n",
       "      <th>rating</th>\n",
       "      <th>rented for</th>\n",
       "      <th>review_text</th>\n",
       "      <th>body type</th>\n",
       "      <th>review_summary</th>\n",
       "      <th>category</th>\n",
       "      <th>height</th>\n",
       "      <th>size</th>\n",
       "      <th>age</th>\n",
       "      <th>review_date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>fit</td>\n",
       "      <td>420272</td>\n",
       "      <td>34d</td>\n",
       "      <td>2260466</td>\n",
       "      <td>137lbs</td>\n",
       "      <td>10</td>\n",
       "      <td>vacation</td>\n",
       "      <td>An adorable romper! Belt and zipper were a lit...</td>\n",
       "      <td>hourglass</td>\n",
       "      <td>So many compliments!</td>\n",
       "      <td>romper</td>\n",
       "      <td>5' 8\"</td>\n",
       "      <td>14</td>\n",
       "      <td>28</td>\n",
       "      <td>April 20, 2016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>fit</td>\n",
       "      <td>273551</td>\n",
       "      <td>34b</td>\n",
       "      <td>153475</td>\n",
       "      <td>132lbs</td>\n",
       "      <td>10</td>\n",
       "      <td>other</td>\n",
       "      <td>I rented this dress for a photo shoot. The the...</td>\n",
       "      <td>straight &amp; narrow</td>\n",
       "      <td>I felt so glamourous!!!</td>\n",
       "      <td>gown</td>\n",
       "      <td>5' 6\"</td>\n",
       "      <td>12</td>\n",
       "      <td>36</td>\n",
       "      <td>June 18, 2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>fit</td>\n",
       "      <td>360448</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1063761</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10</td>\n",
       "      <td>party</td>\n",
       "      <td>This hugged in all the right places! It was a ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>It was a great time to celebrate the (almost) ...</td>\n",
       "      <td>sheath</td>\n",
       "      <td>5' 4\"</td>\n",
       "      <td>4</td>\n",
       "      <td>116</td>\n",
       "      <td>December 14, 2015</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>fit</td>\n",
       "      <td>909926</td>\n",
       "      <td>34c</td>\n",
       "      <td>126335</td>\n",
       "      <td>135lbs</td>\n",
       "      <td>8</td>\n",
       "      <td>formal affair</td>\n",
       "      <td>I rented this for my company's black tie award...</td>\n",
       "      <td>pear</td>\n",
       "      <td>Dress arrived on time and in perfect condition.</td>\n",
       "      <td>dress</td>\n",
       "      <td>5' 5\"</td>\n",
       "      <td>8</td>\n",
       "      <td>34</td>\n",
       "      <td>February 12, 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>fit</td>\n",
       "      <td>151944</td>\n",
       "      <td>34b</td>\n",
       "      <td>616682</td>\n",
       "      <td>145lbs</td>\n",
       "      <td>10</td>\n",
       "      <td>wedding</td>\n",
       "      <td>I have always been petite in my upper body and...</td>\n",
       "      <td>athletic</td>\n",
       "      <td>Was in love with this dress !!!</td>\n",
       "      <td>gown</td>\n",
       "      <td>5' 9\"</td>\n",
       "      <td>12</td>\n",
       "      <td>27</td>\n",
       "      <td>September 26, 2016</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   fit user_id bust size  item_id  weight rating     rented for  \\\n",
       "0  fit  420272       34d  2260466  137lbs     10       vacation   \n",
       "1  fit  273551       34b   153475  132lbs     10          other   \n",
       "2  fit  360448       NaN  1063761     NaN     10          party   \n",
       "3  fit  909926       34c   126335  135lbs      8  formal affair   \n",
       "4  fit  151944       34b   616682  145lbs     10        wedding   \n",
       "\n",
       "                                         review_text          body type  \\\n",
       "0  An adorable romper! Belt and zipper were a lit...          hourglass   \n",
       "1  I rented this dress for a photo shoot. The the...  straight & narrow   \n",
       "2  This hugged in all the right places! It was a ...                NaN   \n",
       "3  I rented this for my company's black tie award...               pear   \n",
       "4  I have always been petite in my upper body and...           athletic   \n",
       "\n",
       "                                      review_summary category height  size  \\\n",
       "0                               So many compliments!   romper  5' 8\"    14   \n",
       "1                            I felt so glamourous!!!     gown  5' 6\"    12   \n",
       "2  It was a great time to celebrate the (almost) ...   sheath  5' 4\"     4   \n",
       "3   Dress arrived on time and in perfect condition.     dress  5' 5\"     8   \n",
       "4                    Was in love with this dress !!!     gown  5' 9\"    12   \n",
       "\n",
       "   age         review_date  \n",
       "0   28      April 20, 2016  \n",
       "1   36       June 18, 2013  \n",
       "2  116   December 14, 2015  \n",
       "3   34   February 12, 2014  \n",
       "4   27  September 26, 2016  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import urllib.request \n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import gzip\n",
    "import json\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "pd.options.mode.chained_assignment = None\n",
    "\n",
    "file_name = \"renttherunway_final_data.json.gz\"\n",
    "urllib.request.urlretrieve(\"http://jmcauley.ucsd.edu/data/renttherunway/renttherunway_final_data.json.gz\", file_name)\n",
    "\n",
    "# function to parse file from original site listed above\n",
    "\n",
    "def parse(file):\n",
    "  file = gzip.open(file, \"r\")\n",
    "  for entry in file:\n",
    "    yield eval(entry)\n",
    "    \n",
    "# Setting null name for generator to list conversion- throws error otherwise\n",
    "\n",
    "null = \"Unknown\"\n",
    "fashion_dict_list = list(parse(file_name))\n",
    "fashion_data = pd.DataFrame(fashion_dict_list).replace(\"Unknown\", np.nan)\n",
    "\n",
    "print(\"Rent-the-Runway Data Specs \\n\")\n",
    "print(\"Rows (unique transactions): {row:,}\".format(row=fashion_data.shape[0]))\n",
    "print(\"Columns: {0}\".format(fashion_data.shape[1]))\n",
    "\n",
    "fashion_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['fit', 'user_id', 'bust size', 'item_id', 'weight', 'rating',\n",
       "       'rented for', 'review_text', 'body type', 'review_summary', 'category',\n",
       "       'height', 'size', 'age', 'review_date'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fashion_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of unique users: 105,571\n",
      "Number of unique items: 5,850\n"
     ]
    }
   ],
   "source": [
    "print(\"Number of unique users: {user:,}\".format(user=fashion_data[\"user_id\"].nunique()))\n",
    "print(\"Number of unique items: {item:,}\".format(item=fashion_data[\"item_id\"].nunique()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After seeing the first few rows and reviewing the missing data here, it's clear some variables require cleaning."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>is_null_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>fit</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bust size</th>\n",
       "      <td>18411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>item_id</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weight</th>\n",
       "      <td>29982</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rating</th>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rented for</th>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>review_text</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>body type</th>\n",
       "      <td>14637</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>review_summary</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>category</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>height</th>\n",
       "      <td>677</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>size</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>age</th>\n",
       "      <td>960</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>review_date</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                is_null_count\n",
       "fit                         0\n",
       "user_id                     0\n",
       "bust size               18411\n",
       "item_id                     0\n",
       "weight                  29982\n",
       "rating                     82\n",
       "rented for                 10\n",
       "review_text                 0\n",
       "body type               14637\n",
       "review_summary              0\n",
       "category                    0\n",
       "height                    677\n",
       "size                        0\n",
       "age                       960\n",
       "review_date                 0"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame({\"is_null_count\": fashion_data.isna().sum()}) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(192544, 15)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fashion_data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Cleaning and Exploration\n",
    "\n",
    "### Cleaning approach\n",
    "\n",
    "Cleaning steps here include:\n",
    "- Adding underscores where white space exists in column names\n",
    "- Dropping records without a rating since these are required for the analysis\n",
    "- Converting height column from string to numeric in inches\n",
    "- Variable type conversions (e.g. to numeric)\n",
    "- Imputations (often with a median, or grouped median)\n",
    "- Age values: Age showed values over 100 and 0- assumption that these are errors and median is imputed\n",
    "- Cleaning text fields by removing non-alphanumeric characters\n",
    "- Converting rating scale to 1-5: There were only even numbers (i.e. 2, 4, 6, 8, 10) so condensing it to this scale seems reasonable\n",
    "\n",
    "The final logic shows that following these steps, all the missing values have been addressed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fashion_data.columns = fashion_data.columns.str.replace(\" \", \"_\")\n",
    "\n",
    "fashion_data.dropna(subset = [\"rating\"], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = fashion_data[['user_id', 'item_id', 'rating']]\n",
    "dataset['rating'] = pd.to_numeric(dataset['rating'])\n",
    "dataset['rating'] = dataset['rating'] / 2 # 1-5 scale"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(105508, 5850)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dataset['user_id'].unique()), len(dataset['item_id'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset.to_csv('rentrunway_preproc.csv',index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "recsys3.10",
   "language": "python",
   "name": "recsys3.10"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "vscode": {
   "interpreter": {
    "hash": "5c7b89af1651d0b8571dde13640ecdccf7d5a6204171d6ab33e7c296e100e08a"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
