{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import six\n",
    "import sys\n",
    "import codecs\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "from torch.autograd import Variable\n",
    "import torch.nn.functional as F\n",
    "\n",
    "import MeCab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5501\n",
      "5118\n",
      "5901\n",
      "5149\n"
     ]
    }
   ],
   "source": [
    "for folder_name in ['books', 'dvd', 'electronics', 'kitchen']:\n",
    "\n",
    "    text_list = []\n",
    "\n",
    "    with codecs.open('dataset/%s/all_balanced.review'%folder_name, 'r', 'UTF-8', 'ignore') as file:    \n",
    "        line = file.readline()\n",
    "\n",
    "        text_temp = ''\n",
    "\n",
    "        while line:\n",
    "            text_list.append(line)\n",
    "            line = file.readline()\n",
    "\n",
    "        file.close()\n",
    "        \n",
    "    labels = []\n",
    "    texts = []\n",
    "\n",
    "    for text_idx in range(len(text_list)):\n",
    "        text_temp = text_list[text_idx].split(' ')\n",
    "\n",
    "        text_temp_main = text_temp[:-1]\n",
    "        \n",
    "        if text_temp[-1] == '#label#:5.0\\n':\n",
    "            label = 5\n",
    "        elif text_temp[-1] == '#label#:4.0\\n':\n",
    "            label = 4\n",
    "        elif text_temp[-1] == '#label#:2.0\\n':\n",
    "            label = 2\n",
    "        elif text_temp[-1] == '#label#:1.0\\n':\n",
    "            label = 1\n",
    "            \n",
    "        #label = 0 if text_temp[-1] == '#label#:negative\\n' else 1\n",
    "\n",
    "        temp_sentence = ''\n",
    "        for i in range(len(text_temp_main)):\n",
    "            word_num = text_temp_main[i].split(':')\n",
    "            word = word_num[0]\n",
    "            num = word_num[1]\n",
    "            for n in range(np.int(num)):\n",
    "                temp_sentence = temp_sentence + word\n",
    "                temp_sentence = temp_sentence + ' '\n",
    "\n",
    "        texts.append(temp_sentence)\n",
    "        labels.append(label)\n",
    "\n",
    "    df =pd.DataFrame([labels, texts]).T\n",
    "    df = df.dropna()\n",
    "    df.columns = ['rating', 'text']\n",
    "    df['item'] = folder_name\n",
    "    \n",
    "    print(len(df))\n",
    "    if len(df) >= 2000:\n",
    "        df.to_csv('dataset/%s_processed_balanced.csv'%folder_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
