{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import random\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# function that imports data from csv using pandas\n",
    "def import_data(fname):\n",
    "    df = pd.read_csv(fname, sep=';')\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_red = import_data('data/winery/winequality-red.csv')\n",
    "data_white = import_data('data/winery/winequality-white.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_red"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_white"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge two dataframes data_red and data_white into one dataframe data\n",
    "data = pd.concat([data_red, data_white], axis=0)\n",
    "# remove the quality column from data\n",
    "data = data.drop('quality', axis=1)\n",
    "# fix the ids of the concatenated dataframe\n",
    "data = data.reset_index(drop=True)\n",
    "# add dash between all words in the column names\n",
    "data.columns = data.columns.str.replace(' ', '-')\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# normalise every column in data based on the min and max values of each column\n",
    "def normalise_data(data):\n",
    "    for col in data.columns:\n",
    "        data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())\n",
    "    return data\n",
    "\n",
    "data = normalise_data(data)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def select_random_rows_using_id(df, n):\n",
    "    \"\"\"Given a dataframe selects randomly n rows.\"\"\"\n",
    "    # get the number of rows in the dataframe\n",
    "    n_rows = df.shape[0]\n",
    "    # generate a list of n random numbers between 0 and n_rows\n",
    "    random_ids = random.sample(range(n_rows), n)\n",
    "    # select the rows with the ids in random_ids\n",
    "    df_random = df.iloc[random_ids]\n",
    "    return df_random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# given an index return a string with the corresponding number where each number has 6 digits\n",
    "# for example 3000 becomes 003000\n",
    "def get_index_string(index: int, leading_zeros: int = 6):\n",
    "    index_string: str = str(index)\n",
    "    while len(index_string) < leading_zeros:\n",
    "        index_string = '0' + index_string\n",
    "    return index_string\n",
    "\n",
    "def create_filename(split: str, index: int):\n",
    "    return f\"winery_{split}_{get_index_string(index)}.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_test_split(df, split_size=0.9):\n",
    "    dataset_size = len(df)\n",
    "    train_size = round(len(df) * split_size)\n",
    "\n",
    "    train_ids = random.sample(range(dataset_size), train_size)\n",
    "    train_ids = sorted(train_ids)\n",
    "    test_ids = sorted(list(set([i for i in range(dataset_size)]).difference(set(train_ids))))\n",
    "\n",
    "    train_df = df.iloc[train_ids]\n",
    "    test_df = df.iloc[test_ids]\n",
    "    return train_df, test_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dataset(data, split, scenes):\n",
    "    dataset = []\n",
    "    for i in range(scenes):\n",
    "        scene_size: int = random.randint(3, 10)\n",
    "        new_scene = {\"image_index\": i, \"image_filename\": \"nil\", \"objects\": []}\n",
    "        for _, row in select_random_rows_using_id(data, scene_size).iterrows():\n",
    "            new_object = {\"attributes\": row.to_dict(), \"description\": {}}\n",
    "            for key, val in new_object[\"attributes\"].items():\n",
    "                new_object[\"attributes\"][key] = round(val, 5)\n",
    "            new_scene[\"objects\"].append(new_object)\n",
    "        dataset.append(new_scene)\n",
    "\n",
    "    # save every scene in a separate file\n",
    "    out_dir = os.path.join(\"out\", split)\n",
    "    os.makedirs(out_dir, exist_ok=True)\n",
    "    for scene in dataset:\n",
    "        with open(os.path.join(out_dir, create_filename(split, scene['image_index'])), \"w\") as f:\n",
    "            json.dump(scene, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, test = train_test_split(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_dataset(train, split=\"train\", scenes=20000)\n",
    "create_dataset(test, split=\"test\", scenes=1000)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "winery-dataset-gen",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
