{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "This notebook can be used to search for the source images and explore which e-commerce websites and idioms appeared in which dataset. The datasets need to be dowloaded first.",
   "id": "facba1d25708ba29"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-11T15:28:13.716800Z",
     "start_time": "2025-05-11T15:28:13.712963Z"
    }
   },
   "cell_type": "code",
   "source": "from data_analysis_utils import DsDetails, strip_website_name",
   "id": "93ed7bd2d9f075b2",
   "outputs": [],
   "execution_count": 40
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-11T15:28:13.727909Z",
     "start_time": "2025-05-11T15:28:13.725937Z"
    }
   },
   "cell_type": "code",
   "source": "dataset_name = '12M'",
   "id": "26f3ca43e778af1e",
   "outputs": [],
   "execution_count": 41
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-11T15:28:13.917957Z",
     "start_time": "2025-05-11T15:28:13.855192Z"
    }
   },
   "cell_type": "code",
   "source": "ds = DsDetails(dataset_name)",
   "id": "39493a8e033ee9f7",
   "outputs": [],
   "execution_count": 42
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-11T15:28:13.927380Z",
     "start_time": "2025-05-11T15:28:13.926038Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# Relevant for the datasets downloaded from https://github.com/ryanwebster90/onestep-extraction, uncomment to use\n",
    "# ds.filter_by_overfit_type('TV')"
   ],
   "id": "1bf7dc16d4df814d",
   "outputs": [],
   "execution_count": 44
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-11T15:28:13.929697Z",
     "start_time": "2025-05-11T15:28:13.928392Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# Uncomment this part to count how common each website is\n",
    "# websites = ds.df.loc[:, 'URL'].apply(lambda x: strip_website_name(x) if len(x) else '')\n",
    "# websites.value_counts()"
   ],
   "id": "821fdfb9e69505c9",
   "outputs": [],
   "execution_count": 45
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-11T15:28:13.940505Z",
     "start_time": "2025-05-11T15:28:13.938848Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# Uncomment to filter by url inplace before filtering by idiom. \n",
    "# ds.filter_by_idiom('', 'url', inplace=True)"
   ],
   "id": "cd0d8f57f520c68b",
   "outputs": [],
   "execution_count": 47
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-05-11T15:30:26.065774Z",
     "start_time": "2025-05-11T15:30:26.023764Z"
    }
   },
   "cell_type": "code",
   "source": "ds.filter_by_idiom('', 'cap')",
   "id": "a2f4c453212beb",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "                                                 caption       index  \\\n",
       "229    \"\"\"Most Awesome People are born on 27th of Dec...  1527998996   \n",
       "257              \"\"\"Sprouting New Energy\"\" Throw Pillow\"   361171647   \n",
       "1037                      'D' Bright Letter Throw Pillow  1907511042   \n",
       "1053                      'Y' Bright Letter Throw Pillow  2138317512   \n",
       "1590                       1001 Good Nights Throw Pillow  2151830573   \n",
       "...                                                  ...         ...   \n",
       "27271  ohana means family.. lilo and stitch disney......  1888302284   \n",
       "27302  orphan black Throw Pillows featuring Ornate El...   478486711   \n",
       "27423            portrait of a highland cow Throw Pillow  1694515587   \n",
       "27881  tree Throw Pillows featuring THE BEAUTY OF MIN...  1457990634   \n",
       "27962       watercolour cacti and succulent Throw Pillow  2042259489   \n",
       "\n",
       "           scores                                                url  numdups  \n",
       "229    3634189.50  https://image.spreadshirtmedia.com/image-serve...        8  \n",
       "257    3677148.50  https://cdn.shopify.com/s/files/1/0010/6414/08...       13  \n",
       "1037   3718814.50  https://images.landofnod.com/is/image/LandOfNo...        6  \n",
       "1053   3746928.25  https://images.landofnod.com/is/image/LandOfNo...        6  \n",
       "1590   3661104.50  http://li.c-b.co/is/image/LandOfNod/519626_CR_...        8  \n",
       "...           ...                                                ...      ...  \n",
       "27271  3690851.50  https://i.pinimg.com/736x/b7/76/e1/b776e11dc84...        6  \n",
       "27302  3680533.00  https://ctl.s6img.com/society6/img/RTSDBd3JDWx...       50  \n",
       "27423  3688014.00  https://01.img.society6.com/society6/img/dlax8...        6  \n",
       "27881  3635083.00  https://ctl.s6img.com/society6/img/kyEPVg50qGd...       13  \n",
       "27962  3649191.50  https://ctl.s6img.com/society6/img/4vqymfFH_BQ...       13  \n",
       "\n",
       "[559 rows x 5 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>caption</th>\n",
       "      <th>index</th>\n",
       "      <th>scores</th>\n",
       "      <th>url</th>\n",
       "      <th>numdups</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>229</th>\n",
       "      <td>\"\"\"Most Awesome People are born on 27th of Dec...</td>\n",
       "      <td>1527998996</td>\n",
       "      <td>3634189.50</td>\n",
       "      <td>https://image.spreadshirtmedia.com/image-serve...</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>257</th>\n",
       "      <td>\"\"\"Sprouting New Energy\"\" Throw Pillow\"</td>\n",
       "      <td>361171647</td>\n",
       "      <td>3677148.50</td>\n",
       "      <td>https://cdn.shopify.com/s/files/1/0010/6414/08...</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037</th>\n",
       "      <td>'D' Bright Letter Throw Pillow</td>\n",
       "      <td>1907511042</td>\n",
       "      <td>3718814.50</td>\n",
       "      <td>https://images.landofnod.com/is/image/LandOfNo...</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1053</th>\n",
       "      <td>'Y' Bright Letter Throw Pillow</td>\n",
       "      <td>2138317512</td>\n",
       "      <td>3746928.25</td>\n",
       "      <td>https://images.landofnod.com/is/image/LandOfNo...</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1590</th>\n",
       "      <td>1001 Good Nights Throw Pillow</td>\n",
       "      <td>2151830573</td>\n",
       "      <td>3661104.50</td>\n",
       "      <td>http://li.c-b.co/is/image/LandOfNod/519626_CR_...</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27271</th>\n",
       "      <td>ohana means family.. lilo and stitch disney......</td>\n",
       "      <td>1888302284</td>\n",
       "      <td>3690851.50</td>\n",
       "      <td>https://i.pinimg.com/736x/b7/76/e1/b776e11dc84...</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27302</th>\n",
       "      <td>orphan black Throw Pillows featuring Ornate El...</td>\n",
       "      <td>478486711</td>\n",
       "      <td>3680533.00</td>\n",
       "      <td>https://ctl.s6img.com/society6/img/RTSDBd3JDWx...</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27423</th>\n",
       "      <td>portrait of a highland cow Throw Pillow</td>\n",
       "      <td>1694515587</td>\n",
       "      <td>3688014.00</td>\n",
       "      <td>https://01.img.society6.com/society6/img/dlax8...</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27881</th>\n",
       "      <td>tree Throw Pillows featuring THE BEAUTY OF MIN...</td>\n",
       "      <td>1457990634</td>\n",
       "      <td>3635083.00</td>\n",
       "      <td>https://ctl.s6img.com/society6/img/kyEPVg50qGd...</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27962</th>\n",
       "      <td>watercolour cacti and succulent Throw Pillow</td>\n",
       "      <td>2042259489</td>\n",
       "      <td>3649191.50</td>\n",
       "      <td>https://ctl.s6img.com/society6/img/4vqymfFH_BQ...</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>559 rows × 5 columns</p>\n",
       "</div>"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 53
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
