{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from src.common.utils import get_abs_path, get_cache_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "cc12m = pd.read_csv(\"downloaded_cc12m_report.tsv.gz\", compression=\"gzip\", sep=\"\\t\", names=[\"caption\", \"path\", \"dataset\", \"mimetype\", \"size\", \"status\", \"url\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "caption                            a very typical bus station\n",
       "path        /export/home/.cache/src/conceptual_caption/i...\n",
       "dataset                                                  cc3m\n",
       "mimetype                                           image/jpeg\n",
       "size                                                    36078\n",
       "status                                                    200\n",
       "url         http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cc12m.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3318333"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(cc12m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3130587/3130587 [17:28<00:00, 2986.08it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 2759017 valid records\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "cnt = 0\n",
    "\n",
    "valid_records = []\n",
    "\n",
    "for i, path in tqdm(enumerate(cc12m.path.unique()), total=len(cc12m.path.unique())):\n",
    "    path = str(path)\n",
    "    if os.path.exists(path):\n",
    "        record = cc12m.iloc[i]\n",
    "        valid_records.append({\"image\": record[\"path\"], \"caption\": record[\"caption\"]})\n",
    "\n",
    "        cnt += 1\n",
    "\n",
    "print(\"Found {} valid records\".format(cnt))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2759017"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(valid_records)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'image': '/export/home/.cache/src/conceptual_caption/images/1_3239086386.jpg',\n",
       " 'caption': 'sierra looked stunning in this top and this skirt while performing with person at their former university'}"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_records[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/export/home/.cache/src/conceptual_caption/annotations/cc3m.json already exists\n"
     ]
    },
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "from omegaconf import OmegaConf\n",
    "\n",
    "\n",
    "config_path = get_abs_path(\"configs/datasets/conceptual_caption/defaults_12m.yaml\")\n",
    "\n",
    "ann_path = OmegaConf.load(\n",
    "    config_path\n",
    ").datasets.conceptual_caption_12m.build_info.annotations.train.storage[0]\n",
    "\n",
    "ann_path = get_cache_path(ann_path)\n",
    "\n",
    "if os.path.exists(ann_path):\n",
    "    # abort\n",
    "    print(\"{} already exists\".format(ann_path))\n",
    "else:\n",
    "    # Save the valid records to a json file\n",
    "    with open(ann_path, \"w\") as f:\n",
    "        f.write(json.dumps(valid_records))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.10 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
