{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "a9e1d302",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "## change and re-run\n",
    "# img_dir = Path('/data/FOLDER/FOLDER/data/ImageNetRed/dataset_no_images/mini-imagenet/')\n",
    "# df = pd.read_json('/data/FOLDER/FOLDER/data/ImageNetRed/dataset_no_images/mini-imagenet-annotations.json')\n",
    "\n",
    "img_dir = Path('/data/FOLDER/FOLDER/data/ImageNetRed/dataset_no_images/stanford_cars/')\n",
    "df = pd.read_json('/data/FOLDER/FOLDER/data/ImageNetRed/dataset_no_images/stanford-cars-annotations.json')\n",
    "\n",
    "df = pd.DataFrame(df['data'].apply(lambda x: x[0]).tolist())\n",
    "df.columns = ['url', 'id', 'label', 'is_clean']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "fad4757c",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_jpgs = list(img_dir.glob('**/*.jpg'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "9af89fb4",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_jpgs_mapping = pd.Series({int(i.with_suffix('').name):str(i) for i in all_jpgs})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "4c92510a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['path'] = all_jpgs_mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "0192cbc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.dropna(subset = ['path']).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae3f24bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "deaa2347",
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "a883e4dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(img_dir/'multimodal_mislabel_split.csv', index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
