{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "44c62a07-e026-4dce-80d7-e265240dcc05",
   "metadata": {},
   "source": [
    "# LivingThings Demo\n",
    "\n",
    "To start this notebook:\n",
    "\n",
    "```bash\n",
    "conda create -n lt python=3.10 -y\n",
    "conda activate lt\n",
    "pip install jupyterlab pillow requests nltk\n",
    "jupyter lab\n",
    "```\n",
    "\n",
    "Due to licensing constraints the images are downloaded on the fly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c27a840-f149-420b-9cf9-3333c6b3e9e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import io\n",
    "import math\n",
    "from copy import deepcopy\n",
    "from pathlib import Path\n",
    "import json\n",
    "from pprint import pprint\n",
    "\n",
    "from PIL import Image\n",
    "import requests\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e11d145-7328-4f74-b387-5686e9678eef",
   "metadata": {},
   "outputs": [],
   "source": [
    "# image downloader code\n",
    "\n",
    "def download_data(url, timeout=10):\n",
    "    \"\"\"Adapted from img2dataset\"\"\"\n",
    "    user_agent = \"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0\"\n",
    "    res = requests.get(url, params=None, data=None, headers={\"User-Agent\": user_agent}, timeout=timeout)\n",
    "    if res.status_code != 200:\n",
    "        return None\n",
    "    return res\n",
    "\n",
    "def download_image(url: str, target_file: Path, timeout=10):\n",
    "    target_file = Path(target_file)\n",
    "    res = download_data(url, timeout)\n",
    "    if res is None:\n",
    "        return None\n",
    "    data = res.content\n",
    "    res.close()\n",
    "    rgb_img = Image.open(io.BytesIO(data)).convert(\"RGB\")\n",
    "    rgb_img.save(target_file.as_posix())\n",
    "    return rgb_img\n",
    "\n",
    "def download_or_load_image(url: str, target_file: Path):\n",
    "    target_file = Path(target_file)\n",
    "    if target_file.is_file():\n",
    "        return Image.open(target_file.as_posix())\n",
    "    err_file = target_file.parent / f\"{target_file.name}.err\"\n",
    "    if err_file.is_file():\n",
    "        print(f\"Image already failed: {err_file.read_text(encoding='utf-8')} Delete error file to retry: {err_file}\")\n",
    "        return None\n",
    "    try:\n",
    "        return download_image(url, target_file)\n",
    "    except Exception as e:\n",
    "        err_msg = f\"URL {url} failed due to {type(e)}: {e}\"\n",
    "        print(err_msg)\n",
    "        Path(err_file).write_text(err_msg, encoding=\"utf-8\")\n",
    "    return None\n",
    "\n",
    "def scale_longer_side(image, target_size):\n",
    "    w, h = image.size\n",
    "    if h > w:\n",
    "        target_h = target_size\n",
    "        target_w = int(w * target_size / h)\n",
    "    else:\n",
    "        target_w = target_size\n",
    "        target_h = int(h * target_size / w)    \n",
    "    return image.resize((target_w, target_h))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d632a1eb-39db-4a5f-84f0-03d5c72f325c",
   "metadata": {},
   "source": [
    "## Load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba20da97-4070-4ca1-8c7a-8d144acf6644",
   "metadata": {},
   "outputs": [],
   "source": [
    "source_data = \"data.json\"\n",
    "metadata = json.load(Path(source_data).open(\"r\", encoding=\"utf-8\"))\n",
    "pprint(metadata[0])\n",
    "print(len(metadata))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc5b988e-4609-4518-9788-c4287b9ff369",
   "metadata": {},
   "outputs": [],
   "source": [
    "temp_dir = Path(\"temp_livingthings_images\")\n",
    "os.makedirs(temp_dir, exist_ok=True)\n",
    "min_size, max_size = 256, 512\n",
    "\n",
    "def display_sample(data):\n",
    "    data = deepcopy(data)\n",
    "    \n",
    "    \n",
    "    alttexts = data.pop(\"alttexts\")\n",
    "    i = data.pop(\"i\")\n",
    "    target_file = temp_dir / f\"{i}.jpg\"\n",
    "    # query_type = data[\"querytype\"]\n",
    "    \n",
    "    print(\"-\" * 50)\n",
    "    for k, v in data.items():\n",
    "        if isinstance(v, list):\n",
    "            v = ' | '.join(v)\n",
    "        print(f\"{k:12s} {v}\")\n",
    "    for j, alttext in enumerate(alttexts):\n",
    "        print(f\"alttext  {j+1:02d}  {alttext}\")\n",
    "\n",
    "    os.makedirs(target_file.parent, exist_ok=True)\n",
    "    imgurl = data['imgurl']\n",
    "    print(target_file)\n",
    "    pil_image = download_or_load_image(data['imgurl'], target_file)\n",
    "    if pil_image is None:\n",
    "        print(f\"Could not display image.\")\n",
    "    else:\n",
    "        mictn_size, max_size = 256, 512\n",
    "        if max(pil_image.size) > max_size:\n",
    "            pil_image = scale_longer_side(pil_image, max_size)\n",
    "        if max(pil_image.size) < min_size:\n",
    "            pil_image = scale_longer_side(pil_image, min_size)\n",
    "        display(pil_image)\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6df9efff-4c57-430a-aaaf-0227dcf00525",
   "metadata": {},
   "source": [
    "## View Bing API results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b3facac-c5d6-4946-9b1c-41361cef3cfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata_bing = [data for data in metadata if data[\"api\"] == \"bing\"]\n",
    "for i in range(20):\n",
    "    display_sample(metadata_bing[i])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6cf204a6-a481-4986-bf21-28938c92f9dd",
   "metadata": {},
   "source": [
    "## View Google API results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "baaca0f1-155c-4037-8a5d-72638cf56caf",
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata_google = [data for data in metadata if data[\"api\"] == \"google\"]\n",
    "for i in range(20):\n",
    "    display_sample(metadata_google[i])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15609917-0291-4f2c-8c4f-f42b085bfcac",
   "metadata": {},
   "source": [
    "## view wordnet entity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bd10098d-0c63-47ab-b934-78e3826f5625",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import wordnet as wn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49665ed2-9a57-41c8-9c06-0ed074bd8c7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# wordnet:yellowtail.n.01\n",
    "def display_wordnet_synset(synsetname):\n",
    "    if synsetname.startswith(\"wordnet:\"):\n",
    "        synsetname = synsetname[8:]\n",
    "    print(f\"{synsetname}\")\n",
    "    synset = wn.synset(synsetname)\n",
    "    print(synset.definition())\n",
    "    print(\" | \".join(str(lemma.name()) for lemma in synset.lemmas()))\n",
    "    print()\n",
    "    # print(synset)\n",
    "\n",
    "display_wordnet_synset(\"wordnet:yellowtail.n.01\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1a78d30-a105-40e1-906e-e27aaf036126",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
