{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding\n",
    "Perception Language Model (PLM) is a state-of-the-art, fully open and reproducible MLLM for transparent research in image and video understanding.\n",
    "\n",
    "[![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face%20Collection-Models,%20Data,%20and%20Benchmarks-blue)](https://huggingface.co/collections/facebook/perception-lm-67f9783f171948c383ee7498)\n",
    "[![Paper](https://img.shields.io/badge/Technical%20Report-PerceptionLM-b31b1b.svg)](https://ai.meta.com/research/publications/perceptionlm-open-access-data-and-models-for-detailed-visual-understanding)\n",
    "[![Paper](https://img.shields.io/badge/arXiv-2504.13180-brightgreen.svg?style=flat-square)](https://arxiv.org/abs/2504.13180)\n",
    "[![ModelLicense](https://img.shields.io/badge/Model_License-FAIR_Research_License-lightgrey)](../../LICENSE.PLM)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Log in HF hub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "from huggingface_hub.hf_api import HfFolder\n",
    "# get your token here https://huggingface.co/settings/tokens\n",
    "HfFolder.save_token('YOUR_HF_TOKEN') "
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "source": [
    "import os\n",
    "import torch\n",
    "from PIL import Image\n",
    "import time\n",
    "from IPython.display import HTML\n",
    "from base64 import b64encode\n",
    "import textwrap\n",
    "import requests\n",
    "import urllib.request\n",
    "\n",
    "from core.args import dataclass_from_dict\n",
    "from core.transforms.image_transform import get_image_transform\n",
    "from core.transforms.video_transform import get_video_transform\n",
    "from apps.plm.generate import PackedCausalTransformerGeneratorArgs, PackedCausalTransformerGenerator, load_consolidated_model_and_tokenizer"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "source": [
    "# ckpt = \"facebook/Perception-LM-1B\"\n",
    "# ckpt = \"facebook/Perception-LM-3B\"\n",
    "ckpt = \"facebook/Perception-LM-8B\" \n",
    "model, tokenizer, config = load_consolidated_model_and_tokenizer(ckpt)"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "source": [
    "def generate(\n",
    "    media_path=\"\",\n",
    "    question=\"Describe the image in details.\",\n",
    "    media_type=\"image\",\n",
    "    number_of_frames=4,\n",
    "    number_of_tiles=1,\n",
    "    temperature=0.0,\n",
    "    top_p=None,\n",
    "    top_k=None,\n",
    "):\n",
    "    prompts = []\n",
    "    if media_type == \"image\":\n",
    "        transform = get_image_transform(\n",
    "            vision_input_type=(\n",
    "                \"vanilla\" if number_of_tiles == 1 else config.data.vision_input_type\n",
    "            ),\n",
    "            image_res=model.vision_model.image_size,\n",
    "            max_num_tiles=number_of_tiles,\n",
    "        )\n",
    "        image = Image.open(media_path).convert(\"RGB\")\n",
    "        image, _ = transform(image)\n",
    "        prompts.append((question, image))\n",
    "    elif media_type == \"video\":\n",
    "        transform = get_video_transform(\n",
    "            image_res=model.vision_model.image_size,\n",
    "        )\n",
    "        video_info = (media_path, number_of_frames, None, None, None)\n",
    "        frames, _ = transform(video_info)\n",
    "        prompts.append((question, frames))\n",
    "    else:\n",
    "        raise NotImplementedError(\n",
    "            f\"The provided generate function only supports image and video.\"\n",
    "        )\n",
    "    # Create generator\n",
    "    gen_cfg = dataclass_from_dict(\n",
    "        PackedCausalTransformerGeneratorArgs,\n",
    "        {\"temperature\": temperature, \"top_p\": top_p, \"top_k\": top_k},\n",
    "        strict=False,\n",
    "    )\n",
    "    generator = PackedCausalTransformerGenerator(gen_cfg, model, tokenizer)\n",
    "    # Run generation\n",
    "    start_time = time.time()\n",
    "    generation, loglikelihood, greedy = generator.generate(prompts)\n",
    "    end_time = time.time()\n",
    "    for i, gen in enumerate(generation):\n",
    "        # Calculate tokens per second\n",
    "        total_tokens = sum(\n",
    "            len(tokenizer.encode(gen, False, False)) for gen in generation\n",
    "        )\n",
    "        tokens_per_second = total_tokens / (end_time - start_time)\n",
    "        print(\"=================================================\")\n",
    "        print(textwrap.fill(gen, width=75))\n",
    "        print(f\"Tokens per second: {tokens_per_second:.2f}\")\n",
    "        print(\"=================================================\")"
   ],
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Inference on Image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "source": [
    "imgURL = \"http://images.cocodataset.org/val2017/000000281759.jpg\"\n",
    "\n",
    "urllib.request.urlretrieve(imgURL, \"000000281759.jpg\")\n",
    "\n",
    "media_path = \"000000281759.jpg\"\n",
    "question = \"Describe the image in details.\"\n",
    "display(Image.open(media_path))\n",
    "print(\"Generating...\")\n",
    "# with basic colab we can only run with with 1 to 4 tiles, instead of full 36 tiles.\n",
    "# generate(media_path=media_path, question=question, media_type=\"image\")\n",
    "print(\"Generating with 4 tiles + 1 tumb...\")\n",
    "generate(media_path=media_path, question=question, number_of_tiles=4, media_type=\"image\")"
   ],
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download PVD dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "source": [
    "# download one shard of our PVD dataset from HF\n",
    "!wget https://huggingface.co/datasets/facebook/PE-Video/resolve/main/test/000000.tar\n",
    "\n",
    "!mkdir -p shard_0 && tar -xf 000000.tar -C shard_0 > /dev/null"
   ],
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Inference on Video"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "metadata": {},
   "source": [
    "media_path = \"shard_0/97490574.mp4\"\n",
    "question = \"What is happening in the video?\"\n",
    "\n",
    "mp4 = open(media_path,'rb').read()\n",
    "data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
    "HTML(\"\"\"\n",
    "<video width=400 controls>\n",
    "      <source src=\"%s\" type=\"video/mp4\">\n",
    "</video>\n",
    "\"\"\" % data_url)"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "source": [
    "print(\"Generating with 4 frames ...\")\n",
    "# with basic colab we can only run with 4 frames\n",
    "generate(media_path=media_path, question=question, media_type=\"video\")"
   ],
   "outputs": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
