{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding\n",
    "Perception Language Model (PLM) is a state-of-the-art, fully open and reproducible MLLM for transparent research in image and video understanding.\n",
    "\n",
    "[![Hugging Face Collection](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face%20Collection-Models,%20Data,%20and%20Benchmarks-blue)](https://huggingface.co/collections/facebook/perception-lm-67f9783f171948c383ee7498)\n",
    "[![Paper](https://img.shields.io/badge/Technical%20Report-PerceptionLM-b31b1b.svg)](https://ai.meta.com/research/publications/perceptionlm-open-access-data-and-models-for-detailed-visual-understanding)\n",
    "[![Paper](https://img.shields.io/badge/arXiv-2504.13180-brightgreen.svg?style=flat-square)](https://arxiv.org/abs/2504.13180)\n",
    "[![ModelLicense](https://img.shields.io/badge/Model_License-FAIR_Research_License-lightgrey)](../../LICENSE.PLM)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Log in HF hub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "from huggingface_hub.hf_api import HfFolder\n",
    "# get your token here https://huggingface.co/settings/tokens\n",
    "HfFolder.save_token('<Replace with your HF Token>')"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "source": [
    "from PIL import Image\n",
    "from PIL import Image, ImageDraw\n",
    "from IPython.display import display, HTML\n",
    "import textwrap\n",
    "import time\n",
    "import urllib\n",
    "\n",
    "from core.args import dataclass_from_dict\n",
    "from core.transforms.image_transform import get_image_transform\n",
    "from core.transforms.video_transform import get_video_transform\n",
    "from apps.plm.generate import PackedCausalTransformerGeneratorArgs, PackedCausalTransformerGenerator, load_consolidated_model_and_tokenizer"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "source": [
    "ckpt = \"facebook/Perception-LM-1B\"\n",
    "# ckpt = \"facebook/Perception-LM-3B\"\n",
    "# ckpt = \"facebook/Perception-LM-8B\"\n",
    "model, tokenizer, config = load_consolidated_model_and_tokenizer(ckpt)"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "source": [
    "def generate(\n",
    "    media_path=\"\",\n",
    "    question=\"Describe the image in details.\",\n",
    "    media_type=\"image\",\n",
    "    number_of_tiles=1,\n",
    "    temperature=0.0,\n",
    "    top_p=None,\n",
    "    top_k=None,\n",
    "):\n",
    "    prompts = []\n",
    "    if media_type == \"image\":\n",
    "        transform = get_image_transform(\n",
    "            vision_input_type=(\n",
    "                \"vanilla\" if number_of_tiles == 1 else config.data.vision_input_type\n",
    "            ),\n",
    "            image_res=model.vision_model.image_size,\n",
    "            max_num_tiles=number_of_tiles,\n",
    "        )\n",
    "        image = Image.open(media_path).convert(\"RGB\")\n",
    "        image, _ = transform(image)\n",
    "        prompts.append((question, image))\n",
    "    elif media_type == \"multi_image\":\n",
    "        assert len(media_path) > 1, f\"The 'multi_image' is specified as media type, however the media contains only one image. Provide at least two images.\"\n",
    "        transform = get_video_transform(\n",
    "            image_res=model.vision_model.image_size,\n",
    "        )\n",
    "        images = [Image.open(path).convert(\"RGB\") for path in media_path]\n",
    "        processed_images, _ = transform._process_multiple_images_pil(images)\n",
    "        prompts.append((question, processed_images))\n",
    "    else:\n",
    "        raise NotImplementedError(\n",
    "            f\"The provided generate function only supports image and video.\"\n",
    "        )\n",
    "    # Create generator\n",
    "    gen_cfg = dataclass_from_dict(\n",
    "        PackedCausalTransformerGeneratorArgs,\n",
    "        {\"temperature\": temperature, \"top_p\": top_p, \"top_k\": top_k},\n",
    "        strict=False,\n",
    "    )\n",
    "    generator = PackedCausalTransformerGenerator(gen_cfg, model, tokenizer)\n",
    "    # Run generation\n",
    "    start_time = time.time()\n",
    "    generation, loglikelihood, greedy = generator.generate(prompts)\n",
    "    end_time = time.time()\n",
    "    for i, gen in enumerate(generation):\n",
    "        # Calculate tokens per second\n",
    "        total_tokens = sum(\n",
    "            len(tokenizer.encode(gen, False, False)) for gen in generation\n",
    "        )\n",
    "        tokens_per_second = total_tokens / (end_time - start_time)\n",
    "        print(\"=================================================\")\n",
    "        print(f\"Question: {question}\")\n",
    "        print(f\"Generation: {textwrap.fill(gen, width=75)}\")\n",
    "        print(f\"Tokens per second: {tokens_per_second:.2f}\")\n",
    "        print(\"=================================================\")"
   ],
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download and Preprocess the Image for Multi-Image Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "source": [
    "imgURL = \"http://images.cocodataset.org/val2017/000000281759.jpg\"\n",
    "\n",
    "urllib.request.urlretrieve(imgURL, \"000000281759.jpg\")\n",
    "\n",
    "# We modify the image to place a red-dot on top right.\n",
    "media_path = \"000000281759.jpg\"\n",
    "media_path_modified = \"000000281759_modified.jpg\"\n",
    "\n",
    "def add_red_dot(image_path, output_path, position=(10, 10), radius=5):\n",
    "    # Open the image\n",
    "    image = Image.open(image_path).convert(\"RGB\")\n",
    "    draw = ImageDraw.Draw(image)\n",
    "\n",
    "    # Define coordinates for the red dot\n",
    "    x, y = position\n",
    "    draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=\"red\")\n",
    "\n",
    "    # Save the modified image\n",
    "    image.save(output_path)\n",
    "    print(f\"Red dot added and saved to {output_path}\")\n",
    "\n",
    "# Add red dot on top-left of the image\n",
    "add_red_dot(media_path, media_path_modified, position=(50, 50))"
   ],
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Inference on Multiple Images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "source": [
    "def display_images_side_by_side(image_path1, image_path2):\n",
    "    \"\"\"\n",
    "    Display two images side by side in a Jupyter Notebook.\n",
    "\n",
    "    Args:\n",
    "        image_path1 (str): Path to the first image.\n",
    "        image_path2 (str): Path to the second image.\n",
    "    \"\"\"\n",
    "    # Create HTML to display images side by side\n",
    "    html = f\"\"\"\n",
    "    <div style=\"display: flex; justify-content: space-around;\">\n",
    "        <div>\n",
    "            <img src=\"{image_path1}\" style=\"max-width: 100%; height: auto;\">\n",
    "        </div>\n",
    "        <div>\n",
    "            <img src=\"{image_path2}\" style=\"max-width: 100%; height: auto;\">\n",
    "        </div>\n",
    "    </div>\n",
    "    \"\"\"\n",
    "\n",
    "    # Display the images\n",
    "    display(HTML(html))"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "source": [
    "display_images_side_by_side(media_path_modified, media_path)\n",
    "\n",
    "question = \"<image><image>What is the difference between two images.\"\n",
    "print(\"Generating for multi-image input (each image uses a sinlge tile)...\")\n",
    "generate(media_path=[media_path_modified, media_path], question=question, media_type=\"multi_image\")"
   ],
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Concatenate Images Horizontally and Run Inference as a Single Image\n",
    "\n",
    "An alternative approach to processing multiple images is to concatenate them horizontally. This allows you to treat them as a single image and use standard image inference with tiling."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "source": [
    "def concatenate_images_horizontally(image_path1, image_path2, output_path):\n",
    "    \"\"\"\n",
    "    Concatenate two images horizontally and save the result.\n",
    "    Args:\n",
    "        image_path1 (str): Path to the first image.\n",
    "        image_path2 (str): Path to the second image.\n",
    "        output_path (str): Path to save the concatenated image.\n",
    "    \"\"\"\n",
    "    # Open the images\n",
    "    image1 = Image.open(image_path1)\n",
    "    image2 = Image.open(image_path2)\n",
    "    # Get dimensions\n",
    "    width1, height1 = image1.size\n",
    "    width2, height2 = image2.size\n",
    "    # Create a new image with a width equal to the sum of both images' widths\n",
    "    # and a height equal to the maximum height of the two images\n",
    "    new_width = width1 + width2\n",
    "    new_height = max(height1, height2)\n",
    "    # Create a new blank image with the calculated dimensions\n",
    "    new_image = Image.new('RGB', (new_width, new_height))\n",
    "    # Paste the first image at the left\n",
    "    new_image.paste(image1, (0, 0))\n",
    "    # Paste the second image to the right of the first image\n",
    "    new_image.paste(image2, (width1, 0))\n",
    "    # Save the concatenated image\n",
    "    new_image.save(output_path)\n",
    "\n",
    "media_path_combined = \"000000281759_combined.jpg\"\n",
    "concatenate_images_horizontally(media_path_modified, media_path, media_path_combined)"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "source": [
    "display(Image.open(media_path_combined))\n",
    "\n",
    "question = \"The image contains two images concatenated horizontly. What is the difference between the image on the left and the image on the right?\"\n",
    "print(\"Generating for multi-image input (images are concatenated horizonly and treated as a sinlge image with tiling)...\")\n",
    "# with basic colab we can only run with with 1 to 4 tiles, instead of full 36 tiles.\n",
    "# generate(media_path=media_path, question=question, media_type=\"image\")\n",
    "print(\"Generating with 4 tiles + 1 tumb...\")\n",
    "generate(media_path=media_path_combined, question=question, number_of_tiles=4, media_type=\"image\")"
   ],
   "outputs": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "perception_models",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
