{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/miniconda/envs/densecaption/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import torch\n",
    "from diffusers import StableDiffusionGLIGENTextImagePipeline, StableDiffusionGLIGENPipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import diffusers\n",
    "from diffusers import (\n",
    "    AutoencoderKL,\n",
    "    DDPMScheduler,\n",
    "    UNet2DConditionModel,\n",
    "    UniPCMultistepScheduler,\n",
    "    EulerDiscreteScheduler,\n",
    ")\n",
    "from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer\n",
    "# pretrained_model_name_or_path = 'masterful/gligen-1-4-generation-text-box'\n",
    "\n",
    "pretrained_model_name_or_path = '/root/data/zhizhonghuang/checkpoints/models--masterful--gligen-1-4-generation-text-box/snapshots/d2820dc1e9ba6ca082051ce79cfd3eb468ae2c83'\n",
    "\n",
    "tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder=\"tokenizer\")\n",
    "noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder=\"scheduler\")\n",
    "text_encoder = CLIPTextModel.from_pretrained(\n",
    "    pretrained_model_name_or_path, subfolder=\"text_encoder\"\n",
    ")\n",
    "vae = AutoencoderKL.from_pretrained(\n",
    "    pretrained_model_name_or_path, subfolder=\"vae\"\n",
    ")\n",
    "# unet = UNet2DConditionModel.from_pretrained(\n",
    "#     pretrained_model_name_or_path, subfolder=\"unet\"\n",
    "# )\n",
    "\n",
    "noise_scheduler = EulerDiscreteScheduler.from_config(noise_scheduler.config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "unet = UNet2DConditionModel.from_pretrained(\n",
    "    '/root/data/zhizhonghuang/ckpt/GLIGEN_Text_Retrain_COCO'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion_gligen.pipeline_stable_diffusion_gligen.StableDiffusionGLIGENPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .\n"
     ]
    }
   ],
   "source": [
    "pipe = StableDiffusionGLIGENPipeline(\n",
    "    vae,\n",
    "    text_encoder,\n",
    "    tokenizer,\n",
    "    unet,\n",
    "    noise_scheduler,\n",
    "    safety_checker=None,\n",
    "    feature_extractor=None,\n",
    ")\n",
    "pipe = pipe.to(\"cuda\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# prompt = 'A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky'\n",
    "# gen_boxes = [('a green car', [21, 281, 211, 159]), ('a blue truck', [269, 283, 209, 160]), ('a red air balloon', [66, 8, 145, 135]), ('a bird', [296, 42, 143, 100])]\n",
    "\n",
    "# prompt = 'A realistic top-down view of a wooden table with two apples on it'\n",
    "# gen_boxes = [('a wooden table', [20, 148, 472, 216]), ('an apple', [150, 226, 100, 100]), ('an apple', [280, 226, 100, 100])]\n",
    "\n",
    "# prompt = 'A realistic scene of three skiers standing in a line on the snow near a palm tree'\n",
    "# gen_boxes = [('a skier', [5, 152, 139, 168]), ('a skier', [278, 192, 121, 158]), ('a skier', [148, 173, 124, 155]), ('a palm tree', [404, 105, 103, 251])]\n",
    "\n",
    "prompt = 'An oil painting of a pink dolphin jumping on the left of a steam boat on the sea'\n",
    "gen_boxes = [('a steam boat', [232, 225, 257, 149]), ('a jumping pink dolphin', [21, 249, 189, 123])]\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "boxes = np.array([x[1] for x in gen_boxes])\n",
    "boxes = boxes / 512\n",
    "boxes[:, 2] = boxes[:, 0] + boxes[:, 2]\n",
    "boxes[:, 3] = boxes[:, 1] + boxes[:, 3]\n",
    "boxes = boxes.tolist()\n",
    "gligen_phrases = [x[0] for x in gen_boxes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/miniconda/envs/densecaption/lib/python3.11/site-packages/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py:683: FutureWarning: Accessing config attribute `in_channels` directly via 'UNet2DConditionModel' object attribute is deprecated. Please access 'in_channels' over 'UNet2DConditionModel's config object instead, e.g. 'unet.config.in_channels'.\n",
      "  num_channels_latents = self.unet.in_channels\n",
      "/root/miniconda/envs/densecaption/lib/python3.11/site-packages/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py:716: FutureWarning: Accessing config attribute `cross_attention_dim` directly via 'UNet2DConditionModel' object attribute is deprecated. Please access 'cross_attention_dim' over 'UNet2DConditionModel's config object instead, e.g. 'unet.config.cross_attention_dim'.\n",
      "  max_objs, self.unet.cross_attention_dim, device=device, dtype=self.text_encoder.dtype\n",
      "100%|██████████| 50/50 [01:21<00:00,  1.64s/it]\n"
     ]
    }
   ],
   "source": [
    "images = pipe(\n",
    "    prompt=prompt,\n",
    "    gligen_phrases=gligen_phrases,\n",
    "    gligen_boxes=boxes,\n",
    "    gligen_scheduled_sampling_beta=1.0,\n",
    "    output_type=\"pil\",\n",
    "    num_inference_steps=50,\n",
    "    negative_prompt=\"artifacts, blurry, smooth texture, bad quality, distortions, unrealistic, distorted image, bad proportions, duplicate\",\n",
    "    num_images_per_prompt=16,\n",
    ").images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "diffusers.utils.make_image_grid(images, 4, len(images)//4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "densecaption",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
