{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a8075aeb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Project root: /leonardo/home/userexternal/ggramagl/prjects/CCO\n",
      "['any-to-any', 'audio-classification', 'audio-to-audio', 'audio-text-to-text', 'automatic-speech-recognition', 'depth-estimation', 'document-question-answering', 'visual-document-retrieval', 'feature-extraction', 'fill-mask', 'image-classification', 'image-feature-extraction', 'image-segmentation', 'image-to-image', 'image-text-to-text', 'image-to-text', 'image-to-video', 'keypoint-detection', 'mask-generation', 'object-detection', 'video-classification', 'question-answering', 'reinforcement-learning', 'sentence-similarity', 'summarization', 'table-question-answering', 'tabular-classification', 'tabular-regression', 'text-classification', 'text-generation', 'text-ranking', 'text-to-image', 'text-to-speech', 'text-to-video', 'token-classification', 'translation', 'unconditional-image-generation', 'video-text-to-text', 'video-to-video', 'visual-question-answering', 'zero-shot-classification', 'zero-shot-image-classification', 'zero-shot-object-detection', 'text-to-3d', 'image-to-3d']\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "from huggingface_hub import ModelCard, list_models\n",
    "from data.utils.utility import clean_markdown\n",
    "from time import sleep\n",
    "import datetime\n",
    "import json\n",
    "import re\n",
    "\n",
    "load_dotenv()\n",
    "# Add the project root to the path\n",
    "project_root = os.getenv(\"PARENT_ROOT\")\n",
    "print(f\"Project root: {project_root}\")\n",
    "if project_root not in sys.path:\n",
    "    sys.path.insert(0, project_root)\n",
    "\n",
    "\n",
    "f = os.getenv(\"TASKS_FILE\")\n",
    "t = json.load(open(f, \"r\"))\n",
    "tasks = list(t.keys())\n",
    "print(tasks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "41628e6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "limit = 200\n",
    "# limit_per_task = 100\n",
    "min_year = 2024\n",
    "collected_models = []\n",
    "seen_ids = set()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffafb580",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Search: task='any-to-any'\n",
      "Found 1 models for tag 'any-to-any'\n",
      "\n",
      "Search: task='audio-classification'\n",
      "Found 1 models for tag 'audio-classification'\n",
      "\n",
      "Search: task='audio-to-audio'\n",
      "Found 1 models for tag 'audio-to-audio'\n",
      "\n",
      "Search: task='audio-text-to-text'\n",
      "Found 1 models for tag 'audio-text-to-text'\n",
      "\n",
      "Search: task='automatic-speech-recognition'\n",
      "Found 1 models for tag 'automatic-speech-recognition'\n",
      "\n",
      "Search: task='depth-estimation'\n",
      "Found 1 models for tag 'depth-estimation'\n",
      "\n",
      "Search: task='document-question-answering'\n",
      "Found 1 models for tag 'document-question-answering'\n",
      "\n",
      "Search: task='visual-document-retrieval'\n",
      "Found 1 models for tag 'visual-document-retrieval'\n",
      "\n",
      "Search: task='feature-extraction'\n",
      "Found 1 models for tag 'feature-extraction'\n",
      "\n",
      "Search: task='fill-mask'\n",
      "Found 1 models for tag 'fill-mask'\n",
      "\n",
      "Search: task='image-classification'\n",
      "Found 1 models for tag 'image-classification'\n",
      "\n",
      "Search: task='image-feature-extraction'\n",
      "Found 1 models for tag 'image-feature-extraction'\n",
      "\n",
      "Search: task='image-segmentation'\n",
      "Found 1 models for tag 'image-segmentation'\n",
      "\n",
      "Search: task='image-to-image'\n",
      "Found 1 models for tag 'image-to-image'\n",
      "\n",
      "Search: task='image-text-to-text'\n",
      "Found 1 models for tag 'image-text-to-text'\n",
      "\n",
      "Search: task='image-to-text'\n",
      "Found 1 models for tag 'image-to-text'\n",
      "\n",
      "Search: task='image-to-video'\n",
      "Found 1 models for tag 'image-to-video'\n",
      "\n",
      "Search: task='keypoint-detection'\n",
      "Found 1 models for tag 'keypoint-detection'\n",
      "\n",
      "Search: task='mask-generation'\n",
      "Found 1 models for tag 'mask-generation'\n",
      "\n",
      "Search: task='object-detection'\n",
      "Found 1 models for tag 'object-detection'\n",
      "\n",
      "Search: task='video-classification'\n",
      "Found 1 models for tag 'video-classification'\n",
      "\n",
      "Search: task='question-answering'\n",
      "Found 1 models for tag 'question-answering'\n",
      "\n",
      "Search: task='reinforcement-learning'\n",
      "Found 1 models for tag 'reinforcement-learning'\n",
      "\n",
      "Search: task='sentence-similarity'\n",
      "Found 1 models for tag 'sentence-similarity'\n",
      "\n",
      "Search: task='summarization'\n",
      "Found 1 models for tag 'summarization'\n",
      "\n",
      "Search: task='table-question-answering'\n",
      "Found 1 models for tag 'table-question-answering'\n",
      "\n",
      "Search: task='tabular-classification'\n",
      "Found 1 models for tag 'tabular-classification'\n",
      "\n",
      "Search: task='tabular-regression'\n",
      "Found 1 models for tag 'tabular-regression'\n",
      "\n",
      "Search: task='text-classification'\n",
      "Found 1 models for tag 'text-classification'\n",
      "\n",
      "Search: task='text-generation'\n",
      "Found 1 models for tag 'text-generation'\n",
      "\n",
      "Search: task='text-ranking'\n",
      "Found 1 models for tag 'text-ranking'\n",
      "\n",
      "Search: task='text-to-image'\n",
      "Found 1 models for tag 'text-to-image'\n",
      "\n",
      "Search: task='text-to-speech'\n",
      "Found 1 models for tag 'text-to-speech'\n",
      "\n",
      "Search: task='text-to-video'\n",
      "Found 1 models for tag 'text-to-video'\n",
      "\n",
      "Search: task='token-classification'\n",
      "Found 1 models for tag 'token-classification'\n",
      "\n",
      "Search: task='translation'\n",
      "Found 1 models for tag 'translation'\n",
      "\n",
      "Search: task='unconditional-image-generation'\n",
      "Found 1 models for tag 'unconditional-image-generation'\n",
      "\n",
      "Search: task='video-text-to-text'\n",
      "Found 1 models for tag 'video-text-to-text'\n",
      "\n",
      "Search: task='video-to-video'\n",
      "Found 1 models for tag 'video-to-video'\n",
      "\n",
      "Search: task='visual-question-answering'\n",
      "Found 1 models for tag 'visual-question-answering'\n",
      "\n",
      "Search: task='zero-shot-classification'\n",
      "Found 1 models for tag 'zero-shot-classification'\n",
      "\n",
      "Search: task='zero-shot-image-classification'\n",
      "Found 1 models for tag 'zero-shot-image-classification'\n",
      "\n",
      "Search: task='zero-shot-object-detection'\n",
      "Found 1 models for tag 'zero-shot-object-detection'\n",
      "\n",
      "Search: task='text-to-3d'\n",
      "Found 1 models for tag 'text-to-3d'\n",
      "\n",
      "Search: task='image-to-3d'\n",
      "Found 1 models for tag 'image-to-3d'\n"
     ]
    }
   ],
   "source": [
    "def parse_created_at(model):\n",
    "    created_at = model.created_at or model.cardData.get(\"createdAt\")\n",
    "    if isinstance(created_at, str):\n",
    "        return datetime.fromisoformat(created_at.replace(\"Z\", \"+00:00\"))\n",
    "    return created_at\n",
    "\n",
    "\n",
    "def is_recent(model, min_year=2024, min_month=8):\n",
    "    try:\n",
    "        created_at = parse_created_at(model)\n",
    "        if not created_at:\n",
    "            return False\n",
    "        return created_at.year > min_year or (\n",
    "            created_at.year == min_year and created_at.month > min_month\n",
    "        )\n",
    "    except Exception:\n",
    "        return False\n",
    "\n",
    "\n",
    "def load_modelcard(model_id):\n",
    "    try:\n",
    "        card = ModelCard.load(model_id)\n",
    "        return card.text\n",
    "    except Exception:\n",
    "        return None\n",
    "\n",
    "\n",
    "count = 0\n",
    "for task in tasks:\n",
    "    task_count = 0\n",
    "    print(f\"\\nSearch: task='{task}'\")\n",
    "\n",
    "    try:\n",
    "        models = list_models(filter=task, sort=\"downloads\", direction=-1, limit=limit)\n",
    "\n",
    "        for model in models:\n",
    "            if model.id in seen_ids:\n",
    "                continue\n",
    "            # TODO: change to is_old to get older models\n",
    "            if not is_recent(model, min_year):\n",
    "                continue\n",
    "\n",
    "            created_at = parse_created_at(model)\n",
    "            if not created_at:\n",
    "                continue\n",
    "\n",
    "            modelcard = load_modelcard(model.id)\n",
    "            if not modelcard:\n",
    "                continue\n",
    "\n",
    "            modelcard = clean_markdown(modelcard)\n",
    "            if len(modelcard.split()) <= 200:\n",
    "                continue\n",
    "\n",
    "            collected_models.append(\n",
    "                {\n",
    "                    \"model_id\": model.id,\n",
    "                    \"created_at\": created_at.isoformat(),\n",
    "                    \"downloads\": model.downloads,\n",
    "                    \"likes\": model.likes,\n",
    "                    \"author\": model.author,\n",
    "                    \"tags\": model.tags,\n",
    "                    \"modelcard\": modelcard,\n",
    "                    \"domain\": task,\n",
    "                }\n",
    "            )\n",
    "\n",
    "            seen_ids.add(model.id)\n",
    "            task_count += 1\n",
    "    except Exception as e:\n",
    "        print(f\"Error for task={task}: {e}\")\n",
    "\n",
    "    sleep(4)\n",
    "    print(f\"Found {len(collected_models) - count} models for tag '{task}'\")\n",
    "    count = len(collected_models)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "505cb17a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Before filtering, 45 entries found.\n",
      "After filtering, 44 entries remain.\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "import unicodedata\n",
    "\n",
    "\n",
    "def normalize_text(text: str) -> str:\n",
    "    \"\"\"\n",
    "    Normalize text for robust substring matching.\n",
    "    \"\"\"\n",
    "    # Unicode normalization\n",
    "    text = unicodedata.normalize(\"NFKC\", text)\n",
    "\n",
    "    # Case folding (stronger than lower())\n",
    "    text = text.casefold()\n",
    "\n",
    "    # Collapse whitespace\n",
    "    text = re.sub(r\"\\s+\", \" \", text).strip()\n",
    "\n",
    "    return text\n",
    "\n",
    "\n",
    "def is_autogenerated_modelcard(modelcard: str) -> bool:\n",
    "    \"\"\"\n",
    "    Returns True if the normalized modelcard contains\n",
    "    'This model card has been automatically generated.'.\n",
    "    \"\"\"\n",
    "    if not modelcard:\n",
    "        return False\n",
    "\n",
    "    target = normalize_text(\"This model card has been automatically generated.\")\n",
    "    content = normalize_text(modelcard)\n",
    "\n",
    "    return target in content\n",
    "\n",
    "\n",
    "print(f\"Before filtering, {len(collected_models)} entries found.\")\n",
    "# Filter out autogenerated model cards\n",
    "filtered_models = [\n",
    "    entry\n",
    "    for entry in collected_models\n",
    "    if not is_autogenerated_modelcard(entry.get(\"modelcard\", \"\"))\n",
    "]\n",
    "print(f\"After filtering, {len(filtered_models)} entries remain.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "96cd1c2d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>model_id</th>\n",
       "      <th>created_at</th>\n",
       "      <th>downloads</th>\n",
       "      <th>likes</th>\n",
       "      <th>author</th>\n",
       "      <th>tags</th>\n",
       "      <th>modelcard</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>domain</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>any-to-any</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>audio-classification</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>audio-text-to-text</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>audio-to-audio</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>automatic-speech-recognition</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>depth-estimation</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>document-question-answering</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>feature-extraction</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fill-mask</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>image-classification</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>image-feature-extraction</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>image-segmentation</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>image-text-to-text</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>image-to-3d</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>image-to-image</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>image-to-text</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>image-to-video</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>keypoint-detection</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mask-generation</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>object-detection</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>question-answering</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>reinforcement-learning</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sentence-similarity</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>summarization</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>table-question-answering</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tabular-classification</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tabular-regression</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>text-classification</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>text-generation</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>text-ranking</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>text-to-3d</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>text-to-image</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>text-to-speech</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>text-to-video</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>token-classification</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>translation</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unconditional-image-generation</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>video-classification</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>video-text-to-text</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>video-to-video</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>visual-document-retrieval</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>visual-question-answering</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zero-shot-classification</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zero-shot-image-classification</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zero-shot-object-detection</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                model_id  created_at  downloads  likes  \\\n",
       "domain                                                                   \n",
       "any-to-any                             1           1          1      1   \n",
       "audio-classification                   1           1          1      1   \n",
       "audio-text-to-text                     1           1          1      1   \n",
       "audio-to-audio                         1           1          1      1   \n",
       "automatic-speech-recognition           1           1          1      1   \n",
       "depth-estimation                       1           1          1      1   \n",
       "document-question-answering            1           1          1      1   \n",
       "feature-extraction                     1           1          1      1   \n",
       "fill-mask                              1           1          1      1   \n",
       "image-classification                   1           1          1      1   \n",
       "image-feature-extraction               1           1          1      1   \n",
       "image-segmentation                     1           1          1      1   \n",
       "image-text-to-text                     1           1          1      1   \n",
       "image-to-3d                            1           1          1      1   \n",
       "image-to-image                         1           1          1      1   \n",
       "image-to-text                          1           1          1      1   \n",
       "image-to-video                         1           1          1      1   \n",
       "keypoint-detection                     1           1          1      1   \n",
       "mask-generation                        1           1          1      1   \n",
       "object-detection                       1           1          1      1   \n",
       "question-answering                     1           1          1      1   \n",
       "reinforcement-learning                 1           1          1      1   \n",
       "sentence-similarity                    1           1          1      1   \n",
       "summarization                          1           1          1      1   \n",
       "table-question-answering               1           1          1      1   \n",
       "tabular-classification                 1           1          1      1   \n",
       "tabular-regression                     1           1          1      1   \n",
       "text-classification                    1           1          1      1   \n",
       "text-generation                        1           1          1      1   \n",
       "text-ranking                           1           1          1      1   \n",
       "text-to-3d                             1           1          1      1   \n",
       "text-to-image                          1           1          1      1   \n",
       "text-to-speech                         1           1          1      1   \n",
       "text-to-video                          1           1          1      1   \n",
       "token-classification                   1           1          1      1   \n",
       "translation                            1           1          1      1   \n",
       "unconditional-image-generation         1           1          1      1   \n",
       "video-classification                   1           1          1      1   \n",
       "video-text-to-text                     1           1          1      1   \n",
       "video-to-video                         1           1          1      1   \n",
       "visual-document-retrieval              1           1          1      1   \n",
       "visual-question-answering              1           1          1      1   \n",
       "zero-shot-classification               1           1          1      1   \n",
       "zero-shot-image-classification         1           1          1      1   \n",
       "zero-shot-object-detection             1           1          1      1   \n",
       "\n",
       "                                author  tags  modelcard  \n",
       "domain                                                   \n",
       "any-to-any                           0     1          1  \n",
       "audio-classification                 0     1          1  \n",
       "audio-text-to-text                   0     1          1  \n",
       "audio-to-audio                       0     1          1  \n",
       "automatic-speech-recognition         0     1          1  \n",
       "depth-estimation                     0     1          1  \n",
       "document-question-answering          0     1          1  \n",
       "feature-extraction                   0     1          1  \n",
       "fill-mask                            0     1          1  \n",
       "image-classification                 0     1          1  \n",
       "image-feature-extraction             0     1          1  \n",
       "image-segmentation                   0     1          1  \n",
       "image-text-to-text                   0     1          1  \n",
       "image-to-3d                          0     1          1  \n",
       "image-to-image                       0     1          1  \n",
       "image-to-text                        0     1          1  \n",
       "image-to-video                       0     1          1  \n",
       "keypoint-detection                   0     1          1  \n",
       "mask-generation                      0     1          1  \n",
       "object-detection                     0     1          1  \n",
       "question-answering                   0     1          1  \n",
       "reinforcement-learning               0     1          1  \n",
       "sentence-similarity                  0     1          1  \n",
       "summarization                        0     1          1  \n",
       "table-question-answering             0     1          1  \n",
       "tabular-classification               0     1          1  \n",
       "tabular-regression                   0     1          1  \n",
       "text-classification                  0     1          1  \n",
       "text-generation                      0     1          1  \n",
       "text-ranking                         0     1          1  \n",
       "text-to-3d                           0     1          1  \n",
       "text-to-image                        0     1          1  \n",
       "text-to-speech                       0     1          1  \n",
       "text-to-video                        0     1          1  \n",
       "token-classification                 0     1          1  \n",
       "translation                          0     1          1  \n",
       "unconditional-image-generation       0     1          1  \n",
       "video-classification                 0     1          1  \n",
       "video-text-to-text                   0     1          1  \n",
       "video-to-video                       0     1          1  \n",
       "visual-document-retrieval            0     1          1  \n",
       "visual-question-answering            0     1          1  \n",
       "zero-shot-classification             0     1          1  \n",
       "zero-shot-image-classification       0     1          1  \n",
       "zero-shot-object-detection           0     1          1  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.DataFrame(collected_models)\n",
    "df.groupby(\"domain\").count().sort_values(by=\"model_id\", ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d789847a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "After removing duplicates, 44 entries remain.\n"
     ]
    }
   ],
   "source": [
    "def remove_duplicates_by_modelcard(models, key=\"modelcard\", priority_key=\"downloads\"):\n",
    "    \"\"\"\n",
    "    Remove duplicate models based on a key (e.g., 'modelcard').\n",
    "    When duplicates are found, keep the one with the highest value for priority_key.\n",
    "\n",
    "    Args:\n",
    "        models: List of model dictionaries\n",
    "        key: Field to check for duplicates (default: 'modelcard')\n",
    "        priority_key: Field to use for selecting which duplicate to keep (default: 'downloads')\n",
    "\n",
    "    Returns:\n",
    "        List of unique models\n",
    "    \"\"\"\n",
    "    unique_items = {}\n",
    "\n",
    "    for item in models:\n",
    "        key_value = item.get(key)\n",
    "\n",
    "        if key_value is None:\n",
    "            # Skip items without the key\n",
    "            continue\n",
    "\n",
    "        if key_value not in unique_items:\n",
    "            # First time seeing this key value\n",
    "            unique_items[key_value] = item\n",
    "        else:\n",
    "            # Duplicate found - keep the one with higher priority_key value\n",
    "            current_priority = item.get(priority_key, 0)\n",
    "            existing_priority = unique_items[key_value].get(priority_key, 0)\n",
    "\n",
    "            if current_priority > existing_priority:\n",
    "                unique_items[key_value] = item\n",
    "\n",
    "    return list(unique_items.values())\n",
    "\n",
    "\n",
    "cleaned_models = remove_duplicates_by_modelcard(\n",
    "    filtered_models, key=\"modelcard\", priority_key=\"downloads\"\n",
    ")\n",
    "print(f\"After removing duplicates, {len(cleaned_models)} entries remain.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8dbd49c7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'model_id': 'Qwen/Qwen2.5-VL-3B-Instruct',\n",
       "  'created_at': '2025-01-26T09:25:35+00:00',\n",
       "  'downloads': 19873215,\n",
       "  'likes': 591,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'qwen2_5_vl',\n",
       "   'image-to-text',\n",
       "   'multimodal',\n",
       "   'image-text-to-text',\n",
       "   'conversational',\n",
       "   'en',\n",
       "   'arxiv:2309.00071',\n",
       "   'arxiv:2409.12191',\n",
       "   'arxiv:2308.12966',\n",
       "   'text-generation-inference',\n",
       "   'endpoints_compatible',\n",
       "   'deploy:azure',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# Qwen2.5-VL-3B-Instruct\\n\\n    \\n\\n\\n## Introduction\\n\\nIn the past five months since Qwen2-VL’s release, numerous developers have built new models on the Qwen2-VL vision-language models, providing us with valuable feedback. During this period, we focused on building more useful vision-language models. Today, we are excited to introduce the latest addition to the Qwen family: Qwen2.5-VL.\\n\\n#### Key Enhancements:\\n* **Understand things visually**: Qwen2.5-VL is not only proficient in recognizing common objects such as flowers, birds, fish, and insects, but it is highly capable of analyzing texts, charts, icons, graphics, and layouts within images.\\n\\n* **Being agentic**: Qwen2.5-VL directly plays as a visual agent that can reason and dynamically direct tools, which is capable of computer use and phone use.\\n\\n* **Understanding long videos and capturing events**: Qwen2.5-VL can comprehend videos of over 1 hour, and this time it has a new ability of cpaturing event by pinpointing the relevant video segments.\\n\\n* **Capable of visual localization in different formats**: Qwen2.5-VL can accurately localize objects in an image by generating bounding boxes or points, and it can provide stable JSON outputs for coordinates and attributes.\\n\\n* **Generating structured outputs**: for data like scans of invoices, forms, tables, etc. Qwen2.5-VL supports structured outputs of their contents, benefiting usages in finance, commerce, etc.\\n\\n\\n#### Model Architecture Updates:\\n\\n* **Dynamic Resolution and Frame Rate Training for Video Understanding**:\\n\\nWe extend dynamic resolution to the temporal dimension by adopting dynamic FPS sampling, enabling the model to comprehend videos at various sampling rates. Accordingly, we update mRoPE in the time dimension with IDs and absolute time alignment, enabling the model to learn temporal sequence and speed, and ultimately acquire the ability to pinpoint specific moments.\\n\\n\\n    \\n\\n\\n\\n* **Streamlined and Efficient Vision Encoder**\\n\\nWe enhance both training and inference speeds by strategically implementing window attention into the ViT. The ViT architecture is further optimized with SwiGLU and RMSNorm, aligning it with the structure of the Qwen2.5 LLM.\\n\\n\\nWe have three models with 3, 7 and 72 billion parameters. This repo contains the instruction-tuned 3B Qwen2.5-VL model. For more information, visit our  and .\\n\\n\\n\\n## Evaluation\\n\\n### Image benchmark\\n\\n| Benchmark | InternVL2.5-4B |Qwen2-VL-7B |Qwen2.5-VL-3B |\\n| :--- | :---:  | :---: | :---: |\\n| MMMUval  | 52.3 | 54.1 | 53.1| \\n| MMMU-Proval  | **32.7** | 30.5 | 31.6|\\n| AI2Dtest | 81.4 | **83.0** | 81.5 |\\n| DocVQAtest  | 91.6 | 94.5 | **93.9** | \\n| InfoVQAtest  | 72.1 | 76.5 | **77.1** |\\n| TextVQAval  | 76.8 | **84.3** | 79.3|\\n| MMBench-V1.1test  | 79.3 | **80.7** | 77.6 | \\n| MMStar | 58.3 | **60.7** | 55.9 | \\n| MathVistatestmini  | 60.5 | 58.2 | **62.3** |\\n| MathVisionfull  | 20.9 | 16.3  | **21.2** |\\n\\n\\n### Video benchmark\\n| Benchmark | InternVL2.5-4B | Qwen2-VL-7B | Qwen2.5-VL-3B |\\n| :--- | :---:  | :---: | :---: |\\n| MVBench | 71.6 | 67.0 | 67.0 |\\n| VideoMME | 63.6/62.3 | 69.0/63.3 | 67.6/61.5 |\\n| MLVU | 48.3 | - | 68.2 |\\n| LVBench | - | - | 43.3 |\\n| MMBench-Video | 1.73 | 1.44 | 1.63 |\\n| EgoSchema | - | - | 64.8 |\\n| PerceptionTest | - | - | 66.9 |\\n| TempCompass | - | - | 64.4 |\\n| LongVideoBench | 55.2 | 55.6 | 54.2 |\\n| CharadesSTA/mIoU | - | - | 38.8 |\\n\\n\\n### Agent benchmark\\n| Benchmarks              | Qwen2.5-VL-3B |\\n|-------------------------|---------------|\\n| ScreenSpot              |     55.5    |\\n| ScreenSpot Pro          |     23.9    |\\n| AITZ_EM                 |  \\t76.9    |\\n| Android Control High_EM |    \\t63.7    |\\n| Android Control Low_EM  |  \\t22.2    |\\n| AndroidWorld_SR         | \\t90.8  \\t|\\n| MobileMiniWob++_SR      | \\t67.9    |\\n\\n## Requirements\\nThe code of Qwen2.5-VL has been in the latest Hugging face transformers and we advise you to build from source with command:\\n```\\npip install git+ accelerate\\n```\\nor you might encounter the following error:\\n```\\nKeyError: \\'qwen2_5_vl\\'\\n```\\n\\n\\n## Quickstart\\n\\nBelow, we provide simple examples to show how to use Qwen2.5-VL with 🤖 ModelScope and 🤗 Transformers.\\n\\nThe code of Qwen2.5-VL has been in the latest Hugging face transformers and we advise you to build from source with command:\\n```\\npip install git+ accelerate\\n```\\nor you might encounter the following error:\\n```\\nKeyError: \\'qwen2_5_vl\\'\\n```\\n\\n\\nWe offer a toolkit to help you handle various types of visual input more conveniently, as if you were using an API. This includes base64, URLs, and interleaved images and videos. You can install it using the following command:\\n\\n```bash\\n# It\\'s highly recommanded to use `[decord]` feature for faster video loading.\\npip install qwen-vl-utils[decord]==0.0.8\\n```\\n\\nIf you are not using Linux, you might not be able to install `decord` from PyPI. In that case, you can use `pip install qwen-vl-utils` which will fall back to using torchvision for video processing. However, you can still  to get decord used when loading video.\\n\\n### Using 🤗  Transformers to Chat\\n\\nHere we show a code snippet to show you how to use the chat model with `transformers` and `qwen_vl_utils`:\\n\\n```python\\nfrom transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\\nfrom qwen_vl_utils import process_vision_info\\n\\n# default: Load the model on the available device(s)\\nmodel = Qwen2_5_VLForConditionalGeneration.from_pretrained(\\n    \"Qwen/Qwen2.5-VL-3B-Instruct\", torch_dtype=\"auto\", device_map=\"auto\"\\n)\\n\\n# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.\\n# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\\n#     \"Qwen/Qwen2.5-VL-3B-Instruct\",\\n#     torch_dtype=torch.bfloat16,\\n#     attn_implementation=\"flash_attention_2\",\\n#     device_map=\"auto\",\\n# )\\n\\n# default processer\\nprocessor = AutoProcessor.from_pretrained(\"Qwen/Qwen2.5-VL-3B-Instruct\")\\n\\n# The default range for the number of visual tokens per image in the model is 4-16384.\\n# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.\\n# min_pixels = 256*28*28\\n# max_pixels = 1280*28*28\\n# processor = AutoProcessor.from_pretrained(\"Qwen/Qwen2.5-VL-3B-Instruct\", min_pixels=min_pixels, max_pixels=max_pixels)\\n\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\\n                \"type\": \"image\",\\n                \"image\": \"\\n            },\\n            {\"type\": \"text\", \"text\": \"Describe this image.\"},\\n        ],\\n    }\\n]\\n\\n# Preparation for inference\\ntext = processor.apply_chat_template(\\n    messages, tokenize=False, add_generation_prompt=True\\n)\\nimage_inputs, video_inputs = process_vision_info(messages)\\ninputs = processor(\\n    text=[text],\\n    images=image_inputs,\\n    videos=video_inputs,\\n    padding=True,\\n    return_tensors=\"pt\",\\n)\\ninputs = inputs.to(\"cuda\")\\n\\n# Inference: Generation of the output\\ngenerated_ids = model.generate(**inputs, max_new_tokens=128)\\ngenerated_ids_trimmed = [\\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\\n]\\noutput_text = processor.batch_decode(\\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n)\\nprint(output_text)\\n```\\n\\nMulti image inference\\n\\n```python\\n# Messages containing multiple images and a text query\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\"type\": \"image\", \"image\": \"file:///path/to/image1.jpg\"},\\n            {\"type\": \"image\", \"image\": \"file:///path/to/image2.jpg\"},\\n            {\"type\": \"text\", \"text\": \"Identify the similarities between these images.\"},\\n        ],\\n    }\\n]\\n\\n# Preparation for inference\\ntext = processor.apply_chat_template(\\n    messages, tokenize=False, add_generation_prompt=True\\n)\\nimage_inputs, video_inputs = process_vision_info(messages)\\ninputs = processor(\\n    text=[text],\\n    images=image_inputs,\\n    videos=video_inputs,\\n    padding=True,\\n    return_tensors=\"pt\",\\n)\\ninputs = inputs.to(\"cuda\")\\n\\n# Inference\\ngenerated_ids = model.generate(**inputs, max_new_tokens=128)\\ngenerated_ids_trimmed = [\\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\\n]\\noutput_text = processor.batch_decode(\\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n)\\nprint(output_text)\\n```\\n\\n\\n\\nVideo inference\\n\\n```python\\n# Messages containing a images list as a video and a text query\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\\n                \"type\": \"video\",\\n                \"video\": [\\n                    \"file:///path/to/frame1.jpg\",\\n                    \"file:///path/to/frame2.jpg\",\\n                    \"file:///path/to/frame3.jpg\",\\n                    \"file:///path/to/frame4.jpg\",\\n                ],\\n            },\\n            {\"type\": \"text\", \"text\": \"Describe this video.\"},\\n        ],\\n    }\\n]\\n\\n# Messages containing a local video path and a text query\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\\n                \"type\": \"video\",\\n                \"video\": \"file:///path/to/video1.mp4\",\\n                \"max_pixels\": 360 * 420,\\n                \"fps\": 1.0,\\n            },\\n            {\"type\": \"text\", \"text\": \"Describe this video.\"},\\n        ],\\n    }\\n]\\n\\n# Messages containing a video url and a text query\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\\n                \"type\": \"video\",\\n                \"video\": \"\\n            },\\n            {\"type\": \"text\", \"text\": \"Describe this video.\"},\\n        ],\\n    }\\n]\\n\\n#In Qwen 2.5 VL, frame rate information is also input into the model to align with absolute time.\\n# Preparation for inference\\ntext = processor.apply_chat_template(\\n    messages, tokenize=False, add_generation_prompt=True\\n)\\nimage_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)\\ninputs = processor(\\n    text=[text],\\n    images=image_inputs,\\n    videos=video_inputs,\\n    fps=fps,\\n    padding=True,\\n    return_tensors=\"pt\",\\n    **video_kwargs,\\n)\\ninputs = inputs.to(\"cuda\")\\n\\n# Inference\\ngenerated_ids = model.generate(**inputs, max_new_tokens=128)\\ngenerated_ids_trimmed = [\\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\\n]\\noutput_text = processor.batch_decode(\\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n)\\nprint(output_text)\\n```\\n\\nVideo URL compatibility largely depends on the third-party library version. The details are in the table below. change the backend by `FORCE_QWENVL_VIDEO_READER=torchvision` or `FORCE_QWENVL_VIDEO_READER=decord` if you prefer not to use the default one.\\n\\n| Backend     | HTTP | HTTPS |\\n|-------------|------|-------|\\n| torchvision >= 0.19.0 | ✅  | ✅   |\\n| torchvision \\n\\n\\nBatch inference\\n\\n```python\\n# Sample messages for batch inference\\nmessages1 = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\"type\": \"image\", \"image\": \"file:///path/to/image1.jpg\"},\\n            {\"type\": \"image\", \"image\": \"file:///path/to/image2.jpg\"},\\n            {\"type\": \"text\", \"text\": \"What are the common elements in these pictures?\"},\\n        ],\\n    }\\n]\\nmessages2 = [\\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\\n    {\"role\": \"user\", \"content\": \"Who are you?\"},\\n]\\n# Combine messages for batch processing\\nmessages = [messages1, messages2]\\n\\n# Preparation for batch inference\\ntexts = [\\n    processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)\\n    for msg in messages\\n]\\nimage_inputs, video_inputs = process_vision_info(messages)\\ninputs = processor(\\n    text=texts,\\n    images=image_inputs,\\n    videos=video_inputs,\\n    padding=True,\\n    return_tensors=\"pt\",\\n)\\ninputs = inputs.to(\"cuda\")\\n\\n# Batch Inference\\ngenerated_ids = model.generate(**inputs, max_new_tokens=128)\\ngenerated_ids_trimmed = [\\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\\n]\\noutput_texts = processor.batch_decode(\\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n)\\nprint(output_texts)\\n```\\n\\n\\n### 🤖 ModelScope\\nWe strongly advise users especially those in mainland China to use ModelScope. `snapshot_download` can help you solve issues concerning downloading checkpoints.\\n\\n\\n### More Usage Tips\\n\\nFor input images, we support local files, base64, and URLs. For videos, we currently only support local files.\\n\\n```python\\n# You can directly insert a local file path, a URL, or a base64-encoded image into the position where you want in the text.\\n## Local file path\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\"type\": \"image\", \"image\": \"file:///path/to/your/image.jpg\"},\\n            {\"type\": \"text\", \"text\": \"Describe this image.\"},\\n        ],\\n    }\\n]\\n## Image URL\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\"type\": \"image\", \"image\": \"\\n            {\"type\": \"text\", \"text\": \"Describe this image.\"},\\n        ],\\n    }\\n]\\n## Base64 encoded image\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\"type\": \"image\", \"image\": \"data:image;base64,/9j/...\"},\\n            {\"type\": \"text\", \"text\": \"Describe this image.\"},\\n        ],\\n    }\\n]\\n```\\n#### Image Resolution for performance boost\\n\\nThe model supports a wide range of resolution inputs. By default, it uses the native resolution for input, but higher resolutions can enhance performance at the cost of more computation. Users can set the minimum and maximum number of pixels to achieve an optimal configuration for their needs, such as a token count range of 256-1280, to balance speed and memory usage.\\n\\n```python\\nmin_pixels = 256 * 28 * 28\\nmax_pixels = 1280 * 28 * 28\\nprocessor = AutoProcessor.from_pretrained(\\n    \"Qwen/Qwen2.5-VL-3B-Instruct\", min_pixels=min_pixels, max_pixels=max_pixels\\n)\\n```\\n\\nBesides, We provide two methods for fine-grained control over the image size input to the model:\\n\\n1. Define min_pixels and max_pixels: Images will be resized to maintain their aspect ratio within the range of min_pixels and max_pixels.\\n   \\n2. Specify exact dimensions: Directly set `resized_height` and `resized_width`. These values will be rounded to the nearest multiple of 28.\\n\\n```python\\n# min_pixels and max_pixels\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\\n                \"type\": \"image\",\\n                \"image\": \"file:///path/to/your/image.jpg\",\\n                \"resized_height\": 280,\\n                \"resized_width\": 420,\\n            },\\n            {\"type\": \"text\", \"text\": \"Describe this image.\"},\\n        ],\\n    }\\n]\\n# resized_height and resized_width\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\\n                \"type\": \"image\",\\n                \"image\": \"file:///path/to/your/image.jpg\",\\n                \"min_pixels\": 50176,\\n                \"max_pixels\": 50176,\\n            },\\n            {\"type\": \"text\", \"text\": \"Describe this image.\"},\\n        ],\\n    }\\n]\\n```\\n\\n### Processing Long Texts\\n\\nThe current `config.json` is set for context length up to 32,768 tokens.\\nTo handle extensive inputs exceeding 32,768 tokens, we utilize , a technique for enhancing model length extrapolation, ensuring optimal performance on lengthy texts.\\n\\nFor supported frameworks, you could add the following to `config.json` to enable YaRN:\\n\\n```\\n{\\n\\t...,\\n    \"type\": \"yarn\",\\n    \"mrope_section\": [\\n        16,\\n        24,\\n        24\\n    ],\\n    \"factor\": 4,\\n    \"original_max_position_embeddings\": 32768\\n}\\n```\\n\\nHowever, it should be noted that this method has a significant impact on the performance of temporal and spatial localization tasks, and is therefore not recommended for use.\\n\\nAt the same time, for long video inputs, since MRoPE itself is more economical with ids, the max_position_embeddings can be directly modified to a larger value, such as 64k.\\n\\n\\n\\n## Citation\\n\\nIf you find our work helpful, feel free to give us a cite.\\n\\n```\\n@misc{qwen2.5-VL,\\n    title = {Qwen2.5-VL},\\n    url = {\\n    author = {Qwen Team},\\n    month = {January},\\n    year = {2025}\\n}\\n\\n@article{Qwen2VL,\\n  title={Qwen2-VL: Enhancing Vision-Language Model\\'s Perception of the World at Any Resolution},\\n  author={Wang, Peng and Bai, Shuai and Tan, Sinan and Wang, Shijie and Fan, Zhihao and Bai, Jinze and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Fan, Yang and Dang, Kai and Du, Mengfei and Ren, Xuancheng and Men, Rui and Liu, Dayiheng and Zhou, Chang and Zhou, Jingren and Lin, Junyang},\\n  journal={arXiv preprint arXiv:2409.12191},\\n  year={2024}\\n}\\n\\n@article{Qwen-VL,\\n  title={Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond},\\n  author={Bai, Jinze and Bai, Shuai and Yang, Shusheng and Wang, Shijie and Tan, Sinan and Wang, Peng and Lin, Junyang and Zhou, Chang and Zhou, Jingren},\\n  journal={arXiv preprint arXiv:2308.12966},\\n  year={2023}\\n}\\n```\\n',\n",
       "  'domain': 'image-text-to-text'},\n",
       " {'model_id': 'Qwen/Qwen2.5-3B-Instruct',\n",
       "  'created_at': '2024-09-17T14:08:52+00:00',\n",
       "  'downloads': 15565876,\n",
       "  'likes': 386,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'qwen2',\n",
       "   'text-generation',\n",
       "   'chat',\n",
       "   'conversational',\n",
       "   'en',\n",
       "   'arxiv:2407.10671',\n",
       "   'base_model:Qwen/Qwen2.5-3B',\n",
       "   'base_model:finetune:Qwen/Qwen2.5-3B',\n",
       "   'license:other',\n",
       "   'text-generation-inference',\n",
       "   'endpoints_compatible',\n",
       "   'deploy:azure',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# Qwen2.5-3B-Instruct\\n\\n## Introduction\\n\\nQwen2.5 is the latest series of Qwen large language models. For Qwen2.5, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters. Qwen2.5 brings the following improvements upon Qwen2:\\n\\n- Significantly **more knowledge** and has greatly improved capabilities in **coding** and **mathematics**, thanks to our specialized expert models in these domains.\\n- Significant improvements in **instruction following**, **generating long texts** (over 8K tokens), **understanding structured data** (e.g, tables), and **generating structured outputs** especially JSON. **More resilient to the diversity of system prompts**, enhancing role-play implementation and condition-setting for chatbots.\\n- **Long-context Support** up to 128K tokens and can generate up to 8K tokens.\\n- **Multilingual support** for over 29 languages, including Chinese, English, French, Spanish, Portuguese, German, Italian, Russian, Japanese, Korean, Vietnamese, Thai, Arabic, and more. \\n\\n**This repo contains the instruction-tuned 3B Qwen2.5 model**, which has the following features:\\n- Type: Causal Language Models\\n- Training Stage: Pretraining & Post-training\\n- Architecture: transformers with RoPE, SwiGLU, RMSNorm, Attention QKV bias and tied word embeddings\\n- Number of Parameters: 3.09B\\n- Number of Paramaters (Non-Embedding): 2.77B\\n- Number of Layers: 36\\n- Number of Attention Heads (GQA): 16 for Q and 2 for KV\\n- Context Length: Full 32,768 tokens and generation 8192 tokens\\n\\nFor more details, please refer to our , , and .\\n\\n## Requirements\\n\\nThe code of Qwen2.5 has been in the latest Hugging face `transformers` and we advise you to use the latest version of `transformers`.\\n\\nWith `transformers<4.37.0`, you will encounter the following error:\\n```\\nKeyError: \\'qwen2\\'\\n```\\n\\n## Quickstart\\n\\nHere provides a code snippet with `apply_chat_template` to show you how to load the tokenizer and model and how to generate contents.\\n\\n```python\\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\\n\\nmodel_name = \"Qwen/Qwen2.5-3B-Instruct\"\\n\\nmodel = AutoModelForCausalLM.from_pretrained(\\n    model_name,\\n    torch_dtype=\"auto\",\\n    device_map=\"auto\"\\n)\\ntokenizer = AutoTokenizer.from_pretrained(model_name)\\n\\nprompt = \"Give me a short introduction to large language model.\"\\nmessages = [\\n    {\"role\": \"system\", \"content\": \"You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\"},\\n    {\"role\": \"user\", \"content\": prompt}\\n]\\ntext = tokenizer.apply_chat_template(\\n    messages,\\n    tokenize=False,\\n    add_generation_prompt=True\\n)\\nmodel_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\\n\\ngenerated_ids = model.generate(\\n    **model_inputs,\\n    max_new_tokens=512\\n)\\ngenerated_ids = [\\n    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)\\n]\\n\\nresponse = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\\n```\\n\\n\\n## Evaluation & Performance\\n\\nDetailed evaluation results are reported in this .\\n\\nFor requirements on GPU memory and the respective throughput, see results .\\n\\n## Citation\\n\\nIf you find our work helpful, feel free to give us a cite.\\n\\n```\\n@misc{qwen2.5,\\n    title = {Qwen2.5: A Party of Foundation Models},\\n    url = {\\n    author = {Qwen Team},\\n    month = {September},\\n    year = {2024}\\n}\\n\\n@article{qwen2,\\n      title={Qwen2 Technical Report}, \\n      author={An Yang and Baosong Yang and Binyuan Hui and Bo Zheng and Bowen Yu and Chang Zhou and Chengpeng Li and Chengyuan Li and Dayiheng Liu and Fei Huang and Guanting Dong and Haoran Wei and Huan Lin and Jialong Tang and Jialin Wang and Jian Yang and Jianhong Tu and Jianwei Zhang and Jianxin Ma and Jin Xu and Jingren Zhou and Jinze Bai and Jinzheng He and Junyang Lin and Kai Dang and Keming Lu and Keqin Chen and Kexin Yang and Mei Li and Mingfeng Xue and Na Ni and Pei Zhang and Peng Wang and Ru Peng and Rui Men and Ruize Gao and Runji Lin and Shijie Wang and Shuai Bai and Sinan Tan and Tianhang Zhu and Tianhao Li and Tianyu Liu and Wenbin Ge and Xiaodong Deng and Xiaohuan Zhou and Xingzhang Ren and Xinyu Zhang and Xipin Wei and Xuancheng Ren and Yang Fan and Yang Yao and Yichang Zhang and Yu Wan and Yunfei Chu and Yuqiong Liu and Zeyu Cui and Zhenru Zhang and Zhihao Fan},\\n      journal={arXiv preprint arXiv:2407.10671},\\n      year={2024}\\n}\\n```',\n",
       "  'domain': 'text-generation'},\n",
       " {'model_id': 'jinaai/jina-embeddings-v3',\n",
       "  'created_at': '2024-09-05T11:56:46+00:00',\n",
       "  'downloads': 5086228,\n",
       "  'likes': 1125,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'pytorch',\n",
       "   'onnx',\n",
       "   'safetensors',\n",
       "   'feature-extraction',\n",
       "   'sentence-similarity',\n",
       "   'mteb',\n",
       "   'sentence-transformers',\n",
       "   'custom_code',\n",
       "   'multilingual',\n",
       "   'af',\n",
       "   'am',\n",
       "   'ar',\n",
       "   'as',\n",
       "   'az',\n",
       "   'be',\n",
       "   'bg',\n",
       "   'bn',\n",
       "   'br',\n",
       "   'bs',\n",
       "   'ca',\n",
       "   'cs',\n",
       "   'cy',\n",
       "   'da',\n",
       "   'de',\n",
       "   'el',\n",
       "   'en',\n",
       "   'eo',\n",
       "   'es',\n",
       "   'et',\n",
       "   'eu',\n",
       "   'fa',\n",
       "   'fi',\n",
       "   'fr',\n",
       "   'fy',\n",
       "   'ga',\n",
       "   'gd',\n",
       "   'gl',\n",
       "   'gu',\n",
       "   'ha',\n",
       "   'he',\n",
       "   'hi',\n",
       "   'hr',\n",
       "   'hu',\n",
       "   'hy',\n",
       "   'id',\n",
       "   'is',\n",
       "   'it',\n",
       "   'ja',\n",
       "   'jv',\n",
       "   'ka',\n",
       "   'kk',\n",
       "   'km',\n",
       "   'kn',\n",
       "   'ko',\n",
       "   'ku',\n",
       "   'ky',\n",
       "   'la',\n",
       "   'lo',\n",
       "   'lt',\n",
       "   'lv',\n",
       "   'mg',\n",
       "   'mk',\n",
       "   'ml',\n",
       "   'mn',\n",
       "   'mr',\n",
       "   'ms',\n",
       "   'my',\n",
       "   'ne',\n",
       "   'nl',\n",
       "   'no',\n",
       "   'om',\n",
       "   'or',\n",
       "   'pa',\n",
       "   'pl',\n",
       "   'ps',\n",
       "   'pt',\n",
       "   'ro',\n",
       "   'ru',\n",
       "   'sa',\n",
       "   'sd',\n",
       "   'si',\n",
       "   'sk',\n",
       "   'sl',\n",
       "   'so',\n",
       "   'sq',\n",
       "   'sr',\n",
       "   'su',\n",
       "   'sv',\n",
       "   'sw',\n",
       "   'ta',\n",
       "   'te',\n",
       "   'th',\n",
       "   'tl',\n",
       "   'tr',\n",
       "   'ug',\n",
       "   'uk',\n",
       "   'ur',\n",
       "   'uz',\n",
       "   'vi',\n",
       "   'xh',\n",
       "   'yi',\n",
       "   'zh',\n",
       "   'arxiv:2409.10173',\n",
       "   'license:cc-by-nc-4.0',\n",
       "   'model-index',\n",
       "   'region:eu'],\n",
       "  'modelcard': '\\n\\n\\n\\n\\n\\n\\n\\n\\nThe embedding model trained by Jina AI.\\n\\n\\n\\njina-embeddings-v3: Multilingual Embeddings With Task LoRA\\n\\n\\n## Quick Start\\n\\n |  |  | \\n\\n\\n## Intended Usage & Model Info\\n\\n\\n`jina-embeddings-v3` is a **multilingual multi-task text embedding model** designed for a variety of NLP applications.\\nBased on the , \\nthis model supports Rotary Position Embeddings to handle long input sequences up to **8192 tokens**.\\nAdditionally, it features 5 LoRA adapters to generate task-specific embeddings efficiently.\\n\\n### Key Features:\\n- **Extended Sequence Length:** Supports up to 8192 tokens with RoPE.\\n- **Task-Specific Embedding:** Customize embeddings through the `task` argument with the following options:\\n    - `retrieval.query`: Used for query embeddings in asymmetric retrieval tasks\\n    - `retrieval.passage`: Used for passage embeddings in asymmetric retrieval tasks\\n    - `separation`: Used for embeddings in clustering and re-ranking applications\\n    - `classification`: Used for embeddings in classification tasks\\n    - `text-matching`: Used for embeddings in tasks that quantify similarity between two texts, such as STS or symmetric retrieval tasks\\n- **Matryoshka Embeddings**: Supports flexible embedding sizes (`32, 64, 128, 256, 512, 768, 1024`), allowing for truncating embeddings to fit your application.\\n\\n### Supported Languages:\\nWhile the foundation model supports 100 languages, we\\'ve focused our tuning efforts on the following 30 languages: \\n**Arabic, Bengali, Chinese, Danish, Dutch, English, Finnish, French, Georgian, German, Greek, \\nHindi, Indonesian, Italian, Japanese, Korean, Latvian, Norwegian, Polish, Portuguese, Romanian, \\nRussian, Slovak, Spanish, Swedish, Thai, Turkish, Ukrainian, Urdu,** and **Vietnamese.**\\n\\n\\n> **⚠️ Important Notice:**  \\n> We fixed a bug in the `encode` function  where **Matryoshka embedding truncation** occurred *after normalization*, leading to non-normalized truncated embeddings. This issue has been resolved in the latest code revision.  \\n>  \\n> If you have encoded data using the previous version and wish to maintain consistency, please use the specific code revision when loading the model: `AutoModel.from_pretrained(\\'jinaai/jina-embeddings-v3\\', code_revision=\\'da863dd04a4e5dce6814c6625adfba87b83838aa\\', ...)`\\n\\n\\n## Usage\\n\\n**Apply mean pooling when integrating the model.**\\n\\n\\n### Why Use Mean Pooling?\\n\\nMean pooling takes all token embeddings from the model\\'s output and averages them at the sentence or paragraph level. \\nThis approach has been shown to produce high-quality sentence embeddings.\\n\\nWe provide an `encode` function that handles this for you automatically.\\n\\nHowever, if you\\'re working with the model directly, outside of the `encode` function, \\nyou\\'ll need to apply mean pooling manually. Here\\'s how you can do it:\\n\\n\\n```python\\nimport torch\\nimport torch.nn.functional as F\\nfrom transformers import AutoTokenizer, AutoModel\\n\\n\\ndef mean_pooling(model_output, attention_mask):\\n    token_embeddings = model_output[0]\\n    input_mask_expanded = (\\n        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\\n    )\\n    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(\\n        input_mask_expanded.sum(1), min=1e-9\\n    )\\n\\n\\nsentences = [\"How is the weather today?\", \"What is the current weather like today?\"]\\n\\ntokenizer = AutoTokenizer.from_pretrained(\"jinaai/jina-embeddings-v3\")\\nmodel = AutoModel.from_pretrained(\"jinaai/jina-embeddings-v3\", trust_remote_code=True)\\n\\nencoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors=\"pt\")\\ntask = \\'retrieval.query\\'\\ntask_id = model._adaptation_map[task]\\nadapter_mask = torch.full((len(sentences),), task_id, dtype=torch.int32)\\nwith torch.no_grad():\\n    model_output = model(**encoded_input, adapter_mask=adapter_mask)\\n\\nembeddings = mean_pooling(model_output, encoded_input[\"attention_mask\"])\\nembeddings = F.normalize(embeddings, p=2, dim=1)\\n\\n```\\n\\n\\n\\n\\nThe easiest way to start using `jina-embeddings-v3` is with the .\\n\\nAlternatively, you can use `jina-embeddings-v3` directly via Transformers package:\\n```bash\\n!pip install transformers torch einops\\n!pip install \\'numpyONNX Inference.**\\n\\n\\nYou can use ONNX for efficient inference with `jina-embeddings-v3`:\\n```python\\nimport onnxruntime\\nimport numpy as np\\nfrom transformers import AutoTokenizer, PretrainedConfig\\n\\n# Mean pool function\\ndef mean_pooling(model_output: np.ndarray, attention_mask: np.ndarray):\\n    token_embeddings = model_output\\n    input_mask_expanded = np.expand_dims(attention_mask, axis=-1)\\n    input_mask_expanded = np.broadcast_to(input_mask_expanded, token_embeddings.shape)\\n    sum_embeddings = np.sum(token_embeddings * input_mask_expanded, axis=1)\\n    sum_mask = np.clip(np.sum(input_mask_expanded, axis=1), a_min=1e-9, a_max=None)\\n    return sum_embeddings / sum_mask\\n\\n# Load tokenizer and model config\\ntokenizer = AutoTokenizer.from_pretrained(\\'jinaai/jina-embeddings-v3\\')\\nconfig = PretrainedConfig.from_pretrained(\\'jinaai/jina-embeddings-v3\\')\\n\\n# Tokenize input\\ninput_text = tokenizer(\\'sample text\\', return_tensors=\\'np\\')\\n\\n# ONNX session\\nmodel_path = \\'jina-embeddings-v3/onnx/model.onnx\\'\\nsession = onnxruntime.InferenceSession(model_path)\\n\\n# Prepare inputs for ONNX model\\ntask_type = \\'text-matching\\'\\ntask_id = np.array(config.lora_adaptations.index(task_type), dtype=np.int64)\\ninputs = {\\n    \\'input_ids\\': input_text[\\'input_ids\\'],\\n    \\'attention_mask\\': input_text[\\'attention_mask\\'],\\n    \\'task_id\\': task_id\\n}\\n\\n# Run model\\noutputs = session.run(None, inputs)[0]\\n\\n# Apply mean pooling and normalization to the model outputs\\nembeddings = mean_pooling(outputs, input_text[\"attention_mask\"])\\nembeddings = embeddings / np.linalg.norm(embeddings, ord=2, axis=1, keepdims=True)\\n```\\n\\n\\n\\n\\n\\n## Contact\\n\\nJoin our  and chat with other community members about ideas.\\n\\n## License\\n\\n`jina-embeddings-v3` is listed on AWS & Azure. If you need to use it beyond those platforms or on-premises within your company, note that the models is licensed under CC BY-NC 4.0. For commercial usage inquiries, feel free to .\\n\\n## Citation\\n\\nIf you find `jina-embeddings-v3` useful in your research, please cite the following paper:\\n\\n```bibtex\\n@misc{sturua2024jinaembeddingsv3multilingualembeddingstask,\\n      title={jina-embeddings-v3: Multilingual Embeddings With Task LoRA}, \\n      author={Saba Sturua and Isabelle Mohr and Mohammad Kalim Akram and Michael Günther and Bo Wang and Markus Krimmel and Feng Wang and Georgios Mastrapas and Andreas Koukounas and Andreas Koukounas and Nan Wang and Han Xiao},\\n      year={2024},\\n      eprint={2409.10173},\\n      archivePrefix={arXiv},\\n      primaryClass={cs.CL},\\n      url={ \\n}\\n\\n```\\n',\n",
       "  'domain': 'feature-extraction'},\n",
       " {'model_id': 'Qwen/Qwen2.5-VL-7B-Instruct',\n",
       "  'created_at': '2025-01-26T09:26:37+00:00',\n",
       "  'downloads': 3285458,\n",
       "  'likes': 1434,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'qwen2_5_vl',\n",
       "   'image-to-text',\n",
       "   'multimodal',\n",
       "   'image-text-to-text',\n",
       "   'conversational',\n",
       "   'en',\n",
       "   'arxiv:2309.00071',\n",
       "   'arxiv:2409.12191',\n",
       "   'arxiv:2308.12966',\n",
       "   'license:apache-2.0',\n",
       "   'text-generation-inference',\n",
       "   'endpoints_compatible',\n",
       "   'deploy:azure',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# Qwen2.5-VL-7B-Instruct\\n\\n    \\n\\n\\n## Introduction\\n\\nIn the past five months since Qwen2-VL’s release, numerous developers have built new models on the Qwen2-VL vision-language models, providing us with valuable feedback. During this period, we focused on building more useful vision-language models. Today, we are excited to introduce the latest addition to the Qwen family: Qwen2.5-VL.\\n\\n#### Key Enhancements:\\n* **Understand things visually**: Qwen2.5-VL is not only proficient in recognizing common objects such as flowers, birds, fish, and insects, but it is highly capable of analyzing texts, charts, icons, graphics, and layouts within images.\\n\\n* **Being agentic**: Qwen2.5-VL directly plays as a visual agent that can reason and dynamically direct tools, which is capable of computer use and phone use.\\n\\n* **Understanding long videos and capturing events**: Qwen2.5-VL can comprehend videos of over 1 hour, and this time it has a new ability of cpaturing event by pinpointing the relevant video segments.\\n\\n* **Capable of visual localization in different formats**: Qwen2.5-VL can accurately localize objects in an image by generating bounding boxes or points, and it can provide stable JSON outputs for coordinates and attributes.\\n\\n* **Generating structured outputs**: for data like scans of invoices, forms, tables, etc. Qwen2.5-VL supports structured outputs of their contents, benefiting usages in finance, commerce, etc.\\n\\n\\n#### Model Architecture Updates:\\n\\n* **Dynamic Resolution and Frame Rate Training for Video Understanding**:\\n\\nWe extend dynamic resolution to the temporal dimension by adopting dynamic FPS sampling, enabling the model to comprehend videos at various sampling rates. Accordingly, we update mRoPE in the time dimension with IDs and absolute time alignment, enabling the model to learn temporal sequence and speed, and ultimately acquire the ability to pinpoint specific moments.\\n\\n\\n    \\n\\n\\n\\n* **Streamlined and Efficient Vision Encoder**\\n\\nWe enhance both training and inference speeds by strategically implementing window attention into the ViT. The ViT architecture is further optimized with SwiGLU and RMSNorm, aligning it with the structure of the Qwen2.5 LLM.\\n\\n\\nWe have three models with 3, 7 and 72 billion parameters. This repo contains the instruction-tuned 7B Qwen2.5-VL model. For more information, visit our  and .\\n\\n\\n\\n## Evaluation\\n\\n### Image benchmark\\n\\n\\n| Benchmark | InternVL2.5-8B | MiniCPM-o 2.6 | GPT-4o-mini | Qwen2-VL-7B |**Qwen2.5-VL-7B** |\\n| :--- | :---: | :---: | :---: | :---: | :---: |\\n| MMMUval  | 56 | 50.4 | **60**| 54.1 | 58.6|\\n| MMMU-Proval  | 34.3 | - | 37.6| 30.5 | 41.0|\\n| DocVQAtest  | 93 | 93 | - | 94.5 | **95.7** |\\n| InfoVQAtest  | 77.6 | - |  - |76.5 | **82.6** |\\n| ChartQAtest  | 84.8 | - |- | 83.0 |**87.3** |\\n| TextVQAval  | 79.1 | 80.1 | -| 84.3 | **84.9**|\\n| OCRBench | 822 | 852 | 785 | 845 | **864** |\\n| CC_OCR | 57.7 |  | | 61.6 | **77.8**|\\n| MMStar | 62.8| | |60.7| **63.9**|\\n| MMBench-V1.1-Entest  | 79.4 | 78.0 | 76.0| 80.7 | **82.6** |\\n| MMT-Benchtest | - | - | - |**63.7** |63.6 |\\n| MMStar | **61.5** | 57.5 |  54.8 | 60.7 |63.9 |\\n| MMVetGPT-4-Turbo  | 54.2 | 60.0 | 66.9 | 62.0 | **67.1**|\\n| HallBenchavg  | 45.2 | 48.1 | 46.1| 50.6 | **52.9**|\\n| MathVistatestmini  | 58.3 | 60.6 | 52.4 | 58.2 | **68.2**|\\n| MathVision  | - | -  | - | 16.3 | **25.07** |\\n\\n### Video Benchmarks\\n\\n| Benchmark |  Qwen2-VL-7B | **Qwen2.5-VL-7B** |\\n| :--- | :---: | :---: |\\n| MVBench |  67.0 | **69.6** |\\n| PerceptionTesttest  | 66.9 | **70.5** |\\n| Video-MMEwo/w subs   | 63.3/69.0 | **65.1**/**71.6** |\\n| LVBench  |  | 45.3 |\\n| LongVideoBench  |  | 54.7 |\\n| MMBench-Video | 1.44 | 1.79 |\\n| TempCompass |  | 71.7 |\\n| MLVU |  | 70.2 |\\n| CharadesSTA/mIoU |  43.6|\\n\\n### Agent benchmark\\n| Benchmarks              | Qwen2.5-VL-7B |\\n|-------------------------|---------------|\\n| ScreenSpot              |     84.7    |\\n| ScreenSpot Pro          |     29.0    |\\n| AITZ_EM                 |  \\t81.9    |\\n| Android Control High_EM |    \\t60.1    |\\n| Android Control Low_EM  |  \\t93.7    |\\n| AndroidWorld_SR         | \\t25.5  \\t|\\n| MobileMiniWob++_SR      | \\t91.4    |\\n\\n## Requirements\\nThe code of Qwen2.5-VL has been in the latest Hugging face transformers and we advise you to build from source with command:\\n```\\npip install git+ accelerate\\n```\\nor you might encounter the following error:\\n```\\nKeyError: \\'qwen2_5_vl\\'\\n```\\n\\n\\n## Quickstart\\n\\nBelow, we provide simple examples to show how to use Qwen2.5-VL with 🤖 ModelScope and 🤗 Transformers.\\n\\nThe code of Qwen2.5-VL has been in the latest Hugging face transformers and we advise you to build from source with command:\\n```\\npip install git+ accelerate\\n```\\nor you might encounter the following error:\\n```\\nKeyError: \\'qwen2_5_vl\\'\\n```\\n\\n\\nWe offer a toolkit to help you handle various types of visual input more conveniently, as if you were using an API. This includes base64, URLs, and interleaved images and videos. You can install it using the following command:\\n\\n```bash\\n# It\\'s highly recommanded to use `[decord]` feature for faster video loading.\\npip install qwen-vl-utils[decord]==0.0.8\\n```\\n\\nIf you are not using Linux, you might not be able to install `decord` from PyPI. In that case, you can use `pip install qwen-vl-utils` which will fall back to using torchvision for video processing. However, you can still  to get decord used when loading video.\\n\\n### Using 🤗  Transformers to Chat\\n\\nHere we show a code snippet to show you how to use the chat model with `transformers` and `qwen_vl_utils`:\\n\\n```python\\nfrom transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\\nfrom qwen_vl_utils import process_vision_info\\n\\n# default: Load the model on the available device(s)\\nmodel = Qwen2_5_VLForConditionalGeneration.from_pretrained(\\n    \"Qwen/Qwen2.5-VL-7B-Instruct\", torch_dtype=\"auto\", device_map=\"auto\"\\n)\\n\\n# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.\\n# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(\\n#     \"Qwen/Qwen2.5-VL-7B-Instruct\",\\n#     torch_dtype=torch.bfloat16,\\n#     attn_implementation=\"flash_attention_2\",\\n#     device_map=\"auto\",\\n# )\\n\\n# default processer\\nprocessor = AutoProcessor.from_pretrained(\"Qwen/Qwen2.5-VL-7B-Instruct\")\\n\\n# The default range for the number of visual tokens per image in the model is 4-16384.\\n# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.\\n# min_pixels = 256*28*28\\n# max_pixels = 1280*28*28\\n# processor = AutoProcessor.from_pretrained(\"Qwen/Qwen2.5-VL-7B-Instruct\", min_pixels=min_pixels, max_pixels=max_pixels)\\n\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\\n                \"type\": \"image\",\\n                \"image\": \"\\n            },\\n            {\"type\": \"text\", \"text\": \"Describe this image.\"},\\n        ],\\n    }\\n]\\n\\n# Preparation for inference\\ntext = processor.apply_chat_template(\\n    messages, tokenize=False, add_generation_prompt=True\\n)\\nimage_inputs, video_inputs = process_vision_info(messages)\\ninputs = processor(\\n    text=[text],\\n    images=image_inputs,\\n    videos=video_inputs,\\n    padding=True,\\n    return_tensors=\"pt\",\\n)\\ninputs = inputs.to(\"cuda\")\\n\\n# Inference: Generation of the output\\ngenerated_ids = model.generate(**inputs, max_new_tokens=128)\\ngenerated_ids_trimmed = [\\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\\n]\\noutput_text = processor.batch_decode(\\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n)\\nprint(output_text)\\n```\\n\\nMulti image inference\\n\\n```python\\n# Messages containing multiple images and a text query\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\"type\": \"image\", \"image\": \"file:///path/to/image1.jpg\"},\\n            {\"type\": \"image\", \"image\": \"file:///path/to/image2.jpg\"},\\n            {\"type\": \"text\", \"text\": \"Identify the similarities between these images.\"},\\n        ],\\n    }\\n]\\n\\n# Preparation for inference\\ntext = processor.apply_chat_template(\\n    messages, tokenize=False, add_generation_prompt=True\\n)\\nimage_inputs, video_inputs = process_vision_info(messages)\\ninputs = processor(\\n    text=[text],\\n    images=image_inputs,\\n    videos=video_inputs,\\n    padding=True,\\n    return_tensors=\"pt\",\\n)\\ninputs = inputs.to(\"cuda\")\\n\\n# Inference\\ngenerated_ids = model.generate(**inputs, max_new_tokens=128)\\ngenerated_ids_trimmed = [\\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\\n]\\noutput_text = processor.batch_decode(\\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n)\\nprint(output_text)\\n```\\n\\n\\n\\nVideo inference\\n\\n```python\\n# Messages containing a images list as a video and a text query\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\\n                \"type\": \"video\",\\n                \"video\": [\\n                    \"file:///path/to/frame1.jpg\",\\n                    \"file:///path/to/frame2.jpg\",\\n                    \"file:///path/to/frame3.jpg\",\\n                    \"file:///path/to/frame4.jpg\",\\n                ],\\n            },\\n            {\"type\": \"text\", \"text\": \"Describe this video.\"},\\n        ],\\n    }\\n]\\n\\n# Messages containing a local video path and a text query\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\\n                \"type\": \"video\",\\n                \"video\": \"file:///path/to/video1.mp4\",\\n                \"max_pixels\": 360 * 420,\\n                \"fps\": 1.0,\\n            },\\n            {\"type\": \"text\", \"text\": \"Describe this video.\"},\\n        ],\\n    }\\n]\\n\\n# Messages containing a video url and a text query\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\\n                \"type\": \"video\",\\n                \"video\": \"\\n            },\\n            {\"type\": \"text\", \"text\": \"Describe this video.\"},\\n        ],\\n    }\\n]\\n\\n#In Qwen 2.5 VL, frame rate information is also input into the model to align with absolute time.\\n# Preparation for inference\\ntext = processor.apply_chat_template(\\n    messages, tokenize=False, add_generation_prompt=True\\n)\\nimage_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)\\ninputs = processor(\\n    text=[text],\\n    images=image_inputs,\\n    videos=video_inputs,\\n    fps=fps,\\n    padding=True,\\n    return_tensors=\"pt\",\\n    **video_kwargs,\\n)\\ninputs = inputs.to(\"cuda\")\\n\\n# Inference\\ngenerated_ids = model.generate(**inputs, max_new_tokens=128)\\ngenerated_ids_trimmed = [\\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\\n]\\noutput_text = processor.batch_decode(\\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n)\\nprint(output_text)\\n```\\n\\nVideo URL compatibility largely depends on the third-party library version. The details are in the table below. change the backend by `FORCE_QWENVL_VIDEO_READER=torchvision` or `FORCE_QWENVL_VIDEO_READER=decord` if you prefer not to use the default one.\\n\\n| Backend     | HTTP | HTTPS |\\n|-------------|------|-------|\\n| torchvision >= 0.19.0 | ✅  | ✅   |\\n| torchvision \\n\\n\\nBatch inference\\n\\n```python\\n# Sample messages for batch inference\\nmessages1 = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\"type\": \"image\", \"image\": \"file:///path/to/image1.jpg\"},\\n            {\"type\": \"image\", \"image\": \"file:///path/to/image2.jpg\"},\\n            {\"type\": \"text\", \"text\": \"What are the common elements in these pictures?\"},\\n        ],\\n    }\\n]\\nmessages2 = [\\n    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\\n    {\"role\": \"user\", \"content\": \"Who are you?\"},\\n]\\n# Combine messages for batch processing\\nmessages = [messages1, messages2]\\n\\n# Preparation for batch inference\\ntexts = [\\n    processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)\\n    for msg in messages\\n]\\nimage_inputs, video_inputs = process_vision_info(messages)\\ninputs = processor(\\n    text=texts,\\n    images=image_inputs,\\n    videos=video_inputs,\\n    padding=True,\\n    return_tensors=\"pt\",\\n)\\ninputs = inputs.to(\"cuda\")\\n\\n# Batch Inference\\ngenerated_ids = model.generate(**inputs, max_new_tokens=128)\\ngenerated_ids_trimmed = [\\n    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\\n]\\noutput_texts = processor.batch_decode(\\n    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n)\\nprint(output_texts)\\n```\\n\\n\\n### 🤖 ModelScope\\nWe strongly advise users especially those in mainland China to use ModelScope. `snapshot_download` can help you solve issues concerning downloading checkpoints.\\n\\n\\n### More Usage Tips\\n\\nFor input images, we support local files, base64, and URLs. For videos, we currently only support local files.\\n\\n```python\\n# You can directly insert a local file path, a URL, or a base64-encoded image into the position where you want in the text.\\n## Local file path\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\"type\": \"image\", \"image\": \"file:///path/to/your/image.jpg\"},\\n            {\"type\": \"text\", \"text\": \"Describe this image.\"},\\n        ],\\n    }\\n]\\n## Image URL\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\"type\": \"image\", \"image\": \"\\n            {\"type\": \"text\", \"text\": \"Describe this image.\"},\\n        ],\\n    }\\n]\\n## Base64 encoded image\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\"type\": \"image\", \"image\": \"data:image;base64,/9j/...\"},\\n            {\"type\": \"text\", \"text\": \"Describe this image.\"},\\n        ],\\n    }\\n]\\n```\\n#### Image Resolution for performance boost\\n\\nThe model supports a wide range of resolution inputs. By default, it uses the native resolution for input, but higher resolutions can enhance performance at the cost of more computation. Users can set the minimum and maximum number of pixels to achieve an optimal configuration for their needs, such as a token count range of 256-1280, to balance speed and memory usage.\\n\\n```python\\nmin_pixels = 256 * 28 * 28\\nmax_pixels = 1280 * 28 * 28\\nprocessor = AutoProcessor.from_pretrained(\\n    \"Qwen/Qwen2.5-VL-7B-Instruct\", min_pixels=min_pixels, max_pixels=max_pixels\\n)\\n```\\n\\nBesides, We provide two methods for fine-grained control over the image size input to the model:\\n\\n1. Define min_pixels and max_pixels: Images will be resized to maintain their aspect ratio within the range of min_pixels and max_pixels.\\n   \\n2. Specify exact dimensions: Directly set `resized_height` and `resized_width`. These values will be rounded to the nearest multiple of 28.\\n\\n```python\\n# min_pixels and max_pixels\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\\n                \"type\": \"image\",\\n                \"image\": \"file:///path/to/your/image.jpg\",\\n                \"resized_height\": 280,\\n                \"resized_width\": 420,\\n            },\\n            {\"type\": \"text\", \"text\": \"Describe this image.\"},\\n        ],\\n    }\\n]\\n# resized_height and resized_width\\nmessages = [\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\\n                \"type\": \"image\",\\n                \"image\": \"file:///path/to/your/image.jpg\",\\n                \"min_pixels\": 50176,\\n                \"max_pixels\": 50176,\\n            },\\n            {\"type\": \"text\", \"text\": \"Describe this image.\"},\\n        ],\\n    }\\n]\\n```\\n\\n### Processing Long Texts\\n\\nThe current `config.json` is set for context length up to 32,768 tokens.\\nTo handle extensive inputs exceeding 32,768 tokens, we utilize , a technique for enhancing model length extrapolation, ensuring optimal performance on lengthy texts.\\n\\nFor supported frameworks, you could add the following to `config.json` to enable YaRN:\\n\\n{\\n\\t...,\\n    \"type\": \"yarn\",\\n    \"mrope_section\": [\\n        16,\\n        24,\\n        24\\n    ],\\n    \"factor\": 4,\\n    \"original_max_position_embeddings\": 32768\\n}\\n\\nHowever, it should be noted that this method has a significant impact on the performance of temporal and spatial localization tasks, and is therefore not recommended for use.\\n\\nAt the same time, for long video inputs, since MRoPE itself is more economical with ids, the max_position_embeddings can be directly modified to a larger value, such as 64k.\\n\\n\\n\\n\\n## Citation\\n\\nIf you find our work helpful, feel free to give us a cite.\\n\\n```\\n@misc{qwen2.5-VL,\\n    title = {Qwen2.5-VL},\\n    url = {\\n    author = {Qwen Team},\\n    month = {January},\\n    year = {2025}\\n}\\n\\n@article{Qwen2VL,\\n  title={Qwen2-VL: Enhancing Vision-Language Model\\'s Perception of the World at Any Resolution},\\n  author={Wang, Peng and Bai, Shuai and Tan, Sinan and Wang, Shijie and Fan, Zhihao and Bai, Jinze and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Fan, Yang and Dang, Kai and Du, Mengfei and Ren, Xuancheng and Men, Rui and Liu, Dayiheng and Zhou, Chang and Zhou, Jingren and Lin, Junyang},\\n  journal={arXiv preprint arXiv:2409.12191},\\n  year={2024}\\n}\\n\\n@article{Qwen-VL,\\n  title={Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond},\\n  author={Bai, Jinze and Bai, Shuai and Yang, Shusheng and Wang, Shijie and Tan, Sinan and Wang, Peng and Lin, Junyang and Zhou, Chang and Zhou, Jingren},\\n  journal={arXiv preprint arXiv:2308.12966},\\n  year={2023}\\n}\\n```\\n',\n",
       "  'domain': 'image-to-text'},\n",
       " {'model_id': 'openai/whisper-large-v3-turbo',\n",
       "  'created_at': '2024-10-01T07:39:28+00:00',\n",
       "  'downloads': 2822997,\n",
       "  'likes': 2787,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'whisper',\n",
       "   'automatic-speech-recognition',\n",
       "   'audio',\n",
       "   'en',\n",
       "   'zh',\n",
       "   'de',\n",
       "   'es',\n",
       "   'ru',\n",
       "   'ko',\n",
       "   'fr',\n",
       "   'ja',\n",
       "   'pt',\n",
       "   'tr',\n",
       "   'pl',\n",
       "   'ca',\n",
       "   'nl',\n",
       "   'ar',\n",
       "   'sv',\n",
       "   'it',\n",
       "   'id',\n",
       "   'hi',\n",
       "   'fi',\n",
       "   'vi',\n",
       "   'he',\n",
       "   'uk',\n",
       "   'el',\n",
       "   'ms',\n",
       "   'cs',\n",
       "   'ro',\n",
       "   'da',\n",
       "   'hu',\n",
       "   'ta',\n",
       "   'no',\n",
       "   'th',\n",
       "   'ur',\n",
       "   'hr',\n",
       "   'bg',\n",
       "   'lt',\n",
       "   'la',\n",
       "   'mi',\n",
       "   'ml',\n",
       "   'cy',\n",
       "   'sk',\n",
       "   'te',\n",
       "   'fa',\n",
       "   'lv',\n",
       "   'bn',\n",
       "   'sr',\n",
       "   'az',\n",
       "   'sl',\n",
       "   'kn',\n",
       "   'et',\n",
       "   'mk',\n",
       "   'br',\n",
       "   'eu',\n",
       "   'is',\n",
       "   'hy',\n",
       "   'ne',\n",
       "   'mn',\n",
       "   'bs',\n",
       "   'kk',\n",
       "   'sq',\n",
       "   'sw',\n",
       "   'gl',\n",
       "   'mr',\n",
       "   'pa',\n",
       "   'si',\n",
       "   'km',\n",
       "   'sn',\n",
       "   'yo',\n",
       "   'so',\n",
       "   'af',\n",
       "   'oc',\n",
       "   'ka',\n",
       "   'be',\n",
       "   'tg',\n",
       "   'sd',\n",
       "   'gu',\n",
       "   'am',\n",
       "   'yi',\n",
       "   'lo',\n",
       "   'uz',\n",
       "   'fo',\n",
       "   'ht',\n",
       "   'ps',\n",
       "   'tk',\n",
       "   'nn',\n",
       "   'mt',\n",
       "   'sa',\n",
       "   'lb',\n",
       "   'my',\n",
       "   'bo',\n",
       "   'tl',\n",
       "   'mg',\n",
       "   'as',\n",
       "   'tt',\n",
       "   'haw',\n",
       "   'ln',\n",
       "   'ha',\n",
       "   'ba',\n",
       "   'jw',\n",
       "   'su',\n",
       "   'arxiv:2212.04356',\n",
       "   'base_model:openai/whisper-large-v3',\n",
       "   'base_model:finetune:openai/whisper-large-v3',\n",
       "   'license:mit',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# Whisper\\n\\nWhisper is a state-of-the-art model for automatic speech recognition (ASR) and speech translation, proposed in the paper \\n by Alec Radford \\net al. from OpenAI. Trained on >5M hours of labeled data, Whisper demonstrates a strong ability to generalise to many \\ndatasets and domains in a zero-shot setting.\\n\\nWhisper large-v3-turbo is a finetuned version of a pruned . In other words, it\\'s the exact same model, except that the number of decoding layers have reduced from 32 to 4.\\nAs a result, the model is way faster, at the expense of a minor quality degradation. You can find more details about it .\\n\\n**Disclaimer**: Content for this model card has partly been written by the 🤗 Hugging Face team, and partly copied and \\npasted from the original model card.\\n\\n## Usage\\n\\nWhisper large-v3-turbo is supported in Hugging Face 🤗 Transformers. To run the model, first install the Transformers \\nlibrary. For this example, we\\'ll also install 🤗 Datasets to load toy audio dataset from the Hugging Face Hub, and \\n🤗 Accelerate to reduce the model loading time:\\n\\n```bash\\npip install --upgrade pip\\npip install --upgrade transformers datasets[audio] accelerate\\n```\\n\\nThe model can be used with the \\nclass to transcribe audios of arbitrary length:\\n\\n```python\\nimport torch\\nfrom transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline\\nfrom datasets import load_dataset\\n\\n\\ndevice = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\\ntorch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\\n\\nmodel_id = \"openai/whisper-large-v3-turbo\"\\n\\nmodel = AutoModelForSpeechSeq2Seq.from_pretrained(\\n    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True\\n)\\nmodel.to(device)\\n\\nprocessor = AutoProcessor.from_pretrained(model_id)\\n\\npipe = pipeline(\\n    \"automatic-speech-recognition\",\\n    model=model,\\n    tokenizer=processor.tokenizer,\\n    feature_extractor=processor.feature_extractor,\\n    torch_dtype=torch_dtype,\\n    device=device,\\n)\\n\\ndataset = load_dataset(\"distil-whisper/librispeech_long\", \"clean\", split=\"validation\")\\nsample = dataset[0][\"audio\"]\\n\\nresult = pipe(sample)\\nprint(result[\"text\"])\\n```\\n\\nTo transcribe a local audio file, simply pass the path to your audio file when you call the pipeline:\\n\\n```python\\nresult = pipe(\"audio.mp3\")\\n```\\n\\nMultiple audio files can be transcribed in parallel by specifying them as a list and setting the `batch_size` parameter:\\n\\n```python\\nresult = pipe([\"audio_1.mp3\", \"audio_2.mp3\"], batch_size=2)\\n```\\n\\nTransformers is compatible with all Whisper decoding strategies, such as temperature fallback and condition on previous \\ntokens. The following example demonstrates how to enable these heuristics:\\n\\n```python\\ngenerate_kwargs = {\\n    \"max_new_tokens\": 448,\\n    \"num_beams\": 1,\\n    \"condition_on_prev_tokens\": False,\\n    \"compression_ratio_threshold\": 1.35,  # zlib compression ratio threshold (in token space)\\n    \"temperature\": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),\\n    \"logprob_threshold\": -1.0,\\n    \"no_speech_threshold\": 0.6,\\n    \"return_timestamps\": True,\\n}\\n\\nresult = pipe(sample, generate_kwargs=generate_kwargs)\\n```\\n\\nWhisper predicts the language of the source audio automatically. If the source audio language is known *a-priori*, it \\ncan be passed as an argument to the pipeline:\\n\\n```python\\nresult = pipe(sample, generate_kwargs={\"language\": \"english\"})\\n```\\n\\nBy default, Whisper performs the task of *speech transcription*, where the source audio language is the same as the target\\ntext language. To perform *speech translation*, where the target text is in English, set the task to `\"translate\"`:\\n\\n```python\\nresult = pipe(sample, generate_kwargs={\"task\": \"translate\"})\\n```\\n\\nFinally, the model can be made to predict timestamps. For sentence-level timestamps, pass the `return_timestamps` argument:\\n\\n```python\\nresult = pipe(sample, return_timestamps=True)\\nprint(result[\"chunks\"])\\n```\\n\\nAnd for word-level timestamps:\\n\\n```python\\nresult = pipe(sample, return_timestamps=\"word\")\\nprint(result[\"chunks\"])\\n```\\n\\nThe above arguments can be used in isolation or in combination. For example, to perform the task of speech transcription \\nwhere the source audio is in French, and we want to return sentence-level timestamps, the following can be used:\\n\\n```python\\nresult = pipe(sample, return_timestamps=True, generate_kwargs={\"language\": \"french\", \"task\": \"translate\"})\\nprint(result[\"chunks\"])\\n```\\n\\n\\n\\n For more control over the generation parameters, use the model + processor API directly: \\n\\n```python\\nimport torch\\nfrom transformers import AutoModelForSpeechSeq2Seq, AutoProcessor\\nfrom datasets import Audio, load_dataset\\n\\n\\ndevice = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\\ntorch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\\n\\nmodel_id = \"openai/whisper-large-v3-turbo\"\\n\\nmodel = AutoModelForSpeechSeq2Seq.from_pretrained(\\n    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True\\n)\\nmodel.to(device)\\n\\nprocessor = AutoProcessor.from_pretrained(model_id)\\n\\ndataset = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\\ndataset = dataset.cast_column(\"audio\", Audio(processor.feature_extractor.sampling_rate))\\nsample = dataset[0][\"audio\"]\\n\\ninputs = processor(\\n    sample[\"array\"],\\n    sampling_rate=sample[\"sampling_rate\"],\\n    return_tensors=\"pt\",\\n    truncation=False,\\n    padding=\"longest\",\\n    return_attention_mask=True,\\n)\\ninputs = inputs.to(device, dtype=torch_dtype)\\n\\ngen_kwargs = {\\n    \"max_new_tokens\": 448,\\n    \"num_beams\": 1,\\n    \"condition_on_prev_tokens\": False,\\n    \"compression_ratio_threshold\": 1.35,  # zlib compression ratio threshold (in token space)\\n    \"temperature\": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),\\n    \"logprob_threshold\": -1.0,\\n    \"no_speech_threshold\": 0.6,\\n    \"return_timestamps\": True,\\n}\\n\\npred_ids = model.generate(**inputs, **gen_kwargs)\\npred_text = processor.batch_decode(pred_ids, skip_special_tokens=True, decode_with_timestamps=False)\\n\\nprint(pred_text)\\n```\\n\\n\\n\\n## Additional Speed & Memory Improvements\\n\\nYou can apply additional speed and memory improvements to Whisper to further reduce the inference speed and VRAM \\nrequirements.\\n\\n### Chunked Long-Form\\n\\nWhisper has a receptive field of 30-seconds. To transcribe audios longer than this, one of two long-form algorithms are\\nrequired:\\n1. **Sequential:** uses a \"sliding window\" for buffered inference, transcribing 30-second slices one after the other\\n2. **Chunked:** splits long audio files into shorter ones (with a small overlap between segments), transcribes each segment independently, and stitches the resulting transcriptions at the boundaries\\n\\nThe sequential long-form algorithm should be used in either of the following scenarios:\\n1. Transcription accuracy is the most important factor, and speed is less of a consideration\\n2. You are transcribing **batches** of long audio files, in which case the latency of sequential is comparable to chunked, while being up to 0.5% WER more accurate\\n\\nConversely, the chunked algorithm should be used when:\\n1. Transcription speed is the most important factor\\n2. You are transcribing a **single** long audio file\\n\\nBy default, Transformers uses the sequential algorithm. To enable the chunked algorithm, pass the `chunk_length_s` \\nparameter to the `pipeline`. For large-v3, a chunk length of 30-seconds is optimal. To activate batching over long \\naudio files, pass the argument `batch_size`:\\n\\n```python\\nimport torch\\nfrom transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline\\nfrom datasets import load_dataset\\n\\n\\ndevice = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\\ntorch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\\n\\nmodel_id = \"openai/whisper-large-v3-turbo\"\\n\\nmodel = AutoModelForSpeechSeq2Seq.from_pretrained(\\n    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True\\n)\\nmodel.to(device)\\n\\nprocessor = AutoProcessor.from_pretrained(model_id)\\n\\npipe = pipeline(\\n    \"automatic-speech-recognition\",\\n    model=model,\\n    tokenizer=processor.tokenizer,\\n    feature_extractor=processor.feature_extractor,\\n    chunk_length_s=30,\\n    batch_size=16,  # batch size for inference - set based on your device\\n    torch_dtype=torch_dtype,\\n    device=device,\\n)\\n\\ndataset = load_dataset(\"distil-whisper/librispeech_long\", \"clean\", split=\"validation\")\\nsample = dataset[0][\"audio\"]\\n\\nresult = pipe(sample)\\nprint(result[\"text\"])\\n```\\n\\n#### Torch compile\\n\\nThe Whisper forward pass is compatible with \\nfor 4.5x speed-ups.\\n\\n**Note:** `torch.compile` is currently not compatible with the Chunked long-form algorithm or Flash Attention 2 ⚠️\\n\\n```python\\nimport torch\\nfrom torch.nn.attention import SDPBackend, sdpa_kernel\\nfrom transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline\\nfrom datasets import load_dataset\\nfrom tqdm import tqdm\\n\\ntorch.set_float32_matmul_precision(\"high\")\\n\\ndevice = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\\ntorch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\\n\\nmodel_id = \"openai/whisper-large-v3-turbo\"\\n\\nmodel = AutoModelForSpeechSeq2Seq.from_pretrained(\\n    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True\\n).to(device)\\n\\n# Enable static cache and compile the forward pass\\nmodel.generation_config.cache_implementation = \"static\"\\nmodel.generation_config.max_new_tokens = 256\\nmodel.forward = torch.compile(model.forward, mode=\"reduce-overhead\", fullgraph=True)\\n\\nprocessor = AutoProcessor.from_pretrained(model_id)\\n\\npipe = pipeline(\\n    \"automatic-speech-recognition\",\\n    model=model,\\n    tokenizer=processor.tokenizer,\\n    feature_extractor=processor.feature_extractor,\\n    torch_dtype=torch_dtype,\\n    device=device,\\n)\\n\\ndataset = load_dataset(\"distil-whisper/librispeech_long\", \"clean\", split=\"validation\")\\nsample = dataset[0][\"audio\"]\\n\\n# 2 warmup steps\\nfor _ in tqdm(range(2), desc=\"Warm-up step\"):\\n    with sdpa_kernel(SDPBackend.MATH):\\n        result = pipe(sample.copy(), generate_kwargs={\"min_new_tokens\": 256, \"max_new_tokens\": 256})\\n\\n# fast run\\nwith sdpa_kernel(SDPBackend.MATH):\\n    result = pipe(sample.copy())\\n\\nprint(result[\"text\"])\\n```\\n\\n#### Flash Attention 2\\n\\nWe recommend using  if your GPU supports it and you are not using . \\nTo do so, first install :\\n\\n```\\npip install flash-attn --no-build-isolation\\n```\\n\\nThen pass `attn_implementation=\"flash_attention_2\"` to `from_pretrained`:\\n\\n```python\\nmodel = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, attn_implementation=\"flash_attention_2\")\\n```\\n\\n#### Torch Scale-Product-Attention (SDPA)\\n\\nIf your GPU does not support Flash Attention, we recommend making use of PyTorch . \\nThis attention implementation is activated **by default** for PyTorch versions 2.1.1 or greater. To check \\nwhether you have a compatible PyTorch version, run the following Python code snippet:\\n\\n```python\\nfrom transformers.utils import is_torch_sdpa_available\\n\\nprint(is_torch_sdpa_available())\\n```\\n\\nIf the above returns `True`, you have a valid version of PyTorch installed and SDPA is activated by default. If it \\nreturns `False`, you need to upgrade your PyTorch version according to the \\n\\nOnce a valid PyTorch version is installed, SDPA is activated by default. It can also be set explicitly by specifying \\n`attn_implementation=\"sdpa\"` as follows:\\n\\n```python\\nmodel = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, attn_implementation=\"sdpa\")\\n```\\n\\nFor more information about how to use the SDPA refer to the .\\n\\n\\n## Model details\\n\\nWhisper is a Transformer based encoder-decoder model, also referred to as a _sequence-to-sequence_ model. There are two\\nflavours of Whisper model: English-only and multilingual. The English-only models were trained on the task of English \\nspeech recognition. The multilingual models were trained simultaneously on multilingual speech recognition and speech \\ntranslation. For speech recognition, the model predicts transcriptions in the *same* language as the audio. For speech \\ntranslation, the model predicts transcriptions to a *different* language to the audio.\\n\\nWhisper checkpoints come in five configurations of varying model sizes. The smallest four are available as English-only \\nand multilingual. The largest checkpoints are multilingual only. All ten of the pre-trained checkpoints \\nare available on the . The \\ncheckpoints are summarised in the following table with links to the models on the Hub:\\n\\n| Size     | Parameters | English-only                                         | Multilingual                                        |\\n|----------|------------|------------------------------------------------------|-----------------------------------------------------|\\n| tiny     | 39 M       |    |      |\\n| base     | 74 M       |    |      |\\n| small    | 244 M      |   |     |\\n| medium   | 769 M      |  |    |\\n| large    | 1550 M     | x                                                    |     |\\n| large-v2 | 1550 M     | x                                                    |  |\\n| large-v3 | 1550 M     | x                                                    |  |\\n| large-v3-turbo | 809 M     | x                                                    |  |\\n\\n\\n## Fine-Tuning\\n\\nThe pre-trained Whisper model demonstrates a strong ability to generalise to different datasets and domains. However, \\nits predictive capabilities can be improved further for certain languages and tasks through *fine-tuning*. The blog \\npost  provides a step-by-step \\nguide to fine-tuning the Whisper model with as little as 5 hours of labelled data.\\n\\n### Evaluated Use\\n\\nThe primary intended users of these models are AI researchers studying robustness, generalization, capabilities, biases, and constraints of the current model. However, Whisper is also potentially quite useful as an ASR solution for developers, especially for English speech recognition. We recognize that once models are released, it is impossible to restrict access to only “intended” uses or to draw reasonable guidelines around what is or is not research.\\n\\nThe models are primarily trained and evaluated on ASR and speech translation to English tasks. They show strong ASR results in ~10 languages. They may exhibit additional capabilities, particularly if fine-tuned on certain tasks like voice activity detection, speaker classification, or speaker diarization but have not been robustly evaluated in these areas. We strongly recommend that users perform robust evaluations of the models in a particular context and domain before deploying them.\\n\\nIn particular, we caution against using Whisper models to transcribe recordings of individuals taken without their consent or purporting to use these models for any kind of subjective classification. We recommend against use in high-risk domains like decision-making contexts, where flaws in accuracy can lead to pronounced flaws in outcomes. The models are intended to transcribe and translate speech, use of the model for classification is not only not evaluated but also not appropriate, particularly to infer human attributes.\\n\\n\\n## Training Data\\n\\nNo information provided.\\n\\n## Performance and Limitations\\n\\nOur studies show that, over many existing ASR systems, the models exhibit improved robustness to accents, background noise, technical language, as well as zero shot translation from multiple languages into English; and that accuracy on speech recognition and translation is near the state-of-the-art level. \\n\\nHowever, because the models are trained in a weakly supervised manner using large-scale noisy data, the predictions may include texts that are not actually spoken in the audio input (i.e. hallucination). We hypothesize that this happens because, given their general knowledge of language, the models combine trying to predict the next word in audio with trying to transcribe the audio itself.\\n\\nOur models perform unevenly across languages, and we observe lower accuracy on low-resource and/or low-discoverability languages or languages where we have less training data. The models also exhibit disparate performance on different accents and dialects of particular languages, which may include higher word error rate across speakers of different genders, races, ages, or other demographic criteria. Our full evaluation results are presented in . \\n\\nIn addition, the sequence-to-sequence architecture of the model makes it prone to generating repetitive texts, which can be mitigated to some degree by beam search and temperature scheduling but not perfectly. Further analysis on these limitations are provided in . It is likely that this behavior and hallucinations may be worse on lower-resource and/or lower-discoverability languages.\\n\\n\\n## Broader Implications\\n\\nWe anticipate that Whisper models’ transcription capabilities may be used for improving accessibility tools. While Whisper models cannot be used for real-time transcription out of the box – their speed and size suggest that others may be able to build applications on top of them that allow for near-real-time speech recognition and translation. The real value of beneficial applications built on top of Whisper models suggests that the disparate performance of these models may have real economic implications.\\n\\nThere are also potential dual use concerns that come with releasing Whisper. While we hope the technology will be used primarily for beneficial purposes, making ASR technology more accessible could enable more actors to build capable surveillance technologies or scale up existing surveillance efforts, as the speed and accuracy allow for affordable automatic transcription and translation of large volumes of audio communication. Moreover, these models may have some capabilities to recognize specific individuals out of the box, which in turn presents safety concerns related both to dual use and disparate performance. In practice, we expect that the cost of transcription is not the limiting factor of scaling up surveillance projects.\\n\\n\\n### BibTeX entry and citation info\\n```bibtex\\n@misc{radford2022whisper,\\n  doi = {10.48550/ARXIV.2212.04356},\\n  url = {\\n  author = {Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},\\n  title = {Robust Speech Recognition via Large-Scale Weak Supervision},\\n  publisher = {arXiv},\\n  year = {2022},\\n  copyright = {arXiv.org perpetual, non-exclusive license}\\n}\\n```',\n",
       "  'domain': 'automatic-speech-recognition'},\n",
       " {'model_id': 'Lightricks/LTX-2',\n",
       "  'created_at': '2026-01-03T10:23:39+00:00',\n",
       "  'downloads': 2382172,\n",
       "  'likes': 1332,\n",
       "  'author': None,\n",
       "  'tags': ['diffusers',\n",
       "   'safetensors',\n",
       "   'image-to-video',\n",
       "   'text-to-video',\n",
       "   'video-to-video',\n",
       "   'image-text-to-video',\n",
       "   'audio-to-video',\n",
       "   'text-to-audio',\n",
       "   'video-to-audio',\n",
       "   'audio-to-audio',\n",
       "   'text-to-audio-video',\n",
       "   'image-to-audio-video',\n",
       "   'image-text-to-audio-video',\n",
       "   'ltx-2',\n",
       "   'ltx-video',\n",
       "   'ltxv',\n",
       "   'lightricks',\n",
       "   'en',\n",
       "   'de',\n",
       "   'es',\n",
       "   'fr',\n",
       "   'ja',\n",
       "   'ko',\n",
       "   'zh',\n",
       "   'it',\n",
       "   'pt',\n",
       "   'arxiv:2601.03233',\n",
       "   'license:other',\n",
       "   'diffusers:LTX2Pipeline',\n",
       "   'region:us'],\n",
       "  'modelcard': \"\\n# LTX-2 Model Card\\n\\nThis model card focuses on the LTX-2 model, as presented in the paper . The codebase is available .\\n\\nLTX-2 is a DiT-based audio-video foundation model designed to generate synchronized video and audio within a single model. It brings together the core building blocks of modern video generation, with open weights and a focus on practical, local execution. \\n\\n\\n\\n# Model Checkpoints\\n\\n| Name                           | Notes                                                                                                          |\\n|--------------------------------|----------------------------------------------------------------------------------------------------------------|\\n| ltx-2-19b-dev                  | The full model, flexible and trainable in bf16                                                                 |\\n| ltx-2-19b-dev-fp8              | The full model in fp8 quantization                                                                             |\\n| ltx-2-19b-dev-fp4              | The full model in nvfp4 quantization                                                                           | \\n| ltx-2-19b-distilled            | The distilled version of the full model, 8 steps, CFG=1                                                        |\\n| ltx-2-19b-distilled-lora-384   | A LoRA version of the distilled model applicable to the full model                                             |\\n| ltx-2-spatial-upscaler-x2-1.0  | An x2 spatial upscaler for the ltx-2 latents, used in multi stage (multiscale) pipelines for higher resolution |\\n| ltx-2-temporal-upscaler-x2-1.0 | An x2 temporal upscaler for the ltx-2 latents, used in multi stage (multiscale) pipelines for higher FPS       |\\n\\n## Model Details\\n- **Developed by:** Lightricks\\n- **Model type:** Diffusion-based audio-video foundation model\\n- **Language(s):** English\\n\\n# Online demo\\nLTX-2 is accessible right away via the following links:\\n- \\n- \\n\\n# Run locally\\n\\n## Direct use license\\nYou can use the models - full, distilled, upscalers and any derivatives of the models - for purposes under the .\\n\\n## ComfyUI\\nWe recommend you use the built-in LTXVideo nodes that can be found in the ComfyUI Manager. \\nFor manual installation information, please refer to our .\\n\\n## PyTorch codebase\\n\\nThe  is a monorepo with several packages. From model definition in 'ltx-core' to pipelines in 'ltx-pipelines' and training capabilities in 'ltx-trainer'.\\nThe codebase was tested with Python >=3.12, CUDA version >12.7, and supports PyTorch ~= 2.7.\\n\\n### Installation\\n\\n```bash\\ngit clone \\ncd LTX-2\\n\\n# From the repository root\\nuv sync\\nsource .venv/bin/activate\\n```\\n\\n### Inference\\n\\nTo use our model, please follow the instructions in our  package.\\n\\n## Diffusers 🧨\\n\\nLTX-2 is supported in the  for image-to-video generation.\\n\\n## General tips:\\n* Width & height settings must be divisible by 32. Frame count must be divisible by 8 + 1. \\n* In case the resolution or number of frames are not divisible by 32 or 8 + 1, the input should be padded with -1 and then cropped to the desired resolution and number of frames.\\n* For tips on writing effective prompts, please visit our  \\n\\n### Limitations\\n- This model is not intended or able to provide factual information.\\n- As a statistical model this checkpoint might amplify existing societal biases.\\n- The model may fail to generate videos that matches the prompts perfectly.\\n- Prompt following is heavily influenced by the prompting-style.\\n- The model may generate content that is inappropriate or offensive.\\n- When generating audio without speech, the audio may be of lower quality.\\n\\n# Train the model\\n\\nThe base (dev) model is fully trainable.\\n\\nIt's extremely easy to reproduce the LoRAs and IC-LoRAs we publish with the model by following the instructions on the .\\n\\nTraining for motion, style or likeness (sound+appearance) can take less than an hour in many settings.\\n\\n## Citation\\n\\n```bibtex\\n@article{hacohen2025ltx2,\\n  title={LTX-2: Efficient Joint Audio-Visual Foundation Model},\\n  author={HaCohen, Yoav and Brazowski, Benny and Chiprut, Nisan and Bitterman, Yaki and Kvochko, Andrew and Berkowitz, Avishai and Shalem, Daniel and Lifschitz, Daphna and Moshe, Dudu and Porat, Eitan and Richardson, Eitan and Guy Shiran and Itay Chachy and Jonathan Chetboun and Michael Finkelson and Michael Kupchick and Nir Zabari and Nitzan Guetta and Noa Kotler and Ofir Bibi and Ori Gordon and Poriya Panet and Roi Benita and Shahar Armon and Victor Kulikov and Yaron Inger and Yonatan Shiftan and Zeev Melumian and Zeev Farbman},\\n  journal={arXiv preprint arXiv:2601.03233},\\n  year={2025}\\n}\\n```\",\n",
       "  'domain': 'audio-to-audio'},\n",
       " {'model_id': 'hexgrad/Kokoro-82M',\n",
       "  'created_at': '2024-12-26T00:20:08+00:00',\n",
       "  'downloads': 2238109,\n",
       "  'likes': 5625,\n",
       "  'author': None,\n",
       "  'tags': ['text-to-speech',\n",
       "   'en',\n",
       "   'arxiv:2306.07691',\n",
       "   'arxiv:2203.02395',\n",
       "   'base_model:yl4579/StyleTTS2-LJSpeech',\n",
       "   'base_model:finetune:yl4579/StyleTTS2-LJSpeech',\n",
       "   'doi:10.57967/hf/4329',\n",
       "   'license:apache-2.0',\n",
       "   'region:us'],\n",
       "  'modelcard': '**Kokoro** is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, Kokoro can be deployed anywhere from production environments to personal projects.\\n\\n\\n\\n🐈 **GitHub**: \\n\\n🚀 **Demo**: \\n\\n> [!NOTE]\\n> As of April 2025, the market rate of Kokoro served over API is **under $1 per million characters of text input**, or under $0.06 per hour of audio output. (On average, 1000 characters of input is about 1 minute of output.) Sources:  and .\\n>\\n> This is an Apache-licensed model, and Kokoro has been deployed in numerous projects and commercial APIs. We welcome the deployment of the model in real use cases.\\n\\n> [!CAUTION]\\n> Fake websites like kokorottsai_com (snapshot:  and kokorotts_net (snapshot:  are likely scams masquerading under the banner of a popular model.\\n>\\n> Any website containing \"kokoro\" in its root domain (e.g. kokorottsai_com, kokorotts_net) is **NOT owned by and NOT affiliated with this model page or its author**, and attempts to imply otherwise are red flags.\\n\\n- \\n- \\n-  ↗️\\n-  ↗️\\n-  ↗️\\n- \\n- \\n- \\n- \\n\\n### Releases\\n\\n| Model | Published | Training Data | Langs & Voices | SHA256 |\\n| ----- | --------- | ------------- | -------------- | ------ |\\n| **v1.0** | **2025 Jan 27** | **Few hundred hrs** |  | `496dba11` |\\n|  | 2024 Dec 25 | =0.9.2 soundfile\\n!apt-get -qq -y install espeak-ng > /dev/null 2>&1\\nfrom kokoro import KPipeline\\nfrom IPython.display import display, Audio\\nimport soundfile as sf\\nimport torch\\npipeline = KPipeline(lang_code=\\'a\\')\\ntext = \\'\\'\\'\\n is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights,  can be deployed anywhere from production environments to personal projects.\\n\\'\\'\\'\\ngenerator = pipeline(text, voice=\\'af_heart\\')\\nfor i, (gs, ps, audio) in enumerate(generator):\\n    print(i, gs, ps)\\n    display(Audio(data=audio, rate=24000, autoplay=i==0))\\n    sf.write(f\\'{i}.wav\\', audio, 24000)\\n```\\nUnder the hood, `kokoro` uses , a G2P library at \\n\\n### Model Facts\\n\\n**Architecture:**\\n- StyleTTS 2: \\n- ISTFTNet: \\n- Decoder only: no diffusion, no encoder release\\n\\n**Architected by:** Li et al @ \\n\\n**Trained by**: `@rzvzn` on Discord\\n\\n**Languages:** Multiple\\n\\n**Model SHA256 Hash:** `496dba118d1a58f5f3db2efc88dbdc216e0483fc89fe6e47ee1f2c53f18ad1e4`\\n\\n### Training Details\\n\\n**Data:** Kokoro was trained exclusively on **permissive/non-copyrighted audio data** and IPA phoneme labels. Examples of permissive/non-copyrighted audio include:\\n- Public domain audio\\n- Audio licensed under Apache, MIT, etc\\n- Synthetic audio[1] generated by closed[2] TTS models from large providers\\n[1] \\n[2] No synthetic audio from open TTS models or \"custom voice clones\"\\n\\n**Total Dataset Size:** A few hundred hours of audio\\n\\n**Total Training Cost:** About $1000 for 1000 hours of A100 80GB vRAM\\n\\n### Creative Commons Attribution\\n\\nThe following CC BY audio was part of the dataset used to train Kokoro v1.0.\\n\\n| Audio Data | Duration Used | License | Added to Training Set After |\\n| ---------- | ------------- | ------- | --------------------------- |\\n|  `tnc` | \\n',\n",
       "  'domain': 'text-to-speech'},\n",
       " {'model_id': 'Qwen/Qwen3-Embedding-0.6B',\n",
       "  'created_at': '2025-06-03T14:25:32+00:00',\n",
       "  'downloads': 2140161,\n",
       "  'likes': 846,\n",
       "  'author': None,\n",
       "  'tags': ['sentence-transformers',\n",
       "   'safetensors',\n",
       "   'qwen3',\n",
       "   'text-generation',\n",
       "   'transformers',\n",
       "   'sentence-similarity',\n",
       "   'feature-extraction',\n",
       "   'text-embeddings-inference',\n",
       "   'arxiv:2506.05176',\n",
       "   'base_model:Qwen/Qwen3-0.6B-Base',\n",
       "   'base_model:finetune:Qwen/Qwen3-0.6B-Base',\n",
       "   'license:apache-2.0',\n",
       "   'endpoints_compatible',\n",
       "   'deploy:azure',\n",
       "   'region:us'],\n",
       "  'modelcard': '# Qwen3-Embedding-0.6B\\n\\n\\n    \\n\\n\\n## Highlights\\n\\nThe Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B). This series inherits the exceptional multilingual capabilities, long-text understanding, and reasoning skills of its foundational model. The Qwen3 Embedding series represents significant advancements in multiple text embedding and ranking tasks, including text retrieval, code retrieval, text classification, text clustering, and bitext mining.\\n\\n**Exceptional Versatility**: The embedding model has achieved state-of-the-art performance across a wide range of downstream application evaluations. The 8B size embedding model ranks **No.1** in the MTEB multilingual leaderboard (as of June 5, 2025, score **70.58**), while the reranking model excels in various text retrieval scenarios.\\n\\n**Comprehensive Flexibility**: The Qwen3 Embedding series offers a full spectrum of sizes (from 0.6B to 8B) for both embedding and reranking models, catering to diverse use cases that prioritize efficiency and effectiveness. Developers can seamlessly combine these two modules. Additionally, the embedding model allows for flexible vector definitions across all dimensions, and both embedding and reranking models support user-defined instructions to enhance performance for specific tasks, languages, or scenarios.\\n\\n**Multilingual Capability**: The Qwen3 Embedding series offer support for over 100 languages, thanks to the multilingual capabilites of Qwen3 models. This includes various programming languages, and provides robust multilingual, cross-lingual, and code retrieval capabilities.\\n\\n## Model Overview\\n\\n**Qwen3-Embedding-0.6B** has the following features:\\n\\n- Model Type: Text Embedding\\n- Supported Languages: 100+ Languages\\n- Number of Paramaters: 0.6B\\n- Context Length: 32k\\n- Embedding Dimension: Up to 1024, supports user-defined output dimensions ranging from 32 to 1024\\n\\nFor more details, including benchmark evaluation, hardware requirements, and inference performance, please refer to our , .\\n\\n## Qwen3 Embedding Series Model list\\n\\n| Model Type       | Models               | Size | Layers | Sequence Length | Embedding Dimension | MRL Support | Instruction Aware |\\n|------------------|----------------------|------|--------|-----------------|---------------------|-------------|----------------|\\n| Text Embedding   |  | 0.6B | 28     | 32K             | 1024                | Yes         | Yes            |\\n| Text Embedding   |    | 4B   | 36     | 32K             | 2560                | Yes         | Yes            |\\n| Text Embedding   |    | 8B   | 36     | 32K             | 4096                | Yes         | Yes            |\\n| Text Reranking   |  | 0.6B | 28     | 32K             | -                   | -           | Yes            |\\n| Text Reranking   |    | 4B   | 36     | 32K             | -                   | -           | Yes            |\\n| Text Reranking   |    | 8B   | 36     | 32K             | -                   | -           | Yes            |\\n\\n> **Note**:\\n> - `MRL Support` indicates whether the embedding model supports custom dimensions for the final embedding. \\n> - `Instruction Aware` notes whether the embedding or reranking model supports customizing the input instruction according to different tasks.\\n> - Our evaluation indicates that, for most downstream tasks, using instructions (instruct) typically yields an improvement of 1% to 5% compared to not using them. Therefore, we recommend that developers create tailored instructions specific to their tasks and scenarios. In multilingual contexts, we also advise users to write their instructions in English, as most instructions utilized during the model training process were originally written in English.\\n\\n## Usage\\n\\nWith Transformers versions earlier than 4.51.0, you may encounter the following error:\\n```\\nKeyError: \\'qwen3\\'\\n```\\n\\n### Sentence Transformers Usage\\n\\n```python\\n# Requires transformers>=4.51.0\\n# Requires sentence-transformers>=2.7.0\\n\\nfrom sentence_transformers import SentenceTransformer\\n\\n# Load the model\\nmodel = SentenceTransformer(\"Qwen/Qwen3-Embedding-0.6B\")\\n\\n# We recommend enabling flash_attention_2 for better acceleration and memory saving,\\n# together with setting `padding_side` to \"left\":\\n# model = SentenceTransformer(\\n#     \"Qwen/Qwen3-Embedding-0.6B\",\\n#     model_kwargs={\"attn_implementation\": \"flash_attention_2\", \"device_map\": \"auto\"},\\n#     tokenizer_kwargs={\"padding_side\": \"left\"},\\n# )\\n\\n# The queries and documents to embed\\nqueries = [\\n    \"What is the capital of China?\",\\n    \"Explain gravity\",\\n]\\ndocuments = [\\n    \"The capital of China is Beijing.\",\\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\",\\n]\\n\\n# Encode the queries and documents. Note that queries benefit from using a prompt\\n# Here we use the prompt called \"query\" stored under `model.prompts`, but you can\\n# also pass your own prompt via the `prompt` argument\\nquery_embeddings = model.encode(queries, prompt_name=\"query\")\\ndocument_embeddings = model.encode(documents)\\n\\n# Compute the (cosine) similarity between the query and document embeddings\\nsimilarity = model.similarity(query_embeddings, document_embeddings)\\nprint(similarity)\\n# tensor([[0.7646, 0.1414],\\n#         [0.1355, 0.6000]])\\n```\\n\\n### Transformers Usage\\n\\n```python\\n# Requires transformers>=4.51.0\\n\\nimport torch\\nimport torch.nn.functional as F\\n\\nfrom torch import Tensor\\nfrom transformers import AutoTokenizer, AutoModel\\n\\n\\ndef last_token_pool(last_hidden_states: Tensor,\\n                 attention_mask: Tensor) -> Tensor:\\n    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])\\n    if left_padding:\\n        return last_hidden_states[:, -1]\\n    else:\\n        sequence_lengths = attention_mask.sum(dim=1) - 1\\n        batch_size = last_hidden_states.shape[0]\\n        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]\\n\\n\\ndef get_detailed_instruct(task_description: str, query: str) -> str:\\n    return f\\'Instruct: {task_description}\\\\nQuery:{query}\\'\\n\\n# Each query must come with a one-sentence instruction that describes the task\\ntask = \\'Given a web search query, retrieve relevant passages that answer the query\\'\\n\\nqueries = [\\n    get_detailed_instruct(task, \\'What is the capital of China?\\'),\\n    get_detailed_instruct(task, \\'Explain gravity\\')\\n]\\n# No need to add instruction for retrieval documents\\ndocuments = [\\n    \"The capital of China is Beijing.\",\\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\"\\n]\\ninput_texts = queries + documents\\n\\ntokenizer = AutoTokenizer.from_pretrained(\\'Qwen/Qwen3-Embedding-0.6B\\', padding_side=\\'left\\')\\nmodel = AutoModel.from_pretrained(\\'Qwen/Qwen3-Embedding-0.6B\\')\\n\\n# We recommend enabling flash_attention_2 for better acceleration and memory saving.\\n# model = AutoModel.from_pretrained(\\'Qwen/Qwen3-Embedding-0.6B\\', attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16).cuda()\\n\\nmax_length = 8192\\n\\n# Tokenize the input texts\\nbatch_dict = tokenizer(\\n    input_texts,\\n    padding=True,\\n    truncation=True,\\n    max_length=max_length,\\n    return_tensors=\"pt\",\\n)\\nbatch_dict.to(model.device)\\noutputs = model(**batch_dict)\\nembeddings = last_token_pool(outputs.last_hidden_state, batch_dict[\\'attention_mask\\'])\\n\\n# normalize embeddings\\nembeddings = F.normalize(embeddings, p=2, dim=1)\\nscores = (embeddings[:2] @ embeddings[2:].T)\\nprint(scores.tolist())\\n# [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]\\n```\\n\\n### vLLM Usage\\n\\n```python\\n# Requires vllm>=0.8.5\\nimport torch\\nimport vllm\\nfrom vllm import LLM\\n\\ndef get_detailed_instruct(task_description: str, query: str) -> str:\\n    return f\\'Instruct: {task_description}\\\\nQuery:{query}\\'\\n\\n# Each query must come with a one-sentence instruction that describes the task\\ntask = \\'Given a web search query, retrieve relevant passages that answer the query\\'\\n\\nqueries = [\\n    get_detailed_instruct(task, \\'What is the capital of China?\\'),\\n    get_detailed_instruct(task, \\'Explain gravity\\')\\n]\\n# No need to add instruction for retrieval documents\\ndocuments = [\\n    \"The capital of China is Beijing.\",\\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\"\\n]\\ninput_texts = queries + documents\\n\\nmodel = LLM(model=\"Qwen/Qwen3-Embedding-0.6B\", task=\"embed\")\\n\\noutputs = model.embed(input_texts)\\nembeddings = torch.tensor([o.outputs.embedding for o in outputs])\\nscores = (embeddings[:2] @ embeddings[2:].T)\\nprint(scores.tolist())\\n# [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]]\\n```\\n\\n📌 **Tip**: We recommend that developers customize the `instruct` according to their specific scenarios, tasks, and languages. Our tests have shown that in most retrieval scenarios, not using an `instruct` on the query side can lead to a drop in retrieval performance by approximately 1% to 5%.\\n\\n### Text Embeddings Inference (TEI) Usage\\n\\nYou can either run / deploy TEI on NVIDIA GPUs as:\\n\\n```bash\\ndocker run --gpus all -p 8080:80 -v hf_cache:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.7.2 --model-id Qwen/Qwen3-Embedding-0.6B --dtype float16\\n```\\n\\nOr on CPU devices as:\\n\\n```bash\\ndocker run -p 8080:80 -v hf_cache:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.7.2 --model-id Qwen/Qwen3-Embedding-0.6B\\n```\\n\\nAnd then, generate the embeddings sending a HTTP POST request as:\\n\\n```bash\\ncurl  \\\\\\n    -X POST \\\\\\n    -d \\'{\"inputs\": [\"Instruct: Given a web search query, retrieve relevant passages that answer the query\\\\nQuery: What is the capital of China?\", \"Instruct: Given a web search query, retrieve relevant passages that answer the query\\\\nQuery: Explain gravity\"]}\\' \\\\\\n    -H \"Content-Type: application/json\"\\n```\\n\\n## Evaluation\\n\\n### MTEB (Multilingual)\\n\\n| Model                            |  Size   |  Mean (Task)  | Mean (Type) | Bitxt Mining | Class. | Clust. | Inst. Retri. | Multi. Class. | Pair. Class. | Rerank | Retri. | STS  |\\n|----------------------------------|:-------:|:-------------:|:-------------:|:--------------:|:--------:|:--------:|:--------------:|:---------------:|:--------------:|:--------:|:--------:|:------:|\\n| NV-Embed-v2                      |   7B    |     56.29     | 49.58       | 57.84        | 57.29  | 40.80  | 1.04         | 18.63         | 78.94        | 63.82  | 56.72  | 71.10|\\n| GritLM-7B                        |   7B    |     60.92     | 53.74       | 70.53        | 61.83  | 49.75  | 3.45         | 22.77         | 79.94        | 63.78  | 58.31  | 73.33|\\n| BGE-M3                           |  0.6B   |     59.56     | 52.18       | 79.11        | 60.35  | 40.88  | -3.11        | 20.1          | 80.76        | 62.79  | 54.60  | 74.12|\\n| multilingual-e5-large-instruct   |  0.6B   |     63.22     | 55.08       | 80.13        | 64.94  | 50.75  | -0.40        | 22.91         | 80.86        | 62.61  | 57.12  | 76.81|\\n| gte-Qwen2-1.5B-instruct          |  1.5B   |     59.45     | 52.69       | 62.51        | 58.32  | 52.05  | 0.74         | 24.02         | 81.58        | 62.58  | 60.78  | 71.61|\\n| gte-Qwen2-7b-Instruct            |   7B    |     62.51     | 55.93       | 73.92        | 61.55  | 52.77  | 4.94         | 25.48         | 85.13        | 65.55  | 60.08  | 73.98|\\n| text-embedding-3-large           |    -    |     58.93     | 51.41       | 62.17        | 60.27  | 46.89  | -2.68        | 22.03         | 79.17        | 63.89  | 59.27  | 71.68|\\n| Cohere-embed-multilingual-v3.0   |    -    |     61.12     | 53.23       | 70.50        | 62.95  | 46.89  | -1.89        | 22.74         | 79.88        | 64.07  | 59.16  | 74.80|\\n| Gemini Embedding                 |    -    |     68.37     | 59.59       | 79.28        | 71.82  | 54.59  | 5.18         | **29.16**     | 83.63        | 65.58  | 67.71  | 79.40|\\n| **Qwen3-Embedding-0.6B**         |  0.6B   |     64.33     | 56.00       | 72.22        | 66.83  | 52.33  | 5.09         | 24.59         | 80.83        | 61.41  | 64.64  | 76.17|\\n| **Qwen3-Embedding-4B**           |   4B    |     69.45     | 60.86       | 79.36        | 72.33  | 57.15  | **11.56**    | 26.77         | 85.05        | 65.08  | 69.60  | 80.86|\\n| **Qwen3-Embedding-8B**           |   8B    |   **70.58**   | **61.69**   | **80.89**    | **74.00** | **57.65** | 10.06      | 28.66         | **86.40**    | **65.63** | **70.88** | **81.08** |\\n\\n> **Note**: For compared models, the scores are retrieved from MTEB online  on May 24th, 2025.\\n\\n### MTEB (Eng v2)\\n\\n| MTEB English / Models          |  Param.  | Mean(Task) | Mean(Type) | Class. | Clust. | Pair Class. | Rerank. | Retri. | STS   | Summ. |\\n|--------------------------------|:--------:|:------------:|:------------:|:--------:|:--------:|:-------------:|:---------:|:--------:|:-------:|:-------:|\\n| multilingual-e5-large-instruct |   0.6B   | 65.53      | 61.21      | 75.54  | 49.89  | 86.24       | 48.74   | 53.47  | 84.72 | 29.89 |\\n| NV-Embed-v2                    |   7.8B   | 69.81      | 65.00      | 87.19  | 47.66  | 88.69       | 49.61   | 62.84  | 83.82 | 35.21 |\\n| GritLM-7B                      |   7.2B   | 67.07      | 63.22      | 81.25  | 50.82  | 87.29       | 49.59   | 54.95  | 83.03 | 35.65 |\\n| gte-Qwen2-1.5B-instruct        |   1.5B   | 67.20      | 63.26      | 85.84  | 53.54  | 87.52       | 49.25   | 50.25  | 82.51 | 33.94 |\\n| stella_en_1.5B_v5              |   1.5B   | 69.43      | 65.32      | 89.38  | 57.06  | 88.02       | 50.19   | 52.42  | 83.27 | 36.91 |\\n| gte-Qwen2-7B-instruct          |   7.6B   | 70.72      | 65.77      | 88.52  | 58.97  | 85.9        | 50.47   | 58.09  | 82.69 | 35.74 |\\n| gemini-embedding-exp-03-07     |    -     | 73.3       | 67.67      | 90.05  | 59.39  | 87.7        | 48.59   | 64.35  | 85.29 | 38.28 |\\n| **Qwen3-Embedding-0.6B**       |   0.6B   | 70.70      | 64.88      | 85.76  | 54.05  | 84.37       | 48.18   | 61.83  | 86.57 | 33.43 |\\n| **Qwen3-Embedding-4B**         |    4B    | 74.60      | 68.10      | 89.84  | 57.51  | 87.01       | 50.76   | 68.46  | 88.72 | 34.39 |\\n| **Qwen3-Embedding-8B**         |    8B    | 75.22      | 68.71      | 90.43  | 58.57  | 87.52       | 51.56   | 69.44  | 88.58 | 34.83 |\\n\\n### C-MTEB (MTEB Chinese)\\n\\n| C-MTEB           | Param. | Mean(Task) | Mean(Type) | Class. | Clust. | Pair Class. | Rerank. | Retr. | STS   |\\n|------------------|--------|------------|------------|--------|--------|-------------|---------|-------|-------|\\n| multilingual-e5-large-instruct | 0.6B   | 58.08      | 58.24      | 69.80  | 48.23  | 64.52       | 57.45   | 63.65 | 45.81 |\\n| bge-multilingual-gemma2 | 9B     | 67.64      | 75.31      | 59.30  | 86.67  | 68.28       | 73.73   | 55.19 | -     |\\n| gte-Qwen2-1.5B-instruct  | 1.5B   | 67.12      | 67.79      | 72.53  | 54.61  | 79.5        | 68.21   | 71.86 | 60.05 |\\n| gte-Qwen2-7B-instruct    | 7.6B   | 71.62      | 72.19      | 75.77  | 66.06  | 81.16       | 69.24   | 75.70 | 65.20 |\\n| ritrieve_zh_v1          | 0.3B   | 72.71      | 73.85      | 76.88  | 66.5   | 85.98       | 72.86   | 76.97 | 63.92 |\\n| **Qwen3-Embedding-0.6B** | 0.6B   | 66.33      | 67.45      | 71.40  | 68.74  | 76.42       | 62.58   | 71.03 | 54.52 |\\n| **Qwen3-Embedding-4B**   | 4B     | 72.27      | 73.51      | 75.46  | 77.89  | 83.34       | 66.05   | 77.03 | 61.26 |\\n| **Qwen3-Embedding-8B**   | 8B     | 73.84      | 75.00      | 76.97  | 80.08  | 84.23       | 66.99   | 78.21 | 63.53 |\\n\\n\\n## Citation\\n\\nIf you find our work helpful, feel free to give us a cite.\\n\\n```\\n@article{qwen3embedding,\\n  title={Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models},\\n  author={Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and Zhang, Xin and Lin, Huan and Yang, Baosong and Xie, Pengjun and Yang, An and Liu, Dayiheng and Lin, Junyang and Huang, Fei and Zhou, Jingren},\\n  journal={arXiv preprint arXiv:2506.05176},\\n  year={2025}\\n}\\n```',\n",
       "  'domain': 'sentence-similarity'},\n",
       " {'model_id': 'google/siglip2-so400m-patch16-naflex',\n",
       "  'created_at': '2025-02-18T11:45:40+00:00',\n",
       "  'downloads': 1677962,\n",
       "  'likes': 53,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'siglip2',\n",
       "   'zero-shot-image-classification',\n",
       "   'vision',\n",
       "   'arxiv:2502.14786',\n",
       "   'arxiv:2303.15343',\n",
       "   'arxiv:2209.06794',\n",
       "   'license:apache-2.0',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# SigLIP 2 So400m\\n\\n extends the pretraining objective of\\n with prior, independently developed techniques\\ninto a unified recipe, for improved semantic understanding, localization, and dense features.\\n\\n## Intended uses\\n\\nYou can use the raw model for tasks like zero-shot image classification and\\nimage-text retrieval, or as a vision encoder for VLMs (and other vision tasks).\\n\\nHere is how to use this model to perform zero-shot image classification:\\n\\n```python\\nfrom transformers import pipeline\\n\\n# load pipeline\\nckpt = \"google/siglip2-so400m-patch16-naflex\"\\nimage_classifier = pipeline(model=ckpt, task=\"zero-shot-image-classification\")\\n\\n# load image and candidate labels\\nurl = \"\\ncandidate_labels = [\"2 cats\", \"a plane\", \"a remote\"]\\n\\n# run inference\\noutputs = image_classifier(image, candidate_labels)\\nprint(outputs)\\n```\\n\\nYou can encode an image using the Vision Tower like so:\\n\\n```python\\nimport torch\\nfrom transformers import AutoModel, AutoProcessor\\nfrom transformers.image_utils import load_image\\n\\n# load the model and processor\\nckpt = \"google/siglip2-so400m-patch16-naflex\"\\nmodel = AutoModel.from_pretrained(ckpt, device_map=\"auto\").eval()\\nprocessor = AutoProcessor.from_pretrained(ckpt)\\n\\n# load the image\\nimage = load_image(\"\\ninputs = processor(images=[image], return_tensors=\"pt\").to(model.device)\\n\\n# run infernece\\nwith torch.no_grad():\\n    image_embeddings = model.get_image_features(**inputs)    \\n\\nprint(image_embeddings.shape)\\n```\\n\\nFor more code examples, we refer to the .\\n\\n## Training procedure\\n\\nSigLIP 2 adds some clever training objectives on top of SigLIP:\\n\\n1. Decoder loss\\n2. Global-local and masked prediction loss\\n3. Aspect ratio and resolution adaptibility \\n\\n### Training data\\n\\nSigLIP 2 is pre-trained on the WebLI dataset .\\n\\n### Compute\\n\\nThe model was trained on up to 2048 TPU-v5e chips.\\n\\n## Evaluation results\\n\\nEvaluation of SigLIP 2 is shown below (taken from the paper).\\n\\n\\n\\n### BibTeX entry and citation info\\n\\n```bibtex\\n@misc{tschannen2025siglip2multilingualvisionlanguage,\\n      title={SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features}, \\n      author={Michael Tschannen and Alexey Gritsenko and Xiao Wang and Muhammad Ferjad Naeem and Ibrahim Alabdulmohsin and Nikhil Parthasarathy and Talfan Evans and Lucas Beyer and Ye Xia and Basil Mustafa and Olivier Hénaff and Jeremiah Harmsen and Andreas Steiner and Xiaohua Zhai},\\n      year={2025},\\n      eprint={2502.14786},\\n      archivePrefix={arXiv},\\n      primaryClass={cs.CV},\\n      url={ \\n}\\n```\\n',\n",
       "  'domain': 'zero-shot-image-classification'},\n",
       " {'model_id': 'facebook/sam3',\n",
       "  'created_at': '2025-11-07T05:17:48+00:00',\n",
       "  'downloads': 1652412,\n",
       "  'likes': 1455,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'sam3_video',\n",
       "   'feature-extraction',\n",
       "   'sam3',\n",
       "   'mask-generation',\n",
       "   'en',\n",
       "   'license:other',\n",
       "   'endpoints_compatible',\n",
       "   'deploy:azure',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\nSAM 3 is a unified foundation model for promptable segmentation in images and videos. It can detect, segment, and track objects using text or visual prompts such as points, boxes, and masks. Compared to its predecessor , SAM 3 introduces the ability to exhaustively segment all instances of an open-vocabulary concept specified by a short text phrase or exemplars. Unlike prior work, SAM 3 can handle a vastly larger set of open-vocabulary prompts. It achieves 75-80% of human performance on our new  which contains 270K unique concepts, over 50 times more than existing benchmarks.\\n\\n\\n\\n### Basic Usage\\n\\n```python\\nimport torch\\n#################################### For Image ####################################\\nfrom PIL import Image\\nfrom sam3.model_builder import build_sam3_image_model\\nfrom sam3.model.sam3_image_processor import Sam3Processor\\n# Load the model\\nmodel = build_sam3_image_model()\\nprocessor = Sam3Processor(model)\\n# Load an image\\nimage = Image.open(\"\")\\ninference_state = processor.set_image(image)\\n# Prompt the model with text\\noutput = processor.set_text_prompt(state=inference_state, prompt=\"\")\\n\\n# Get the masks, bounding boxes, and scores\\nmasks, boxes, scores = output[\"masks\"], output[\"boxes\"], output[\"scores\"]\\n\\n#################################### For Video ####################################\\n\\nfrom sam3.model_builder import build_sam3_video_predictor\\n\\nvideo_predictor = build_sam3_video_predictor()\\nvideo_path = \"\" # a JPEG folder or an MP4 video file\\n# Start a session\\nresponse = video_predictor.handle_request(\\n    request=dict(\\n        type=\"start_session\",\\n        resource_path=video_path,\\n    )\\n)\\nresponse = video_predictor.handle_request(\\n    request=dict(\\n        type=\"add_prompt\",\\n        session_id=response[\"session_id\"],\\n        frame_index=0, # Arbitrary frame index\\n        text=\"\",\\n    )\\n)\\noutput = response[\"outputs\"]\\n```\\n\\nThe official code is publicly released in the .\\n\\n\\n## Usage with 🤗 Transformers\\n\\n### SAM3 - Promptable Concept Segmentation (PCS) for Images\\n\\nSAM3 performs Promptable Concept Segmentation (PCS) on images, taking text and/or image exemplars as prompts and returning segmentation masks for **all matching object instances** in the image.\\n\\n#### Text-Only Prompts\\n\\n```python\\n>>> from transformers import Sam3Processor, Sam3Model\\n>>> import torch\\n>>> from PIL import Image\\n>>> import requests\\n\\n>>> device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\\n\\n>>> model = Sam3Model.from_pretrained(\"facebook/sam3\").to(device)\\n>>> processor = Sam3Processor.from_pretrained(\"facebook/sam3\")\\n\\n>>> # Load image\\n>>> image_url = \"\\n>>> image = Image.open(requests.get(image_url, stream=True).raw).convert(\"RGB\")\\n\\n>>> # Segment using text prompt\\n>>> inputs = processor(images=image, text=\"ear\", return_tensors=\"pt\").to(device)\\n\\n>>> with torch.no_grad():\\n...     outputs = model(**inputs)\\n\\n>>> # Post-process results\\n>>> results = processor.post_process_instance_segmentation(\\n...     outputs,\\n...     threshold=0.5,\\n...     mask_threshold=0.5,\\n...     target_sizes=inputs.get(\"original_sizes\").tolist()\\n... )[0]\\n\\n>>> print(f\"Found {len(results[\\'masks\\'])} objects\")\\n>>> # Results contain:\\n>>> # - masks: Binary masks resized to original image size\\n>>> # - boxes: Bounding boxes in absolute pixel coordinates (xyxy format)\\n>>> # - scores: Confidence scores\\n```\\n\\nYou can display masks using a simple helper like the following:\\n\\n```python\\nimport numpy as np\\nimport matplotlib\\n\\ndef overlay_masks(image, masks):\\n    image = image.convert(\"RGBA\")\\n    masks = 255 * masks.cpu().numpy().astype(np.uint8)\\n    \\n    n_masks = masks.shape[0]\\n    cmap = matplotlib.colormaps.get_cmap(\"rainbow\").resampled(n_masks)\\n    colors = [\\n        tuple(int(c * 255) for c in cmap(i)[:3])\\n        for i in range(n_masks)\\n    ]\\n\\n    for mask, color in zip(masks, colors):\\n        mask = Image.fromarray(mask)\\n        overlay = Image.new(\"RGBA\", image.size, color + (0,))\\n        alpha = mask.point(lambda v: int(v * 0.5))\\n        overlay.putalpha(alpha)\\n        image = Image.alpha_composite(image, overlay)\\n    return image\\n```\\n\\nThen you can save the resulting composite image or display it in a notebook:\\n\\n```python\\n>>> overlay_masks(image, results[\"masks\"])\\n```\\n\\n#### Single Bounding Box Prompt\\n\\nSegment objects using a bounding box:\\n\\n```python\\n>>> # Box in xyxy format: [x1, y1, x2, y2] in pixel coordinates\\n>>> # Example: laptop region\\n>>> box_xyxy = [100, 150, 500, 450]\\n>>> input_boxes = [[box_xyxy]]  # [batch, num_boxes, 4]\\n>>> input_boxes_labels = [[1]]  # 1 = positive box\\n\\n>>> inputs = processor(\\n...     images=image,\\n...     input_boxes=input_boxes,\\n...     input_boxes_labels=input_boxes_labels,\\n...     return_tensors=\"pt\"\\n... ).to(device)\\n\\n>>> with torch.no_grad():\\n...     outputs = model(**inputs)\\n\\n>>> # Post-process results\\n>>> results = processor.post_process_instance_segmentation(\\n...     outputs,\\n...     threshold=0.5,\\n...     mask_threshold=0.5,\\n...     target_sizes=inputs.get(\"original_sizes\").tolist()\\n... )[0]\\n```\\n\\n#### Multiple Box Prompts (Positive and Negative)\\n\\nUse multiple boxes with positive and negative labels to refine the concept:\\n\\n```python\\n>>> # Load kitchen image\\n>>> kitchen_url = \"\\n>>> kitchen_image = Image.open(requests.get(kitchen_url, stream=True).raw).convert(\"RGB\")\\n\\n>>> # Define two positive boxes (e.g., dial and button on oven)\\n>>> # Boxes are in xyxy format [x1, y1, x2, y2] in pixel coordinates\\n>>> box1_xyxy = [59, 144, 76, 163]  # Dial box\\n>>> box2_xyxy = [87, 148, 104, 159]  # Button box\\n>>> input_boxes = [[box1_xyxy, box2_xyxy]]\\n>>> input_boxes_labels = [[1, 1]]  # Both positive\\n\\n>>> inputs = processor(\\n...     images=kitchen_image,\\n...     input_boxes=input_boxes,\\n...     input_boxes_labels=input_boxes_labels,\\n...     return_tensors=\"pt\"\\n... ).to(device)\\n\\n>>> with torch.no_grad():\\n...     outputs = model(**inputs)\\n\\n>>> # Post-process results\\n>>> results = processor.post_process_instance_segmentation(\\n...     outputs,\\n...     threshold=0.5,\\n...     mask_threshold=0.5,\\n...     target_sizes=inputs.get(\"original_sizes\").tolist()\\n... )[0]\\n>>> overlay_masks(kitchen_image, results[\"masks\"])\\n```\\n\\n#### Combined Prompts (Text + Negative Box)\\n\\nUse text prompts with negative visual prompts to refine the concept:\\n\\n```python\\n>>> # Segment \"handle\" but exclude the oven handle using a negative box\\n>>> text = \"handle\"\\n>>> # Negative box covering oven handle area (xyxy): [40, 183, 318, 204]\\n>>> oven_handle_box = [40, 183, 318, 204]\\n>>> input_boxes = [[oven_handle_box]]\\n\\n>>> inputs = processor(\\n...     images=kitchen_image,\\n...     text=text,\\n...     input_boxes=input_boxes,\\n...     input_boxes_labels=[[0]],  # 0 = negative (exclude this region)\\n...     return_tensors=\"pt\"\\n... ).to(device)\\n\\n>>> with torch.no_grad():\\n...     outputs = model(**inputs)\\n\\n>>> # Post-process results\\n>>> results = processor.post_process_instance_segmentation(\\n...     outputs,\\n...     threshold=0.5,\\n...     mask_threshold=0.5,\\n...     target_sizes=inputs.get(\"original_sizes\").tolist()\\n... )[0]\\n>>> # This will segment pot handles but exclude the oven handle\\n```\\n\\n#### Batched Inference with Text Prompts\\n\\nProcess multiple images with different text prompts by batch:\\n\\n```python\\n>>> cat_url = \"\\n>>> kitchen_url = \"\\n>>> images = [\\n...     Image.open(requests.get(cat_url, stream=True).raw).convert(\"RGB\"),\\n...     Image.open(requests.get(kitchen_url, stream=True).raw).convert(\"RGB\")\\n... ]\\n\\n>>> text_prompts = [\"ear\", \"dial\"]\\n\\n>>> inputs = processor(images=images, text=text_prompts, return_tensors=\"pt\").to(device)\\n\\n>>> with torch.no_grad():\\n...     outputs = model(**inputs)\\n\\n>>> # Post-process results for both images\\n>>> results = processor.post_process_instance_segmentation(\\n...     outputs,\\n...     threshold=0.5,\\n...     mask_threshold=0.5,\\n...     target_sizes=inputs.get(\"original_sizes\").tolist()\\n... )\\n\\n>>> print(f\"Image 1: {len(results[0][\\'masks\\'])} objects found\")\\n>>> print(f\"Image 2: {len(results[1][\\'masks\\'])} objects found\")\\n```\\n\\n#### Batched Mixed Prompts\\n\\nUse different prompt types for different images in the same batch:\\n\\n```python\\n>>> # Image 1: text prompt \"laptop\"\\n>>> # Image 2: visual prompt (dial box)\\n>>> box2_xyxy = [59, 144, 76, 163]\\n\\n>>> inputs = processor(\\n...     images=images,\\n...     text=[\"laptop\", None],  # Only first image has text\\n...     input_boxes=[None, [box2_xyxy]],  # Only second image has box\\n...     input_boxes_labels=[None, [1]],  # Positive box for second image\\n...     return_tensors=\"pt\"\\n... ).to(device)\\n\\n>>> with torch.no_grad():\\n...     outputs = model(**inputs)\\n\\n>>> # Post-process results for both images\\n>>> results = processor.post_process_instance_segmentation(\\n...     outputs,\\n...     threshold=0.5,\\n...     mask_threshold=0.5,\\n...     target_sizes=inputs.get(\"original_sizes\").tolist()\\n... )\\n>>> # Both images processed in single forward pass\\n```\\n\\n#### Semantic Segmentation Output\\n\\nSAM3 also provides semantic segmentation alongside instance masks:\\n\\n```python\\n>>> inputs = processor(images=image, text=\"ear\", return_tensors=\"pt\").to(device)\\n\\n>>> with torch.no_grad():\\n...     outputs = model(**inputs)\\n\\n>>> # Instance segmentation masks\\n>>> instance_masks = torch.sigmoid(outputs.pred_masks)  # [batch, num_queries, H, W]\\n\\n>>> # Semantic segmentation (single channel)\\n>>> semantic_seg = outputs.semantic_seg  # [batch, 1, H, W]\\n\\n>>> print(f\"Instance masks: {instance_masks.shape}\")\\n>>> print(f\"Semantic segmentation: {semantic_seg.shape}\")\\n```\\n\\n### SAM3 Video - Promptable Concept Segmentation (PCS) for Videos\\n\\nSAM3 Video performs Promptable Concept Segmentation (PCS) on videos, taking text as prompts and detecting and tracking **all matching object instances** across video frames.\\n\\n#### Pre-loaded Video Inference\\n\\nProcess a video with all frames already available using text prompts:\\n\\n```python\\n>>> from transformers import Sam3VideoModel, Sam3VideoProcessor\\n>>> from accelerate import Accelerator\\n>>> import torch\\n\\n>>> device = Accelerator().device\\n>>> model = Sam3VideoModel.from_pretrained(\"facebook/sam3\").to(device, dtype=torch.bfloat16)\\n>>> processor = Sam3VideoProcessor.from_pretrained(\"facebook/sam3\")\\n\\n>>> # Load video frames\\n>>> from transformers.video_utils import load_video\\n>>> video_url = \"\\n>>> video_frames, _ = load_video(video_url)\\n\\n>>> # Initialize video inference session\\n>>> inference_session = processor.init_video_session(\\n...     video=video_frames,\\n...     inference_device=device,\\n...     processing_device=\"cpu\",\\n...     video_storage_device=\"cpu\",\\n...     dtype=torch.bfloat16,\\n... )\\n\\n>>> # Add text prompt to detect and track objects\\n>>> text = \"person\"\\n>>> inference_session = processor.add_text_prompt(\\n...     inference_session=inference_session,\\n...     text=text,\\n... )\\n\\n>>> # Process all frames in the video\\n>>> outputs_per_frame = {}\\n>>> for model_outputs in model.propagate_in_video_iterator(\\n...     inference_session=inference_session, max_frame_num_to_track=50\\n... ):\\n...     processed_outputs = processor.postprocess_outputs(inference_session, model_outputs)\\n...     outputs_per_frame[model_outputs.frame_idx] = processed_outputs\\n\\n>>> print(f\"Processed {len(outputs_per_frame)} frames\")\\nProcessed 51 frames\\n\\n>>> # Access results for a specific frame\\n>>> frame_0_outputs = outputs_per_frame[0]\\n>>> print(f\"Detected {len(frame_0_outputs[\\'object_ids\\'])} objects\")\\n>>> print(f\"Object IDs: {frame_0_outputs[\\'object_ids\\'].tolist()}\")\\n>>> print(f\"Scores: {frame_0_outputs[\\'scores\\'].tolist()}\")\\n>>> print(f\"Boxes shape (XYXY format, absolute coordinates): {frame_0_outputs[\\'boxes\\'].shape}\")\\n>>> print(f\"Masks shape: {frame_0_outputs[\\'masks\\'].shape}\")\\n```\\n\\n#### Streaming Video Inference\\n\\nFor real-time applications, the Transformers implementation of SAM3 Video supports processing video frames as they arrive:\\n\\n```python\\n>>> # Initialize session for streaming\\n>>> streaming_inference_session = processor.init_video_session(\\n...     inference_device=device,\\n...     processing_device=\"cpu\",\\n...     video_storage_device=\"cpu\",\\n...     dtype=torch.bfloat16,\\n... )\\n\\n>>> # Add text prompt\\n>>> text = \"person\"\\n>>> streaming_inference_session = processor.add_text_prompt(\\n...     inference_session=streaming_inference_session,\\n...     text=text,\\n... )\\n\\n>>> # Process frames one by one (streaming mode)\\n>>> streaming_outputs_per_frame = {}\\n>>> for frame_idx, frame in enumerate(video_frames[:50]):  # Process first 50 frames\\n...     # First, process the frame using the processor\\n...     inputs = processor(images=frame, device=device, return_tensors=\"pt\")\\n...\\n...     # Process frame using streaming inference - pass the processed pixel_values\\n...     model_outputs = model(\\n...         inference_session=streaming_inference_session,\\n...         frame=inputs.pixel_values[0],  # Provide processed frame - this enables streaming mode\\n...         reverse=False,\\n...     )\\n...\\n...     # Post-process outputs with original_sizes for proper resolution handling\\n...     processed_outputs = processor.postprocess_outputs(\\n...         streaming_inference_session,\\n...         model_outputs,\\n...         original_sizes=inputs.original_sizes,  # Required for streaming inference\\n...     )\\n...     streaming_outputs_per_frame[frame_idx] = processed_outputs\\n...\\n...     if (frame_idx + 1) % 10 == 0:\\n...         print(f\"Processed {frame_idx + 1} frames...\")\\n\\n>>> print(f\"✓ Streaming inference complete! Processed {len(streaming_outputs_per_frame)} frames\")\\n✓ Streaming inference complete! Processed 50 frames\\n\\n>>> # Access results\\n>>> frame_0_outputs = streaming_outputs_per_frame[0]\\n>>> print(f\"Detected {len(frame_0_outputs[\\'object_ids\\'])} objects in first frame\")\\n>>> print(f\"Boxes are in XYXY format (absolute pixel coordinates): {frame_0_outputs[\\'boxes\\'].shape}\")\\n>>> print(f\"Masks are at original video resolution: {frame_0_outputs[\\'masks\\'].shape}\")\\n```\\n\\n\\n⚠️ **Note on Streaming Inference Quality**: Streaming inference disables hotstart heuristics that remove unmatched and duplicate objects, as these require access to future frames to make informed decisions. This may result in more false positive detections and duplicate object tracks compared to pre-loaded video inference. For best results, use pre-loaded video inference when all frames are available.\\n\\n\\n### SAM3 Tracker - Promptable Visual Segmentation (PVS) for Images\\n\\nSam3Tracker performs Promptable Visual Segmentation (PVS) on images, taking interactive visual prompts (points, boxes, masks) to segment a **specific object instance** per prompt. It is an updated version of SAM2 that maintains the same API while providing improved performance, making it a drop-in replacement for SAM2 workflows.\\n\\n#### Automatic Mask Generation with Pipeline\\n\\n```python\\n>>> from transformers import pipeline\\n\\n>>> generator = pipeline(\"mask-generation\", model=\"facebook/sam3\", device=0)\\n>>> image_url = \"\\n>>> outputs = generator(image_url, points_per_batch=64)\\n\\n>>> len(outputs[\"masks\"])  # Number of masks generated\\n```\\n\\n#### Basic Image Segmentation\\n\\n##### Single Point Click\\n\\n```python\\n>>> from transformers import Sam3TrackerProcessor, Sam3TrackerModel\\n>>> from accelerate import Accelerator\\n>>> import torch\\n>>> from PIL import Image\\n>>> import requests\\n\\n>>> device = Accelerator().device\\n\\n>>> model = Sam3TrackerModel.from_pretrained(\"facebook/sam3\").to(device)\\n>>> processor = Sam3TrackerProcessor.from_pretrained(\"facebook/sam3\")\\n\\n>>> image_url = \"\\n>>> raw_image = Image.open(requests.get(image_url, stream=True).raw).convert(\"RGB\")\\n\\n>>> input_points = [[[[500, 375]]]]  # Single point click, 4 dimensions (image_dim, object_dim, point_per_object_dim, coordinates)\\n>>> input_labels = [[[1]]]  # 1 for positive click, 0 for negative click, 3 dimensions (image_dim, object_dim, point_label)\\n\\n>>> inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors=\"pt\").to(model.device)\\n\\n>>> with torch.no_grad():\\n...     outputs = model(**inputs)\\n\\n>>> masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs[\"original_sizes\"])[0]\\n\\n>>> # The model outputs multiple mask predictions ranked by quality score\\n>>> print(f\"Generated {masks.shape[1]} masks with shape {masks.shape}\")\\n```\\n\\n##### Multiple Points for Refinement\\n\\n```python\\n>>> # Add both positive and negative points to refine the mask\\n>>> input_points = [[[[500, 375], [1125, 625]]]]  # Multiple points for refinement\\n>>> input_labels = [[[1, 1]]]  # Both positive clicks\\n\\n>>> inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors=\"pt\").to(device)\\n\\n>>> with torch.no_grad():\\n...     outputs = model(**inputs)\\n\\n>>> masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs[\"original_sizes\"])[0]\\n```\\n\\n##### Bounding Box Input\\n\\n```python\\n>>> # Define bounding box as [x_min, y_min, x_max, y_max]\\n>>> input_boxes = [[[75, 275, 1725, 850]]]\\n\\n>>> inputs = processor(images=raw_image, input_boxes=input_boxes, return_tensors=\"pt\").to(device)\\n\\n>>> with torch.no_grad():\\n...     outputs = model(**inputs)\\n\\n>>> masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs[\"original_sizes\"])[0]\\n```\\n\\n##### Multiple Objects Segmentation\\n\\n```python\\n>>> # Define points for two different objects\\n>>> input_points = [[[[500, 375]], [[650, 750]]]]  # Points for two objects in same image\\n>>> input_labels = [[[1], [1]]]  # Positive clicks for both objects\\n\\n>>> inputs = processor(images=raw_image, input_points=input_points, input_labels=input_labels, return_tensors=\"pt\").to(model.device)\\n\\n>>> with torch.no_grad():\\n...     outputs = model(**inputs, multimask_output=False)\\n\\n>>> # Each object gets its own mask\\n>>> masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs[\"original_sizes\"])[0]\\n>>> print(f\"Generated masks for {masks.shape[0]} objects\")\\nGenerated masks for 2 objects\\n```\\n\\n#### Batch Inference\\n\\n\\n```python\\n>>> # Load multiple images\\n>>> image_urls = [\\n...     \"\\n...     \"\\n... ]\\n>>> raw_images = [Image.open(requests.get(url, stream=True).raw).convert(\"RGB\") for url in image_urls]\\n\\n>>> # Single point per image\\n>>> input_points = [[[[500, 375]]], [[[770, 200]]]]  # One point for each image\\n>>> input_labels = [[[1]], [[1]]]  # Positive clicks for both images\\n\\n>>> inputs = processor(images=raw_images, input_points=input_points, input_labels=input_labels, return_tensors=\"pt\").to(model.device)\\n\\n>>> with torch.no_grad():\\n...     outputs = model(**inputs, multimask_output=False)\\n\\n>>> # Post-process masks for each image\\n>>> all_masks = processor.post_process_masks(outputs.pred_masks.cpu(), inputs[\"original_sizes\"])\\n>>> print(f\"Processed {len(all_masks)} images, each with {all_masks[0].shape[0]} objects\")\\n```\\n\\n### SAM3 Tracker Video - Promptable Visual Segmentation (PVS) for Videos\\n\\nSam3TrackerVideo performs Promptable Visual Segmentation (PVS) on videos, taking interactive visual prompts (points, boxes, masks) to track a **specific object instance** per prompt across video frames. It is an updated version of SAM2 Video that maintains the same API while providing improved performance, making it a drop-in replacement for SAM2 Video workflows.\\n\\n#### Basic Video Tracking\\n\\n```python\\n>>> from transformers import Sam3TrackerVideoModel, Sam3TrackerVideoProcessor\\n>>> from accelerate import Accelerator\\n>>> import torch\\n\\n>>> device = Accelerator().device\\n>>> model = Sam3TrackerVideoModel.from_pretrained(\"facebook/sam3\").to(device, dtype=torch.bfloat16)\\n>>> processor = Sam3TrackerVideoProcessor.from_pretrained(\"facebook/sam3\")\\n\\n>>> # Load video frames\\n>>> from transformers.video_utils import load_video\\n>>> video_url = \"\\n>>> video_frames, _ = load_video(video_url)\\n\\n>>> # Initialize video inference session\\n>>> inference_session = processor.init_video_session(\\n...     video=video_frames,\\n...     inference_device=device,\\n...     dtype=torch.bfloat16,\\n... )\\n\\n>>> # Add click on first frame to select object\\n>>> ann_frame_idx = 0\\n>>> ann_obj_id = 1\\n>>> points = [[[[210, 350]]]]\\n>>> labels = [[[1]]]\\n\\n>>> processor.add_inputs_to_inference_session(\\n...     inference_session=inference_session,\\n...     frame_idx=ann_frame_idx,\\n...     obj_ids=ann_obj_id,\\n...     input_points=points,\\n...     input_labels=labels,\\n... )\\n\\n>>> # Segment the object on the first frame (optional, you can also propagate the masks through the video directly)\\n>>> outputs = model(\\n...     inference_session=inference_session,\\n...     frame_idx=ann_frame_idx,\\n... )\\n>>> video_res_masks = processor.post_process_masks(\\n...     [outputs.pred_masks], original_sizes=[[inference_session.video_height, inference_session.video_width]], binarize=False\\n... )[0]\\n>>> print(f\"Segmentation shape: {video_res_masks.shape}\")\\nSegmentation shape: torch.Size([1, 1, 480, 854])\\n\\n>>> # Propagate through the entire video\\n>>> video_segments = {}\\n>>> for sam3_tracker_video_output in model.propagate_in_video_iterator(inference_session):\\n...     video_res_masks = processor.post_process_masks(\\n...         [sam3_tracker_video_output.pred_masks], original_sizes=[[inference_session.video_height, inference_session.video_width]], binarize=False\\n...     )[0]\\n...     video_segments[sam3_tracker_video_output.frame_idx] = video_res_masks\\n\\n>>> print(f\"Tracked object through {len(video_segments)} frames\")\\nTracked object through 180 frames\\n```\\n\\n#### Multi-Object Video Tracking\\n\\nTrack multiple objects simultaneously across video frames:\\n\\n```python\\n>>> # Reset for new tracking session\\n>>> inference_session.reset_inference_session()\\n\\n>>> # Add multiple objects on the first frame\\n>>> ann_frame_idx = 0\\n>>> obj_ids = [2, 3]\\n>>> input_points = [[[[200, 300]], [[400, 150]]]]  # Points for two objects (batched)\\n>>> input_labels = [[[1], [1]]]\\n\\n>>> processor.add_inputs_to_inference_session(\\n...     inference_session=inference_session,\\n...     frame_idx=ann_frame_idx,\\n...     obj_ids=obj_ids,\\n...     input_points=input_points,\\n...     input_labels=input_labels,\\n... )\\n\\n>>> # Get masks for both objects on first frame (optional, you can also propagate the masks through the video directly)\\n>>> outputs = model(\\n...     inference_session=inference_session,\\n...     frame_idx=ann_frame_idx,\\n... )\\n\\n>>> # Propagate both objects through video\\n>>> video_segments = {}\\n>>> for sam3_tracker_video_output in model.propagate_in_video_iterator(inference_session):\\n...     video_res_masks = processor.post_process_masks(\\n...         [sam3_tracker_video_output.pred_masks], original_sizes=[[inference_session.video_height, inference_session.video_width]], binarize=False\\n...     )[0]\\n...     video_segments[sam3_tracker_video_output.frame_idx] = {\\n...         obj_id: video_res_masks[i]\\n...         for i, obj_id in enumerate(inference_session.obj_ids)\\n...     }\\n\\n>>> print(f\"Tracked {len(inference_session.obj_ids)} objects through {len(video_segments)} frames\")\\nTracked 2 objects through 180 frames\\n```\\n\\n#### Streaming Video Inference\\n\\nFor real-time applications, Sam3TrackerVideo supports processing video frames as they arrive:\\n\\n```python\\n>>> # Initialize session for streaming\\n>>> inference_session = processor.init_video_session(\\n...     inference_device=device,\\n...     dtype=torch.bfloat16,\\n... )\\n\\n>>> # Process frames one by one\\n>>> for frame_idx, frame in enumerate(video_frames[:10]):  # Process first 10 frames\\n...     inputs = processor(images=frame, device=device, return_tensors=\"pt\")\\n...\\n...     if frame_idx == 0:\\n...         # Add point input on first frame\\n...         processor.add_inputs_to_inference_session(\\n...             inference_session=inference_session,\\n...             frame_idx=0,\\n...             obj_ids=1,\\n...             input_points=[[[[210, 350], [250, 220]]]],\\n...             input_labels=[[[1, 1]]],\\n...             original_size=inputs.original_sizes[0], # need to be provided when using streaming video inference\\n...         )\\n...\\n...     # Process current frame\\n...     sam3_tracker_video_output = model(inference_session=inference_session, frame=inputs.pixel_values[0])\\n...\\n...     video_res_masks = processor.post_process_masks(\\n...         [sam3_tracker_video_output.pred_masks], original_sizes=inputs.original_sizes, binarize=False\\n...     )[0]\\n...     print(f\"Frame {frame_idx}: mask shape {video_res_masks.shape}\")\\n```',\n",
       "  'domain': 'mask-generation'},\n",
       " {'model_id': 'lightx2v/Wan2.2-Distill-Loras',\n",
       "  'created_at': '2025-10-16T07:23:10+00:00',\n",
       "  'downloads': 1284058,\n",
       "  'likes': 191,\n",
       "  'author': None,\n",
       "  'tags': ['diffusers',\n",
       "   'diffusion-single-file',\n",
       "   'comfyui',\n",
       "   'distillation',\n",
       "   'LoRA',\n",
       "   'video',\n",
       "   'video genration',\n",
       "   'lora',\n",
       "   'image-to-video',\n",
       "   'base_model:Wan-AI/Wan2.2-I2V-A14B',\n",
       "   'base_model:adapter:Wan-AI/Wan2.2-I2V-A14B',\n",
       "   'license:apache-2.0',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# 🎬 Wan2.2 Distilled LoRA Models\\n\\n### ⚡ High-Performance Video Generation with 4-Step Inference Using LoRA\\n\\n*LoRA weights extracted from Wan2.2 distilled models - Flexible deployment with excellent generation quality*\\n\\n\\n\\n---\\n\\n\\n\\n\\n\\n---\\n\\n## 🌟 What\\'s Special?\\n\\n\\n\\n\\n\\n### ⚡ Flexible Deployment\\n- **Base Model + LoRA**: Can be combined with base models\\n- **Offline Merging**: Pre-merge LoRA into models\\n- **Online Loading**: Dynamically load LoRA during inference\\n- **Multiple Frameworks**: Supports LightX2V and ComfyUI\\n\\n\\n\\n\\n### 🎯 Dual Noise Control\\n- **High Noise**: More creative, diverse outputs\\n- **Low Noise**: More faithful to input, stable outputs\\n- Rank 64 LoRA, compact size\\n\\n\\n\\n\\n\\n\\n### 💾 Storage Efficient\\n- **Small LoRA Size**: Significantly smaller than full models\\n- **Flexible Combination**: Can be combined with quantization\\n- **Easy Sharing**: Convenient for model weight distribution\\n\\n\\n\\n\\n### 🚀 4-Step Inference\\n- **Ultra-Fast Generation**: Generate high-quality videos in just 4 steps\\n- **Distillation Acceleration**: Inherits advantages of distilled models\\n- **Quality Assurance**: Maintains excellent generation quality\\n\\n\\n\\n\\n\\n---\\n\\n## 📦 LoRA Model Catalog\\n\\n### 🎥 Available LoRA Models\\n\\n| Task Type | Noise Level | Model File | Rank | Purpose |\\n|:-------:|:--------:|:---------|:----:|:-----|\\n| **I2V** | High Noise | `wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step_xxx.safetensors` | 64 | More creative image-to-video |\\n| **I2V** | Low Noise | `wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_xxx.safetensors` | 64 | More stable image-to-video |\\n\\n> 💡 **Note**: \\n> - `xxx` in filenames represents version number or timestamp, please check  for the latest version\\n> - These LoRAs must be used with Wan2.2 base models\\n\\n---\\n\\n## 🚀 Usage\\n\\n### Prerequisites\\n\\n**Base Model**: You need to prepare Wan2.2 I2V base model (original model without distillation)\\n\\nDownload base model (choose one):\\n\\n**Method 1: From LightX2V Official Repository (Recommended)**\\n```bash\\n# Download high noise base model\\nhuggingface-cli download lightx2v/Wan2.2-Official-Models \\\\\\n    wan2.2_i2v_A14b_high_noise_lightx2v.safetensors \\\\\\n    --local-dir ./models/Wan2.2-Official-Models\\n\\n# Download low noise base model\\nhuggingface-cli download lightx2v/Wan2.2-Official-Models \\\\\\n    wan2.2_i2v_A14b_low_noise_lightx2v.safetensors \\\\\\n    --local-dir ./models/Wan2.2-Official-Models\\n```\\n\\n**Method 2: From Wan-AI Official Repository**\\n```bash\\nhuggingface-cli download Wan-AI/Wan2.2-I2V-A14B \\\\\\n    --local-dir ./models/Wan2.2-I2V-A14B\\n```\\n\\n> 💡 **Note**:  provides separate high noise and low noise base models, download as needed\\n\\n### Method 1: LightX2V - Offline LoRA Merging (Recommended ⭐)\\n\\n**Offline LoRA merging provides best performance and supports quantization simultaneously.**\\n\\n#### 1.1 Download LoRA Models\\n\\n```bash\\n# Download both LoRAs (high noise and low noise)\\n# Note: xxx represents version number, please check HuggingFace for actual filename\\nhuggingface-cli download lightx2v/Wan2.2-Distill-Loras \\\\\\n    wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step_xxx.safetensors \\\\\\n    wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_xxx.safetensors \\\\\\n    --local-dir ./loras/\\n```\\n\\n#### 1.2 Merge LoRA (Basic Merging)\\n\\n**Merge LoRA:**\\n```bash\\ncd LightX2V/tools/convert\\n\\n# For directory-based base model: --source /path/to/Wan2.2-I2V-A14B/high_noise_model/\\npython converter.py \\\\\\n    --source ./models/Wan2.2-Official-Models/wan2.2_i2v_A14b_high_noise_lightx2v.safetensors \\\\\\n    --output /path/to/output/ \\\\\\n    --output_ext .safetensors \\\\\\n    --output_name wan2.2_i2v_A14b_high_noise_lightx2v_4step \\\\\\n    --model_type wan_dit \\\\\\n    --lora_path /path/to/loras/wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step_xxx.safetensors \\\\\\n    --lora_strength 1.0 \\\\\\n    --single_file\\n\\n# For directory-based base model: --source /path/to/Wan2.2-I2V-A14B/low_noise_model/\\npython converter.py \\\\\\n    --source ./models/Wan2.2-Official-Models/wan2.2_i2v_A14b_low_noise_lightx2v.safetensors \\\\\\n    --output /path/to/output/ \\\\\\n    --output_ext .safetensors \\\\\\n    --output_name wan2.2_i2v_A14b_low_noise_lightx2v_4step \\\\\\n    --model_type wan_dit \\\\\\n    --lora_path /path/to/loras/wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_xxx.safetensors \\\\\\n    --lora_strength 1.0 \\\\\\n    --single_file\\n```\\n\\n#### 1.3 Merge LoRA + Quantization (Recommended)\\n\\n**Merge LoRA + FP8 Quantization:**\\n```bash\\ncd LightX2V/tools/convert\\n\\n# For directory-based base model: --source /path/to/Wan2.2-I2V-A14B/high_noise_model/\\npython converter.py \\\\\\n    --source ./models/Wan2.2-Official-Models/wan2.2_i2v_A14b_high_noise_lightx2v.safetensors \\\\\\n    --output /path/to/output/ \\\\\\n    --output_ext .safetensors \\\\\\n    --output_name wan2.2_i2v_A14b_high_noise_scaled_fp8_e4m3_lightx2v_4step \\\\\\n    --model_type wan_dit \\\\\\n    --lora_path /path/to/loras/wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step_xxx.safetensors \\\\\\n    --lora_strength 1.0 \\\\\\n    --quantized \\\\\\n    --linear_dtype torch.float8_e4m3fn \\\\\\n    --non_linear_dtype torch.bfloat16 \\\\\\n    --single_file\\n\\n# For directory-based base model: --source /path/to/Wan2.2-I2V-A14B/low_noise_model/\\npython converter.py \\\\\\n    --source ./models/Wan2.2-Official-Models/wan2.2_i2v_A14b_low_noise_lightx2v.safetensors \\\\\\n    --output /path/to/output/ \\\\\\n    --output_ext .safetensors \\\\\\n    --output_name wan2.2_i2v_A14b_low_noise_scaled_fp8_e4m3_lightx2v_4step \\\\\\n    --model_type wan_dit \\\\\\n    --lora_path /path/to/loras/wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_xxx.safetensors \\\\\\n    --lora_strength 1.0 \\\\\\n    --quantized \\\\\\n    --linear_dtype torch.float8_e4m3fn \\\\\\n    --non_linear_dtype torch.bfloat16 \\\\\\n    --single_file\\n```\\n\\n**Merge LoRA + ComfyUI FP8 Format:**\\n```bash\\ncd LightX2V/tools/convert\\n\\n# For directory-based base model: --source /path/to/Wan2.2-I2V-A14B/high_noise_model/\\npython converter.py \\\\\\n    --source ./models/Wan2.2-Official-Models/wan2.2_i2v_A14b_high_noise_lightx2v.safetensors \\\\\\n    --output /path/to/output/ \\\\\\n    --output_ext .safetensors \\\\\\n    --output_name wan2.2_i2v_A14b_high_noise_scaled_fp8_e4m3_lightx2v_4step_comfyui \\\\\\n    --model_type wan_dit \\\\\\n    --lora_path /path/to/loras/wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step_xxx.safetensors \\\\\\n    --lora_strength 1.0 \\\\\\n    --quantized \\\\\\n    --linear_dtype torch.float8_e4m3fn \\\\\\n    --non_linear_dtype torch.bfloat16 \\\\\\n    --single_file \\\\\\n    --comfyui_mode\\n\\n# For directory-based base model: --source /path/to/Wan2.2-I2V-A14B/low_noise_model/\\npython converter.py \\\\\\n    --source ./models/Wan2.2-Official-Models/wan2.2_i2v_A14b_low_noise_lightx2v.safetensors \\\\\\n    --output /path/to/output/ \\\\\\n    --output_ext .safetensors \\\\\\n    --output_name wan2.2_i2v_A14b_low_noise_scaled_fp8_e4m3_lightx2v_4step_comfyui \\\\\\n    --model_type wan_dit \\\\\\n    --lora_path /path/to/loras/wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_xxx.safetensors \\\\\\n    --lora_strength 1.0 \\\\\\n    --quantized \\\\\\n    --linear_dtype torch.float8_e4m3fn \\\\\\n    --non_linear_dtype torch.bfloat16 \\\\\\n    --single_file \\\\\\n    --comfyui_mode\\n```\\n\\n> 📝 **Reference Documentation**: For more merging options, see \\n\\n---\\n\\n### Method 2: LightX2V - Online LoRA Loading\\n\\n**Online LoRA loading requires no pre-merging, loads dynamically during inference, more flexible.**\\n\\n#### 2.1 Download LoRA Models\\n\\n```bash\\n# Download both LoRAs (high noise and low noise)\\n# Note: xxx represents version number, please check HuggingFace for actual filename\\nhuggingface-cli download lightx2v/Wan2.2-Distill-Loras \\\\\\n    wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step_xxx.safetensors \\\\\\n    wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_xxx.safetensors \\\\\\n    --local-dir ./loras/\\n```\\n\\n#### 2.2 Use Configuration File\\n\\nReference configuration file: \\n\\nLoRA configuration example in config file:\\n```json\\n{\\n    \"lora_configs\": [\\n        {\\n            \"name\": \"high_noise_model\",\\n            \"path\": \"/path/to/loras/wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step_xxx.safetensors\",\\n            \"strength\": 1.0\\n        },\\n        {\\n            \"name\": \"low_noise_model\",\\n            \"path\": \"/path/to/loras/wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_xxx.safetensors\",\\n            \"strength\": 1.0\\n        }\\n    ]\\n}\\n```\\n\\n> 💡 **Tip**: Replace `xxx` with actual version number (e.g., `1022`). Check  for the latest version\\n\\n\\n#### 2.3 Run Inference\\n\\nUsing  as example:\\n```bash\\ncd scripts\\nbash wan22/run_wan22_moe_i2v_distill.sh\\n```\\n\\n### Method 3: ComfyUI\\n\\nPlease refer to \\n\\n## ⚠️ Important Notes\\n\\n1. **Base Model Requirement**: These LoRAs must be used with Wan2.2-I2V-A14B base model, cannot be used standalone\\n\\n2. **Other Components**: In addition to DIT model and LoRA, the following are also required at runtime:\\n   - T5 text encoder\\n   - CLIP vision encoder\\n   - VAE encoder/decoder\\n   - Tokenizer\\n   \\n   Please refer to  for how to organize complete model directory\\n\\n3. **Inference Configuration**: When using 4-step inference, configure correct `denoising_step_list`, recommended: `[1000, 750, 500, 250]`\\n\\n\\n## 📚 Related Resources\\n\\n### Documentation Links\\n- **LightX2V Quick Start**: \\n- **Model Conversion Tool**: \\n- **Online LoRA Loading**: \\n- **Quantization Guide**: \\n- **Model Structure**: \\n\\n### Related Models\\n- **Distilled Full Models**: \\n- **Wan2.2 Official Models**:  - Contains high noise and low noise base models\\n- **Base Model (Wan-AI)**: \\n\\n## 🤝 Community & Support\\n\\n- **GitHub Issues**: \\n- **HuggingFace**: \\n- **LightX2V Homepage**: \\n\\nIf you find this project helpful, please give us a ⭐ on ',\n",
       "  'domain': 'image-to-video'},\n",
       " {'model_id': 'answerdotai/ModernBERT-base',\n",
       "  'created_at': '2024-12-11T11:38:06+00:00',\n",
       "  'downloads': 743351,\n",
       "  'likes': 986,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'pytorch',\n",
       "   'onnx',\n",
       "   'safetensors',\n",
       "   'modernbert',\n",
       "   'fill-mask',\n",
       "   'masked-lm',\n",
       "   'long-context',\n",
       "   'en',\n",
       "   'arxiv:2412.13663',\n",
       "   'license:apache-2.0',\n",
       "   'deploy:azure',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# ModernBERT\\n\\n## Table of Contents\\n1. \\n2. \\n3. \\n4. \\n5. \\n6. \\n7. \\n\\n## Model Summary\\n\\nModernBERT is a modernized bidirectional encoder-only Transformer model (BERT-style) pre-trained on 2 trillion tokens of English and code data with a native context length of up to 8,192 tokens. ModernBERT leverages recent architectural improvements such as:\\n\\n- **Rotary Positional Embeddings (RoPE)** for long-context support.  \\n- **Local-Global Alternating Attention** for efficiency on long inputs.  \\n- **Unpadding and Flash Attention** for efficient inference.  \\n\\nModernBERT’s native long context length makes it ideal for tasks that require processing long documents, such as retrieval, classification, and semantic search within large corpora. The model was trained on a large corpus of text and code, making it suitable for a wide range of downstream tasks, including code retrieval and hybrid (text + code) semantic search.\\n\\nIt is available in the following sizes:\\n\\n-  - 22 layers, 149 million parameters\\n-  - 28 layers, 395 million parameters\\n\\nFor more information about ModernBERT, we recommend our  for a high-level overview, and our  for in-depth information.\\n\\n*ModernBERT is a collaboration between , , and friends.*\\n\\n## Usage\\n\\nYou can use these models directly with the `transformers` library starting from v4.48.0:\\n\\n```sh\\npip install -U transformers>=4.48.0\\n```\\n\\nSince ModernBERT is a Masked Language Model (MLM), you can use the `fill-mask` pipeline or load it via `AutoModelForMaskedLM`. To use ModernBERT for downstream tasks like classification, retrieval, or QA, fine-tune it following standard BERT fine-tuning recipes.\\n\\n**⚠️ If your GPU supports it, we recommend using ModernBERT with Flash Attention 2 to reach the highest efficiency. To do so, install Flash Attention as follows, then use the model as normal:**\\n\\n```bash\\npip install flash-attn\\n```\\n\\nUsing `AutoModelForMaskedLM`:\\n\\n```python\\nfrom transformers import AutoTokenizer, AutoModelForMaskedLM\\n\\nmodel_id = \"answerdotai/ModernBERT-base\"\\ntokenizer = AutoTokenizer.from_pretrained(model_id)\\nmodel = AutoModelForMaskedLM.from_pretrained(model_id)\\n\\ntext = \"The capital of France is [MASK].\"\\ninputs = tokenizer(text, return_tensors=\"pt\")\\noutputs = model(**inputs)\\n\\n# To get predictions for the mask:\\nmasked_index = inputs[\"input_ids\"][0].tolist().index(tokenizer.mask_token_id)\\npredicted_token_id = outputs.logits[0, masked_index].argmax(axis=-1)\\npredicted_token = tokenizer.decode(predicted_token_id)\\nprint(\"Predicted token:\", predicted_token)\\n# Predicted token:  Paris\\n```\\n\\nUsing a pipeline:\\n\\n```python\\nimport torch\\nfrom transformers import pipeline\\nfrom pprint import pprint\\n\\npipe = pipeline(\\n    \"fill-mask\",\\n    model=\"answerdotai/ModernBERT-base\",\\n    torch_dtype=torch.bfloat16,\\n)\\n\\ninput_text = \"He walked to the [MASK].\"\\nresults = pipe(input_text)\\npprint(results)\\n```\\n\\n**Note:** ModernBERT does not use token type IDs, unlike some earlier BERT models. Most downstream usage is identical to standard BERT models on the Hugging Face Hub, except you can omit the `token_type_ids` parameter.\\n\\n## Evaluation\\n\\nWe evaluate ModernBERT across a range of tasks, including natural language understanding (GLUE), general retrieval (BEIR), long-context retrieval (MLDR), and code retrieval (CodeSearchNet and StackQA).\\n\\n**Key highlights:**\\n- On GLUE, ModernBERT-base surpasses other similarly-sized encoder models, and ModernBERT-large is second only to Deberta-v3-large.\\n- For general retrieval tasks, ModernBERT performs well on BEIR in both single-vector (DPR-style) and multi-vector (ColBERT-style) settings.\\n- Thanks to the inclusion of code data in its training mixture, ModernBERT as a backbone also achieves new state-of-the-art code retrieval results on CodeSearchNet and StackQA.\\n\\n### Base Models\\n\\n| Model       | IR (DPR)     | IR (DPR)     | IR (DPR)     | IR (ColBERT)  | IR (ColBERT)  | NLU  | Code | Code |\\n|-------------|--------------|--------------|--------------|---------------|---------------|------|------|------|\\n|             | BEIR         | MLDR_OOD     | MLDR_ID      | BEIR          | MLDR_OOD      | GLUE | CSN  | SQA  |\\n| BERT        | 38.9         | 23.9         | 32.2         | 49.0          | 28.1          | 84.7 | 41.2 | 59.5 |\\n| RoBERTa     | 37.7         | 22.9         | 32.8         | 48.7          | 28.2          | 86.4 | 44.3 | 59.6 |\\n| DeBERTaV3   | 20.2         | 5.4          | 13.4         | 47.1          | 21.9          | 88.1 | 17.5 | 18.6 |\\n| NomicBERT   | 41.0         | 26.7         | 30.3         | 49.9          | 61.3          | 84.0 | 41.6 | 61.4 |\\n| GTE-en-MLM  | 41.4         | **34.3**    |**44.4**   | 48.2          | 69.3          | 85.6 | 44.9 | 71.4 |\\n| ModernBERT  | **41.6**    | 27.4         | 44.0         | **51.3**    | **80.2**      | **88.4** | **56.4** |**73.6**|\\n\\n---\\n\\n### Large Models\\n\\n| Model       | IR (DPR)     | IR (DPR)     | IR (DPR)     | IR (ColBERT)  | IR (ColBERT)  | NLU  | Code | Code |\\n|-------------|--------------|--------------|--------------|---------------|---------------|------|------|------|\\n|             | BEIR         | MLDR_OOD     | MLDR_ID      | BEIR          | MLDR_OOD      | GLUE | CSN  | SQA  |\\n| BERT        | 38.9         | 23.3         | 31.7         | 49.5          | 28.5          | 85.2 | 41.6 | 60.8 |\\n| RoBERTa     | 41.4         | 22.6         | 36.1         | 49.8          | 28.8          | 88.9 | 47.3 | 68.1 |\\n| DeBERTaV3   | 25.6         | 7.1          | 19.2         | 46.7          | 23.0          | **91.4**| 21.2 | 19.7 |\\n| GTE-en-MLM  | 42.5         | **36.4**    | **48.9**  | 50.7          | 71.3          | 87.6 | 40.5 | 66.9 |\\n| ModernBERT  | **44.0**    | 34.3         | 48.6         | **52.4**     | **80.4**     | 90.4 |**59.5** |**83.9**|\\n\\n*Table 1: Results for all models across an overview of all tasks. CSN refers to CodeSearchNet and SQA to StackQA. MLDRID refers to in-domain (fine-tuned on the training set) evaluation, and MLDR_OOD to out-of-domain.*\\n\\nModernBERT’s strong results, coupled with its efficient runtime on long-context inputs, demonstrate that encoder-only models can be significantly improved through modern architectural choices and extensive pretraining on diversified data sources.\\n\\n\\n## Limitations\\n\\nModernBERT’s training data is primarily English and code, so performance may be lower for other languages. While it can handle long sequences efficiently, using the full 8,192 tokens window may be slower than short-context inference. Like any large language model, ModernBERT may produce representations that reflect biases present in its training data. Verify critical or sensitive outputs before relying on them.\\n\\n## Training\\n\\n- Architecture: Encoder-only, Pre-Norm Transformer with GeGLU activations.\\n- Sequence Length: Pre-trained up to 1,024 tokens, then extended to 8,192 tokens.\\n- Data: 2 trillion tokens of English text and code.\\n- Optimizer: StableAdamW with trapezoidal LR scheduling and 1-sqrt decay.\\n- Hardware: Trained on 8x H100 GPUs.\\n\\nSee the paper for more details.\\n\\n## License\\n\\nWe release the ModernBERT model architectures, model weights, training codebase under the Apache 2.0 license.\\n\\n## Citation\\n\\nIf you use ModernBERT in your work, please cite:\\n\\n```\\n@misc{modernbert,\\n      title={Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference}, \\n      author={Benjamin Warner and Antoine Chaffin and Benjamin Clavié and Orion Weller and Oskar Hallström and Said Taghadouini and Alexis Gallagher and Raja Biswas and Faisal Ladhak and Tom Aarsen and Nathan Cooper and Griffin Adams and Jeremy Howard and Iacopo Poli},\\n      year={2024},\\n      eprint={2412.13663},\\n      archivePrefix={arXiv},\\n      primaryClass={cs.CL},\\n      url={ \\n}\\n```',\n",
       "  'domain': 'fill-mask'},\n",
       " {'model_id': 'Qwen/Qwen3-Reranker-8B',\n",
       "  'created_at': '2025-05-29T13:30:18+00:00',\n",
       "  'downloads': 670384,\n",
       "  'likes': 209,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'qwen3',\n",
       "   'text-generation',\n",
       "   'text-ranking',\n",
       "   'arxiv:2506.05176',\n",
       "   'base_model:Qwen/Qwen3-8B-Base',\n",
       "   'base_model:finetune:Qwen/Qwen3-8B-Base',\n",
       "   'license:apache-2.0',\n",
       "   'text-embeddings-inference',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': '# Qwen3-Reranker-8B\\n\\n\\n    \\n\\n\\n## Highlights\\n\\nThe Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B). This series inherits the exceptional multilingual capabilities, long-text understanding, and reasoning skills of its foundational model. The Qwen3 Embedding series represents significant advancements in multiple text embedding and ranking tasks, including text retrieval, code retrieval, text classification, text clustering, and bitext mining.\\n\\n**Exceptional Versatility**: The embedding model has achieved state-of-the-art performance across a wide range of downstream application evaluations. The 8B size embedding model ranks No.1 in the MTEB multilingual leaderboard (as of June 5, 2025, score 70.58), while the reranking model excels in various text retrieval scenarios.\\n\\n**Comprehensive Flexibility**: The Qwen3 Embedding series offers a full spectrum of sizes (from 0.6B to 8B) for both embedding and reranking models, catering to diverse use cases that prioritize efficiency and effectiveness. Developers can seamlessly combine these two modules. Additionally, the embedding model allows for flexible vector definitions across all dimensions, and both embedding and reranking models support user-defined instructions to enhance performance for specific tasks, languages, or scenarios.\\n\\n**Multilingual Capability**: The Qwen3 Embedding series offer support for over 100 languages, thanks to the multilingual capabilites of Qwen3 models. This includes various programming languages, and provides robust multilingual, cross-lingual, and code retrieval capabilities.\\n\\n\\n## Model Overview\\n\\n**Qwen3-Reranker-8B** has the following features:\\n\\n- Model Type: Text Reranking\\n- Supported Languages: 100+ Languages\\n- Number of Paramaters: 8B\\n- Context Length: 32k\\n\\nFor more details, including benchmark evaluation, hardware requirements, and inference performance, please refer to our , .\\n\\n## Qwen3 Embedding Series Model list\\n\\n| Model Type       | Models               | Size | Layers | Sequence Length | Embedding Dimension | MRL Support | Instruction Aware |\\n|------------------|----------------------|------|--------|-----------------|---------------------|-------------|----------------|\\n| Text Embedding   |  | 0.6B | 28     | 32K             | 1024                | Yes         | Yes            |\\n| Text Embedding   |    | 4B   | 36     | 32K             | 2560                | Yes         | Yes            |\\n| Text Embedding   |    | 8B   | 36     | 32K             | 4096                | Yes         | Yes            |\\n| Text Reranking   |  | 0.6B | 28     | 32K             | -                   | -           | Yes            |\\n| Text Reranking   |    | 4B   | 36     | 32K             | -                   | -           | Yes            |\\n| Text Reranking   |    | 8B   | 36     | 32K             | -                   | -           | Yes            |\\n\\n> **Note**:\\n> - `MRL Support` indicates whether the embedding model supports custom dimensions for the final embedding. \\n> - `Instruction Aware` notes whether the embedding or reranking model supports customizing the input instruction according to different tasks.\\n> - Our evaluation indicates that, for most downstream tasks, using instructions (instruct) typically yields an improvement of 1% to 5% compared to not using them. Therefore, we recommend that developers create tailored instructions specific to their tasks and scenarios. In multilingual contexts, we also advise users to write their instructions in English, as most instructions utilized during the model training process were originally written in English.\\n\\n\\n## Usage\\n\\nWith Transformers versions earlier than 4.51.0, you may encounter the following error:\\n```\\nKeyError: \\'qwen3\\'\\n```\\n\\n### Transformers Usage\\n\\n```python\\n# Requires transformers>=4.51.0\\nimport torch\\nfrom transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM\\n\\ndef format_instruction(instruction, query, doc):\\n    if instruction is None:\\n        instruction = \\'Given a web search query, retrieve relevant passages that answer the query\\'\\n    output = \": {instruction}\\\\n: {query}\\\\n: {doc}\".format(instruction=instruction,query=query, doc=doc)\\n    return output\\n\\ndef process_inputs(pairs):\\n    inputs = tokenizer(\\n        pairs, padding=False, truncation=\\'longest_first\\',\\n        return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens)\\n    )\\n    for i, ele in enumerate(inputs[\\'input_ids\\']):\\n        inputs[\\'input_ids\\'][i] = prefix_tokens + ele + suffix_tokens\\n    inputs = tokenizer.pad(inputs, padding=True, return_tensors=\"pt\", max_length=max_length)\\n    for key in inputs:\\n        inputs[key] = inputs[key].to(model.device)\\n    return inputs\\n\\n@torch.no_grad()\\ndef compute_logits(inputs, **kwargs):\\n    batch_scores = model(**inputs).logits[:, -1, :]\\n    true_vector = batch_scores[:, token_true_id]\\n    false_vector = batch_scores[:, token_false_id]\\n    batch_scores = torch.stack([false_vector, true_vector], dim=1)\\n    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)\\n    scores = batch_scores[:, 1].exp().tolist()\\n    return scores\\n\\ntokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen3-Reranker-8B\", padding_side=\\'left\\')\\n\\nmodel = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen3-Reranker-8B\").eval()\\n# We recommend enabling flash_attention_2 for better acceleration and memory saving.\\n# model = AutoModelForCausalLM.from_pretrained(\"Qwen/Qwen3-Reranker-8B\", torch_dtype=torch.float16, attn_implementation=\"flash_attention_2\").cuda().eval()\\n\\ntoken_false_id = tokenizer.convert_tokens_to_ids(\"no\")\\ntoken_true_id = tokenizer.convert_tokens_to_ids(\"yes\")\\nmax_length = 8192\\n\\nprefix = \"system\\\\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \\\\\"yes\\\\\" or \\\\\"no\\\\\".\\\\nuser\\\\n\"\\nsuffix = \"\\\\nassistant\\\\n\\\\n\\\\n\\\\n\\\\n\"\\nprefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)\\nsuffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)\\n        \\ntask = \\'Given a web search query, retrieve relevant passages that answer the query\\'\\n\\nqueries = [\"What is the capital of China?\",\\n    \"Explain gravity\",\\n]\\n\\ndocuments = [\\n    \"The capital of China is Beijing.\",\\n    \"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.\",\\n]\\n\\npairs = [format_instruction(task, query, doc) for query, doc in zip(queries, documents)]\\n\\n# Tokenize the input texts\\ninputs = process_inputs(pairs)\\nscores = compute_logits(inputs)\\n\\nprint(\"scores: \", scores)\\n```\\n\\n📌 **Tip**: We recommend that developers customize the `instruct` according to their specific scenarios, tasks, and languages. Our tests have shown that in most retrieval scenarios, not using an `instruct` on the query side can lead to a drop in retrieval performance by approximately 1% to 5%.\\n\\n## Evaluation\\n\\n| Model                              | Param  | MTEB-R  | CMTEB-R | MMTEB-R | MLDR   | MTEB-Code | FollowIR |\\n|------------------------------------|--------|---------|---------|---------|--------|-----------|----------|\\n| **Qwen3-Embedding-0.6B**               | 0.6B   | 61.82   | 71.02   | 64.64   | 50.26  | 75.41     | 5.09     |\\n| Jina-multilingual-reranker-v2-base | 0.3B   | 58.22   | 63.37   | 63.73   | 39.66  | 58.98     | -0.68    |\\n| gte-multilingual-reranker-base                      | 0.3B   | 59.51   | 74.08   | 59.44   | 66.33  | 54.18     | -1.64    |\\n| BGE-reranker-v2-m3                 | 0.6B   | 57.03   | 72.16   | 58.36   | 59.51  | 41.38     | -0.01    |\\n| **Qwen3-Reranker-0.6B**                | 0.6B   | 65.80   | 71.31   | 66.36   | 67.28  | 73.42     | 5.41     |\\n| **Qwen3-Reranker-4B**                  | 4B   | **69.76** | 75.94   | 72.74   | 69.97  | 81.20     | **14.84** |\\n| **Qwen3-Reranker-8B**                  | 8B     | 69.02   | **77.45** | **72.94** | **70.19** | **81.22** | 8.05     |\\n\\n> **Note**:  \\n> - Evaluation results for reranking models. We use the retrieval subsets of MTEB(eng, v2), MTEB(cmn, v1), MMTEB and MTEB (Code), which are MTEB-R, CMTEB-R, MMTEB-R and MTEB-Code.\\n> - All scores are our runs based on the top-100 candidates retrieved by dense embedding model .\\n\\n## Citation\\nIf you find our work helpful, feel free to give us a cite.\\n\\n```\\n@article{qwen3embedding,\\n  title={Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models},\\n  author={Zhang, Yanzhao and Li, Mingxin and Long, Dingkun and Zhang, Xin and Lin, Huan and Yang, Baosong and Xie, Pengjun and Yang, An and Liu, Dayiheng and Lin, Junyang and Huang, Fei and Zhou, Jingren},\\n  journal={arXiv preprint arXiv:2506.05176},\\n  year={2025}\\n}\\n```',\n",
       "  'domain': 'text-ranking'},\n",
       " {'model_id': 'autogluon/mitra-regressor',\n",
       "  'created_at': '2025-06-22T23:29:18+00:00',\n",
       "  'downloads': 648441,\n",
       "  'likes': 28,\n",
       "  'author': None,\n",
       "  'tags': ['safetensors',\n",
       "   'tabular-regression',\n",
       "   'arxiv:2510.21204',\n",
       "   'license:apache-2.0',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# Mitra Regressor\\n\\nMitra regressor is a tabular foundation model that is pre-trained on purely synthetic datasets sampled from a mix of random regressors. \\n\\n## Architecture\\n\\nMitra is based on a 12-layer Transformer of 72 M parameters, pre-trained by incorporating an in-context learning paradigm.\\n\\n## Usage\\n\\nTo use Mitra regressor, install AutoGluon by running:\\n\\n```sh\\npip install uv\\nuv pip install autogluon.tabular[mitra]   \\n```\\n\\nA minimal example showing how to perform inference using the Mitra regressor:\\n\\n```python\\nimport pandas as pd\\nfrom autogluon.tabular import TabularDataset, TabularPredictor\\nfrom sklearn.model_selection import train_test_split\\nfrom sklearn.datasets import fetch_california_housing\\n\\n# Load datasets\\nhousing_data = fetch_california_housing()\\nhousing_df = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)\\nhousing_df[\\'target\\'] = housing_data.target\\n\\nprint(\"Dataset shapes:\")\\nprint(f\"California Housing: {housing_df.shape}\")\\n\\n# Create train/test splits (80/20)\\nhousing_train, housing_test = train_test_split(housing_df, test_size=0.2, random_state=42)\\n\\nprint(\"Training set sizes:\")\\nprint(f\"Housing: {len(housing_train)} samples\")\\n\\n# Convert to TabularDataset\\nhousing_train_data = TabularDataset(housing_train)\\nhousing_test_data = TabularDataset(housing_test)\\n\\n# Create predictor with Mitra for regression\\nprint(\"Training Mitra regressor on California Housing dataset...\")\\nmitra_reg_predictor = TabularPredictor(\\n    label=\\'target\\',\\n    path=\\'./mitra_regressor_model\\',\\n    problem_type=\\'regression\\'\\n)\\nmitra_reg_predictor.fit(\\n    housing_train_data.sample(1000), # sample 1000 rows\\n    hyperparameters={\\n        \\'MITRA\\': {\\'fine_tune\\': False}\\n    },\\n)\\n\\n# Evaluate regression performance\\nmitra_reg_predictor.leaderboard(housing_test_data)\\n```\\n\\n## License\\n\\nThis project is licensed under the Apache-2.0 License.\\n\\n## Reference\\n\\n```\\n@article{zhang2025mitra,\\n  title={Mitra: Mixed synthetic priors for enhancing tabular foundation models},\\n  author={Zhang, Xiyuan and Maddix, Danielle C and Yin, Junming and Erickson, Nick and Ansari, Abdul Fatir and Han, Boran and Zhang, Shuai and Akoglu, Leman and Faloutsos, Christos and Mahoney, Michael W and others},\\n  journal={arXiv preprint arXiv:2510.21204},\\n  year={2025}\\n}\\n```\\n\\nAmazon Science blog: ',\n",
       "  'domain': 'tabular-regression'},\n",
       " {'model_id': 'Tongyi-MAI/Z-Image-Turbo',\n",
       "  'created_at': '2025-11-25T15:09:48+00:00',\n",
       "  'downloads': 537900,\n",
       "  'likes': 3946,\n",
       "  'author': None,\n",
       "  'tags': ['diffusers',\n",
       "   'safetensors',\n",
       "   'text-to-image',\n",
       "   'en',\n",
       "   'arxiv:2511.22699',\n",
       "   'arxiv:2511.22677',\n",
       "   'arxiv:2511.13649',\n",
       "   'license:apache-2.0',\n",
       "   'diffusers:ZImagePipeline',\n",
       "   'deploy:azure',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n\\n⚡️- ImageAn Efficient Image Generation Foundation Model with Single-Stream Diffusion Transformer\\n\\n\\n\\n&#160;\\n&#160;\\n&#160;\\n&#160;\\n&#160;\\n&#160;\\n&#160;\\n&#160;\\n&#160;\\n\\n\\n\\nWelcome to the official repository for the Z-Image（造相）project!\\n\\n\\n\\n\\n\\n## ✨ Z-Image\\n\\nZ-Image is a powerful and highly efficient image generation model with **6B** parameters. Currently there are three variants:\\n\\n- 🚀 **Z-Image-Turbo** – A distilled version of Z-Image that matches or exceeds leading competitors with only **8 NFEs** (Number of Function Evaluations). It offers **⚡️sub-second inference latency⚡️** on enterprise-grade H800 GPUs and fits comfortably within **16G VRAM consumer devices**. It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.\\n\\n- 🧱 **Z-Image-Base** – The non-distilled foundation model. By releasing this checkpoint, we aim to unlock the full potential for community-driven fine-tuning and custom development.\\n\\n- ✍️ **Z-Image-Edit** – A variant fine-tuned on Z-Image specifically for image editing tasks. It supports creative image-to-image generation with impressive instruction-following capabilities, allowing for precise edits based on natural language prompts.\\n\\n### 📥 Model Zoo\\n\\n| Model | Hugging Face                                                                                                                                                                                                                                                                                                              | ModelScope                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |\\n| :--- |:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\\n| **Z-Image-Turbo** |    |    |\\n| **Z-Image-Base** | *To be released*                                                                                                                                                                                                                                                                                                          | *To be released*                                                                                                                                                                                                                                                                                                                                                                                                                                                            |\\n| **Z-Image-Edit** | *To be released*                                                                                                                                                                                                                                                                                                          | *To be released*                                                                                                                                                                                                                                                                                                                                                                                                                                                            |\\n\\n### 🖼️ Showcase\\n\\n📸 **Photorealistic Quality**: **Z-Image-Turbo** delivers strong photorealistic image generation while maintaining excellent aesthetic quality.\\n\\n\\n\\n📖 **Accurate Bilingual Text Rendering**: **Z-Image-Turbo** excels at accurately rendering complex Chinese and English text.\\n\\n\\n\\n💡  **Prompt Enhancing & Reasoning**: Prompt Enhancer empowers the model with reasoning capabilities, enabling it to transcend surface-level descriptions and tap into underlying world knowledge.\\n\\n\\n\\n🧠 **Creative Image Editing**: **Z-Image-Edit** shows a strong understanding of bilingual editing instructions, enabling imaginative and flexible image transformations.\\n\\n\\n\\n### 🏗️ Model Architecture\\nWe adopt a **Scalable Single-Stream DiT** (S3-DiT) architecture. In this setup, text, visual semantic tokens, and image VAE tokens are concatenated at the sequence level to serve as a unified input stream, maximizing parameter efficiency compared to dual-stream approaches.\\n\\n\\n\\n### 📈 Performance\\nAccording to the Elo-based Human Preference Evaluation (on ), Z-Image-Turbo shows highly competitive performance against other leading models, while achieving state-of-the-art results among open-source models.\\n\\n\\n  \\n    \\n     Click to view the full leaderboard\\n  \\n\\n\\n### 🚀 Quick Start\\nInstall the latest version of diffusers, use the following command:\\n\\n  Click here for details for why you need to install diffusers from source\\n\\n  We have submitted two pull requests ( and ) to the 🤗 diffusers repository to add support for Z-Image. Both PRs have been merged into the latest official diffusers release.\\n  Therefore, you need to install diffusers from source for the latest features and Z-Image support.\\n\\n\\n\\n```bash\\npip install git+\\n```\\n\\n```python\\nimport torch\\nfrom diffusers import ZImagePipeline\\n\\n# 1. Load the pipeline\\n# Use bfloat16 for optimal performance on supported GPUs\\npipe = ZImagePipeline.from_pretrained(\\n    \"Tongyi-MAI/Z-Image-Turbo\",\\n    torch_dtype=torch.bfloat16,\\n    low_cpu_mem_usage=False,\\n)\\npipe.to(\"cuda\")\\n\\n# [Optional] Attention Backend\\n# Diffusers uses SDPA by default. Switch to Flash Attention for better efficiency if supported:\\n# pipe.transformer.set_attention_backend(\"flash\")    # Enable Flash-Attention-2\\n# pipe.transformer.set_attention_backend(\"_flash_3\") # Enable Flash-Attention-3\\n\\n# [Optional] Model Compilation\\n# Compiling the DiT model accelerates inference, but the first run will take longer to compile.\\n# pipe.transformer.compile()\\n\\n# [Optional] CPU Offloading\\n# Enable CPU offloading for memory-constrained devices.\\n# pipe.enable_model_cpu_offload()\\n\\nprompt = \"Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights.\"\\n\\n# 2. Generate Image\\nimage = pipe(\\n    prompt=prompt,\\n    height=1024,\\n    width=1024,\\n    num_inference_steps=9,  # This actually results in 8 DiT forwards\\n    guidance_scale=0.0,     # Guidance should be 0 for the Turbo models\\n    generator=torch.Generator(\"cuda\").manual_seed(42),\\n).images[0]\\n\\nimage.save(\"example.png\")\\n```\\n\\n## 🔬 Decoupled-DMD: The Acceleration Magic Behind Z-Image\\n\\n\\n\\nDecoupled-DMD is the core few-step distillation algorithm that empowers the 8-step Z-Image model.\\n\\nOur core insight in Decoupled-DMD  is that the success of existing DMD (Distributaion Matching Distillation) methods is the result of two independent, collaborating mechanisms:\\n\\n-   **CFG Augmentation (CA)**: The primary **engine** 🚀 driving the distillation process, a factor largely overlooked in previous work.\\n-   **Distribution Matching (DM)**: Acts more as a **regularizer** ⚖️, ensuring the stability and quality of the generated output.\\n\\nBy recognizing and decoupling these two mechanisms, we were able to study and optimize them in isolation. This ultimately motivated us to develop an improved distillation process that significantly enhances the performance of few-step generation.\\n\\n\\n\\n## 🤖 DMDR: Fusing DMD with Reinforcement Learning\\n\\n\\n\\nBuilding upon the strong foundation of Decoupled-DMD, our 8-step Z-Image model has already demonstrated exceptional capabilities. To achieve further improvements in terms of semantic alignment, aesthetic quality, and structural coherence—while producing images with richer high-frequency details—we present **DMDR**.\\n\\nOur core insight behind DMDR is that Reinforcement Learning (RL) and Distribution Matching Distillation (DMD) can be synergistically integrated during the post-training of few-step models. We demonstrate that:\\n\\n-   **RL Unlocks the Performance of DMD** 🚀\\n-   **DMD Effectively Regularizes RL** ⚖️\\n\\n\\n\\n## ⏬ Download\\n```bash\\npip install -U huggingface_hub\\nHF_XET_HIGH_PERFORMANCE=1 hf download Tongyi-MAI/Z-Image-Turbo\\n```\\n\\n## 📜 Citation\\n\\nIf you find our work useful in your research, please consider citing:\\n\\n```bibtex\\n@article{team2025zimage,\\n  title={Z-Image: An Efficient Image Generation Foundation Model with Single-Stream Diffusion Transformer},\\n  author={Z-Image Team},\\n  journal={arXiv preprint arXiv:2511.22699},\\n  year={2025}\\n}\\n\\n@article{liu2025decoupled,\\n  title={Decoupled DMD: CFG Augmentation as the Spear, Distribution Matching as the Shield},\\n  author={Dongyang Liu and Peng Gao and David Liu and Ruoyi Du and Zhen Li and Qilong Wu and Xin Jin and Sihan Cao and Shifeng Zhang and Hongsheng Li and Steven Hoi},\\n  journal={arXiv preprint arXiv:2511.22677},\\n  year={2025}\\n}\\n\\n@article{jiang2025distribution,\\n  title={Distribution Matching Distillation Meets Reinforcement Learning},\\n  author={Jiang, Dengyang and Liu, Dongyang and Wang, Zanyi and Wu, Qilong and Jin, Xin and Liu, David and Li, Zhen and Wang, Mengmeng and Gao, Peng and Yang, Harry},\\n  journal={arXiv preprint arXiv:2511.13649},\\n  year={2025}\\n}\\n```',\n",
       "  'domain': 'text-to-image'},\n",
       " {'model_id': 'Alibaba-NLP/gte-reranker-modernbert-base',\n",
       "  'created_at': '2025-01-20T05:46:18+00:00',\n",
       "  'downloads': 523714,\n",
       "  'likes': 83,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'onnx',\n",
       "   'safetensors',\n",
       "   'modernbert',\n",
       "   'text-classification',\n",
       "   'sentence-transformers',\n",
       "   'transformers.js',\n",
       "   'text-embeddings-inference',\n",
       "   'text-ranking',\n",
       "   'en',\n",
       "   'arxiv:2308.03281',\n",
       "   'base_model:answerdotai/ModernBERT-base',\n",
       "   'base_model:finetune:answerdotai/ModernBERT-base',\n",
       "   'license:apache-2.0',\n",
       "   'endpoints_compatible',\n",
       "   'deploy:azure',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# gte-reranker-modernbert-base\\n\\nWe are excited to introduce the `gte-modernbert` series of models, which are built upon the latest modernBERT pre-trained encoder-only foundation models. The `gte-modernbert` series models include both text embedding models and rerank models.\\n\\nThe `gte-modernbert` models demonstrates competitive performance in several text embedding and text retrieval evaluation tasks when compared to similar-scale models from the current open-source community. This includes assessments such as **MTEB**, **LoCO**, and **COIR** evaluation.\\n\\n## Model Overview\\n\\n- Developed by: Tongyi Lab, Alibaba Group\\n- Model Type: Text reranker\\t\\n- Primary Language: English\\n- Model Size: 149M\\n- Max Input Length: 8192 tokens\\n\\n### Model list\\n\\n\\n|                                         Models                                         | Language |       Model Type       | Model Size | Max Seq. Length | Dimension | MTEB-en | BEIR | LoCo | CoIR |\\n|:--------------------------------------------------------------------------------------:|:--------:|:----------------------:|:----------:|:---------------:|:---------:|:-------:|:----:|:----:|:----:|\\n|     | English  |     text embedding     |    149M    |      8192       |    768    |  64.38  | 55.33 | 87.57 | 79.31 | \\n|   | English  | text reranker     |    149M    |    8192    |     -     |  - | 56.19 | 90.68 | 79.99 |\\n\\n## Usage\\n\\n> [!TIP]\\n> For `transformers` and `sentence-transformers`, if your GPU supports it, the efficient Flash Attention 2 will be used automatically if you have `flash_attn` installed. It is not mandatory.\\n> \\n> ```bash\\n> pip install flash_attn\\n> ```\\n\\nUse with `transformers`\\n```python\\n# Requires transformers>=4.48.0\\nimport torch\\nfrom transformers import AutoModelForSequenceClassification, AutoTokenizer\\n\\nmodel_name_or_path = \"Alibaba-NLP/gte-reranker-modernbert-base\"\\ntokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\\nmodel = AutoModelForSequenceClassification.from_pretrained(\\n    model_name_or_path,\\n    torch_dtype=torch.float16,\\n)\\nmodel.eval()\\n\\npairs = [\\n    [\"what is the capital of China?\", \"Beijing\"],\\n    [\"how to implement quick sort in python?\", \"Introduction of quick sort\"],\\n    [\"how to implement quick sort in python?\", \"The weather is nice today\"],\\n]\\n\\nwith torch.no_grad():\\n    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors=\\'pt\\', max_length=512)\\n    scores = model(**inputs, return_dict=True).logits.view(-1, ).float()\\n    print(scores)\\n\\n# tensor([ 2.1387,  2.4609, -1.6729])\\n```\\nUse with `sentence-transformers`:\\n\\nBefore you start, install the sentence-transformers libraries:\\n```\\npip install sentence-transformers\\n```\\n\\n```python\\n# Requires transformers>=4.48.0\\nfrom sentence_transformers import CrossEncoder\\n\\nmodel = CrossEncoder(\\n    \"Alibaba-NLP/gte-reranker-modernbert-base\",\\n    automodel_args={\"torch_dtype\": \"auto\"},\\n)\\n\\npairs = [\\n    [\"what is the capital of China?\", \"Beijing\"],\\n    [\"how to implement quick sort in python?\",\"Introduction of quick sort\"],\\n    [\"how to implement quick sort in python?\", \"The weather is nice today\"],\\n]\\n\\nscores = model.predict(pairs)\\nprint(scores)\\n# [0.8945664  0.9213594  0.15742092]\\n# NOTE: Sentence Transformers calls Softmax over the outputs by default, hence the scores are in [0, 1] range.\\n```\\n\\nUse with `transformers.js`\\n```js\\nimport {\\n  AutoTokenizer,\\n  AutoModelForSequenceClassification,\\n} from \"@huggingface/transformers\";\\n\\nconst model_id = \"Alibaba-NLP/gte-reranker-modernbert-base\";\\nconst model = await AutoModelForSequenceClassification.from_pretrained(\\n  model_id,\\n  { dtype: \"fp32\" }, // Supported options: \"fp32\", \"fp16\", \"q8\", \"q4\", \"q4f16\"\\n);\\nconst tokenizer = await AutoTokenizer.from_pretrained(model_id);\\n\\nconst pairs = [\\n  [\"what is the capital of China?\", \"Beijing\"],\\n  [\"how to implement quick sort in python?\", \"Introduction of quick sort\"],\\n  [\"how to implement quick sort in python?\", \"The weather is nice today\"],\\n];\\nconst inputs = tokenizer(\\n  pairs.map((x) => x[0]),\\n  {\\n    text_pair: pairs.map((x) => x[1]),\\n    padding: true,\\n    truncation: true,\\n  },\\n);\\nconst { logits } = await model(inputs);\\nconsole.log(logits.tolist()); // [[2.138258218765259], [2.4609625339508057], [-1.6775450706481934]]\\n```\\n\\nAdditionally, you can also deploy `Alibaba-NLP/gte-reranker-modernbert-base` with  as follows:\\n\\n- CPU\\n\\n```bash\\ndocker run --platform linux/amd64 \\\\\\n  -p 8080:80 \\\\\\n  -v $PWD/data:/data \\\\\\n  --pull always \\\\\\n  ghcr.io/huggingface/text-embeddings-inference:cpu-1.7 \\\\\\n  --model-id Alibaba-NLP/gte-reranker-modernbert-base\\n```\\n\\n- GPU\\n\\n```bash\\ndocker run --gpus all \\\\\\n  -p 8080:80 \\\\\\n  -v $PWD/data:/data \\\\\\n  --pull always \\\\\\n  ghcr.io/huggingface/text-embeddings-inference:1.7 \\\\\\n  --model-id Alibaba-NLP/gte-reranker-modernbert-base\\n```\\n\\nThen you can send requests to the deployed API via the `/rerank` route (see the  for more details):\\n\\n```bash\\ncurl  \\\\\\n  -H \"Content-Type: application/json\" \\\\\\n  -d \\'{\\n    \"query\": \"What is the capital of China?\",\\n    \"raw_scores\": false,\\n    \"return_text\": false,\\n    \"texts\": [ \"Beijing\" ],\\n    \"truncate\": true,\\n    \"truncation_direction\": \"right\"\\n  }\\'\\n```\\n\\n## Training Details\\n\\nThe `gte-modernbert` series of models follows the training scheme of the previous , with the only difference being that the pre-training language model base has been replaced from  to . For more training details, please refer to our paper: \\n\\n## Evaluation\\n\\n### MTEB\\n\\nThe results of other models are retrieved from . Given that all models in the `gte-modernbert` series have a size of less than 1B parameters, we focused exclusively on the results of models under 1B from the MTEB leaderboard.\\n\\n|                                            Model Name                                            | Param Size (M) | Dimension | Sequence Length | Average (56) | Class. (12) | Clust. (11) | Pair Class. (3) | Reran. (4) | Retr. (15) |  STS (10)   | Summ. (1) |\\n|:------------------------------------------------------------------------------------------------:|:--------------:|:---------:|:---------------:|:------------:|:-----------:|:---:|:---:|:---:|:---:|:-----------:|:--------:|\\n|                 |      335       |   1024    |       512       |    64.68     |    75.64    | 46.71 | 87.2 | 60.11 | 54.39 |     85      |   32.71  |\\n|  |      560       |   1024    |       514       |    64.41     |    77.56    | 47.1 | 86.19 | 58.58 | 52.47 |    84.78    |   30.39  |\\n|                                |      335       |   1024    |       512       |    64.23     |    75.97    | 46.08 | 87.12 | 60.03 | 54.29 |    83.11    |   31.61  |\\n|                           |      137       |    768    |      8192       |  64.11   |    77.17    | 46.82 | 85.33 | 57.66 | 54.09 |    81.97    |   31.17  |\\n|                                  |      109       |    768    |       512       |    63.55     |    75.53    | 45.77 | 86.55 | 58.86 | 53.25 |    82.4     |   31.07  |\\n|                         |      409       |   1024    |      8192       |    65.39     |    77.75    | 47.95 | 84.63 | 58.50 | 57.91 |    81.43    |   30.91  |\\n|  |      149       |    768    |      8192       |    62.62     |    74.31    | 44.98 | 83.96 | 56.42 | 52.89 |    81.78    |   31.39  |\\n|  |                |    768    |      8192       |    62.28     |   \\t73.55    |\\t43.93 |\\t84.61 |\\t55.78 | 53.01|    81.94    |   30.4   |\\n|  |      305       |    768    |       8192      |     61.4     | 70.89 | 44.31 | 84.24 | 57.47 |51.08 |    82.11    |   30.58  | \\n|  | 572 |   1024    |      8192  |       65.51 | 82.58 |45.21 |84.01 |58.13 |53.88 | 85.81 |   29.71  | \\n|  | 149 |   768    |      8192  |   **64.38** | **76.99** | **46.47** | **85.93** | **59.24** | **55.33** | **81.57** | **30.68** |\\n\\n\\n### LoCo (Long Document Retrieval)\\n\\n| Model Name |  Dimension | Sequence Length | Average (5) | QsmsumRetrieval | SummScreenRetrieval | QasperAbastractRetrieval | QasperTitleRetrieval |  GovReportRetrieval |\\n|:----:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|\\n|  | 4096 | 32768 |  87.57 | 49.37 | 93.10 | 99.67 | 97.54 | 98.21 | \\n|  |1024 | 8192 | 86.71 | 44.55 | 92.61 | 99.82 | 97.81 | 98.74 |\\n|  | 768 | 8192 | 87.44 | 49.91  | 91.78 | 99.82 | 97.13 | 98.58 |\\n|  | 768 | 8192 | 88.88 | 54.45 | 93.00 | 99.82 | 98.03 | 98.70 |\\n|  | - | 8192 | 90.68 | 70.86 | 94.06 | 99.73 | 99.11 | 89.67 | \\n\\n### COIR (Code Retrieval Task)\\n\\n| Model Name | Dimension | Sequence Length | Average(20) | CodeSearchNet-ccr-go | CodeSearchNet-ccr-java | CodeSearchNet-ccr-javascript | CodeSearchNet-ccr-php | CodeSearchNet-ccr-python | CodeSearchNet-ccr-ruby | CodeSearchNet-go | CodeSearchNet-java | CodeSearchNet-javascript | CodeSearchNet-php | CodeSearchNet-python | CodeSearchNet-ruby | apps | codefeedback-mt | codefeedback-st | codetrans-contest | codetrans-dl | cosqa | stackoverflow-qa | synthetic-text2sql |\\n|:----:|:---:|:---:|:---:|:---:| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |\\n|  | 768 | 8192 | 79.31\\t| 94.15\\t| 93.57 |\\t94.27 |\\t91.51\\t| 93.93\\t| 90.63\\t| 88.32 |\\t83.27\\t| 76.05\\t| 85.12\\t| 88.16\\t| 77.59\\t| 57.54\\t| 82.34\\t| 85.95\\t| 71.89\\t | 35.46\\t| 43.47\\t| 91.2\\t| 61.87 |\\n|  | - | 8192 | 79.99\\t| 96.43\\t| 96.88\\t| 98.32 | 91.81\\t| 97.7\\t| 91.96 |\\t88.81\\t| 79.71\\t| 76.27\\t| 89.39\\t| 98.37\\t| 84.11\\t| 47.57\\t| 83.37\\t| 88.91\\t| 49.66\\t| 36.36\\t| 44.37\\t| 89.58\\t| 64.21 |\\n\\n### BEIR\\n\\n| Model Name | Dimension | Sequence Length | Average(15) | ArguAna | ClimateFEVER | CQADupstackAndroidRetrieval | DBPedia | FEVER | FiQA2018 | HotpotQA | MSMARCO | NFCorpus | NQ | QuoraRetrieval | SCIDOCS | SciFact | Touche2020 | TRECCOVID |\\n| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |\\n|  | 768 | 8192 | 55.33 | 72.68 | 37.74 | 42.63 | 41.79 | 91.03 | 48.81 | 69.47 | 40.9 | 36.44 | 57.62 | 88.55 | 21.29 | 77.4 | 21.68 | 81.95 |\\n|  | - | 8192 | 56.73 | 69.03 | 37.79 | 44.68 | 47.23 | 94.54 | 49.81 | 78.16 | 45.38 | 30.69 | 64.57 | 87.77 | 20.60 | 73.57 | 27.36 | 79.89 |\\n\\n\\n## Hiring\\n\\nWe have open positions for **Research Interns** and **Full-Time Researchers** to join our team at Tongyi Lab. \\nWe are seeking passionate individuals with expertise in representation learning, LLM-driven information retrieval, Retrieval-Augmented Generation (RAG), and agent-based systems. \\nOur team is located in the vibrant cities of **Beijing** and **Hangzhou**.\\nIf you are driven by curiosity and eager to make a meaningful impact through your work, we would love to hear from you. Please submit your resume along with a brief introduction to dingkun.ldk@alibaba-inc.com.\\n\\n\\n## Citation\\n\\nIf you find our paper or models helpful, feel free to give us a cite.\\n\\n```\\n@inproceedings{zhang2024mgte,\\n  title={mGTE: Generalized Long-Context Text Representation and Reranking Models for Multilingual Text Retrieval},\\n  author={Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Wen and Dai, Ziqi and Tang, Jialong and Lin, Huan and Yang, Baosong and Xie, Pengjun and Huang, Fei and others},\\n  booktitle={Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: Industry Track},\\n  pages={1393--1412},\\n  year={2024}\\n}\\n\\n@article{li2023towards,\\n  title={Towards general text embeddings with multi-stage contrastive learning},\\n  author={Li, Zehan and Zhang, Xin and Zhang, Yanzhao and Long, Dingkun and Xie, Pengjun and Zhang, Meishan},\\n  journal={arXiv preprint arXiv:2308.03281},\\n  year={2023}\\n}\\n```\\n',\n",
       "  'domain': 'text-classification'},\n",
       " {'model_id': 'facebook/dinov3-vitl16-pretrain-lvd1689m',\n",
       "  'created_at': '2025-08-06T06:19:51+00:00',\n",
       "  'downloads': 522249,\n",
       "  'likes': 135,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'dinov3_vit',\n",
       "   'image-feature-extraction',\n",
       "   'dino',\n",
       "   'dinov3',\n",
       "   'arxiv:2508.10104',\n",
       "   'en',\n",
       "   'base_model:facebook/dinov3-vit7b16-pretrain-lvd1689m',\n",
       "   'base_model:finetune:facebook/dinov3-vit7b16-pretrain-lvd1689m',\n",
       "   'license:other',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# Model Card for DINOv3\\n\\nDINOv3 is a family of versatile vision foundation models that outperforms the specialized state of the art across a broad range of settings, without fine-tuning. DINOv3 produces high-quality dense features that achieve outstanding performance on various vision tasks, significantly surpassing previous self- and weakly-supervised foundation models.\\n\\n## Model Details\\n\\nThese are Vision Transformer and ConvNeXt models trained following the method described in the DINOv3 paper. 12 models are provided:\\n\\n- 10 models pretrained on web data (LVD-1689M dataset)\\n  - 1 ViT-7B trained from scratch,\\n  - 5 ViT-S/S+/B/L/H+ models distilled from the ViT-7B,\\n  - 4 ConvNeXt-{T/S/B/L} models distilled from the ViT-7B,\\n- 2 models pretrained on satellite data (SAT-493M dataset)\\n  - 1 ViT-7B trained from scratch\\n  - 1 ViT-L distilled from the ViT-7B\\n\\n\\nEach Transformer-based model takes an image as input and returns a class token, patch tokens (and register tokens). These models follow a ViT architecture, with a patch size of 16. For a 224x224 image, this results in 1 class token + 4 register tokens + 196 patch tokens = 201 tokens (for DINOv2 with registers this resulted in 1 + 4 + 256 = 261 tokens).\\n\\nThe models can accept larger images provided the image shapes are multiples of the patch size (16). If this condition is not verified, the model will crop to the closest smaller multiple of the patch size.\\n\\n### Model Description\\n\\n- **Developed by:** Meta AI\\n- **Model type:** Vision Transformer, ConvNeXt\\n- **License:** \\n\\n### Model Sources\\n\\n- **Repository:** \\n- **Paper:** \\n\\n## Uses\\n\\nThe models are vision backbones providing multi-purpose features for downstream tasks.\\n\\n### Direct Use\\n\\nThe models can be used without fine-tuning, with downstream classifiers as simple as linear layers, to obtain competitive results:\\n\\n- on image classification, using k-NN classifiers on the class token\\n- on image classification, with logistic regression classifiers applied on the class token\\n- on image classification, with a linear layer applied on the class token and the average of the patch tokens\\n- on image retrieval using nearest neighbors\\n- on geometric and semantic 3D keypoint correspondances\\n- on depth estimation, semantic segmentation, using linear layers\\n- on unsupervised object discovery\\n- on video segmentation tracking\\n- on video classification, using a small 4-layer attentive probe\\n\\n### Downstream Use\\n\\nWhile fine-tuning the models can yield some gains, it is recommended to keep this option as a last resort: the frozen features are expected to provide good performance out-of-the-box.\\n\\n## Bias, Risks, and Limitations\\n\\nCompared to DINOv2 and SEERv2, DINOv3 delivers somewhat consistent performance across income categories on geographical fairness and diversity, although with a notable performance drop in the low-income bucket compared to the highest-income bucket.\\n\\nDINOv3 also achieves relatively good scores across different regions, improving over its predecessor DINOv2. However, a relative difference is still observed between Europe and Africa.\\n\\n### Recommendations\\n\\nFine-tuning is expected to increase the biases in the features produced by the model as they will be tuned to the fine-tuning labels.\\n\\n## How to Get Started with the Model\\n\\nThe example below demonstrates how to obtain an image embedding with [Pipeline] or the [AutoModel] class.\\n\\n```python\\nfrom transformers import pipeline\\nfrom transformers.image_utils import load_image\\n\\nurl = \"\\nimage = load_image(url)\\n\\nfeature_extractor = pipeline(\\n    model=\"facebook/dinov3-vitl16-pretrain-lvd1689m\",\\n    task=\"image-feature-extraction\", \\n)\\nfeatures = feature_extractor(image)\\n```\\n\\n```python\\nimport torch\\nfrom transformers import AutoImageProcessor, AutoModel\\nfrom transformers.image_utils import load_image\\n\\nurl = \"\\nimage = load_image(url)\\n\\npretrained_model_name = \"facebook/dinov3-vitl16-pretrain-lvd1689m\"\\nprocessor = AutoImageProcessor.from_pretrained(pretrained_model_name)\\nmodel = AutoModel.from_pretrained(\\n    pretrained_model_name, \\n    device_map=\"auto\", \\n)\\n\\ninputs = processor(images=image, return_tensors=\"pt\").to(model.device)\\nwith torch.inference_mode():\\n    outputs = model(**inputs)\\n\\npooled_output = outputs.pooler_output\\nprint(\"Pooled output shape:\", pooled_output.shape)\\n```\\n\\n## Training Details\\n\\n### Training Data\\n\\n- Web dataset (LVD-1689M): a curated dataset of 1,689 millions of images extracted from a large data\\npool of 17 billions web images collected from public posts on Instagram\\n\\n- Satellite dataset (SAT-493M): a dataset of 493 millions of 512x512 images sampled randomly from Maxar RGB ortho-rectified imagery at 0.6 meter resolution\\n\\n### Training Procedure\\n\\n**Training objective:**\\n\\n- DINO self-distillation loss with multi-crop\\n- iBOT masked-image modeling loss\\n- KoLeo regularization on [CLS] tokens\\n- Gram anchoring\\n\\n- **Training regime:** PyTorch FSDP2 (with bf16 and fp8 matrix multiplications)\\n\\n**Distillation:**\\n\\n- Distillation follows the standard DINOv3 pretraining procedure, except the teacher is a frozen pretrained ViT-7B.\\n\\n## Evaluation\\n\\n**Results**\\n\\nThe reader is referred to the associated paper for details on the evaluation protocols\\n\\n*Results for ViT backbones pretrained (or distilled) on web (LVD-1689M)*\\n\\n\\n  \\n    \\n    \\n    Global Tasks\\n    Dense Tasks\\n  \\n  \\n    Model\\n    \\n    IN-ReaL\\n    IN-R\\n    Obj.Net\\n    Ox.-H\\n    ADE20k\\n    NYU↓\\n    DAVIS\\n    NAVI\\n    SPair\\n  \\n  \\n    DINOv3 ViT-S/16\\n    \\n    87.0\\n    60.4\\n    50.9\\n    49.5\\n    47.0\\n    0.403\\n    72.7\\n    56.3\\n    50.4\\n  \\n  \\n    DINOv3 ViT-S+/16\\n    \\n    88.0\\n    68.8\\n    54.6\\n    50.0\\n    48.8\\n    0.399\\n    75.5\\n    57.1\\n    55.2\\n  \\n  \\n    DINOv3 ViT-B/16\\n    \\n    89.3\\n    76.7\\n    64.1\\n    58.5\\n    51.8\\n    0.373\\n    77.2\\n    58.8\\n    57.2\\n  \\n  \\n    DINOv3 ViT-L/16\\n    \\n    90.2\\n    88.1\\n    74.8\\n    63.1\\n    54.9\\n    0.352\\n    79.9\\n    62.3\\n    61.3\\n  \\n  \\n    DINOv3 ViT-H+/16\\n    \\n    90.3\\n    90.0\\n    78.6\\n    64.5\\n    54.8\\n    0.352\\n    79.3\\n    63.3\\n    56.3\\n  \\n  \\n    DINOv3 ViT-7B/16\\n    \\n    90.4\\n    91.1\\n    91.1\\n    72.8\\n    55.9\\n    0.309\\n    79.7\\n    64.4\\n    58.7\\n  \\n\\n\\n*Results for ConvNeXt backbones distilled on web (LVD-1689M)*\\n\\n\\n  \\n    \\n    Global Tasks\\n    Dense Tasks\\n  \\n  \\n    Model\\n    IN-ReaL\\n    IN-R\\n    Obj.Net\\n    ADE20k\\n    NYU↓\\n  \\n  \\n    \\n    @256px\\n    @512px\\n    @256px\\n    @512px\\n    @256px\\n    @512px\\n    \\n  \\n  \\n    DINOv3 ConvNeXt Tiny\\n    86.6\\n    87.7\\n    73.7\\n    74.1\\n    52.6\\n    58.7\\n    42.7\\n    0.448\\n  \\n  \\n    DINOv3 ConvNeXt Small\\n    87.9\\n    88.7\\n    73.7\\n    74.1\\n    52.6\\n    58.7\\n    44.8\\n    0.432\\n  \\n  \\n    DINOv3 ConvNeXt Base\\n    88.5\\n    89.2\\n    77.2\\n    78.2\\n    56.2\\n    61.3\\n    46.3\\n    0.420\\n  \\n  \\n    DINOv3 ConvNeXt Large\\n    88.9\\n    89.4\\n    81.3\\n    82.4\\n    59.3\\n    65.2\\n    47.8\\n    0.403\\n  \\n\\n\\n*Results for ViT backbones pretrained (or distilled) on satellite (SAT-493M)*\\n\\n\\n  \\n    \\n    (GEO-Bench) Classification\\n  \\n  \\n    Model\\n    m-BEnet\\n    m-brick-kiln\\n    m-eurosat\\n    m-forestnet\\n    m-pv4ger\\n    m-so2sat\\n    mean\\n  \\n  \\n    DINOv3 ViT-L/16\\n    73.0\\n    96.5\\n    94.1\\n    60.6\\n    96.0\\n    57.4\\n    79.6\\n  \\n  \\n    DINOv3 ViT-7B/16\\n    74.0\\n    97.2\\n    94.8\\n    62.3\\n    96.1\\n    62.1\\n    81.1\\n  \\n  \\n    \\n    (GEO-Bench) Segmentation\\n  \\n  \\n    Model\\n    m-cashew\\n    m-chesapeake\\n    m-NeonTree\\n    m-nz-cattle\\n    m-pv4ger-seg\\n    m-SA-crop\\n    mean\\n  \\n  \\n    DINOv3 ViT-L/16\\n    94.2\\n    75.6\\n    61.8\\n    83.7\\n    95.2\\n    36.8\\n    74.5\\n  \\n  \\n    DINOv3 ViT-7B/16\\n    94.1\\n    76.6\\n    62.6\\n    83.4\\n    95.5\\n    37.6\\n    75.0\\n  \\n\\n\\n\\n## Environmental Impact\\n\\n- **Hardware Type:** Nvidia H100\\n- **Hours used:** 61,440 hours for ViT-7B model training\\n- **Cloud Provider:** Private infrastructure\\n- **Compute Region:** USA\\n- **Carbon Emitted:** 18t CO2eq\\n\\n## Technical Specifications\\n\\n### Model Architecture and Objective\\n\\nVision Transformer models:\\n\\n- ViT-S (21M parameters): patch size 16, embedding dimension 384, 4 register tokens, 6 heads, MLP FFN, RoPE\\n- ViT-S+ (29M parameters): patch size 16, embedding dimension 384, 4 register tokens, 6 heads, SwiGLU FFN, RoPE\\n- ViT-B (86M parameters): patch size 16, embedding dimension 768, 4 register tokens, 12 heads, MLP FFN, RoPE\\n- ViT-L (300M parameters): patch size 16, embedding dimension 1024, 4 register tokens, 16 heads, MLP FFN, RoPE\\n- ViT-H+ (840M parameters): patch size 16, embedding dimension 1280, 4 register tokens, 20 heads, SwiGLU FFN, RoPE\\n- ViT-7B (6716M parameters): patch size 16, embedding dimension 4096, 4 register tokens, 32 heads, SwiGLU FFN, RoPE\\n\\nConvNeXt models:\\n\\n- ConvNeXt Tiny (29M parameters)\\n- ConvNeXt Small (50M parameters)\\n- ConvNeXt Base (89M parameters)\\n- ConvNeXt Large (198M parameters)\\n\\n### Compute Infrastructure\\n\\n#### Hardware\\n\\nNvidia H100 GPUs\\n\\n#### Software\\n\\nPyTorch 2.7\\n\\n## More Information\\n\\nSee the  and the associated .\\n\\n## Citation\\n\\n**BibTeX**\\n\\n```\\n@misc{simeoni2025dinov3,\\n  title={{DINOv3}},\\n  author={Sim{\\\\\\'e}oni, Oriane and Vo, Huy V. and Seitzer, Maximilian and Baldassarre, Federico and Oquab, Maxime and Jose, Cijo and Khalidov, Vasil and Szafraniec, Marc and Yi, Seungeun and Ramamonjisoa, Micha{\\\\\"e}l and Massa, Francisco and Haziza, Daniel and Wehrstedt, Luca and Wang, Jianyuan and Darcet, Timoth{\\\\\\'e}e and Moutakanni, Th{\\\\\\'e}o and Sentana, Leonel and Roberts, Claire and Vedaldi, Andrea and Tolan, Jamie and Brandt, John and Couprie, Camille and Mairal, Julien and J{\\\\\\'e}gou, Herv{\\\\\\'e} and Labatut, Patrick and Bojanowski, Piotr},\\n  year={2025},\\n  eprint={2508.10104},\\n  archivePrefix={arXiv},\\n  primaryClass={cs.CV},\\n  url={\\n}\\n```\\n',\n",
       "  'domain': 'image-feature-extraction'},\n",
       " {'model_id': 'briaai/RMBG-2.0',\n",
       "  'created_at': '2024-10-29T12:14:36+00:00',\n",
       "  'downloads': 515235,\n",
       "  'likes': 1044,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'pytorch',\n",
       "   'onnx',\n",
       "   'safetensors',\n",
       "   'image-segmentation',\n",
       "   'remove background',\n",
       "   'background',\n",
       "   'background-removal',\n",
       "   'Pytorch',\n",
       "   'vision',\n",
       "   'legal liability',\n",
       "   'transformers.js',\n",
       "   'custom_code',\n",
       "   'license:other',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# BRIA Background Removal v2.0 Model Card\\n\\n\\n\\n\\n\\n  \\n  \\n    \\n    ✨ Discover FIBO on Hugging Face\\n  \\n\\n  \\n\\n         💜 Bria AI&nbsp&nbsp | &nbsp&nbsp🤗 Hugging Face &nbsp&nbsp | &nbsp&nbsp 📑 Blog &nbsp&nbsp \\n\\n🖥️ Demo&nbsp&nbsp| &nbsp&nbsp Github&nbsp&nbsp\\n\\n\\nRMBG v2.0 is our new state-of-the-art background removal model significantly improves RMBG v1.4. The model is designed to effectively separate foreground from background in a range of\\ncategories and image types. This model has been trained on a carefully selected dataset, which includes:\\ngeneral stock images, e-commerce, gaming, and advertising content, making it suitable for commercial use cases powering enterprise content creation at scale. \\nThe accuracy, efficiency, and versatility currently rival leading source-available models. \\nIt is ideal where content safety, legally licensed datasets, and bias mitigation are paramount. \\n\\nDeveloped by BRIA AI, RMBG v2.0 is available as a source-available model for non-commercial use.\\n\\n### Get Access\\n\\nBria RMBG2.0 is availabe everywhere you build, either as source-code and weights, ComfyUI nodes or API endpoints.\\n\\n- **Purchase:** To purchase a commercial license for RMBG V2.0 **or** an API package .\\n- **API Endpoint**: , , \\n- **ComfyUI**: \\n- **GitHub**: \\n\\nFor more information, please visit our .\\n\\nJoin our  for more information, tutorials, tools, and to connect with other users!\\n\\n\\n\\n\\n\\n\\n\\n## Model Details\\n#####\\n### Model Description\\n\\n- **Developed by:** \\n- **Model type:** Background Removal \\n- **License:** \\n  - The model is released under a CC BY-NC 4.0 license for non-commercial use.\\n  - Commercial use is subject to a commercial agreement with BRIA. Available \\n\\n\\n- **Model Description:** BRIA RMBG-2.0 is a dichotomous image segmentation model trained exclusively on a professional-grade dataset. The model output includes a single-channel 8-bit grayscale alpha matte, where each pixel value indicates the opacity level of the corresponding pixel in the original image. This non-binary output approach offers developers the flexibility to define custom thresholds for foreground-background separation, catering to varied use cases requirements and enhancing integration into complex pipelines.\\n- **BRIA:** Resources for more information: \\n\\n\\n\\n## Training data\\nBria-RMBG model was trained with over 15,000 high-quality, high-resolution, manually labeled (pixel-wise accuracy), fully licensed images.\\nOur benchmark included balanced gender, balanced ethnicity, and people with different types of disabilities.\\nFor clarity, we provide our data distribution according to different categories, demonstrating our model’s versatility.\\n\\n### Distribution of images:\\n\\n| Category | Distribution |\\n| -----------------------------------| -----------------------------------:|\\n| Objects only | 45.11% |\\n| People with objects/animals | 25.24% |\\n| People only | 17.35% |\\n| people/objects/animals with text | 8.52% |\\n| Text only | 2.52% |\\n| Animals only | 1.89% |\\n\\n| Category | Distribution |\\n| -----------------------------------| -----------------------------------------:|\\n| Photorealistic | 87.70% |\\n| Non-Photorealistic | 12.30% |\\n\\n\\n| Category | Distribution |\\n| -----------------------------------| -----------------------------------:|\\n| Non Solid Background | 52.05% |\\n| Solid Background | 47.95% \\n\\n\\n| Category | Distribution |\\n| -----------------------------------| -----------------------------------:|\\n| Single main foreground object | 51.42% |\\n| Multiple objects in the foreground | 48.58% |\\n\\n\\n## Qualitative Evaluation\\nOpen source models comparison\\n\\n\\n\\n### Architecture\\nRMBG-2.0 is developed on the  architecture enhanced with our proprietary dataset and training scheme. This training data significantly improves the model’s accuracy and effectiveness for background-removal task.\\nIf you use this model in your research, please cite:\\n\\n```\\n@article{BiRefNet,\\n  title={Bilateral Reference for High-Resolution Dichotomous Image Segmentation},\\n  author={Zheng, Peng and Gao, Dehong and Fan, Deng-Ping and Liu, Li and Laaksonen, Jorma and Ouyang, Wanli and Sebe, Nicu},\\n  journal={CAAI Artificial Intelligence Research},\\n  year={2024}\\n}\\n```\\n\\n#### Requirements\\n```bash\\ntorch\\ntorchvision\\npillow\\nkornia\\ntransformers\\n```\\n\\n### Usage\\n\\n\\n\\n\\n```python\\nfrom PIL import Image\\nimport torch\\nfrom torchvision import transforms\\nfrom transformers import AutoModelForImageSegmentation\\n\\ndevice = \\'cuda\\' if torch.cuda.is_available() else \\'cpu\\'\\nmodel = AutoModelForImageSegmentation.from_pretrained(\\'briaai/RMBG-2.0\\', trust_remote_code=True).eval().to(device)\\n\\n# Data settings\\nimage_size = (1024, 1024)\\ntransform_image = transforms.Compose([\\n    transforms.Resize(image_size),\\n    transforms.ToTensor(),\\n    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\\n])\\n\\nimage = Image.open(input_image_path)\\ninput_images = transform_image(image).unsqueeze(0).to(device)\\n\\n# Prediction\\nwith torch.no_grad():\\n    preds = model(input_images)[-1].sigmoid().cpu()\\npred = preds[0].squeeze()\\npred_pil = transforms.ToPILImage()(pred)\\nmask = pred_pil.resize(image.size)\\nimage.putalpha(mask)\\n\\nimage.save(\"no_bg_image.png\")\\n```\\n\\n\\n\\n',\n",
       "  'domain': 'image-segmentation'},\n",
       " {'model_id': 'lightx2v/Qwen-Image-Edit-2511-Lightning',\n",
       "  'created_at': '2025-12-22T06:03:41+00:00',\n",
       "  'downloads': 451618,\n",
       "  'likes': 347,\n",
       "  'author': None,\n",
       "  'tags': ['diffusers',\n",
       "   'safetensors',\n",
       "   'diffusion-single-file',\n",
       "   'comfyui',\n",
       "   'distillation',\n",
       "   'LoRA',\n",
       "   'lora',\n",
       "   'Qwen-Image',\n",
       "   'Qwen-Image-Edit',\n",
       "   'image-to-image',\n",
       "   'base_model:Qwen/Qwen-Image-Edit-2511',\n",
       "   'base_model:adapter:Qwen/Qwen-Image-Edit-2511',\n",
       "   'license:apache-2.0',\n",
       "   'region:us'],\n",
       "  'modelcard': '# Qwen-Image-Edit-2511-Lightning\\n\\n## Model Overview\\nQwen-Image-Edit-2511-Lightning is a collection of optimized models tailored for image editing tasks, leveraging step distillation and quantization techniques to deliver high-efficiency inference performance. This repository hosts three core model files with distinct characteristics:\\n\\n| Model File Name | Type | Key Features |\\n|-----------------|------|--------------|\\n| `Qwen-Image-Edit-2511-Lightning-4steps-V1.0-bf16.safetensors` | 4-step Distilled LoRA | BF16 precision, lightweight, 4-step inference |\\n| `Qwen-Image-Edit-2511-Lightning-4steps-V1.0-fp32.safetensors` | 4-step Distilled LoRA | FP32 precision, high accuracy, 4-step inference |\\n| `qwen_image_edit_2511_fp8_e4m3fn_scaled_lightning.safetensors` | FP8 Quantized | FP8 (e4m3fn scaled) precision, fused with 4-step distilled LoRA, optimized for low-memory deployment |\\n\\n## Usage Instructions\\nThis model suite supports two mainstream usage frameworks, with detailed guides provided below:\\n\\n### 1. Qwen-Image-Lightning Framework\\nFor full documentation on model usage within the Qwen-Image-Lightning ecosystem (including environment setup, inference pipelines, and customization), please refer to:\\n\\n\\n### 2. LightX2V Framework\\nThe models are fully compatible with the LightX2V lightweight video/image generation inference framework. For step-by-step usage examples, configuration templates, and performance optimization tips, see:\\n\\n\\n## Key Optimizations\\n- **Step Distillation**: The LoRA models reduce the original inference steps to just 4 steps, achieving significant speedup (≈10x faster than standard 40-step inference) while preserving image editing quality.\\n- **FP8 Quantization**: The quantized base model balances performance and resource efficiency, reducing GPU memory usage by ~50% compared to FP32 while maintaining editing fidelity.\\n\\n## Support\\nFor technical issues, feature requests, or integration questions:\\n- Open an issue in the  (for Qwen framework-specific questions)\\n- Open an issue in the  (for LightX2V integration questions)',\n",
       "  'domain': 'image-to-image'},\n",
       " {'model_id': 'Freepik/nsfw_image_detector',\n",
       "  'created_at': '2025-04-10T08:59:44+00:00',\n",
       "  'downloads': 415562,\n",
       "  'likes': 51,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'timm_wrapper',\n",
       "   'image-classification',\n",
       "   'pytorch',\n",
       "   'arxiv:2303.11331',\n",
       "   'base_model:timm/eva02_base_patch14_448.mim_in22k_ft_in22k_in1k',\n",
       "   'base_model:finetune:timm/eva02_base_patch14_448.mim_in22k_ft_in22k_in1k',\n",
       "   'license:mit',\n",
       "   'endpoints_compatible',\n",
       "   'deploy:azure',\n",
       "   'region:us',\n",
       "   'not-for-all-audiences'],\n",
       "  'modelcard': '\\n# EVA-based Fast NSFW Image Classifier\\n\\n## Table of Contents\\n- \\n- \\n- \\n  - \\n  - \\n    - \\n    - \\n- \\n  - \\n  - \\n  - \\n- \\n- \\n\\n## Model Description\\n\\nThis model is a vision transformer based on the **EVA architecture**, fine-tuned for **NSFW content classification**. It has been trained\\nto detect **four categories** (neutral, low, medium, high) of visual content using **100,000 synthetically labeled images**.\\n\\nThe model can be used as a **binary (true/false) classifier if desired, or you can obtain the full output probabilities.**. It **outperforms other excellent publicly available models** such as  or  in our internal benchmarks adding the enrichment of being able to select the NSFW level that suits your use case.\\n\\n## Try it Online! 🚀\\n\\nYou can try this model directly in your browser through our . Upload any image and get instant NSFW classification results without any installation required. \\n\\n## Model Performance Comparison\\n\\n### Global Performance\\n\\n| Category | Freepik | Falconsai | Adamcodd |\\n|----------|-------------|------------------|----------------|\\n| High     | 99.54%     | 97.92%          | 98.62%         |\\n| Medium   | 97.02%     | 78.54%          | 91.65%         |\\n| Low      | 98.31%     | 31.25%          | 89.66%         |\\n| Neutral  | 99.87%     | 99.27%          | 98.37%         |\\n\\n\\nIn the table below, the results are obtained as follows:\\n \\n* For the **Falconsai and AdamCodd** models:\\n    * A prediction is considered correct if the image is labeled \"low\", \"medium\", or \"high\" and the model returns true.\\n    * If the label is \"neutral\", the correct output should be false.\\n\\n* For the **Freepik model**:\\n    * If the image label is \"low\", \"medium\", or \"high\", the model should return at least \"low\".\\n    * If the label is \"neutral\", the correct output should be \"neutral\".\\n\\n\\n**Conclusions:**\\n\\n* Our model **outperforms AdamCodd and Falconsai in accuracy**. It is entirely fair to compare them on the \"high\" and \"neutral\" labels.\\n* Our model **offers greater granularity**. It is not only suitable for detecting \"high\" and \"neutral\" content, but also performs excellently at identifying \"low\" and \"medium\" NSFW content.\\n    * Falconsai may classify some \"medium\" and \"low\" images as not NSFW but mark others as safe for work(SFW), which could lead to unexpected results.\\n    * AdamCodd classifies both \"low\" and \"medium\" categories as NSFW, which may not be desirable depending on your use case. Furthermore, a 10% of images in low and medium are considered SFW.\\n\\n### Accuracy by AI Content\\n\\nWe have created a **manually labeled dataset** with careful attention to **avoiding biases** (gender, ethnicity, etc.). While the sample size is relatively small, it provides meaningful insights into model performance across different scenarios, which was very useful in the training process to avoid biases.\\n\\nThe following tables show detection accuracy percentages across different NSFW categories and content types:\\n\\n#### AI-Generated Content\\n\\n| Category | Freepik Model | Falconsai Model | Adamcodd Model |\\n|----------|-------------|------------------|----------------|\\n| High     | 100.00%     | 84.00%          | 92.00%         |\\n| Medium   | 96.15%      | 69.23%          | 96.00%         |\\n| Low      | 100.00%     | 35.71%          | 92.86%         |\\n| Neutral  | 100.00%     | 100.00%         | 66.67%         |\\n\\n\\n**Conclusions:**\\n* **Avoid using Falconsai for AI-generated content** to prevent prediction errors.\\n* **Our model is the best option to detect NSFW content in AI-generated content**.  \\n\\n\\n## Usage\\n\\n### Quick Start via pip\\n\\n```sh \\npip install nsfw-image-detector\\n```\\n\\n```python\\nfrom PIL import Image\\nfrom nsfw_image_detector import NSFWDetector\\nimport torch\\n\\n# Initialize the detector\\ndetector = NSFWDetector(dtype=torch.bfloat16, device=\"cuda\")\\n\\n# Load and classify an image\\nimage = Image.open(\"your_image\")\\n\\n# Check if the image contains NSFW content sentivity level medium or higher\\nis_nsfw = detector.is_nsfw(image, \"medium\")\\n\\n# Get probability scores for all categories\\nprobabilities = detector.predict_proba(image)\\nprint(f\"Is NSFW: {is_nsfw}\")\\nprint(f\"Probabilities: {probabilities}\")\\n```\\n\\nExample output:\\n```python\\nIs NSFW: False\\nProbabilities: \\n    [\\n        {: 0.00372314453125, \\n        : 0.1884765625, \\n        : 0.234375, \\n        : 0.765625}\\n    ]\\n```\\n\\n### Quick Start with Pipeline\\n\\n```python\\nfrom transformers import pipeline\\nfrom PIL import Image\\n\\n# Create classifier pipeline\\nclassifier = pipeline(\\n    \"image-classification\",\\n    model=\"Freepik/nsfw_image_detector\",\\n    device=0  # Use GPU (0) or CPU (-1)\\n)\\n\\n# Load and classify an image\\nimage = Image.open(\"path/to/your/image.jpg\")\\npredictions = classifier(image)\\nprint(predictions)\\n```\\n\\nExample output:\\n```python\\n[\\n    {\\'label\\': \\'neutral\\', \\'score\\': 0.92},\\n    {\\'label\\': \\'low\\', \\'score\\': 0.05},\\n    {\\'label\\': \\'medium\\', \\'score\\': 0.02},\\n    {\\'label\\': \\'high\\', \\'score\\': 0.01}\\n]\\n```\\n\\nThe model supports efficient batch processing for multiple images:\\n\\n```python\\nimages = [Image.open(path) for path in [\"image1.jpg\", \"image2.jpg\", \"image3.jpg\"]]\\npredictions = classifier(images)\\n```\\n\\n**Note**: If the intention is to use the model in production review  section before using this approach.\\n\\n### Avoid installation of pip dependency\\n\\nThe following example demonstrates how to **customize the NSFW detection label**, it is very similar to the code in . This code returns True if the NSFW level is \\'medium\\' or higher:\\n\\n```python\\nfrom transformers import AutoModelForImageClassification\\nimport torch\\nfrom PIL import Image\\nfrom typing import List, Dict\\nimport torch.nn.functional as F\\nfrom timm.data.transforms_factory import create_transform\\nfrom torchvision.transforms import Compose\\nfrom timm.data import resolve_data_config\\nfrom timm.models import get_pretrained_cfg\\n\\n\\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\"\\n\\n# Load model and processor\\nmodel = AutoModelForImageClassification.from_pretrained(\"Freepik/nsfw_image_detector\", torch_dtype = torch.bfloat16).to(device) \\n\\n# Load original processor (faster for tensors)\\ncfg = get_pretrained_cfg(\"eva02_base_patch14_448.mim_in22k_ft_in22k_in1k\")\\nprocessor: Compose = create_transform(**resolve_data_config(cfg.__dict__))\\n\\ndef predict_batch_values(model, processor: Compose, img_batch: List[Image.Image] | torch.Tensor) -> List[Dict[str, float]]:\\n    \"\"\"\\n    Process a batch of images and return prediction scores for each NSFW category\\n    \"\"\"\\n    idx_to_label = {0: \\'neutral\\', 1: \\'low\\', 2: \\'medium\\', 3: \\'high\\'}\\n    \\n    # Prepare batch\\n    inputs = torch.stack([processor(img) for img in img_batch])\\n    output = []\\n    with torch.inference_mode():\\n        logits = model(inputs).logits\\n        batch_probs = F.log_softmax(logits, dim=-1)\\n        batch_probs = torch.exp(batch_probs).cpu()\\n        \\n        for i in range(len(batch_probs)):\\n            element_probs = batch_probs[i]\\n            output_img = {}\\n            danger_cum_sum = 0\\n            \\n            for j in range(len(element_probs) - 1, -1, -1):\\n                danger_cum_sum += element_probs[j]\\n                if j == 0:\\n                    danger_cum_sum = element_probs[j]\\n                output_img[idx_to_label[j]] = danger_cum_sum.item()\\n            output.append(output_img)\\n\\n    return output\\n\\ndef prediction(model, processor, img_batch: List[Image.Image], class_to_predict: str, threshold: float=0.5) -> List[bool]:\\n    \"\"\"\\n    Predict if images meet or exceed a specific NSFW threshold\\n    \"\"\"\\n    if class_to_predict not in [\"low\", \"medium\", \"high\"]:\\n        raise ValueError(\"class_to_predict must be one of: low, medium, high\")\\n    \\n    if not 0 = threshold for i in range(len(output))]\\n\\n# Example usage\\nimage = Image.open(\"path/to/your/image.jpg\")\\nprint(predict_batch_values(model, processor, [image]))\\nprint(prediction(model, processor, [image], \"medium\")) # Options: low, medium, high\\n```\\n\\nExample output:\\n\\n```python\\n[\\n    {\\'label\\': \\'neutral\\', \\'score\\': 0.92},\\n    {\\'label\\': \\'low\\', \\'score\\': 0.08},\\n    {\\'label\\': \\'medium\\', \\'score\\': 0.03},\\n    {\\'label\\': \\'high\\', \\'score\\': 0.01}\\n]\\n\\n[False]\\n```\\n**Note**: The sum is higher than one because the prediction is the cumulative sum of all labels equal to or higher than your selected label, except neutral. For instance, if you select \\'medium\\', it is the sum of \\'medium\\' and \\'high\\'. In our opinion, this approach is more effective than selecting only the highest probability label.\\n\\n## Training\\n\\n* **100,000 images** were used during training. \\n* The model were trained for **3 epochs on 3 NVIDIA GeForce RTX 3090**\\n* The model were trained using two sets, training and validation.\\n* There are **no images with a cosine similarity higher than 0.92** to avoid duplicates and biases between training and validation. The model used for deduplication is \"openai/clip-vit-base-patch32\"\\n* A **custom loss** was created to minimize predictions that are lower than the true class. For instance, it is very rare for an image labeled as \\'high\\' to be predicted as \\'neutral\\' (this only happens 0.46% of the time).\\n\\n## Speed and Memory Metrics\\n\\n| Batch Size | Avg by batch (ms) | VRAM (MB) | Optimizations |\\n|------------|------------------|------------|---------------|\\n| 1          | 28               | 540        | BF16 using PIL images |\\n| 4          | 110              | 640        | BF16 using PIL images |\\n| 16         | 412              | 1144       | BF16 using PIL images |\\n| 1          | 10               | 540        | BF16 using torch tensor |\\n| 4          | 33               | 640        | BF16 using torch tensor |\\n| 16         | 102              | 1144       | BF16 using torch tensor |\\n\\n**Notes:**\\n* The model has been trained in bf16 so it is **recommended to use it in bf16**.\\n* **Using torch tensor**: The speed using torch tensor is not achieved using pipeline. Avoid pipeline use in production.\\n* Measurements taken on **NVIDIA RTX 3090**, expect better metrics in more powerful servers.\\n* Throughput increases with larger batch sizes due to better GPU utilization. Consider your use case when selecting batch size.\\n* Optimizations listed are suggestions that could further improve performance.\\n* **Using torch tensors is specially indicated** in cases such as use the model for **text to image models or similar** because the output is already in tensor format.\\n\\n\\n\\n## License\\n\\nThis project is licensed under the MIT License - Copyright 2025 Freepik Company S.L.\\n\\n\\n## Citation\\n\\nIf you use this model in your research or project, please cite it as:\\n\\n```bibtex\\n@software{freepik2025nsfw,\\n    title={EVA-based Fast NSFW Image Classifier},\\n    author={Freepik Company S.L.},\\n    year={2025},\\n    publisher={Hugging Face},\\n    url = {\\n    organization = {Freepik Company S.L.}\\n}\\n```\\n\\n## Acknowledgements\\n\\nThis model is based on the EVA architecture (), as described in the following paper:\\n\\nEVA-02: A Visual Representation for Neon Genesis - \\n\\n```bibtex\\n@article{EVA02,\\n  title={EVA-02: A Visual Representation for Neon Genesis},\\n  author={Fang, Yuxin and Sun, Quan and Wang, Xinggang and Huang, Tiejun and Wang, Xinlong and Cao, Yue},\\n  journal={arXiv preprint arXiv:2303.11331},\\n  year={2023}\\n}\\n```',\n",
       "  'domain': 'image-classification'},\n",
       " {'model_id': 'iSEE-Laboratory/llmdet_base',\n",
       "  'created_at': '2025-05-30T10:10:59+00:00',\n",
       "  'downloads': 403263,\n",
       "  'likes': 9,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'mm-grounding-dino',\n",
       "   'zero-shot-object-detection',\n",
       "   'vision',\n",
       "   'arxiv:2501.18954',\n",
       "   'arxiv:2104.12763',\n",
       "   'license:apache-2.0',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n\\n# LLMDet (base variant)\\n\\n model was proposed in  by Shenghao Fu, Qize Yang, Qijie Mo, Junkai Yan, Xihan Wei, Jingke Meng, Xiaohua Xie, Wei-Shi Zheng.\\n\\nLLMDet improves upon the  and  by co-training the model with a large language model.\\n\\nYou can find all the LLMDet checkpoints under the  collection. Note that these checkpoints are inference only -- they do not include LLM which was used for training. The inference is identical to that of .\\n\\n\\n## Intended uses\\n\\nYou can use the raw model for zero-shot object detection.\\n\\nHere\\'s how to use the model for zero-shot object detection:\\n\\n```py\\nimport torch\\nfrom transformers import AutoModelForZeroShotObjectDetection, AutoProcessor\\nfrom transformers.image_utils import load_image\\n\\n\\n# Prepare processor and model\\nmodel_id = \"iSEE-Laboratory/llmdet_base\"\\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\"\\nprocessor = AutoProcessor.from_pretrained(model_id)\\nmodel = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)\\n\\n# Prepare inputs\\nimage_url = \"\\nimage = load_image(image_url)\\ntext_labels = [[\"a cat\", \"a remote control\"]]\\ninputs = processor(images=image, text=text_labels, return_tensors=\"pt\").to(device)\\n\\n# Run inference\\nwith torch.no_grad():\\n    outputs = model(**inputs)\\n\\n# Postprocess outputs\\nresults = processor.post_process_grounded_object_detection(\\n    outputs,\\n    threshold=0.4,\\n    target_sizes=[(image.height, image.width)]\\n)\\n\\n# Retrieve the first image result\\nresult = results[0]\\nfor box, score, labels in zip(result[\"boxes\"], result[\"scores\"], result[\"labels\"]):\\n    box = [round(x, 2) for x in box.tolist()]\\n    print(f\"Detected {labels} with confidence {round(score.item(), 3)} at location {box}\")\\n```\\n\\n## Training Data\\n\\nThis model was trained on:\\n - \\n - \\n - \\n - \\n\\n\\n## Evaluation results\\n\\n- Here\\'s a table of LLMDet models and their performance on LVIS (results from ):\\n\\n    |                             Model                         | Pre-Train Data            |  MiniVal APr | MiniVal APc | MiniVal APf | MiniVal AP  | Val1.0 APr | Val1.0 APc | Val1.0 APf |  Val1.0 AP  |\\n    | --------------------------------------------------------- | -------------------------------------------- | ------------ | ----------- | ----------- | ----------- | ---------- | ---------- | ---------- | ----------- |\\n    |    | (O365,GoldG,GRIT,V3Det) + GroundingCap-1M    | 44.7         | 37.3        | 39.5        | 50.7        | 34.9       | 26.0       | 30.1       | 44.3        |\\n    |    | (O365,GoldG,V3Det) + GroundingCap-1M         | 48.3         | 40.8        | 43.1        | 54.3        | 38.5       | 28.2       | 34.3       | 47.8        |\\n    |  | (O365V2,OpenImageV6,GoldG) + GroundingCap-1M | 51.1         | 45.1        | 46.1        | 56.6        | 42.0       | 31.6       | 38.8       | 50.2        |\\n\\n\\n\\n## BibTeX entry and citation info\\n\\n```bib\\n@article{fu2025llmdet,\\n  title={LLMDet: Learning Strong Open-Vocabulary Object Detectors under the Supervision of Large Language Models},\\n  author={Fu, Shenghao and Yang, Qize and Mo, Qijie and Yan, Junkai and Wei, Xihan and Meng, Jingke and Xie, Xiaohua and Zheng, Wei-Shi},\\n  journal={arXiv preprint arXiv:2501.18954},\\n  year={2025}\\n}\\n```\\n',\n",
       "  'domain': 'zero-shot-object-detection'},\n",
       " {'model_id': 'tencent/HunyuanVideo-1.5',\n",
       "  'created_at': '2025-11-18T06:36:54+00:00',\n",
       "  'downloads': 384639,\n",
       "  'likes': 699,\n",
       "  'author': None,\n",
       "  'tags': ['HunyuanVideo-1.5',\n",
       "   'diffusers',\n",
       "   'safetensors',\n",
       "   'text-to-video',\n",
       "   'image-to-video',\n",
       "   'en',\n",
       "   'zh',\n",
       "   'arxiv:2511.18870',\n",
       "   'license:other',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n\\n\\n\\n# HunyuanVideo-1.5\\n\\n\\n\\n\\n\\n# 🎬 HunyuanVideo-1.5: A leading lightweight video generation model\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nHunyuanVideo-1.5 is a video generation model that delivers top-tier quality with only 8.3B parameters, significantly lowering the barrier to usage. It runs smoothly on consumer-grade GPUs, making it accessible for every developer and creator. This repository provides the implementation and tools needed to generate creative videos.\\n\\n\\n\\n  \\n  \\n  \\n  \\n  \\n   \\n  \\n  \\n  \\n  \\n\\n\\n\\n\\n\\n    👏 Join our WeChat and Discord | \\n💻 Official website Try our model!&nbsp&nbsp\\n\\n\\n## 🔥🔥🔥 News\\n* 🚀 Dec 23, 2025: Fp8 gemm inference is supported! 🔥🔥🔥🆕\\n* 🚀 Dec 05, 2025: **New Release**: We now release the , which generates videos in 8 or 12 steps (recommended)! On RTX 4090, end-to-end generation time is reduced by 75%, and a single RTX 4090 can generate videos within **75 seconds**. The step-distilled model maintains comparable quality to the original model while achieving significant speedup. See  for detailed quality comparisons. For even faster generation, you can also try 4 steps (faster speed with slightly reduced quality). **To enable the step-distilled model, run `generate.py` with the `--enable_step_distill` parameter.** See  for detailed usage instructions. 🔥🔥🔥🆕\\n* 📚 Dec 05, 2025: **Training Code & LoRA Tuning Script Released**: We now open-source the training code for HunyuanVideo-1.5! The training script (`train.py`) provides a full training pipeline with support for distributed training, FSDP, context parallel, gradient checkpointing, and more. HunyuanVideo-1.5 is trained using the Muon optimizer, which we have open-sourced in the  section. **If you would like to continue training our model or fine-tune it with LoRA, please use the Muon optimizer.** See  section for detailed usage instructions. 🔥🔥🔥🆕\\n* 🎉 **Diffusers Support**: HunyuanVideo-1.5 is now available on Hugging Face Diffusers! Check out  for easy integration. 🔥🔥🔥🆕\\n* 🚀 Nov 27, 2025: We now support cache inference (deepcache, teacache, taylorcache), achieving significant speedup! Pull the latest code to try it.\\n* 🚀 Nov 24, 2025: We now support deepcache inference.\\n* 👋 Nov 20, 2025: We release the inference code and model weights of HunyuanVideo-1.5.\\n\\n\\n## 🎥 Demo\\n\\n   \\n\\n\\n## 🧩 Community Contributions\\n\\nIf you develop/use HunyuanVideo-1.5 in your projects, welcome to let us know.\\n\\n- **Diffusers** - : Official Hugging Face Diffusers integration for HunyuanVideo-1.5. Easily use HunyuanVideo-1.5 with the Diffusers library for seamless integration into your projects. See  section for details.\\n\\n- **ComfyUI** - : A powerful and modular diffusion model GUI with a graph/nodes interface. ComfyUI supports HunyuanVideo-1.5 with various engineering optimizations for fast inference. We provide a  for HunyuanVideo-1.5.\\n\\n- **Community-implemented ComfyUI Plugin** - : A community-implemented ComfyUI plugin for HunyuanVideo-1.5, offering both simplified and complete node sets for quick usage or deep workflow customization, with built-in automatic model download support.\\n\\n- **LightX2V** - : A lightweight and efficient video generation framework that integrates HunyuanVideo-1.5, supporting multiple engineering acceleration techniques for fast inference.\\n\\n- **Wan2GP v9.62** - : WanGP is a very low VRAM app (as low 6 GB of VRAM for Hunyuan Video 1.5) supports Lora Accelerator for a 8 steps generation and offers tools to facilitate Video Generation.\\n\\n- **ComfyUI-MagCache** - : MagCache is a training-free caching approach that accelerates video generation by estimating fluctuating differences among model outputs across timesteps. It achieves 1.7x speedup for HunyuanVideo-1.5 with 20 inference steps.\\n\\n\\n## 📑 Open-source Plan\\n- HunyuanVideo-1.5 (T2V/I2V)\\n  - [x] Inference Code and checkpoints\\n  - [x] ComfyUI Support\\n  - [x] LightX2V Support\\n  - [x] Diffusers Support\\n  - [ ] Release all model weights (Sparse attention, distill model, and SR models)\\n\\n## 📋 Table of Contents\\n- \\n- \\n- \\n- \\n- \\n- \\n- \\n- \\n- \\n- \\n- \\n  - \\n  - \\n  - \\n  - \\n  - \\n  - \\n  - \\n- \\n- \\n- \\n- \\n- \\n- \\n\\n\\n## 📖 Introduction\\nWe present HunyuanVideo-1.5, a lightweight yet powerful video generation model that achieves state-of-the-art visual quality and motion coherence with only 8.3 billion parameters, enabling efficient inference on consumer-grade GPUs. This achievement is built upon several key components, including meticulous data curation, an advanced DiT architecture with selective and sliding tile attention(SSTA), enhanced bilingual understanding through glyph-aware text encoding , progressive pre-training and post-training, and an efficient video super-resolution network. Leveraging these designs, we developed a unified framework capable of high-quality text-to-video and image-to-video generation across multiple durations and resolutions. Extensive experiments demonstrate that this compact and proficient model establishes a new state-of-the-art among open-source models. By releasing the code and weights of HunyuanVideo-1.5, we provide the community with a high-performance foundation that significantly lowers the cost of video creation and research, making advanced video generation more accessible to all.\\n\\n\\n## ✨ Key Features\\n- **Lightweight High-Performance Architecture**: We propose an efficient architecture that integrates an 8.3B-parameter Diffusion Transformer (DiT) with a 3D causal VAE, achieving compression ratios of 16× in spatial dimensions and 4× along the temporal axis. Additionally, the innovative SSTA (Selective and Sliding Tile Attention) mechanism prunes redundant spatiotemporal kv blocks, significantly reduces computational overhead for long video sequences and accelerates inference, achieving an end-to-end speedup of $1.87 \\\\times$ in 10-second 720p video synthesis compared to FlashAttention-3.\\n\\n\\n\\n \\n\\n\\n- **Video Super-Resolution Enhancement**: We develop an efficient few-step super-resolution network that upscales outputs to 1080p. It enhances sharpness while correcting distortions, thereby refining details and overall visual texture.\\n\\n\\n\\n \\n\\n- **End-to-End Training Optimization**: This work employs a multi-stage, progressive training strategy covering the entire pipeline from pre-training to post-training. Combined with the Muon optimizer to accelerate convergence, this approach holistically refines motion coherence, aesthetic quality, and human preference alignment, achieving professional-grade content generation.\\n\\n## 📜 System Requirements\\n\\n### Hardware Requirements\\n\\n- **GPU**: NVIDIA GPU with CUDA support\\n- **Minimum GPU Memory**: 14 GB (with model offloading enabled)\\n  \\n  > **Note:** The memory requirements above are measured with model offloading enabled. If your GPU has sufficient memory, you may disable offloading for improved inference speed.\\n\\n### Software Requirements\\n\\n- **Operating System**: Linux\\n- **Python**: Python 3.10 or higher\\n- **CUDA**: Compatible CUDA version for your PyTorch installation\\n\\n## 🛠️ Dependencies and Installation\\n\\n### Step 1: Clone the Repository\\n\\n```bash\\ngit clone \\ncd HunyuanVideo-1.5\\n```\\n\\n### Step 2: Install Basic Dependencies\\n\\n```bash\\npip install -r requirements.txt\\npip install -i  --upgrade tencentcloud-sdk-python\\n```\\n\\n### Step 3: Install Attention Libraries\\n\\n* Flash Attention: \\n  Install Flash Attention for faster inference and reduced GPU memory consumption.\\n  Detailed installation instructions are available at .\\n\\n* Flex-Block-Attention: \\n  flex-block-attn is only required for sparse attention to achieve faster inference and can be installed by the following command:\\n  ```bash\\n  git clone \\n  cd flex-block-attn\\n  git submodule update --init --recursive\\n  python3 setup.py install\\n  ```\\n\\n* SageAttention: \\n  To enable SageAttention for faster inference, you need to install it by the following command:\\n  > **Note**: Enabling SageAttention will automatically disable Flex-Block-Attention.\\n  ```bash\\n  git clone \\n  cd SageAttention \\n  export EXT_PARALLEL=4 NVCC_APPEND_FLAGS=\"--threads 8\" MAX_JOBS=32 # Optional\\n  python3 setup.py install\\n  ```\\n\\n* SGL-Kernel:\\n  To enable fp8 gemm for transformer, you need to install it by the following command:\\n  ```bash\\n  pip install sgl-kernel==0.3.18\\n  ```\\n\\n\\n## 🧱 Download Pretrained Models\\n\\n> 💡 Distillation models and sparse attention models are still coming soon. Please stay tuned for the latest updates on the Hugging Face Model Card.\\n\\nDownload the pretrained models before generating videos. Detailed instructions are available at .\\n\\n### Model Cards\\n|ModelName| Download                     |\\n|-|---------------------------| \\n|HunyuanVideo-1.5-480P-T2V| |\\n|HunyuanVideo-1.5-480P-I2V | |\\n|HunyuanVideo-1.5-480P-T2V-cfg-distill |  |\\n|HunyuanVideo-1.5-480P-I2V-cfg-distill | |\\n|HunyuanVideo-1.5-480P-I2V-step-distill | |\\n|HunyuanVideo-1.5-720P-T2V| |\\n|HunyuanVideo-1.5-720P-I2V | |\\n|HunyuanVideo-1.5-720P-T2V-cfg-distill| Comming soon |\\n|HunyuanVideo-1.5-720P-I2V-cfg-distill | |\\n|HunyuanVideo-1.5-720P-T2V-sparse-cfg-distill| Comming soon |\\n|HunyuanVideo-1.5-720P-I2V-sparse-cfg-distill | |\\n|HunyuanVideo-1.5-720P-sr-step-distill | |\\n|HunyuanVideo-1.5-1080P-sr-step-distill | |\\n\\n## 📝 Prompt Guide\\n### Prompt Writing Handbook\\nPrompt enhancement plays a crucial role in enabling our model to generate high-quality videos. By writing longer and more detailed prompts, the generated video will be significantly improved. We encourage you to craft comprehensive and descriptive prompts to achieve the best possible video quality. we recommend community partners consulting our official guide on how to write effective prompts. \\n\\n**Reference:** ****\\n\\n### System Prompts for Automatic Prompt Enhancement\\nFor users seeking to optimize prompts for other large models, it is recommended to consult the definition of `t2v_rewrite_system_prompt` in the file `hyvideo/utils/rewrite/t2v_prompt.py` to guide text-to-video rewriting. Similarly, for image-to-video rewriting, refer to the definition of `i2v_rewrite_system_prompt` in `hyvideo/utils/rewrite/i2v_prompt.py`.\\n\\n## 🔑 Inference\\n\\n### Inference with Source Code\\n\\n\\nFor prompt rewriting, we recommend using Gemini or models deployed via vLLM. This codebase currently only supports models compatible with the vLLM API. If you wish to use Gemini, you will need to implement your own interface calls.\\n\\nFor models with a vLLM API, note that T2V (text-to-video) and I2V (image-to-video) have different recommended models and environment variables:\\n\\n- T2V: use , configure `T2V_REWRITE_BASE_URL` and `T2V_REWRITE_MODEL_NAME`\\n- I2V: use , configure `I2V_REWRITE_BASE_URL` and `I2V_REWRITE_MODEL_NAME`\\n\\n> You may set the above model names to any other vLLM-compatible models you have deployed (including HuggingFace models).  \\n> Rewriting is enabled by default (`--rewrite` defaults to `true`); to disable it explicitly, use `--rewrite false` or `--rewrite 0`. If no vLLM endpoint is configured, the pipeline runs without remote rewriting.\\n\\nExample: Generate a video (works for both T2V and I2V; set `IMAGE_PATH=none` for T2V or provide an image path for I2V)\\n\\n> 💡 **Tip**: For faster inference speed, you can enable the step-distilled model using the `--enable_step_distill` parameter. The step-distilled model (480p I2V) can generate videos in 8 or 12 steps (recommended), achieving up to 75% speedup on RTX 4090 while maintaining comparable quality.\\n>\\n> **Tips:** If your GPU memory is > 14GB but you encounter OOM (Out of Memory) errors during generation, you can try setting the following environment variable before running:\\n> ```bash\\n> export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128\\n> ```\\n> \\n> **Tips:** If you have limited CPU memory and encounter OOM during inference, you can try disable overlapped group offloading by adding the following argument:\\n> ```bash\\n> --overlap_group_offloading false\\n> ```\\n\\n```bash\\nexport T2V_REWRITE_BASE_URL=\"\"\\nexport T2V_REWRITE_MODEL_NAME=\"\"\\nexport I2V_REWRITE_BASE_URL=\"\"\\nexport I2V_REWRITE_MODEL_NAME=\"\"\\n\\nPROMPT=\\'A girl holding a paper with words \"Hello, world!\"\\'\\n\\nIMAGE_PATH=/path/to/image.png # Optional, none or  to enable i2v mode\\nSEED=1\\nASPECT_RATIO=16:9\\nRESOLUTION=480p\\nOUTPUT_PATH=./outputs/output.mp4\\nMODEL_PATH=./ckpts # Path to pretrained model\\n\\n# Configuration for faster inference\\nN_INFERENCE_GPU=8 # Parallel inference GPU count\\nCFG_DISTILLED=true # Inference with CFG distilled model, 2x speedup\\nSAGE_ATTN=true # Inference with SageAttention\\nSPARSE_ATTN=false # Inference with sparse attention (only 720p models are equipped with sparse attention). Please ensure flex-block-attn is installed\\nOVERLAP_GROUP_OFFLOADING=true # Only valid when group offloading is enabled, significantly increases CPU memory usage but speeds up inference\\nENABLE_CACHE=true # Enable feature cache during inference. Significantly speeds up inference.\\nCACHE_TYPE=deepcache # Support: deepcache, teacache, taylorcache\\nENABLE_STEP_DISTILL=true # Enable step distilled model for 480p I2V, recommended 8 or 12 steps, up to 6x speedup\\n\\n\\n# Configuration for better quality\\nREWRITE=true # Enable prompt rewriting. Please ensure rewrite vLLM server is deployed and configured.\\nENABLE_SR=true # Enable super resolution\\n\\n\\ntorchrun --nproc_per_node=$N_INFERENCE_GPU generate.py \\\\\\n  --prompt \"$PROMPT\" \\\\\\n  --image_path $IMAGE_PATH \\\\\\n  --resolution $RESOLUTION \\\\\\n  --aspect_ratio $ASPECT_RATIO \\\\\\n  --seed $SEED \\\\\\n  --rewrite $REWRITE \\\\\\n  --cfg_distilled $CFG_DISTILLED \\\\\\n  --enable_step_distill $ENABLE_STEP_DISTILL \\\\\\n  --sparse_attn $SPARSE_ATTN --use_sageattn $SAGE_ATTN \\\\\\n  --enable_cache $ENABLE_CACHE --cache_type $CACHE_TYPE \\\\\\n  --overlap_group_offloading $OVERLAP_GROUP_OFFLOADING \\\\\\n  --sr $ENABLE_SR --save_pre_sr_video \\\\\\n  --output_path $OUTPUT_PATH \\\\\\n  --model_path $MODEL_PATH\\n```\\n\\n\\n\\n### Command Line Arguments\\n\\n| Argument | Type | Required | Default | Description |\\n|----------|------|----------|---------|-------------|\\n| `--prompt` | str | Yes | - | Text prompt for video generation |\\n| `--negative_prompt` | str | No | `\\'\\'` | Negative prompt for video generation |\\n| `--resolution` | str | Yes | - | Video resolution: `480p` or `720p` |\\n| `--model_path` | str | Yes | - | Path to pretrained model directory |\\n| `--aspect_ratio` | str | No | `16:9` | Aspect ratio of the output video |\\n| `--num_inference_steps` | int | No | `50` | Number of inference steps |\\n| `--video_length` | int | No | `121` | Number of frames to generate |\\n| `--seed` | int | No | `123` | Random seed for reproducibility |\\n| `--image_path` | str | No | `None` | Path to reference image (enables i2v mode). Use `none` or `None` to explicitly use text-to-video mode |\\n| `--output_path` | str | No | `None` | Output file path (if not provided, saves to `./outputs/output_{transformer_version}_{timestamp}.mp4`) |\\n| `--sr` | bool | No | `true` | Enable super resolution (use `--sr false` or `--sr 0` to disable) |\\n| `--save_pre_sr_video` | bool | No | `false` | Save original video before super resolution (use `--save_pre_sr_video` or `--save_pre_sr_video true` to enable, only effective when super resolution is enabled) |\\n| `--rewrite` | bool | No | `true` | Enable prompt rewriting (use `--rewrite false` or `--rewrite 0` to disable, may result in lower quality video generation) |\\n| `--cfg_distilled` | bool | No | `false` | Enable CFG distilled model for faster inference (~2x speedup, use `--cfg_distilled` or `--cfg_distilled true` to enable) |\\n| `--enable_step_distill` | bool | No | `false` | Enable step distilled model for 480p I2V (recommended 8 or 12 steps, ~75% speedup on RTX 4090, use `--enable_step_distill` or `--enable_step_distill true` to enable) |\\n| `--sparse_attn` | bool | No | `false` | Enable sparse attention for faster inference (~1.5-2x speedup, requires H-series GPUs, auto-enables CFG distilled, use `--sparse_attn` or `--sparse_attn true` to enable) |\\n| `--offloading` | bool | No | `true` | Enable CPU offloading (use `--offloading false` or `--offloading 0` to disable for faster inference if GPU memory allows) |\\n| `--group_offloading` | bool | No | `None` | Enable group offloading (default: None, automatically enabled if offloading is enabled. Use `--group_offloading` or `--group_offloading true/1` to enable, `--group_offloading false/0` to disable) |\\n| `--overlap_group_offloading` | bool | No | `true` | Enable overlap group offloading (default: true). Significantly increases CPU memory usage but speeds up inference. Use `--overlap_group_offloading` or `--overlap_group_offloading true/1` to enable, `--overlap_group_offloading false/0` to disable |\\n| `--dtype` | str | No | `bf16` | Data type for transformer: `bf16` (faster, lower memory) or `fp32` (better quality, slower, higher memory) |\\n| `--use_sageattn` | bool | No | `false` | Enable SageAttention (use `--use_sageattn` or `--use_sageattn true/1` to enable, `--use_sageattn false/0` to disable) |\\n| `--sage_blocks_range` | str | No | `0-53` | SageAttention blocks range (e.g., `0-5` or `0,1,2,3,4,5`) |\\n| `--enable_cache` | bool | No | `false` | Enable cache for transformer (use `--enable_cache` or `--enable_cache true/1` to enable, `--enable_cache false/0` to disable) |\\n| `--cache_type` | str | No | `deepcache` | Cache type for transformer (e.g., `deepcache, teacache, taylorcache`) |\\n| `--no_cache_block_id` | str | No | `53` | Blocks to exclude from deepcache (e.g., `0-5` or `0,1,2,3,4,5`) |\\n| `--cache_start_step` | int | No | `11` | Start step to skip when using cache |\\n| `--cache_end_step` | int | No | `45` | End step to skip when using cache |\\n| `--total_steps` | int | No | `50` | Total inference steps |\\n| `--cache_step_interval` | int | No | `4` | Step interval to skip when using cache |\\n\\n**Note:** Use `--nproc_per_node` to specify the number of GPUs. For example, `--nproc_per_node=8` uses 8 GPUs.\\n\\n### Optimal Inference Configurations\\n\\nThe following table provides the optimal inference configurations (CFG scale, embedded CFG scale, flow shift, and inference steps) for each model to achieve the best generation quality:\\n\\n| Model | CFG Scale | Embedded CFG Scale | Flow Shift | Inference Steps |\\n|-------|-----------|-------------------|------------|-----------------|\\n| 480p T2V | 6 | None | 5 | 50 |\\n| 480p I2V | 6 | None | 5 | 50 |\\n| 720p T2V | 6 | None | 9 | 50 |\\n| 720p I2V | 6 | None | 7 | 50 |\\n| 480p T2V CFG Distilled | 1 | None | 5 | 50 |\\n| 480p I2V CFG Distilled | 1 | None | 5 | 50 |\\n| 480p I2V Step Distilled | 1 | None | 7 | 8 or 12 (recommended) |\\n| 720p T2V CFG Distilled | 1 | None | 9 | 50 |\\n| 720p I2V CFG Distilled | 1 | None | 7 | 50 |\\n| 720p T2V CFG Distilled Sparse | 1 | None | 9 | 50 |\\n| 720p I2V CFG Distilled Sparse | 1 | None | 7 | 50 |\\n| 480→720 SR Step Distilled | 1 | None | 2 | 6 |\\n| 720→1080 SR Step Distilled | 1 | None | 2 | 8 |\\n\\n**Please note that the cfg distilled model we provided, must use 50 steps to generate correct results.**\\n\\n### Usage with Diffusers\\n\\nHunyuanVideo-1.5 is available on Hugging Face Diffusers! You can easily use it with the Diffusers library:\\n\\n**Basic Usage:**\\n\\n```python\\nimport torch\\n\\ndtype = torch.bfloat16\\ndevice = \"cuda:0\"\\n\\nfrom diffusers import HunyuanVideo15Pipeline\\nfrom diffusers.utils import export_to_video\\n\\npipe = HunyuanVideo15Pipeline.from_pretrained(\"hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v\", torch_dtype=dtype)\\npipe.enable_model_cpu_offload()\\npipe.vae.enable_tiling()\\n\\ngenerator = torch.Generator(device=device).manual_seed(seed)\\n\\nvideo = pipe(\\n    prompt=prompt,\\n    generator=generator,\\n    num_frames=121,\\n    num_inference_steps=50,\\n).frames[0]\\n\\nexport_to_video(video, \"output.mp4\", fps=24)\\n```\\n\\n**Optimized Usage with Attention Backend:**\\n\\nHunyuanVideo-1.5 uses attention masks with variable-length sequences. For best performance, we recommend using an attention backend that handles padding efficiently.\\n\\nWe recommend installing kernels (`pip install kernels`) to access prebuilt attention kernels.\\n\\n```python\\nimport torch\\n\\ndtype = torch.bfloat16\\ndevice = \"cuda:0\"\\n\\nfrom diffusers import HunyuanVideo15Pipeline, attention_backend\\nfrom diffusers.utils import export_to_video\\n\\npipe = HunyuanVideo15Pipeline.from_pretrained(\"hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v\", torch_dtype=dtype)\\npipe.enable_model_cpu_offload()\\npipe.vae.enable_tiling()\\n\\ngenerator = torch.Generator(device=device).manual_seed(seed)\\n\\nwith attention_backend(\"_flash_3_hub\"): # or `\"flash_hub\"` if you are not on H100/H800\\n    video = pipe(\\n        prompt=prompt,\\n        generator=generator,\\n        num_frames=121,\\n        num_inference_steps=50,\\n    ).frames[0]\\n    export_to_video(video, \"output.mp4\", fps=24)\\n```\\n\\nFor more details, please visit .\\n\\n\\n## 🎓 Training\\n\\nHunyuanVideo-1.5 is trained using the **Muon optimizer**, which accelerates convergence and improves training stability. The Muon optimizer combines momentum-based updates with Newton-Schulz orthogonalization for efficient optimization of large-scale video generation models.\\n\\n### Quick Start\\n\\nThe training script (`train.py`) provides a complete training pipeline for HunyuanVideo-1.5. Here\\'s how to use it:\\n\\n#### 1. Implement Your DataLoader\\n\\nReplace the `create_dummy_dataloader()` function in `train.py` with your own implementation. Your dataset\\'s `__getitem__` method should return a single sample.\\n\\n- **Required fields:**\\n  - `\"pixel_values\"`: `torch.Tensor` - Video: `[C, F, H, W]` or Image: `[C, H, W]`\\n    - Pixel values must be in range `[-1, 1]` \\n    - Note: For video data, temporal dimension F must be `4n+1` (e.g., 1, 5, 9, 13, 17, ...)\\n  - `\"text\"`: `str` - Text prompt for this sample\\n  - `\"data_type\"`: `str` - `\"video\"` or `\"image\"`\\n\\n- **Optional fields (for performance optimization):**\\n  - `\"latents\"`: Pre-encoded VAE latents (skips VAE encoding for faster training)\\n  - `\"byt5_text_ids\"` and `\"byt5_text_mask\"`: Pre-tokenized byT5 inputs\\n\\nSee the `create_dummy_dataloader()` function in `train.py` for detailed format documentation.\\n\\n#### 2. Run Training\\n\\n**Single GPU:**\\n```bash\\npython train.py --pretrained_model_root  [other args]\\n```\\n\\n**Multi-GPU:**\\n```bash\\nN=8\\ntorchrun --nproc_per_node=$N train.py --pretrained_model_root  [other args]\\n```\\n\\n**Example:**\\n```bash\\ntorchrun --nproc_per_node=8 train.py \\\\\\n  --pretrained_model_root ./ckpts \\\\\\n  --learning_rate 1e-5 \\\\\\n  --batch_size 1 \\\\\\n  --max_steps 10000 \\\\\\n  --output_dir ./outputs \\\\\\n  --enable_fsdp \\\\\\n  --enable_gradient_checkpointing \\\\\\n  --sp_size 8\\n```\\n\\n#### 3. Key Training Parameters\\n\\n| Parameter | Description | Default |\\n|-----------|-------------|---------|\\n| `--pretrained_model_root` | Path to pretrained model (required) | - |\\n| `--learning_rate` | Learning rate | 1e-5 |\\n| `--batch_size` | Batch size | 1 |\\n| `--max_steps` | Maximum training steps | 10000 |\\n| `--warmup_steps` | Warmup steps | 500 |\\n| `--gradient_accumulation_steps` | Gradient accumulation steps | 1 |\\n| `--enable_fsdp` | Enable FSDP for distributed training | true |\\n| `--enable_gradient_checkpointing` | Enable gradient checkpointing | true |\\n| `--sp_size` | Sequence parallelism size (must divide world_size) | 8 |\\n| `--i2v_prob` | Probability of i2v task for video data | 0.3 |\\n| `--use_muon` | Use Muon optimizer | true |\\n| `--resume_from_checkpoint` | Resume from checkpoint directory | None |\\n| `--use_lora` | Enable LoRA fine-tuning | false |\\n| `--lora_r` | LoRA rank | 8 |\\n| `--lora_alpha` | LoRA alpha scaling parameter | 16 |\\n| `--lora_dropout` | LoRA dropout rate | 0.0 |\\n| `--pretrained_lora_path` | Path to pretrained LoRA adapter | None |\\n\\n#### 4. Monitor Training\\n\\n- Checkpoints are saved to `output_dir` at intervals specified by `--save_interval`\\n- Validation videos are generated at intervals specified by `--validation_interval`\\n- Training logs are printed to console at intervals specified by `--log_interval`\\n\\n#### 5. Resume Training\\n\\nUse `--resume_from_checkpoint ` to resume from a saved checkpoint:\\n```bash\\npython train.py \\\\\\n  --pretrained_model_root  \\\\\\n  --resume_from_checkpoint ./outputs/checkpoint-1000\\n```\\n\\n#### 6. LoRA Fine-tuning\\n\\nTo enable LoRA fine-tuning, add `--use_lora` to your training command. LoRA adapters will be saved in the checkpoint directory under `lora/`:\\n\\n```bash\\ntorchrun --nproc_per_node=8 train.py \\\\\\n  --pretrained_model_root ./ckpts \\\\\\n  --use_lora \\\\\\n  --lora_r 8 \\\\\\n  --lora_alpha 16 \\\\\\n  --learning_rate 1e-4 \\\\\\n  --output_dir ./outputs\\n```\\n\\nTo load a pretrained LoRA adapter, use `--pretrained_lora_path`:\\n```bash\\ntorchrun --nproc_per_node=8 train.py \\\\\\n  --pretrained_model_root ./ckpts \\\\\\n  --use_lora \\\\\\n  --pretrained_lora_path ./outputs/checkpoint-1000/lora/default\\n```\\n\\n\\n## 📊 Evaluation\\n\\n### Rating\\nWe assess text-to-video generation using a comprehensive rating methodology that considers five key dimensions: text-video consistency, visual quality, structural stability, motion effects, and the aesthetic quality of individual frames. For image-to-video generation, the evaluation encompasses image-video consistency, instruction responsiveness, visual quality, structural stability, and motion effects.\\n\\n\\n\\n \\n\\n---\\n\\n\\n\\n \\n\\n\\n### GSB\\nThe GSB(Good/Same/Bad) approach is widely used to evaluate the relative performance of two models based on overall video perception quality.We carefully construct 300 diverse text prompts and 300 image samples to cover balanced application scenarios for both text-to-video and image-to-video tasks. For each prompt or image input, an equal number of video samples are generated by each model in a single run to ensure comparability. To maintain fairness, inference is performed only once per input without any cherry-picking of results. All competing models are evaluated using their default configurations. The evaluation is conducted by over 100 professional assessors\\n\\n\\n\\n\\n\\n---\\n\\n\\n\\n \\n\\n\\n### Inference speed\\nWe report inference speed with basic engineering-level acceleration techniques enabled on 8 H800 GPUs to demonstrate practical performance achievable in real-world deployment scenarios.\\nPlease note that in this experiment, we do not pursue the most extreme acceleration at the cost of generation quality, but rather to achieve notable speed improvements while maintaining nearly identical output quality.\\n\\nWe report the total inference time for 50 diffusion steps for HunyuanVideo 1.5 below:\\n\\n\\n\\n \\n\\n## 🎬 More Examples\\n|Features|Demo1|Demo2|\\n|------|------|------|\\n|Strong Instruction Following|  📋 Show input prompt ```一名哀伤的黑发中国女子凝望天空，复古胶片风格烘托出怀旧戏剧氛围```  📋 Show rewrite prompt ```俯视角度，一位有着深色，略带凌乱的长卷发的年轻中国女性，佩戴着闪耀的珍珠项链和圆形金色耳环，她凌乱的头发被风吹散，她微微抬头，望向天空，神情十分哀伤，眼中含着泪水。嘴唇涂着红色口红。背景是带有华丽红色花纹的图案。画面呈现复古电影风格，色调低饱和，带着轻微柔焦，烘托情绪氛围，质感仿佛20世纪90年代的经典胶片风格，营造出怀旧且富有戏剧性的感觉。``` |  📋 Show input prompt ```建筑蓝图上的线条化为实体，瞬间生长出一个完整的复古工业风办公空间。```  📋 Show rewrite prompt ```一座空旷的现代阁楼里，有一张铺展在地板中央的建筑蓝图。忽然间，图纸上的线条泛起微光，仿佛被某种无形的力量唤醒。紧接着，那些发光的线条开始向上延伸，从平面中挣脱，勾勒出立体的轮廓——就像在空中进行一场无声的3D打印。随后，奇迹在加速发生：极简的橡木办公桌、优雅的伊姆斯风格皮质椅、高挑的工业风金属书架，还有几盏爱迪生灯泡，以光纹为骨架迅速“生长”出来。转瞬间，线条被真实的材质填充——木材的温润、皮革的质感、金属的冷静，都在眨眼间完整呈现。最终，所有家具稳固落地，蓝图的光芒悄然褪去。一个完整的办公空间，就这样从二维的图纸中诞生。``` |\\n|Smooth Motion Generation|  📋 Show input prompt ```A DJ is immersed in his musical world. He wears a pair of professional, matte-black headphones, revealing a focused expression. He wears a black bomber jacket, zipped open to reveal a T-shirt underneath. His upper body sways back and forth rhythmically to the throbbing electronic beats, his head moving with precise movement. The mixing console in front of him serves as the primary source of light. In the distance, the cool white glow of several stadium floodlights casts a deep, dark haze across the vast field, casting long shadows across the emerald green grass, creating a stark contrast to the brightly lit area surrounding the DJ booth. His hands danced swiftly and precisely across the equipment. The entire scene was filled with high-tech dynamics and the solitary creative passion. Against the backdrop of the vast and silent night stadium, it created an atmosphere of high focus, energy, and a slightly surreal feeling.```  📋 Show rewrite prompt ```slowly advancing medium shot, shot from a level angle, focuses on the center of an empty football field, where a DJ is immersed in his musical world. He wears a pair of professional, matte-black headphones, one earcup slightly removed, revealing a focused expression and a brow beaded with sweat from his intense concentration. He wears a black bomber jacket, zipped open to reveal a T-shirt underneath. His upper body sways back and forth rhythmically to the throbbing electronic beats, his head moving with precise movement. The mixing console in front of him serves as the primary source of light. In the distance, the cool white glow of several stadium floodlights casts a deep, dark haze across the vast field, casting long shadows across the emerald green grass, creating a stark contrast to the brightly lit area surrounding the DJ booth. His hands danced swiftly and precisely across the equipment, one hand steadily pushing and pulling a long volume fader, while the fingers of the other nimbly jumped between the illuminated knobs and pads, sometimes decisively cutting a bass line, sometimes triggering an echo effect. The entire scene was filled with high-tech dynamics and the solitary creative passion. Against the backdrop of the vast and silent night stadium, it created an atmosphere of high focus, energy, and a slightly surreal feeling.``` |  📋 Show input prompt ```A figure skater performs a rapid, graceful Biellmann spin, captured from all angles.```  📋 Show rewrite prompt ```The video captures a figure skater performing a Biellmann spin on ice. The subject is a female skater in a glittering costume. Initially, she spins on one leg. Then, she reaches back and pulls her free leg up. Next, she spins rapidly, becoming a blur of motion, with ice shavings spraying from her skate blade. The background is an ice rink with blurred advertising boards. The camera circles around the subject to capture the spin from all angles. The lighting is spotlit, creating lens flares and sparkles on her costume. The overall video presents a graceful artistic sports style.``` |\\n|Cinematic Aesthetics|  📋 Show input prompt ```固定镜头,焦点在图片里的挂钟上，镜头轻微摇晃营造手持摄影感，\\u200bwjw,filmphotos,Film Grain,Reversal film photography，Wong Kar-wai movies,cinematic photography, HK film style,neon lighting, in the style of Wong Kar Wai film```  📋 Show rewrite prompt ```Handheld lens shooting, the camera focuses on the wall clock hanging on the green-toned wall, shaking slightly. The second hand sweeps steadily across the clock face, and the shadow of the clock cast on the wall shifts subtly with the movement of the lens.``` |  📋 Show input prompt ```The leaves of calamus shine in the sunlight, dotted with dewdrops that trickle down to the ground with the breeze.```  📋 Show rewrite prompt ```A macro shot focuses on long, slender calamus leaves, rendered in a cinematic photography realistic style. The main leaf, a vibrant, deep green, is positioned diagonally across the frame. Its surface is covered in tiny, glistening spherical dewdrops that catch and refract the bright morning sunlight, creating sparkling highlights. Initially, a larger, perfectly round dewdrop clings to the upper section of the leaf, its surface tension holding it in place. Then, as the leaf sways almost imperceptibly, the dewdrop begins to slowly dislodge. Next, it starts to trickle down the central vein of the leaf, its shape elongating slightly as it moves, leaving a subtle, glistening wet trail in its path. Finally, it reaches the pointed tip of the leaf, hangs for a brief moment, and falls out of the bottom of the frame. In the background, other leaves and blades of grass are softly blurred, creating a beautiful bokeh effect with soft, out-of-focus circles of light. The environment is bathed in the warm, golden glow of early morning sunlight, which streams in from behind the leaves, backlighting them and causing their wet edges to shine brilliantly. The overall impression is one of serene, natural beauty, captured in a highly realistic and detailed manner. This is a macro shot. The camera tilts down very slowly, following the path of the main dewdrop as it travels down the leaf. The lighting is soft and natural, with strong backlighting to create a radiant, glowing effect on the dewdrops and leaf edges, characteristic of professional nature photography. The atmosphere is peaceful and serene. The overall video presents a cinematic photography realistic style.``` |\\n|Text Rendering|  📋 Show input prompt ```赛博朋克风格的夜晚街角，一个巨大的招牌上， “Hunyuan Video 1.5”的霓虹灯管轮廓已经安装好。镜头推进，霓虹灯从“H”开始，伴随着‘滋滋’的电流声，每个字母依次亮起粉紫色的光芒，直到全部点亮，照亮了潮湿的街道。赛博朋克，城市美学```  📋 Show rewrite prompt ```On a wet street corner in a cyberpunk city at night, a large neon sign reading \"Hunyuan Video 1.5\" lights up sequentially, illuminating the dark, rainy environment with a pinkish-purple glow. he scene is a dark, rain-slicked street corner in a futuristic, cinematic cyberpunk city. Mounted on the metallic, weathered facade of a building is a massive, unlit neon sign. The sign\\'s glass tube framework clearly spells out the words \"Hunyuan Video 1.5\". Initially, the street is dimly lit, with ambient light from distant skyscrapers creating shimmering reflections on the wet asphalt below. Then, the camera zooms in slowly toward the sign. As it moves, a low electrical sizzling sound begins. In the background, the dense urban landscape of the cyberpunk metropolis is visible through a light atmospheric haze, with towering structures adorned with their own flickering advertisements. A complex web of cables and pipes crisscrosses between the buildings. The shot is at a low angle, looking up at the sign to emphasize its grand scale. The lighting is high-contrast and dramatic, dominated by the neon glow which creates sharp, specular reflections and deep shadows. The atmosphere is moody and tech-noir. The overall video presents a cinematic photography realistic style.,``` |  📋 Show input prompt ```一张铺开的中国宣纸上，浓墨滴入水中，晕染出壮丽的山水画轮廓。山峰、云雾、孤舟在墨色中自然形成。随后，这些水墨元素巧妙地流动、重组，在画面的留白处汇聚成\"Hunyuan Video 1.5\"的书法字体。优雅，诗意，文化底蕴```  📋 Show rewrite prompt ```A drop of black ink blooms on wet Chinese Xuan paper, forming a landscape painting before the ink elements fluidly reassemble into the calligraphic text \"Hunyuan Video 1.5\". On a flat, laid-out sheet of off-white Chinese Xuan paper with a subtle, fibrous texture, the scene unfolds. Initially, a single, concentrated drop of deep black ink falls into a clear, wet area at the center of the paper. Then, the ink instantly begins to bloom outwards in intricate, flowing tendrils of varying shades from jet-black to smoky grey. As it spreads, the ink wash naturally and rapidly forms the silhouette of a majestic mountain range with sharp, defined peaks. Next, softer, diluted grey tones billow around the mountains, creating layers of atmospheric mist and clouds, while a simple, dark stroke materializes as a lone boat on a tranquil, watery expanse at the base. As the landscape is formed, the ink elements—the lines of the mountains, wisps of cloud, and the shape of the boat—begin to deconstruct, dissolving into flowing streams of liquid ink. Finally, these streams move gracefully across the paper\\'s empty white space, converging and elegantly reorganizing to form the text \"Hunyuan Video 1.5\" in a fluid, semi-cursive calligraphic style. The background is the minimalist expanse of the Xuan paper itself, its texture providing a subtle depth. The entire process is lit by soft, even, diffused light from above, which enhances the rich tonal variations of the ink and the delicate texture of the paper without creating harsh shadows. Bird\\'s-eye view. The camera is positioned directly above the subject, capturing the entire process. The camera remains static. The aesthetic is a high-quality, dynamic Chinese ink wash animation style, perfectly simulating the real-world physics of ink spreading on wet paper. The entire sheet of paper and the final text are kept fully within the frame. Poetic, elegant, artistic. The overall video presents a dynamic Chinese ink wash animation style.``` |\\n|Physics Compliance|  📋 Show input prompt ```In a sleek museum gallery, a woman pauses before a gilded oil painting. The painted man inside slowly comes alive, lifting a bottle and pouring real wine straight from the canvas into her glass. Surrounded by stylish art critics moving naturally through the hall, she accepts the pour with calm elegance, as if the impossible were routine. ```  📋 Show rewrite prompt ```In a sleek museum gallery, a woman receives a glass of wine poured directly from an animated oil painting. A sophisticated woman with dark hair tied back elegantly stands in the mid-ground. She is wearing a simple, black silk sleeveless dress and holds a clear, crystal wine glass in her right hand. She is positioned before a large, baroque-style oil painting in an ornate, gilded frame. Inside the painting, an aristocratic man with a mustache, dressed in a dark velvet doublet with a white lace collar, is depicted. His form is defined by visible, impasto oil brushstrokes. Initially, the woman watches the painting with calm poise. Then, the painted man\\'s arm slowly animates, his painted texture retained as he lifts a dark bottle. Next, a photorealistic stream of red wine emerges directly from the flat canvas surface, arcing through the air and splashing gently into the real crystal glass she holds. She remains perfectly still, accepting the impossible pour with a subtle, knowing smile. The setting is a modern art gallery with high white walls and polished dark concrete floors that reflect the ambient light. Focused track lighting from the high ceiling casts a warm, dramatic spotlight on the woman and the painting, creating soft shadows. In the background, two other gallery patrons, a man and a woman in stylish, modern attire, stroll slowly from right to left, their figures slightly blurred by a shallow depth of field, moving naturally through the hall. The shot is at an eye-level angle with the woman. The camera remains static, capturing the surreal event in a steady medium shot. The lighting is high-contrast and dramatic, reminiscent of a cinematic photography realistic style, using soft side lighting to accentuate the woman\\'s features and the texture of the painting. The mood is surreal, elegant, and mysterious. The overall video presents a cinematic photography realistic style.``` |  📋 Show input prompt ```An intact soda can is slowly crushed by a hand.```  📋 Show rewrite prompt ```In a medium close-up, a hand slowly crushes an intact red and white soda can on a wooden table. A male hand with visible, realistic skin texture is wrapped firmly around the middle of an intact, pristine red and white aluminum soda can. The can, covered in glistening condensation droplets, rests on a dark, polished wooden surface. The cinematic realism captures every minute detail of the scene. Initially, the hand\\'s grip is steady, with the can\\'s cylindrical shape perfectly preserved. Then, the fingers begin to tighten slowly, the knuckles whitening slightly from the exertion. Next, the smooth aluminum surface starts to buckle under the controlled pressure, a sharp crease forming vertically down its side as the metallic sheen distorts. As the hand continues its deliberate squeeze, the can collapses inward progressively, the vibrant red paint wrinkling as the metal structure crumples. Finally, the can is left significantly crushed, its form now an irregular, crumpled shape held tightly in the fist. The scene takes place on a dark, polished wooden tabletop that catches soft, diffuse reflections. The grain of the wood is faintly discernible, adding a layer of texture to the foreground. The background is completely out of focus, rendered as a soft, dark, and non-descript blur, which isolates the main action and enhances the photorealistic quality of the shot. The shot is a medium close-up, presented in a cinematic photography realistic style. The camera remains static at a slightly high angle, looking down to provide a clear and unobstructed view of the can\\'s deformation. Soft side lighting creates high contrast, sculpting the muscles and tendons of the hand while casting specular highlights on the metallic can and the water droplets. The atmosphere is focused and intense. The overall video presents a cinematic photography realistic style.``` |\\n|Camera Movement|  📋 Show input prompt ```圣诞节的家中，小女孩靠着妈妈听妈妈读书，背景是下着雪的窗外，镜头缓慢下移，一只可爱的长毛小白猫戴着圣诞帽趴在温暖的地摊上```  📋 Show rewrite prompt ```In a cozy home on Christmas, a young girl leans against her mother as they read a book, and the camera moves down to reveal a fluffy white cat in a Santa hat resting on a warm rug. In a warmly lit living room on a snowy Christmas evening, a young mother and her little daughter are sitting together on a comfortable sofa. The mother, with a gentle expression and wearing a cream-colored knitted sweater, holds an open storybook with colorful illustrations. Her daughter, a small girl with brown hair in pigtails and a red pajama set, leans her head affectionately on her mother\\'s shoulder, her eyes fixed on the book. On the floor below them, a fluffy, long-haired white cat is curled up on a plush, beige wool rug. The cat wears a tiny red and white Santa hat perched between its ears. Initially, the shot focuses on the mother and daughter, capturing their quiet, shared moment. The mother’s finger gently rests on the page of the book. Then, the camera slowly moves downward, gliding past the book and their laps. Finally, the camera settles at a low angle, bringing the adorable white cat into sharp focus as the primary subject. The cat\\'s chest gently rises and falls with each breath, its eyes peacefully closed. Through a large window in the background, large, soft snowflakes can be seen falling silently against the dark blue twilight sky, creating a peaceful and serene backdrop. Faint, out-of-focus golden Christmas lights twinkle in the corner of the room, adding to the warm, festive atmosphere. The scene is imbued with a sense of comfort and holiday warmth, creating a beautiful cinematic photography realistic image. The camera slowly moves downward. The shot uses soft, warm interior lighting that casts gentle shadows, creating a high-contrast, cinematic look. A shallow depth of field keeps the focus on the subjects while beautifully blurring the background elements. The mood is heartwarming, peaceful, and festive. The overall video presents a cinematic photography realistic style.``` |  📋 Show input prompt ```The hiker begins walking forward along the trail, causing the water bottle to swing rhythmically with each step. The camera gradually pulls back and rises to reveal a vast desert landscape stretching out ahead.```  📋 Show rewrite prompt ```The hiker begins walking forward along the trail, causing the water bottle to swing rhythmically with each step. The camera gradually pulls back and rises to reveal a vast desert landscape stretching out ahead, while the sun position shifts from afternoon to dusk, casting increasingly longer shadows across the terrain as the figure becomes smaller in the frame.``` |\\n|Multi-Style Support|  📋 Show input prompt ```Have the cake man begin to take chunks out of himself and eat it.```  📋 Show rewrite prompt ```The cake man sits on the chair, with his hands resting on his knees. Then, he slowly raises his right hand and breaks off a piece of cake from his left shoulder. Next, he brings the piece of cake to his mouth and begins to chew. At the same time, his eyes widen slightly, and his mouth parts gently. After that, he raises his right hand again, breaks off another piece of cake from his right arm, and repeats the action of bringing it to his mouth to chew.``` |  📋 Show input prompt ```A little girl, carrying a colorful handbag, skips through the garden.  The video uses claymation style.```  📋 Show rewrite prompt ```A little girl with a colorful handbag skips through a whimsical claymation garden. In a vibrant garden constructed entirely from clay, a young girl, meticulously crafted in a claymation style, skips joyfully. She has chunky, sculpted yellow clay hair tied in pigtails that bounce with a slight stiffness, simple black button eyes, and a wide, permanently etched smile. She wears a simple pink clay dress with a white collar. In her left hand, she carries a small handbag molded from bright red and blue clay, which swings in a slightly jerky arc as she moves. Initially, the girl lifts her right leg high, her body momentarily suspended in a classic stop-motion pose. Then, she hops forward, landing lightly as her left leg swings through for the next skip. Her arms move in an exaggerated, back-and-forth rhythm, characteristic of stop-motion animation. Her movements are intentionally not perfectly fluid, highlighting the frame-by-frame nature of the claymation technique. The garden around her is a whimsical, textured world. In the foreground and mid-ground, oversized flowers with swirled purple and orange petals stand on thick green stems. The ground is a textured mat of green clay, showing subtle fingerprints and tool marks that add to the handmade charm. In the background, a pale blue clay backdrop features a simplified, smiling sun molded from yellow clay. The shot is at an eye-level angle with the main subject. The camera follows the subject, moving smoothly to the right to keep her in the frame. The lighting is bright and even, casting soft shadows that emphasize the rounded, three-dimensional forms of the clay models. The overall video presents a charming and detailed claymation style.``` |\\n|High Image-Video Consistency|   📋 Show input prompt ```女孩放下书，站起身，转身向屋内走去。镜头拉远。```  📋 Show rewrite prompt ```女孩合上手中的书，将书放在身侧的窗台上。随后，她缓缓站起身，转身向屋内走去，身影逐渐没入门后的阴影中。镜头缓缓拉远，露出更多被绿植覆盖的屋檐和墙体。``` |   📋 Show input prompt ```女人手上的鸟亲了女人一口```  📋 Show rewrite prompt ```女人手臂上的白色鹦鹉缓缓转过头，将喙轻轻触碰女人的脸颊，随后收回头部。女人嘴角微微上扬，目光温柔地注视着鹦鹉。背景中的绿植保持静止。``` |\\n\\n\\n\\n\\n## 📚 Citation\\n\\n```bibtex\\n@misc{hunyuanvideo2025,\\n      title={HunyuanVideo 1.5 Technical Report}, \\n      author={Tencent Hunyuan Foundation Model Team},\\n      year={2025},\\n      eprint={2511.18870},\\n      archivePrefix={arXiv},\\n      primaryClass={cs.CV},\\n      url={ \\n}\\n```\\n\\n## 🙏 Acknowledgements\\nWe would like to thank the contributors to the ,  ,  and , for their open research and exploration.\\n\\n## 🌟 Github Star History\\n\\n\\n \\n   \\n   \\n   \\n \\n\\n',\n",
       "  'domain': 'text-to-video'},\n",
       " {'model_id': 'fixie-ai/ultravox-v0_5-llama-3_2-1b',\n",
       "  'created_at': '2025-02-06T22:48:54+00:00',\n",
       "  'downloads': 357185,\n",
       "  'likes': 68,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'ultravox',\n",
       "   'feature-extraction',\n",
       "   'audio-text-to-text',\n",
       "   'custom_code',\n",
       "   'ar',\n",
       "   'be',\n",
       "   'bg',\n",
       "   'bn',\n",
       "   'cs',\n",
       "   'cy',\n",
       "   'da',\n",
       "   'de',\n",
       "   'el',\n",
       "   'en',\n",
       "   'es',\n",
       "   'et',\n",
       "   'fa',\n",
       "   'fi',\n",
       "   'fr',\n",
       "   'gl',\n",
       "   'hi',\n",
       "   'hu',\n",
       "   'it',\n",
       "   'ja',\n",
       "   'ka',\n",
       "   'lt',\n",
       "   'lv',\n",
       "   'mk',\n",
       "   'mr',\n",
       "   'nl',\n",
       "   'pl',\n",
       "   'pt',\n",
       "   'ro',\n",
       "   'ru',\n",
       "   'sk',\n",
       "   'sl',\n",
       "   'sr',\n",
       "   'sv',\n",
       "   'sw',\n",
       "   'ta',\n",
       "   'th',\n",
       "   'tr',\n",
       "   'uk',\n",
       "   'ur',\n",
       "   'vi',\n",
       "   'zh',\n",
       "   'license:mit',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# Model Card for Ultravox\\n\\nUltravox is a multimodal Speech LLM built around a pretrained  and  backbone.\\n\\nSee  for the GitHub repo and more information.\\n\\n\\n## Model Details\\n\\n### Model Description\\n\\nUltravox is a multimodal model that can consume both speech and text as input (e.g., a text system prompt and voice user message). \\nThe input to the model is given as a text prompt with a special `` pseudo-token, and the model processor will replace this magic token with embeddings derived from the input audio.\\nUsing the merged embeddings as input, the model will then generate output text as usual. \\n\\nIn a future revision of Ultravox, we plan to expand the token vocabulary to support generation of semantic and acoustic audio tokens, which can then be fed to a vocoder to produce voice output.\\nNo preference tuning has been applied to this revision of the model.\\n\\n- **Developed by:** Fixie.ai\\n- **License:** MIT\\n\\n### Model Sources\\n\\n- **Repository:** \\n- **Demo:** See repo\\n\\n## Usage\\n\\nThink of the model as an LLM that can also hear and understand speech. As such, it can be used as a voice agent, and also to do speech-to-speech translation, analysis of spoken audio, etc.\\n\\nTo use the model, try the following:\\n```python\\n# pip install transformers peft librosa\\n\\nimport transformers\\nimport numpy as np\\nimport librosa\\n\\npipe = transformers.pipeline(model=\\'fixie-ai/ultravox-v0_5-llama-3_2-1b\\', trust_remote_code=True)\\n\\npath = \"\"  # TODO: pass the audio here\\naudio, sr = librosa.load(path, sr=16000)\\n\\n\\nturns = [\\n  {\\n    \"role\": \"system\",\\n    \"content\": \"You are a friendly and helpful character. You love to answer questions for people.\"\\n  },\\n]\\npipe({\\'audio\\': audio, \\'turns\\': turns, \\'sampling_rate\\': sr}, max_new_tokens=30)\\n```\\n\\n\\n## Training Details\\n\\nThe model uses a pre-trained  backbone as well as the encoder part of .\\n\\nThe multi-modal adapter is trained, the Whisper encoder is fine-tuned, while the Llama model is kept frozen.\\n\\nWe use a knowledge-distillation loss where Ultravox is trying to match the logits of the text-based Llama backbone.\\n\\n### Training Data\\n\\nThe training dataset is a mix of ASR datasets, extended with continuations generated by Llama 3.1 8B, and speech translation datasets, which yield a modest improvement in translation evaluations.\\n\\n### Training Procedure\\n\\nSupervised speech instruction finetuning via knowledge-distillation. For more info, see .\\n\\n\\n#### Training Hyperparameters\\n\\n- **Training regime:** BF16 mixed precision training\\n- **Hardward used:** 8x H100 GPUs\\n\\n#### Speeds, Sizes, Times\\n\\nCheck out the audio tab on  for daily benchmarks and a comparison with other existing models.\\n\\n## Evaluation\\n\\n|  | **Ultravox 0.5 1b**| Ultravox 0.5 8B | Ultravox 0.5 70B |\\n| --- |  ---: | ---: | ---: | \\n| **covost2 en_ar** | 1.55 | 12.99 | 20.21 |\\n| **covost2 en_ca** | 8.06 | 31.54 | 40.01 |\\n| **covost2 en_de** | 14.21 | 28.70 | 34.53 |\\n| **covost2 es_en** | 24.97 | 40.19 | 43.29 |\\n| **covost2 ru_en** | 24.12 | 42.13 | 48.99 |\\n| **covost2 zh_en** | 4.76 | 17.22 | 21.37 |\\n| **big bench audio**| 39.14 | 66.54 | 82.70 |\\n',\n",
       "  'domain': 'audio-text-to-text'},\n",
       " {'model_id': 'google/gemma-3n-E2B-it',\n",
       "  'created_at': '2025-06-12T17:44:07+00:00',\n",
       "  'downloads': 280090,\n",
       "  'likes': 261,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'gemma3n',\n",
       "   'image-to-text',\n",
       "   'automatic-speech-recognition',\n",
       "   'automatic-speech-translation',\n",
       "   'audio-text-to-text',\n",
       "   'video-text-to-text',\n",
       "   'image-text-to-text',\n",
       "   'conversational',\n",
       "   'arxiv:1905.07830',\n",
       "   'arxiv:1905.10044',\n",
       "   'arxiv:1911.11641',\n",
       "   'arxiv:1904.09728',\n",
       "   'arxiv:1705.03551',\n",
       "   'arxiv:1911.01547',\n",
       "   'arxiv:1907.10641',\n",
       "   'arxiv:1903.00161',\n",
       "   'arxiv:2210.03057',\n",
       "   'arxiv:2502.12404',\n",
       "   'arxiv:2411.19799',\n",
       "   'arxiv:2009.03300',\n",
       "   'arxiv:2502.21228',\n",
       "   'arxiv:2311.12022',\n",
       "   'arxiv:2403.07974',\n",
       "   'arxiv:2108.07732',\n",
       "   'arxiv:2107.03374',\n",
       "   'base_model:google/gemma-3n-E4B-it',\n",
       "   'base_model:finetune:google/gemma-3n-E4B-it',\n",
       "   'license:gemma',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n> [!Note]\\n> This repository corresponds to the launch version of Gemma 3n E2B IT (Instruct), to be used with Hugging Face `transformers`,\\n> supporting text, audio, and vision (image and video) inputs.\\n> \\n> Gemma 3n models have multiple architecture innovations:\\n>  * They are available in two sizes based on . While the raw parameter count of this model is 6B, the architecture design allows the model to be run with a memory footprint comparable to a traditional 2B model by offloading low-utilization matrices from the accelerator.\\n>  * They use a MatFormer architecture that allows nesting sub-models within the . We provide one sub-model (this model repository), or you can access a spectrum of custom-sized models using the .\\n>\\n> Learn more about these techniques in the \\n> and the . \\n\\n\\n\\n# Gemma 3n model card\\n\\n**Model Page**: \\n\\n**Resources and Technical Documentation**:\\n\\n-   \\n-   \\n-   \\n-   \\n\\n**Terms of Use**: \\\\\\n**Authors**: Google DeepMind\\n\\n## Model Information\\n\\nSummary description and brief definition of inputs and outputs.\\n\\n### Description\\n\\nGemma is a family of lightweight, state-of-the-art open models from Google,\\nbuilt from the same research and technology used to create the Gemini models.\\nGemma 3n models are designed for efficient execution on low-resource devices.\\nThey are capable of multimodal input, handling text, image, video, and audio\\ninput, and generating text outputs, with open weights for pre-trained and\\ninstruction-tuned variants. These models were trained with data in over 140\\nspoken languages.\\n\\nGemma 3n models use selective parameter activation technology to reduce resource\\nrequirements. This technique allows the models to operate at an effective size\\nof 2B and 4B parameters, which is lower than the total number of parameters they\\ncontain. For more information on Gemma 3n\\'s efficient parameter management\\ntechnology, see the\\n\\npage.\\n\\n### Inputs and outputs\\n\\n-   **Input:**\\n    -   Text string, such as a question, a prompt, or a document to be\\n        summarized\\n    -   Images, normalized to 256x256, 512x512, or 768x768 resolution\\n        and encoded to 256 tokens each\\n    -   Audio data encoded to 6.25 tokens per second from a single channel\\n    -   Total input context of 32K tokens\\n-   **Output:**\\n    -   Generated text in response to the input, such as an answer to a\\n        question, analysis of image content, or a summary of a document\\n    -   Total output length up to 32K tokens, subtracting the request\\n        input tokens\\n\\n### Usage\\n\\nBelow, there are some code snippets on how to get quickly started with running\\nthe model. First, install the Transformers library. Gemma 3n is supported\\nstarting from transformers 4.53.0.\\n\\n```sh\\n$ pip install -U transformers\\n```\\n\\nThen, copy the snippet from the section that is relevant for your use case.\\n\\n#### Running with the `pipeline` API\\n\\nYou can initialize the model and processor for inference with `pipeline` as\\nfollows.\\n\\n```python\\nfrom transformers import pipeline\\nimport torch\\n\\npipe = pipeline(\\n    \"image-text-to-text\",\\n    model=\"google/gemma-3n-e2b-it\",\\n    device=\"cuda\",\\n    torch_dtype=torch.bfloat16,\\n)\\n```\\n\\nWith instruction-tuned models, you need to use chat templates to process our\\ninputs first. Then, you can pass it to the pipeline.\\n\\n```python\\nmessages = [\\n    {\\n        \"role\": \"system\",\\n        \"content\": [{\"type\": \"text\", \"text\": \"You are a helpful assistant.\"}]\\n    },\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\"type\": \"image\", \"url\": \"\\n            {\"type\": \"text\", \"text\": \"What animal is on the candy?\"}\\n        ]\\n    }\\n]\\n\\noutput = pipe(text=messages, max_new_tokens=200)\\nprint(output[0][\"generated_text\"][-1][\"content\"])\\n# Okay, let\\'s take a look!\\n# Based on the image, the animal on the candy is a **turtle**.\\n# You can see the shell shape and the head and legs.\\n```\\n\\n#### Running the model on a single GPU\\n\\n```python\\nfrom transformers import AutoProcessor, Gemma3nForConditionalGeneration\\nfrom PIL import Image\\nimport requests\\nimport torch\\n\\nmodel_id = \"google/gemma-3n-e2b-it\"\\n\\nmodel = Gemma3nForConditionalGeneration.from_pretrained(model_id, device=\"cuda\", torch_dtype=torch.bfloat16,).eval()\\n\\nprocessor = AutoProcessor.from_pretrained(model_id)\\n\\nmessages = [\\n    {\\n        \"role\": \"system\",\\n        \"content\": [{\"type\": \"text\", \"text\": \"You are a helpful assistant.\"}]\\n    },\\n    {\\n        \"role\": \"user\",\\n        \"content\": [\\n            {\"type\": \"image\", \"image\": \"\\n            {\"type\": \"text\", \"text\": \"Describe this image in detail.\"}\\n        ]\\n    }\\n]\\n\\ninputs = processor.apply_chat_template(\\n    messages,\\n    add_generation_prompt=True,\\n    tokenize=True,\\n    return_dict=True,\\n    return_tensors=\"pt\",\\n).to(model.device, dtype=torch.bfloat16)\\n\\ninput_len = inputs[\"input_ids\"].shape[-1]\\n\\nwith torch.inference_mode():\\n    generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)\\n    generation = generation[0][input_len:]\\n\\ndecoded = processor.decode(generation, skip_special_tokens=True)\\nprint(decoded)\\n\\n# **Overall Impression:** The image is a close-up shot of a vibrant garden scene,\\n# focusing on a cluster of pink cosmos flowers and a busy bumblebee.\\n# It has a slightly soft, natural feel, likely captured in daylight.\\n```\\n\\n### Citation\\n\\n```\\n@article{gemma_3n_2025,\\n    title={Gemma 3n},\\n    url={\\n    publisher={Google DeepMind},\\n    author={Gemma Team},\\n    year={2025}\\n}\\n```\\n\\n## Model Data\\n\\nData used for model training and how the data was processed.\\n\\n### Training Dataset\\n\\nThese models were trained on a dataset that includes a wide variety of sources\\ntotalling approximately 11 trillion tokens. The knowledge cutoff date for the\\ntraining data was June 2024. Here are the key components:\\n\\n-   **Web Documents**: A diverse collection of web text ensures the model\\n    is exposed to a broad range of linguistic styles, topics, and vocabulary.\\n    The training dataset includes content in over 140 languages.\\n-   **Code**: Exposing the model to code helps it to learn the syntax and\\n    patterns of programming languages, which improves its ability to generate\\n    code and understand code-related questions.\\n-   **Mathematics**: Training on mathematical text helps the model learn\\n    logical reasoning, symbolic representation, and to address mathematical queries.\\n-   **Images**: A wide range of images enables the model to perform image\\n    analysis and visual data extraction tasks.\\n-   Audio: A diverse set of sound samples enables the model to recognize\\n    speech, transcribe text from recordings, and identify information in audio data.\\n\\nThe combination of these diverse data sources is crucial for training a\\npowerful multimodal model that can handle a wide variety of different tasks and\\ndata formats.\\n\\n### Data Preprocessing\\n\\nHere are the key data cleaning and filtering methods applied to the training\\ndata:\\n\\n-   **CSAM Filtering**: Rigorous CSAM (Child Sexual Abuse Material)\\n    filtering was applied at multiple stages in the data preparation process to\\n    ensure the exclusion of harmful and illegal content.\\n-   **Sensitive Data Filtering**: As part of making Gemma pre-trained models\\n    safe and reliable, automated techniques were used to filter out certain\\n    personal information and other sensitive data from training sets.\\n-   **Additional methods**: Filtering based on content quality and safety in\\n    line with\\n    .\\n\\n## Implementation Information\\n\\nDetails about the model internals.\\n\\n### Hardware\\n\\nGemma was trained using  hardware (TPUv4p, TPUv5p\\nand TPUv5e). Training generative models requires significant computational\\npower. TPUs, designed specifically for matrix operations common in machine\\nlearning, offer several advantages in this domain:\\n\\n-   **Performance**: TPUs are specifically designed to handle the massive\\n    computations involved in training generative models. They can speed up\\n    training considerably compared to CPUs.\\n-   **Memory**: TPUs often come with large amounts of high-bandwidth memory,\\n    allowing for the handling of large models and batch sizes during training.\\n    This can lead to better model quality.\\n-   **Scalability**: TPU Pods (large clusters of TPUs) provide a scalable\\n    solution for handling the growing complexity of large foundation models.\\n    You can distribute training across multiple TPU devices for faster and more\\n    efficient processing.\\n-   **Cost-effectiveness**: In many scenarios, TPUs can provide a more\\n    cost-effective solution for training large models compared to CPU-based\\n    infrastructure, especially when considering the time and resources saved\\n    due to faster training.\\n\\nThese advantages are aligned with\\n.\\n\\n### Software\\n\\nTraining was done using  and\\n.\\nJAX allows researchers to take advantage of the latest generation of hardware,\\nincluding TPUs, for faster and more efficient training of large models. ML\\nPathways is Google\\'s latest effort to build artificially intelligent systems\\ncapable of generalizing across multiple tasks. This is specially suitable for\\nfoundation models, including large language models like these ones.\\n\\nTogether, JAX and ML Pathways are used as described in the\\n:\\n*\"the \\'single controller\\' programming model of Jax and Pathways allows a single\\nPython process to orchestrate the entire training run, dramatically simplifying\\nthe development workflow.\"*\\n\\n## Evaluation\\n\\nModel evaluation metrics and results.\\n\\n### Benchmark Results\\n\\nThese models were evaluated at full precision (float32) against a large\\ncollection of different datasets and metrics to cover different aspects of\\ncontent generation. Evaluation results marked with **IT** are for\\ninstruction-tuned models. Evaluation results marked with **PT** are for\\npre-trained models.\\n\\n#### Reasoning and factuality\\n\\n| Benchmark                      | Metric         | n-shot   |  E2B PT  |  E4B PT  |\\n| ------------------------------ |----------------|----------|:--------:|:--------:|\\n| [HellaSwag][hellaswag]         | Accuracy       | 10-shot  |   72.2   |   78.6   |\\n| [BoolQ][boolq]                 | Accuracy       | 0-shot   |   76.4   |   81.6   |\\n| [PIQA][piqa]                   | Accuracy       | 0-shot   |   78.9   |   81.0   |\\n| [SocialIQA][socialiqa]         | Accuracy       | 0-shot   |   48.8   |   50.0   |\\n| [TriviaQA][triviaqa]           | Accuracy       | 5-shot   |   60.8   |   70.2   |\\n| [Natural Questions][naturalq]  | Accuracy       | 5-shot   |   15.5   |   20.9   |\\n| [ARC-c][arc]                   | Accuracy       | 25-shot  |   51.7   |   61.6   |\\n| [ARC-e][arc]                   | Accuracy       | 0-shot   |   75.8   |   81.6   |\\n| [WinoGrande][winogrande]       | Accuracy       | 5-shot   |   66.8   |   71.7   |\\n| [BIG-Bench Hard][bbh]          | Accuracy       | few-shot |   44.3   |   52.9   |\\n| [DROP][drop]                   | Token F1 score | 1-shot   |   53.9   |   60.8   |\\n\\n[hellaswag]: \\n[boolq]: \\n[piqa]: \\n[socialiqa]: \\n[triviaqa]: \\n[naturalq]: \\n[arc]: \\n[winogrande]: \\n[bbh]: \\n[drop]: \\n\\n#### Multilingual\\n\\n| Benchmark                           | Metric                  | n-shot   |  E2B IT  |  E4B IT  |\\n| ------------------------------------|-------------------------|----------|:--------:|:--------:|\\n| [MGSM][mgsm]                        | Accuracy                |  0-shot  |   53.1   |   60.7   |\\n| [WMT24++][wmt24pp] (ChrF)           | Character-level F-score |  0-shot  |   42.7   |   50.1   |\\n| [Include][include]                  | Accuracy                |  0-shot  |   38.6   |   57.2   |\\n| [MMLU][mmlu] (ProX)                 | Accuracy                |  0-shot  |    8.1   |   19.9   |\\n| [OpenAI MMLU][openai-mmlu]          | Accuracy                |  0-shot  |   22.3   |   35.6   |\\n| [Global-MMLU][global-mmlu]          | Accuracy                |  0-shot  |   55.1   |   60.3   |\\n| [ECLeKTic][eclektic]                | ECLeKTic score          |  0-shot  |    2.5   |    1.9   |\\n\\n[mgsm]: \\n[wmt24pp]: \\n[include]:\\n[mmlu]: \\n[openai-mmlu]: \\n[global-mmlu]: \\n[eclektic]: \\n\\n#### STEM and code\\n\\n| Benchmark                           | Metric                   | n-shot   |  E2B IT  |  E4B IT  |\\n| ------------------------------------|--------------------------|----------|:--------:|:--------:|\\n| [GPQA][gpqa] Diamond                | RelaxedAccuracy/accuracy |  0-shot  |   24.8   |   23.7   |\\n| [LiveCodeBench][lcb] v5             | pass@1                   |  0-shot  |   18.6   |   25.7   |\\n| Codegolf v2.2                       | pass@1                   |  0-shot  |   11.0   |   16.8   |\\n| [AIME 2025][aime-2025]              | Accuracy                 |  0-shot  |    6.7   |   11.6   |\\n\\n[gpqa]: \\n[lcb]: \\n[aime-2025]: \\n\\n#### Additional benchmarks\\n\\n| Benchmark                            | Metric     | n-shot   |  E2B IT  |  E4B IT  |\\n| ------------------------------------ |------------|----------|:--------:|:--------:|\\n| [MMLU][mmlu]                         |  Accuracy  |  0-shot  |   60.1   |   64.9   |\\n| [MBPP][mbpp]                         |  pass@1    |  3-shot  |   56.6   |   63.6   |\\n| [HumanEval][humaneval]               |  pass@1    |  0-shot  |   66.5   |   75.0   |\\n| [LiveCodeBench][lcb]                 |  pass@1    |  0-shot  |   13.2   |   13.2   |\\n| HiddenMath                           |  Accuracy  |  0-shot  |   27.7   |   37.7   |\\n| [Global-MMLU-Lite][global-mmlu-lite] |  Accuracy  |  0-shot  |   59.0   |   64.5   |\\n| [MMLU][mmlu] (Pro)                   |  Accuracy  |  0-shot  |   40.5   |   50.6   |\\n\\n[gpqa]: \\n[mbpp]: \\n[humaneval]: \\n[lcb]: \\n[global-mmlu-lite]: \\n\\n## Ethics and Safety\\n\\nEthics and safety evaluation approach and results.\\n\\n### Evaluation Approach\\n\\nOur evaluation methods include structured evaluations and internal red-teaming\\ntesting of relevant content policies. Red-teaming was conducted by a number of\\ndifferent teams, each with different goals and human evaluation metrics. These\\nmodels were evaluated against a number of different categories relevant to\\nethics and safety, including:\\n\\n-   **Child Safety**: Evaluation of text-to-text and image to text prompts\\n    covering child safety policies, including child sexual abuse and\\n    exploitation.\\n-   **Content Safety:** Evaluation of text-to-text and image to text prompts\\n    covering safety policies including, harassment, violence and gore, and hate\\n    speech.\\n-   **Representational Harms**: Evaluation of text-to-text and image to text\\n    prompts covering safety policies including bias, stereotyping, and harmful\\n    associations or inaccuracies.\\n\\nIn addition to development level evaluations, we conduct \"assurance\\nevaluations\" which are our \\'arms-length\\' internal evaluations for responsibility\\ngovernance decision making. They are conducted separately from the model\\ndevelopment team, to inform decision making about release. High level findings\\nare fed back to the model team, but prompt sets are held-out to prevent\\noverfitting and preserve the results\\' ability to inform decision making. Notable\\nassurance evaluation results are reported to our Responsibility & Safety Council\\nas part of release review.\\n\\n### Evaluation Results\\n\\nFor all areas of safety testing, we saw safe levels of performance across the\\ncategories of child safety, content safety, and representational harms relative\\nto previous Gemma models. All testing was conducted without safety filters to\\nevaluate the model capabilities and behaviors. For text-to-text,  image-to-text,\\nand audio-to-text, and across all model sizes, the model produced minimal policy\\nviolations, and showed significant improvements over previous Gemma models\\'\\nperformance with respect to high severity violations. A limitation of our\\nevaluations was they included primarily English language prompts.\\n\\n## Usage and Limitations\\n\\nThese models have certain limitations that users should be aware of.\\n\\n### Intended Usage\\n\\nOpen generative models have a wide range of applications across various\\nindustries and domains. The following list of potential uses is not\\ncomprehensive. The purpose of this list is to provide contextual information\\nabout the possible use-cases that the model creators considered as part of model\\ntraining and development.\\n\\n-   Content Creation and Communication\\n    -   **Text Generation**: Generate creative text formats such as\\n        poems, scripts, code, marketing copy, and email drafts.\\n    -   **Chatbots and Conversational AI**: Power conversational\\n        interfaces for customer service, virtual assistants, or interactive\\n        applications.\\n    -   **Text Summarization**: Generate concise summaries of a text\\n        corpus, research papers, or reports.\\n    -   **Image Data Extraction**: Extract, interpret, and summarize\\n        visual data for text communications.\\n    -   **Audio Data Extraction**: Transcribe spoken language, translate speech\\n        to text in other languages, and analyze sound-based data.\\n-   Research and Education\\n    -   **Natural Language Processing (NLP) and generative model\\n        Research**: These models can serve as a foundation for researchers to\\n        experiment with generative models and NLP techniques, develop\\n        algorithms, and contribute to the advancement of the field.\\n    -   **Language Learning Tools**: Support interactive language\\n        learning experiences, aiding in grammar correction or providing writing\\n        practice.\\n    -   **Knowledge Exploration**: Assist researchers in exploring large\\n        bodies of data by generating summaries or answering questions about\\n        specific topics.\\n\\n### Limitations\\n\\n-   Training Data\\n    -   The quality and diversity of the training data significantly\\n        influence the model\\'s capabilities. Biases or gaps in the training data\\n        can lead to limitations in the model\\'s responses.\\n    -   The scope of the training dataset determines the subject areas\\n        the model can handle effectively.\\n-   Context and Task Complexity\\n    -   Models are better at tasks that can be framed with clear\\n        prompts and instructions. Open-ended or highly complex tasks might be\\n        challenging.\\n    -   A model\\'s performance can be influenced by the amount of context\\n        provided (longer context generally leads to better outputs, up to a\\n        certain point).\\n-   Language Ambiguity and Nuance\\n    -   Natural language is inherently complex. Models might struggle\\n        to grasp subtle nuances, sarcasm, or figurative language.\\n-   Factual Accuracy\\n    -   Models generate responses based on information they learned\\n        from their training datasets, but they are not knowledge bases. They\\n        may generate incorrect or outdated factual statements.\\n-   Common Sense\\n    -   Models rely on statistical patterns in language. They might\\n        lack the ability to apply common sense reasoning in certain situations.\\n\\n### Ethical Considerations and Risks\\n\\nThe development of generative models raises several ethical concerns. In\\ncreating an open model, we have carefully considered the following:\\n\\n-   Bias and Fairness\\n    -   Generative models trained on large-scale, real-world text and image data\\n        can reflect socio-cultural biases embedded in the training material.\\n        These models underwent careful scrutiny, input data pre-processing\\n        described and posterior evaluations reported in this card.\\n-   Misinformation and Misuse\\n    -   Generative models can be misused to generate text that is\\n        false, misleading, or harmful.\\n    -   Guidelines are provided for responsible use with the model, see the\\n        .\\n-   Transparency and Accountability:\\n    -   This model card summarizes details on the models\\' architecture,\\n        capabilities, limitations, and evaluation processes.\\n    -   A responsibly developed open model offers the opportunity to\\n        share innovation by making generative model technology accessible to\\n        developers and researchers across the AI ecosystem.\\n\\nRisks identified and mitigations:\\n\\n-   **Perpetuation of biases**: It\\'s encouraged to perform continuous monitoring\\n    (using evaluation metrics, human review) and the exploration of de-biasing\\n    techniques during model training, fine-tuning, and other use cases.\\n-   **Generation of harmful content**: Mechanisms and guidelines for content\\n    safety are essential. Developers are encouraged to exercise caution and\\n    implement appropriate content safety safeguards based on their specific\\n    product policies and application use cases.\\n-   **Misuse for malicious purposes**: Technical limitations and developer\\n    and end-user education can help mitigate against malicious applications of\\n    generative models. Educational resources and reporting mechanisms for users\\n    to flag misuse are provided. Prohibited uses of Gemma models are outlined\\n    in the\\n    .\\n-   **Privacy violations**: Models were trained on data filtered for removal of\\n    certain personal information and other sensitive data. Developers are\\n    encouraged to adhere to privacy regulations with privacy-preserving\\n    techniques.\\n\\n### Benefits\\n\\nAt the time of release, this family of models provides high-performance open\\ngenerative model implementations designed from the ground up for responsible AI\\ndevelopment compared to similarly sized models.\\n\\nUsing the benchmark evaluation metrics described in this document, these models\\nhave shown to provide superior performance to other, comparably-sized open model\\nalternatives.\\n',\n",
       "  'domain': 'video-text-to-text'},\n",
       " {'model_id': 'tencent/HY-MT1.5-7B',\n",
       "  'created_at': '2025-12-25T10:45:31+00:00',\n",
       "  'downloads': 259950,\n",
       "  'likes': 125,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'hunyuan_v1_dense',\n",
       "   'text-generation',\n",
       "   'translation',\n",
       "   'zh',\n",
       "   'en',\n",
       "   'fr',\n",
       "   'pt',\n",
       "   'es',\n",
       "   'ja',\n",
       "   'tr',\n",
       "   'ru',\n",
       "   'ar',\n",
       "   'ko',\n",
       "   'th',\n",
       "   'it',\n",
       "   'de',\n",
       "   'vi',\n",
       "   'ms',\n",
       "   'id',\n",
       "   'tl',\n",
       "   'hi',\n",
       "   'pl',\n",
       "   'cs',\n",
       "   'nl',\n",
       "   'km',\n",
       "   'my',\n",
       "   'fa',\n",
       "   'gu',\n",
       "   'ur',\n",
       "   'te',\n",
       "   'mr',\n",
       "   'he',\n",
       "   'bn',\n",
       "   'ta',\n",
       "   'uk',\n",
       "   'bo',\n",
       "   'kk',\n",
       "   'mn',\n",
       "   'ug',\n",
       "   'arxiv:2512.24092',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n\\n\\n  \\n\\n\\n\\n\\n    🤗&nbsp;Hugging Face&nbsp;&nbsp;|&nbsp;&nbsp;\\n    🕹️&nbsp;Demo&nbsp;&nbsp;&nbsp;&nbsp;\\n    🤖&nbsp;ModelScope&nbsp;&nbsp;|&nbsp;&nbsp;\\n\\n\\n\\n    🖥️&nbsp;Official Website&nbsp;&nbsp;|&nbsp;&nbsp;\\n    Github\\n\\n\\n\\n## Model Introduction\\n\\nHunyuan Translation Model Version 1.5 includes a 1.8B translation model, HY-MT1.5-1.8B, and a 7B translation model, HY-MT1.5-7B. Both models focus on supporting mutual translation across 33 languages and incorporating 5 ethnic and dialect variations. Among them, HY-MT1.5-7B is an upgraded version of our WMT25 championship model, optimized for explanatory translation and mixed-language scenarios, with newly added support for terminology intervention, contextual translation, and formatted translation. Despite having less than one-third the parameters of HY-MT1.5-7B, HY-MT1.5-1.8B delivers translation performance comparable to its larger counterpart, achieving both high speed and high quality. After quantization, the 1.8B model can be deployed on edge devices and support real-time translation scenarios, making it widely applicable.\\n\\n## Key Features and Advantages\\n\\n- HY-MT1.5-1.8B achieves the industry-leading performance among models of the same size, surpassing most commercial translation APIs.\\n- HY-MT1.5-1.8B supports deployment on edge devices and real-time translation scenarios, offering broad applicability.\\n- HY-MT1.5-7B, compared to its September open-source version, has been optimized for annotated and mixed-language scenarios.\\n- Both models support terminology intervention, contextual translation, and formatted translation.\\n\\n## Related News\\n* 2025.12.30, we have open-sourced **HY-MT1.5-1.8B** and **HY-MT1.5-7B** on Hugging Face.\\n* 2025.9.1, we have open-sourced  **Hunyuan-MT-7B** , **Hunyuan-MT-Chimera-7B** on Hugging Face.\\n\\n\\n\\n## Performance\\n\\n\\n\\n\\nYou can refer to our technical report for more experimental results and analysis.\\n\\nTechnical Report \\n\\n&nbsp;\\n\\n## Model Links\\n| Model Name  | Description | Download |\\n| ----------- | ----------- |-----------\\n| HY-MT1.5-1.8B  | Hunyuan 1.8B translation model |🤗 |\\n| HY-MT1.5-1.8B-FP8 | Hunyuan 1.8B translation model, fp8 quant    | 🤗 |\\n| HY-MT1.5-1.8B-GPTQ-Int4 | Hunyuan 1.8B translation model, int4 quant    | 🤗 |\\n| HY-MT1.5-7B | Hunyuan 7B translation model    | 🤗 |\\n| HY-MT1.5-7B-FP8 | Hunyuan 7B translation model, fp8 quant     | 🤗 |\\n| HY-MT1.5-7B-GPTQ-Int4 | Hunyuan 7B translation model, int4 quant     | 🤗 |\\n\\n## Prompts\\n\\n### Prompt Template for ZHXX Translation.\\n---\\n```\\n将以下文本翻译为{target_language}，注意只需要输出翻译后的结果，不要额外解释：\\n\\n{source_text}\\n```\\n---\\n\\n### Prompt Template for XXXX Translation, excluding ZHXX.\\n---\\n```\\nTranslate the following segment into {target_language}, without additional explanation.\\n\\n{source_text}\\n```\\n---\\n\\n### Prompt Template for terminology intervention.\\n---\\n```\\n参考下面的翻译：\\n{source_term} 翻译成 {target_term}\\n\\n将以下文本翻译为{target_language}，注意只需要输出翻译后的结果，不要额外解释：\\n{source_text}\\n```\\n---\\n\\n### Prompt Template for contextual translation.\\n---\\n```\\n{context}\\n参考上面的信息，把下面的文本翻译成{target_language}，注意不需要翻译上文，也不要额外解释：\\n{source_text}\\n\\n```\\n---\\n\\n###  Prompt Template for formatted translation.\\n---\\n```\\n将以下之间的文本翻译为中文，注意只需要输出翻译后的结果，不要额外解释，原文中的标签表示标签内文本包含格式信息，需要在译文中相应的位置尽量保留该标签。输出格式为：str\\n\\n{src_text_with_format}\\n```\\n---\\n\\n&nbsp;\\n\\n### Use with transformers\\nFirst, please install transformers, recommends v4.56.0\\n```SHELL\\npip install transformers==4.56.0\\n```\\n\\n*!!! If you want to load fp8 model with transformers, you need to change the name\"ignored_layers\" in config.json to \"ignore\" and upgrade the compressed-tensors to compressed-tensors-0.11.0.*\\n\\nThe following code snippet shows how to use the transformers library to load and apply the model.\\n\\nwe use tencent/HY-MT1.5-1.8B for example\\n\\n```python\\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\\nimport os\\n\\nmodel_name_or_path = \"tencent/HY-MT1.5-1.8B\"\\n\\ntokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\\nmodel = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map=\"auto\")  # You may want to use bfloat16 and/or move to GPU here\\nmessages = [\\n    {\"role\": \"user\", \"content\": \"Translate the following segment into Chinese, without additional explanation.\\\\n\\\\nIt’s on the house.\"},\\n]\\ntokenized_chat = tokenizer.apply_chat_template(\\n    messages,\\n    tokenize=True,\\n    add_generation_prompt=False,\\n    return_tensors=\"pt\"\\n)\\n\\noutputs = model.generate(tokenized_chat.to(model.device), max_new_tokens=2048)\\noutput_text = tokenizer.decode(outputs[0])\\n```\\n\\nWe recommend using the following set of parameters for inference. Note that our model does not have the default system_prompt.\\n\\n```json\\n{\\n  \"top_k\": 20,\\n  \"top_p\": 0.6,\\n  \"repetition_penalty\": 1.05,\\n  \"temperature\": 0.7\\n}\\n```\\n\\n&nbsp;\\n\\nSupported languages:\\n| Languages         | Abbr.   | Chinese Names   |\\n|-------------------|---------|-----------------|\\n| Chinese           | zh      | 中文            |\\n| English           | en      | 英语            |\\n| French            | fr      | 法语            |\\n| Portuguese        | pt      | 葡萄牙语        |\\n| Spanish           | es      | 西班牙语        |\\n| Japanese          | ja      | 日语            |\\n| Turkish           | tr      | 土耳其语        |\\n| Russian           | ru      | 俄语            |\\n| Arabic            | ar      | 阿拉伯语        |\\n| Korean            | ko      | 韩语            |\\n| Thai              | th      | 泰语            |\\n| Italian           | it      | 意大利语        |\\n| German            | de      | 德语            |\\n| Vietnamese        | vi      | 越南语          |\\n| Malay             | ms      | 马来语          |\\n| Indonesian        | id      | 印尼语          |\\n| Filipino          | tl      | 菲律宾语        |\\n| Hindi             | hi      | 印地语          |\\n| Traditional Chinese | zh-Hant| 繁体中文        |\\n| Polish            | pl      | 波兰语          |\\n| Czech             | cs      | 捷克语          |\\n| Dutch             | nl      | 荷兰语          |\\n| Khmer             | km      | 高棉语          |\\n| Burmese           | my      | 缅甸语          |\\n| Persian           | fa      | 波斯语          |\\n| Gujarati          | gu      | 古吉拉特语      |\\n| Urdu              | ur      | 乌尔都语        |\\n| Telugu            | te      | 泰卢固语        |\\n| Marathi           | mr      | 马拉地语        |\\n| Hebrew            | he      | 希伯来语        |\\n| Bengali           | bn      | 孟加拉语        |\\n| Tamil             | ta      | 泰米尔语        |\\n| Ukrainian         | uk      | 乌克兰语        |\\n| Tibetan           | bo      | 藏语            |\\n| Kazakh            | kk      | 哈萨克语        |\\n| Mongolian         | mn      | 蒙古语          |\\n| Uyghur            | ug      | 维吾尔语        |\\n| Cantonese         | yue     | 粤语            |\\n\\nCiting HY-MT1.5:\\n\\n```bibtex\\n@misc{hy-mt1.5,\\n      title={HY-MT1.5 Technical Report}, \\n      author={Mao Zheng and Zheng Li and Tao Chen and Mingyang Song and Di Wang},\\n      year={2025},\\n      eprint={2512.24092},\\n      archivePrefix={arXiv},\\n      primaryClass={cs.CL},\\n      url={ \\n}\\n```',\n",
       "  'domain': 'translation'},\n",
       " {'model_id': 'google/videoprism-base-f16r288',\n",
       "  'created_at': '2025-06-14T21:58:29+00:00',\n",
       "  'downloads': 239219,\n",
       "  'likes': 92,\n",
       "  'author': None,\n",
       "  'tags': ['videoprism',\n",
       "   'video-classification',\n",
       "   'video-embedding',\n",
       "   'arxiv:2402.13217',\n",
       "   'arxiv:2205.01917',\n",
       "   'arxiv:2103.15691',\n",
       "   'arxiv:2007.14937',\n",
       "   'arxiv:2106.02636',\n",
       "   'arxiv:2204.00679',\n",
       "   'arxiv:2307.06942',\n",
       "   'license:apache-2.0',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# VideoPrism Model Card\\n\\n**Paper**: \\n\\n**arXiv**: \\n\\n**GitHub**: \\n\\n**Blog**: \\n\\nVideoPrism is a foundational video encoder that enables state-of-the-art performance on a large variety of video understanding tasks. It takes video frames as input and outputs compact embeddings of the frames, which one can conveniently feed into classifiers, LLMs, retrieval models, etc. When tested on 33 public video understanding benchmarks over four task categories, a single frozen VideoPrism checkpoint outperforms previous best-performing foundation models on 31 of them, with no fine-tuning on target task datasets.\\n\\n## Model details\\n\\nWe release the following model variants:\\n\\n| Model Name | Configuration Name | Model Type | Backbone | #Params | File Size | Checkpoint |\\n| -------- | -------- | ------- | :-------: | :-------: | :-------: | :-------: |\\n| VideoPrism-B | `videoprism_public_v1_base`  | Video encoder | ViT-B | 114M | 458MB |  |\\n| VideoPrism-L | `videoprism_public_v1_large` | Video encoder | ViT-L | 354M | 1.42GB |  |\\n| VideoPrism-LvT-B | `videoprism_lvt_public_v1_base`  | Video-text encoders | ViT-B | 248M | 991MB |  |\\n| VideoPrism-LvT-L | `videoprism_lvt_public_v1_large` | Video-text encoders | ViT-L | 580M | 2.30GB |  |\\n\\n\\n### Model description\\n\\nVideoPrism-B/L are the composition of a Vision Transformer image encoder and four temporal-attention Transformer layers. The image encoder and text encoder are initialized from , which is trained on WebLI following the CoCa recipes. VideoPrism is based on the  factorized video encoder architecture.\\n\\n### Inputs and outputs\\nThe models take videos with shape (num_frames, 288, 288, 3) as inputs and outputs embeddings with shape (num_frames * 16 * 16, feature_channels) which could be reshaped into (num_frames, 16, 16, feature_channels) for spatiotemporal representations. During model training, num_frames is set to 16 and 8 for VideoPrism-B and VideoPrism-L, respectively. Both models are expected to work with arbitrary num_frames by interpolating the temporal positional embeddings.\\n\\nIn video-text models, both video and text encoders produce global embeddings with shape `(feature_channels)`, whose similarities could be measured by cosine distances. We use the `c4_en`  model for text tokenization. During inference, embedding calculation for either modality can be skipped by providing `None` as the input.\\n\\n## Uses\\nVideoPrism has a wide range of applications across various video understanding scenarios. The following lists some primary use cases and yet is not comprehensive. The purpose of this list is to provide contextual information the model creators considered as part of model training and development.\\n*    **Video classification**: By feeding the video embeddings to a lightweight classifier, we can tackle video action recognition, a fundamental task in video understanding, under various scenarios.\\n*    **Temporal and spatiotemporal localization**: We can also use the model to localize actions of interest spatially across time by equipping it with a bounding box proposal.\\n*    **Video retrieval and open-set classification**: By pairing up the video embeddings with a text encoder in the CLIP fashion, we can do text-video retrieval and open-set video classification.\\n\\n\\n## Ethical considerations and risks\\nThe model inherits the safety benefits and safety risks associated with the image encoder CoCa and the training datasets described above. We recommend that the model should not be used for downstream applications without prior assessment and mitigation of downstream application-specific security and fairness concerns.\\n*    Data bias: Large datasets scraped from the internet can contain inherent biases, leading to skewed model performance and potentially discriminatory outputs. The presence of \"noisy parallel text\" like ASR transcripts introduces potential inaccuracies and biases from the speech-to-text process.\\n*    Content moderation: The sheer volume of data (36M video-caption pairs and 582M video clips) raises concerns about the presence of objectionable or inappropriate content within the training data, which could lead to harmful model outputs.\\n*    Ethical use: As with any powerful video understanding model, there are risks of misuse, such as in surveillance or the propagation of misinformation.\\n*    Limitations: The reliance on potentially noisy text data can limit the models understanding of the true video content. Further research is needed to refine the models ability to understand long form videos, geometric information in videos, and non-semantic cues.\\n\\n\\n## How to get started with the model\\nTo get started with our models, please see the code and examples in our . \\n\\n### Feedback and Questions\\n\\nWe welcome all questions and feedback! If you find a bug, have a feature request, or want to ask a question, please don\\'t hesitate to **open an issue** on our GitHub repository.\\n\\nWe\\'re excited to see what you build with VideoPrism! 🚀\\n\\n\\n## Training details\\n\\n### Training data\\n\\nVideoPrism is pre-trained on a wide range of videos (36M video-caption pairs and 582M video clips), including the datasets below. Note that the number of clips are subject to change due to wipeout according to policy.\\n\\n| Pretraining datasets | Public | Domain | Caption source | Caption quality | # of videos | # of clips |\\n| :--- | :--- | :--- | :--- | :--- | :--- | :--- |\\n| Anonymous-Corpus #1 | ❌ | Web video | Manual labelled | High | 36.1M | 36.1M |\\n|  | ✅ | YouTube video | Metadata | Low | 55.1M | 55.1M |\\n|  | ✅ | YouTube video | ASR | Low | 2.3M | 87.8M |\\n|  | ❌ | YouTube video | Image captions for mining | Low | 133.5M | 191.1M |\\n|  | ✅ | YouTube video | Generated by VLM/LLM | Medium | 2.8M | 7.0M |\\n| Anonymous-Corpus #2 | ❌ | YouTube video | ASR | Low | 44.6M | 170.3M |\\n| Anonymous-Corpus #3 | ❌ | YouTube video | Generated by VLM/LLM | Medium | 36.7M | 71.5M |\\n\\n## Evaluation\\n\\nIn the tables below, \"Public\" denotes models we released in this repository. \"Paper\" and \"Prior SOTA\" denote our models and previous best-performing models reported in the paper, respectively. Our public models perform slightly worse than the paper models due to different pre-training image-text data we used subject to data policy.\\n\\n\\n### Results on video-focused tasks with frozen backbones\\n\\n| Dataset | K400 | MiT | SSv2 | D48 | Charades | ActivityNet | AVA | AVA-K |\\n| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |\\n| **VideoPrism-B (public)** | 82.9 | 39.7 | 62.2 | 64.3 | 43.5 | 36.5 | 28.3 | 30.8 |\\n| **VideoPrism-L (public)** | 85.0 | 43.3 | 64.6 | 67.6 | 53.2 | 37.0 | 32.4 | 34.5 |\\n| VideoPrism-B (paper) | 84.2 | 40.8 | 63.6 | 67.4 | 40.4 | 36.6 | 30.6 | 31.8 |\\n| VideoPrism-g (paper) | 87.2 | 45.5 | 68.5 | 71.3 | 62.3 | 37.8 | 36.2 | 37.3 |\\n| Prior SOTA (B) | 77.1 | 34.0 | 58.2 | 55.6 | 33.3 | 35.8 | 21.1 | 25.9 |\\n| Prior SOTA (L+) | 82.8 | 40.3 | 67.4 | 69.6 | 39.9 | 36.7 | 24.4 | 26.2 |\\n\\n### Zero-shot video-text retrieval\\n\\n| Models | MSRVTT-1K (v2t)  | MSRVTT-1K (t2v) | VATEX (v2t) | VATEX (t2v) | ActivityNet (v2t) | ActivityNet (t2v) |\\n| -------- | :-------: | :-------: | :-------: | :-------: | :-------: | :-------: |\\n| **VideoPrism-LvT-B (public)** | 49.8 | 50.1 | 73.1 | 56.2 | 47.9 | 48.8 |\\n| **VideoPrism-LvT-L (public)** | 50.6 | 50.1 | 75.0 | 57.2 | 49.1 | 51.3 |\\n| VideoPrism-LvT-B (paper) | 50.2 | 51.4 | 76.2 | 57.7 | 47.9 | 49.6 |\\n| VideoPrism-LvT-g (paper) | 51.7 | 52.7 | 77.1 | 62.5 | 50.3 | 52.7 |\\n| Prior SOTA (B) | - | 34.0 | - | - | - | 30.6 |\\n| Prior SOTA (L+) | 45.4 | 43.9 | 73.6 | 53.2 | 40.7 | 42.8 |\\n\\n### Zero-shot video classification\\n\\n| Models | K400 | SSv2 (Temporal) | SSv2 (Events) | NExT-QA (Hard) | Charades | Charades (STA) |\\n| -------- | :-------: | :-------: | :-------: | :-------: | :-------: | :-------: |\\n| **VideoPrism-LvT-B (public)** | 69.2 | 14.6 | 11.3 | 31.1 | 26.9 | 48.6 |\\n| **VideoPrism-LvT-L (public)** | 72.4 | 18.0 | 12.4 | 32.1 | 32.4 | 50.2 |\\n| VideoPrism-LvT-B (paper) | 71.3 | 16.1 | 11.9 | 31.3 | 29.2 | 50.0 |\\n| VideoPrism-LvT-g (paper) | 74.6 | 18.6 | 15.7 | 32.7 | 32.4 | 50.4 |\\n| Prior SOTA (B) | - | 9.8 | 6.4 | 27.6 | 21.1 | - |\\n| Prior SOTA (L+) | 72.0 | 15.2 | 11.4 | 25.2 | 25.8 | 47.2 |\\n\\n\\n## Implementation information\\n\\n### Model architecture\\n\\nVision model is a  factorized video encoder architecture, initialized from the Vision Transformer image encoder () followed by four temporal-attention Transformer layers.\\n\\n### Hardware\\n\\nVideoPrism was trained using  hardware.\\n\\n#### Software\\n\\nJAX, Flax\\n\\n## Citation\\n\\nVideoPrism:\\n```\\n@inproceedings{zhao2024videoprism,\\n  title = {{VideoPrism}: A Foundational Visual Encoder for Video Understanding},\\n  author = {Long Zhao and Nitesh B. Gundavarapu and Liangzhe Yuan and Hao Zhou and Shen Yan and Jennifer J. Sun and Luke Friedman and Rui Qian and Tobias Weyand and Yue Zhao and Rachel Hornung and Florian Schroff and Ming-Hsuan Yang and David A. Ross and Huisheng Wang and Hartwig Adam and Mikhail Sirotenko and Ting Liu and Boqing Gong},\\n  booktitle = {International Conference on Machine Learning (ICML)},\\n  year = {2024}\\n}\\n```\\n\\nVideoGLUE benchmarks:\\n```\\n@article{yuan2024videoglue,\\n  title = {{VideoGLUE}: Video General Understanding Evaluation of Foundation Models},\\n  author = {Liangzhe Yuan and Nitesh Bharadwaj Gundavarapu and Long Zhao and Hao Zhou and Yin Cui and Lu Jiang and Xuan Yang and Menglin Jia and Tobias Weyand and Luke Friedman and Mikhail Sirotenko and Huisheng Wang and Florian Schroff and Hartwig Adam and Ming-Hsuan Yang and Ting Liu and Boqing Gong},\\n  journal = {Transactions on Machine Learning Research (TMLR)},\\n  year = {2024}\\n}\\n```',\n",
       "  'domain': 'video-classification'},\n",
       " {'model_id': 'autogluon/mitra-classifier',\n",
       "  'created_at': '2025-06-22T23:27:19+00:00',\n",
       "  'downloads': 225192,\n",
       "  'likes': 36,\n",
       "  'author': None,\n",
       "  'tags': ['safetensors',\n",
       "   'tabular-classification',\n",
       "   'arxiv:2510.21204',\n",
       "   'license:apache-2.0',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# Mitra Classifier\\n\\nMitra classifier is a tabular foundation model that is pre-trained on purely synthetic datasets sampled from a mix of random classifiers. \\n\\n## Architecture\\n\\nMitra is based on a 12-layer Transformer of 72 M parameters, pre-trained by incorporating an in-context learning paradigm.\\n\\n## Usage\\n\\nTo use Mitra classifier, install AutoGluon by running:\\n\\n```sh\\npip install uv\\nuv pip install autogluon.tabular[mitra]   \\n```\\n\\nA minimal example showing how to perform inference using the Mitra classifier:\\n\\n```python\\nimport pandas as pd\\nfrom autogluon.tabular import TabularDataset, TabularPredictor\\nfrom sklearn.model_selection import train_test_split\\nfrom sklearn.datasets import load_wine\\n\\n# Load datasets\\nwine_data = load_wine()\\nwine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)\\nwine_df[\\'target\\'] = wine_data.target\\n\\nprint(\"Dataset shapes:\")\\nprint(f\"Wine: {wine_df.shape}\")\\n\\n# Create train/test splits (80/20)\\nwine_train, wine_test = train_test_split(wine_df, test_size=0.2, random_state=42, stratify=wine_df[\\'target\\'])\\n\\nprint(\"Training set sizes:\")\\nprint(f\"Wine: {len(wine_train)} samples\")\\n\\n# Convert to TabularDataset\\nwine_train_data = TabularDataset(wine_train)\\nwine_test_data = TabularDataset(wine_test)\\n\\n# Create predictor with Mitra\\nprint(\"Training Mitra classifier on classification dataset...\")\\nmitra_predictor = TabularPredictor(label=\\'target\\')\\nmitra_predictor.fit(\\n    wine_train_data,\\n    hyperparameters={\\n        \\'MITRA\\': {\\'fine_tune\\': False}\\n    },\\n   )\\n\\nprint(\"\\\\nMitra training completed!\")\\n\\n# Make predictions\\nmitra_predictions = mitra_predictor.predict(wine_test_data)\\nprint(\"Sample Mitra predictions:\")\\nprint(mitra_predictions.head(10))\\n\\n# Show prediction probabilities for first few samples\\nmitra_predictions = mitra_predictor.predict_proba(wine_test_data)\\nprint(mitra_predictions.head())\\n\\n# Show model leaderboard\\nprint(\"\\\\nMitra Model Leaderboard:\")\\nmitra_predictor.leaderboard(wine_test_data)\\n```\\n\\nA minimal example showing how to perform fine-tuning using the Mitra classifier:\\n\\n```python\\nmitra_predictor_ft = TabularPredictor(label=\\'target\\')\\nmitra_predictor_ft.fit(\\n    wine_train_data,\\n    hyperparameters={\\n        \\'MITRA\\': {\\'fine_tune\\': True, \\'fine_tune_steps\\': 10}\\n    },\\n    time_limit=120,  # 2 minutes\\n   )\\n\\nprint(\"\\\\nMitra fine-tuning completed!\")\\n\\n# Show model leaderboard\\nprint(\"\\\\nMitra Model Leaderboard:\")\\nmitra_predictor_ft.leaderboard(wine_test_data)\\n```\\n\\n## License\\n\\nThis project is licensed under the Apache-2.0 License.\\n\\n## Reference\\n\\n```\\n@article{zhang2025mitra,\\n  title={Mitra: Mixed synthetic priors for enhancing tabular foundation models},\\n  author={Zhang, Xiyuan and Maddix, Danielle C and Yin, Junming and Erickson, Nick and Ansari, Abdul Fatir and Han, Boran and Zhang, Shuai and Akoglu, Leman and Faloutsos, Christos and Mahoney, Michael W and others},\\n  journal={arXiv preprint arXiv:2510.21204},\\n  year={2025}\\n}\\n```\\n\\nAmazon Science blog: ',\n",
       "  'domain': 'tabular-classification'},\n",
       " {'model_id': 'OpenMed/OpenMed-NER-PharmaDetect-SuperClinical-434M',\n",
       "  'created_at': '2025-07-16T18:50:49+00:00',\n",
       "  'downloads': 218720,\n",
       "  'likes': 20,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'deberta-v2',\n",
       "   'token-classification',\n",
       "   'named-entity-recognition',\n",
       "   'biomedical-nlp',\n",
       "   'chemical-entity-recognition',\n",
       "   'drug-discovery',\n",
       "   'pharmacology',\n",
       "   'biocuration',\n",
       "   'chem',\n",
       "   'en',\n",
       "   'arxiv:2508.01630',\n",
       "   'license:apache-2.0',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# 🧬 \\n\\n**Specialized model for Chemical Entity Recognition - Chemical entities from the BC5CDR dataset**\\n\\n\\n\\n\\n\\n\\n## 📋 Model Overview\\n\\nThis model is a **state-of-the-art** fine-tuned transformer engineered to deliver **enterprise-grade accuracy** for chemical entity recognition - chemical entities from the bc5cdr dataset. This specialized model excels at identifying and extracting biomedical entities from clinical texts, research papers, and healthcare documents, enabling applications such as **drug interaction detection**, **medication extraction from patient records**, **adverse event monitoring**, **literature mining for drug discovery**, and **biomedical knowledge graph construction** with **production-ready reliability** for clinical and research applications.\\n\\n### 🎯 Key Features\\n- **High Precision**: Optimized for biomedical entity recognition\\n- **Domain-Specific**: Trained on curated BC5CDR_CHEM dataset\\n- **Production-Ready**: Validated on clinical benchmarks\\n- **Easy Integration**: Compatible with Hugging Face Transformers ecosystem\\n\\n### 🏷️ Supported Entity Types\\n\\nThis model can identify and classify the following biomedical entities:\\n\\n- `B-CHEM`\\n- `I-CHEM`\\n\\n## 📊 Dataset\\n\\nBC5CDR-Chem focuses on chemical entity recognition from the BioCreative V Chemical-Disease Relation extraction task.\\n\\nThe BC5CDR-Chem corpus is part of the BioCreative V Chemical-Disease Relation (CDR) extraction challenge, specifically targeting chemical entity recognition in biomedical texts. This dataset contains 1,500 PubMed abstracts with 4,409 annotated chemical entities, designed to support automated drug discovery and pharmacovigilance applications. The corpus emphasizes chemical compounds, drugs, and therapeutic substances that are relevant for understanding chemical-disease relationships. It serves as a critical resource for developing NER systems that can identify chemical entities for downstream tasks like adverse drug reaction detection and drug repurposing research.\\n\\n\\n## 📊 Performance Metrics\\n\\n### Current Model Performance\\n- **F1 Score**: `0.96`\\n- **Precision**: `0.95`\\n- **Recall**: `0.97`\\n- **Accuracy**: `0.99`\\n\\n### 🏆 Comparative Performance on BC5CDR_CHEM Dataset\\n\\n| Rank | Model | F1 Score | Precision | Recall | Accuracy |\\n|------|-------|----------|-----------|--------|-----------|\\n| 🥇 1 |  | **0.9614** | 0.9520 | 0.9710 | 0.9892 |\\n| 🥈 2 |  | **0.9610** | 0.9585 | 0.9634 | 0.9871 |\\n| 🥉 3 |  | **0.9594** | 0.9539 | 0.9649 | 0.9863 |\\n|  4 |  | **0.9587** | 0.9521 | 0.9654 | 0.9902 |\\n|  5 |  | **0.9585** | 0.9520 | 0.9651 | 0.9881 |\\n|  6 |  | **0.9583** | 0.9511 | 0.9656 | 0.9857 |\\n|  7 |  | **0.9562** | 0.9483 | 0.9642 | 0.9888 |\\n|  8 |  | **0.9560** | 0.9504 | 0.9617 | 0.9849 |\\n|  9 |  | **0.9555** | 0.9417 | 0.9697 | 0.9889 |\\n|  10 |  | **0.9550** | 0.9442 | 0.9662 | 0.9871 |\\n\\n\\n*Rankings based on F1-score performance across all models trained on this dataset.*\\n\\n\\n\\n*Figure: OpenMed (Open-Source) vs. Latest SOTA (Closed-Source) performance comparison across biomedical NER datasets.*\\n\\n## 🚀 Quick Start\\n\\n### Installation\\n\\n```bash\\npip install transformers torch\\n```\\n\\n### Usage\\n\\n```python\\nfrom transformers import pipeline\\n\\n# Load the model and tokenizer\\n# Model: \\nmodel_name = \"OpenMed/OpenMed-NER-PharmaDetect-SuperClinical-434M\"\\n\\n# Create a pipeline\\nmedical_ner_pipeline = pipeline(\\n    model=model_name,\\n    aggregation_strategy=\"simple\"\\n)\\n\\n# Example usage\\ntext = \"Administration of metformin reduced glucose levels significantly.\"\\nentities = medical_ner_pipeline(text)\\n\\nprint(entities)\\n\\ntoken = entities[0]\\nprint(text[token[\"start\"] : token[\"end\"]])\\n```\\n\\nNOTE: The `aggregation_strategy` parameter defines how token predictions are grouped into entities. For a detailed explanation, please refer to the .\\n\\nHere is a summary of the available strategies:\\n- **`none`**: Returns raw token predictions without any aggregation.\\n- **`simple`**: Groups adjacent tokens with the same entity type (e.g., `B-LOC` followed by `I-LOC`).\\n- **`first`**: For word-based models, if tokens within a word have different entity tags, the tag of the first token is assigned to the entire word.\\n- **`average`**: For word-based models, this strategy averages the scores of tokens within a word and applies the label with the highest resulting score.\\n- **`max`**: For word-based models, the entity label from the token with the highest score within a word is assigned to the entire word.\\n\\n### Batch Processing\\n\\nFor efficient processing of large datasets, use proper batching with the `batch_size` parameter:\\n\\n```python\\ntexts = [\\n    \"Administration of metformin reduced glucose levels significantly.\",\\n    \"The study evaluated the efficacy of cisplatin in cancer treatment.\",\\n    \"Patients received ibuprofen for inflammation management.\",\\n    \"The patient\\'s medication was switched to tamoxifen to prevent breast cancer recurrence.\",\\n    \"Lithium carbonate is often prescribed for the management of bipolar disorder.\",\\n]\\n\\n# Efficient batch processing with optimized batch size\\n# Adjust batch_size based on your GPU memory (typically 8, 16, 32, or 64)\\nresults = medical_ner_pipeline(texts, batch_size=8)\\n\\nfor i, entities in enumerate(results):\\n    print(f\"Text {i+1} entities:\")\\n    for entity in entities:\\n        print(f\"  - {entity[\\'word\\']} ({entity[\\'entity_group\\']}): {entity[\\'score\\']:.4f}\")\\n```\\n\\n### Large Dataset Processing\\n\\nFor processing large datasets efficiently:\\n\\n```python\\nfrom transformers.pipelines.pt_utils import KeyDataset\\nfrom datasets import Dataset\\nimport pandas as pd\\n\\n# Load your data\\n# Load a medical dataset from Hugging Face\\nfrom datasets import load_dataset\\n\\n# Load a public medical dataset (using a subset for testing)\\nmedical_dataset = load_dataset(\"BI55/MedText\", split=\"train[:100]\")  # Load first 100 examples\\ndata = pd.DataFrame({\"text\": medical_dataset[\"Completion\"]})\\ndataset = Dataset.from_pandas(data)\\n\\n# Process with optimal batching for your hardware\\nbatch_size = 16  # Tune this based on your GPU memory\\nresults = []\\n\\nfor out in medical_ner_pipeline(KeyDataset(dataset, \"text\"), batch_size=batch_size):\\n    results.extend(out)\\n\\nprint(f\"Processed {len(results)} texts with batching\")\\n\\n```\\n\\n### Performance Optimization\\n\\n**Batch Size Guidelines:**\\n- **CPU**: Start with batch_size=1-4\\n- **Single GPU**: Try batch_size=8-32 depending on GPU memory\\n- **High-end GPU**: Can handle batch_size=64 or higher\\n- **Monitor GPU utilization** to find the optimal batch size for your hardware\\n\\n**Memory Considerations:**\\n```python\\n# For limited GPU memory, use smaller batches\\nmedical_ner_pipeline = pipeline(\\n    model=model_name,\\n    aggregation_strategy=\"simple\",\\n    device=0  # Specify GPU device\\n)\\n\\n# Process with memory-efficient batching\\nfor batch_start in range(0, len(texts), batch_size):\\n    batch = texts[batch_start:batch_start + batch_size]\\n    batch_results = medical_ner_pipeline(batch, batch_size=len(batch))\\n    results.extend(batch_results)\\n```\\n\\n## 📚 Dataset Information\\n\\n- **Dataset**: BC5CDR_CHEM\\n- **Description**: Chemical Entity Recognition - Chemical entities from the BC5CDR dataset\\n\\n### Training Details\\n- **Base Model**: deberta-v3-large\\n- **Training Framework**: Hugging Face Transformers\\n- **Optimization**: AdamW optimizer with learning rate scheduling\\n- **Validation**: Cross-validation on held-out test set\\n\\n## 🔬 Model Architecture\\n\\n- **Base Architecture**: deberta-v3-large\\n- **Task**: Token Classification (Named Entity Recognition)\\n- **Labels**: Dataset-specific entity types\\n- **Input**: Tokenized biomedical text\\n- **Output**: BIO-tagged entity predictions\\n\\n## 💡 Use Cases\\n\\nThis model is particularly useful for:\\n- **Clinical Text Mining**: Extracting entities from medical records\\n- **Biomedical Research**: Processing scientific literature\\n- **Drug Discovery**: Identifying chemical compounds and drugs\\n- **Healthcare Analytics**: Analyzing patient data and outcomes\\n- **Academic Research**: Supporting biomedical NLP research\\n\\n## 📜 License\\n\\nLicensed under the Apache License 2.0. See  for details.\\n\\n## 🤝 Contributing\\n\\nWe welcome contributions of all kinds! Whether you have ideas, feature requests, or want to join our mission to advance open-source Healthcare AI, we\\'d love to hear from you.\\n\\nFollow  on Hugging Face 🤗 and click \"Watch\" to stay updated on our latest releases and developments.\\n\\n## Citation\\n\\nIf you use this model in your research or applications, please cite the following paper:\\n\\n```latex\\n@misc{panahi2025openmedneropensourcedomainadapted,\\n      title={OpenMed NER: Open-Source, Domain-Adapted State-of-the-Art Transformers for Biomedical NER Across 12 Public Datasets},\\n      author={Maziyar Panahi},\\n      year={2025},\\n      eprint={2508.01630},\\n      archivePrefix={arXiv},\\n      primaryClass={cs.CL},\\n      url={\\n}\\n```\\n\\nProper citation helps support and acknowledge my work. Thank you!',\n",
       "  'domain': 'token-classification'},\n",
       " {'model_id': 'microsoft/Phi-4-multimodal-instruct',\n",
       "  'created_at': '2025-02-24T22:33:32+00:00',\n",
       "  'downloads': 213980,\n",
       "  'likes': 1563,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'phi4mm',\n",
       "   'text-generation',\n",
       "   'nlp',\n",
       "   'code',\n",
       "   'audio',\n",
       "   'automatic-speech-recognition',\n",
       "   'speech-summarization',\n",
       "   'speech-translation',\n",
       "   'visual-question-answering',\n",
       "   'phi-4-multimodal',\n",
       "   'phi',\n",
       "   'phi-4-mini',\n",
       "   'custom_code',\n",
       "   'multilingual',\n",
       "   'ar',\n",
       "   'zh',\n",
       "   'cs',\n",
       "   'da',\n",
       "   'nl',\n",
       "   'en',\n",
       "   'fi',\n",
       "   'fr',\n",
       "   'de',\n",
       "   'he',\n",
       "   'hu',\n",
       "   'it',\n",
       "   'ja',\n",
       "   'ko',\n",
       "   'no',\n",
       "   'pl',\n",
       "   'pt',\n",
       "   'ru',\n",
       "   'es',\n",
       "   'sv',\n",
       "   'th',\n",
       "   'tr',\n",
       "   'uk',\n",
       "   'arxiv:2503.01743',\n",
       "   'arxiv:2407.13833',\n",
       "   'license:mit',\n",
       "   'region:us'],\n",
       "  'modelcard': '🎉**Phi-4**:  | ] |  | ]; \\n | ]\\n\\n\\n\\n## Model Summary\\n\\nPhi-4-multimodal-instruct is a lightweight open multimodal foundation\\nmodel that leverages the language, vision, and speech research\\nand datasets used for Phi-3.5 and 4.0 models. The model processes text,\\nimage, and audio inputs, generating text outputs, and comes with\\n128K token context length. The model underwent an enhancement process,\\nincorporating both supervised fine-tuning, direct preference\\noptimization and RLHF (Reinforcement Learning from Human Feedback)\\nto support precise instruction adherence and safety measures.\\nThe languages that each modal supports are the following:\\n- Text: Arabic, Chinese, Czech, Danish, Dutch, English, Finnish,\\nFrench, German, Hebrew, Hungarian, Italian, Japanese, Korean, Norwegian,\\nPolish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Ukrainian\\n- Vision: English\\n- Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese\\n\\n📰  \\n📖  \\n🏡  \\n👩\\u200d🍳  \\n🖥️ Try It on , \\n,\\n,\\n playgrounds\\n📱Huggingface Spaces \\n, \\n, \\n \\n\\n\\nWatch as Phi-4 Multimodal analyzes spoken language to help plan a trip to Seattle, demonstrating its advanced audio processing and recommendation capabilities.\\n\\n\\n  \\n    \\n    Your browser does not support the video tag.\\n  \\n\\n\\nSee how Phi-4 Multimodal tackles complex mathematical problems through visual inputs, demonstrating its ability to process and solve equations presented in images.\\n\\n  \\n    \\n    Your browser does not support the video tag.\\n  \\n\\n\\nExplore how Phi-4 Mini functions as an intelligent agent, showcasing its reasoning and task execution abilities in complex scenarios.\\n\\n  \\n    \\n    Your browser does not support the video tag.\\n  \\n\\n\\n\\n## Intended Uses\\n\\n### Primary Use Cases\\n\\nThe model is intended for broad multilingual and multimodal commercial and research use . The model provides uses for general purpose AI systems and applications which require\\n\\n1) Memory/compute constrained environments\\n2) Latency bound scenarios\\n3) Strong reasoning (especially math and logic)\\n4) Function and tool calling\\n5) General image understanding\\n6) Optical character recognition\\n7) Chart and table understanding\\n8) Multiple image comparison\\n9) Multi-image or video clip summarization\\n10) Speech recognition\\n11) Speech translation\\n12) Speech QA\\n13) Speech summarization\\n14) Audio understanding\\n\\nThe model is designed to accelerate research on language and multimodal models, for use as a building block for generative AI powered features. \\n\\n### Use Case Considerations\\n\\nThe model is not specifically designed or evaluated for all downstream purposes. Developers should consider common limitations of language models and multimodal models, as well as performance difference across languages, as they select use cases, and evaluate and mitigate for accuracy, safety, and fairness before using within a specific downstream use case, particularly for high-risk scenarios. \\nDevelopers should be aware of and adhere to applicable laws or regulations (including but not limited to privacy, trade compliance laws, etc.) that are relevant to their use case. \\n\\n***Nothing contained in this Model Card should be interpreted as or deemed a restriction or modification to the license the model is released under.*** \\n\\n## Release Notes \\n\\nThis release of Phi-4-multimodal-instruct is based on valuable user feedback from the Phi-3 series. Previously, users could use a speech recognition model to talk to the Mini and Vision models. To achieve this, users needed to use a pipeline of two models: one model to transcribe the audio to text, and another model for the language or vision tasks. This pipeline means that the core model was not provided the full breadth of input information – e.g. cannot directly observe multiple speakers, background noises, jointly align speech, vision, language information at the same time on the same representation space.\\nWith Phi-4-multimodal-instruct, a single new open model has been trained across text, vision, and audio, meaning that all inputs and outputs are processed by the same neural network. The model  employed new architecture, larger vocabulary for efficiency, multilingual, and multimodal support, and better post-training techniques were used for instruction following and function calling, as well as additional data leading to substantial gains on key multimodal capabilities.\\nIt is anticipated that Phi-4-multimodal-instruct will greatly benefit app developers and various use cases. The enthusiastic support for the Phi-4 series is greatly appreciated. Feedback on Phi-4 is welcomed and crucial to the model\\'s evolution and improvement. Thank you for being part of this journey!\\n\\n## Model Quality\\n\\n  Click to view details\\n\\nTo understand the capabilities, Phi-4-multimodal-instruct  was compared with a set of models over a variety of benchmarks using an internal benchmark platform (See Appendix A for benchmark methodology). Users can refer to the Phi-4-Mini-Instruct model card for details of language benchmarks. At the high-level overview of the model quality on representative speech and vision benchmarks:\\n\\n### Speech\\n\\nThe Phi-4-multimodal-instruct was observed as\\n- Having strong automatic speech recognition (ASR) and speech translation (ST) performance, surpassing expert ASR model WhisperV3 and ST models SeamlessM4T-v2-Large. \\n- Ranking number 1 on the  leaderboard with word error rate 6.14% in comparison with the current best model 6.5% as of March 04, 2025. \\n- Being the first open-sourced model that can perform speech summarization, and the performance is close to GPT4o.\\n- Having a gap with close models, e.g. Gemini-1.5-Flash and GPT-4o-realtime-preview, on speech QA task. Work is being undertaken to improve this capability in the next iterations.\\n\\n#### Speech Recognition (lower is better)\\n\\nThe performance of Phi-4-multimodal-instruct on the aggregated benchmark datasets:\\n\\n\\nThe performance of Phi-4-multimodal-instruct on different languages, averaging the WERs of CommonVoice and FLEURS:\\n\\n\\n\\n#### Speech Translation (higher is better)\\n\\nTranslating from German, Spanish, French, Italian, Japanese, Portugues, Chinese to English:\\n\\n\\n\\nTranslating from English to German, Spanish, French, Italian, Japanese, Portugues, Chinese. Noted that WhiperV3 does not support this capability: \\n\\n\\n\\n\\n#### Speech Summarization (higher is better)\\n\\n\\n\\n#### Speech QA\\n\\nMT bench scores are scaled by 10x to match the score range of MMMLU:\\n\\n\\n\\n#### Audio Understanding\\n\\nAIR bench scores are scaled by 10x to match the score range of MMAU:\\n\\n\\n\\n### Vision\\n\\n#### Vision-Speech tasks\\n\\nPhi-4-multimodal-instruct is capable of processing both image and audio together, the following table shows the model quality when the input query for vision content is synthetic speech on chart/table understanding and document reasoning tasks. Compared to other existing state-of-the-art omni models that can enable audio and visual signal as input, Phi-4-multimodal-instruct achieves much stronger performance on multiple benchmarks.\\n\\n| Benchmarks            | Phi-4-multimodal-instruct | InternOmni-7B | Gemini-2.0-Flash-Lite-prv-02-05 | Gemini-2.0-Flash | Gemini-1.5-Pro |\\n|-----------------------|--------------------------|---------------|--------------------------------|-----------------|----------------|\\n| s_AI2D                | **68.9**                 | 53.9          | 62.0                           | **69.4**        | 67.7           |\\n| s_ChartQA             | **69.0**                 | 56.1          | 35.5                           | 51.3            | 46.9           |\\n| s_DocVQA              | **87.3**                 | 79.9          | 76.0                           | 80.3            | 78.2           |\\n| s_InfoVQA             | **63.7**                 | 60.3          | 59.4                           | 63.6            | **66.1**       |\\n| **Average**           | **72.2**                 | **62.6**      | **58.2**                       | **66.2**        | **64.7**       |\\n\\n### Vision tasks\\nTo understand the vision capabilities, Phi-4-multimodal-instruct was compared with a set of models over a variety of zero-shot benchmarks using an internal benchmark platform. At the high-level overview of the model quality on representative benchmarks:\\n\\n| Dataset                          | Phi-4-multimodal-ins | Phi-3.5-vision-ins | Qwen 2.5-VL-3B-ins | Intern VL 2.5-4B | Qwen 2.5-VL-7B-ins | Intern VL 2.5-8B | Gemini 2.0-Flash Lite-preview-0205 | Gemini2.0-Flash | Claude-3.5-Sonnet-2024-10-22 | Gpt-4o-2024-11-20 |\\n|----------------------------------|---------------------|-------------------|-------------------|-----------------|-------------------|-----------------|--------------------------------|-----------------|----------------------------|------------------|\\n| **Popular aggregated benchmark** |                     |                   |                   |                 |                   |                 |                                |                 |                            |                  |\\n| MMMU                             | **55.1**            | 43.0              | 47.0              | 48.3            | 51.8              | 50.6            | 54.1                           | **64.7**        | 55.8                       | 61.7             |\\n| MMBench (dev-en)                 | **86.7**            | 81.9              | 84.3              | 86.8            | 87.8              | 88.2            | 85.0                           | **90.0**        | 86.7                       | 89.0             |\\n| MMMU-Pro (std/vision)            | **38.5**            | 21.8              | 29.9              | 32.4            | 36.9              | 34.4            | 45.1                           | **54.4**        | 54.3                       | 53.0             |\\n| **Visual science reasoning**     |                     |                   |                   |                 |                   |                 |                                |                 |                            |                  |\\n| ScienceQA Visual (img-test)      | **97.5**            | 91.3              | 79.4              | 96.2            | 87.7              | **97.3**        | 85.0                           | 88.3            | 81.2                       | 88.2             |\\n| **Visual math reasoning**        |                     |                   |                   |                 |                   |                 |                                |                 |                            |                  |\\n| MathVista (testmini)             | **62.4**            | 43.9              | 60.8              | 51.2            | **67.8**          | 56.7            | 57.6                           | 47.2            | 56.9                       | 56.1             |\\n| InterGPS                         | **48.6**            | 36.3              | 48.3              | 53.7            | 52.7              | 54.1            | 57.9                           | **65.4**        | 47.1                       | 49.1             |\\n| **Chart & table reasoning**      |                     |                   |                   |                 |                   |                 |                                |                 |                            |                  |\\n| AI2D                             | **82.3**            | 78.1              | 78.4              | 80.0            | 82.6              | 83.0            | 77.6                           | 82.1            | 70.6                       | **83.8**         |\\n| ChartQA                          | **81.4**            | 81.8              | 80.0              | 79.1            | **85.0**          | 81.0            | 73.0                           | 79.0            | 78.4                       | 75.1             |\\n| DocVQA                           | **93.2**            | 69.3              | 93.9              | 91.6            | **95.7**          | 93.0            | 91.2                           | 92.1            | 95.2                       | 90.9             |\\n| InfoVQA                          | **72.7**            | 36.6              | 77.1              | 72.1            | **82.6**          | 77.6            | 73.0                           | 77.8            | 74.3                       | 71.9             |\\n| **Document Intelligence**        |                     |                   |                   |                 |                   |                 |                                |                 |                            |                  |\\n| TextVQA (val)                    | **75.6**            | 72.0              | 76.8              | 70.9            | **77.7**          | 74.8            | 72.9                           | 74.4            | 58.6                       | 73.1             |\\n| OCR Bench                        | **84.4**            | 63.8              | 82.2              | 71.6            | **87.7**          | 74.8            | 75.7                           | 81.0            | 77.0                       | 77.7             |\\n| **Object visual presence verification** |              |                   |                   |                 |                   |                 |                                |                 |                            |                  |\\n| POPE                             | **85.6**            | 86.1              | 87.9              | 89.4            | 87.5              | **89.1**        | 87.5                           | 88.0            | 82.6                       | 86.5             |\\n| **Multi-image perception**       |                     |                   |                   |                 |                   |                 |                                |                 |                            |                  |\\n| BLINK                            | **61.3**            | 57.0              | 48.1              | 51.2            | 55.3              | 52.5            | 59.3                           | **64.0**        | 56.9                       | 62.4             |\\n| Video MME 16 frames              | **55.0**            | 50.8              | 56.5              | 57.3            | 58.2              | 58.7            | 58.8                           | 65.5            | 60.2                       | **68.2**         |\\n| **Average**                      | **72.0**            | **60.9**          | **68.7**          | **68.8**        | **73.1**          | **71.1**        | **70.2**                       | **74.3**        | **69.1**                   | **72.4**         |\\n\\n\\n\\n#### Visual Perception\\n\\nBelow are the comparison results on existing multi-image tasks. On average, Phi-4-multimodal-instruct outperforms competitor models of the same size and competitive with much bigger models on multi-frame capabilities.\\nBLINK is an aggregated benchmark with 14 visual tasks that humans can solve very quickly but are still hard for current multimodal LLMs.\\n\\n| Dataset                    | Phi-4-multimodal-instruct | Qwen2.5-VL-3B-Instruct | InternVL 2.5-4B | Qwen2.5-VL-7B-Instruct | InternVL 2.5-8B | Gemini-2.0-Flash-Lite-prv-02-05 | Gemini-2.0-Flash | Claude-3.5-Sonnet-2024-10-22 | Gpt-4o-2024-11-20 |\\n|----------------------------|--------------------------|----------------------|-----------------|----------------------|-----------------|--------------------------------|-----------------|----------------------------|------------------|\\n| Art Style                  | **86.3**                 | 58.1                | 59.8           | 65.0                 | 65.0            | 76.9                           | 76.9            | 68.4                       | 73.5             |\\n| Counting                   | **60.0**                 | 67.5                | 60.0           | 66.7                 | **71.7**        | 45.8                           | 69.2            | 60.8                       | 65.0             |\\n| Forensic Detection         | **90.2**                 | 34.8                | 22.0           | 43.9                 | 37.9            | 31.8                           | 74.2            | 63.6                       | 71.2             |\\n| Functional Correspondence  | **30.0**                 | 20.0                | 26.9           | 22.3                 | 27.7            | 48.5                           | **53.1**        | 34.6                       | 42.3             |\\n| IQ Test                    | **22.7**                 | 25.3                | 28.7           | 28.7                 | 28.7            | 28.0                           | **30.7**        | 20.7                       | 25.3             |\\n| Jigsaw                     | **68.7**                 | 52.0                | **71.3**       | 69.3                 | 53.3            | 62.7                           | 69.3            | 61.3                       | 68.7             |\\n| Multi-View Reasoning       | **76.7**                 | 44.4                | 44.4           | 54.1                 | 45.1            | 55.6                           | 41.4            | 54.9                       | 54.1             |\\n| Object Localization        | **52.5**                 | 55.7                | 53.3           | 55.7                 | 58.2            | 63.9                           | **67.2**        | 58.2                       | 65.6             |\\n| Relative Depth             | **69.4**                 | 68.5                | 68.5           | 80.6                 | 76.6            | **81.5**                       | 72.6            | 66.1                       | 73.4             |\\n| Relative Reflectance       | **26.9**                 | **38.8**            | **38.8**       | 32.8                 | **38.8**        | 33.6                           | 34.3            | 38.1                       | 38.1             |\\n| Semantic Correspondence    | **52.5**                 | 32.4                | 33.8           | 28.8                 | 24.5            | **56.1**                       | 55.4            | 43.9                       | 47.5             |\\n| Spatial Relation           | **72.7**                 | 80.4                | 86.0           | **88.8**             | 86.7            | 74.1                           | 79.0            | 74.8                       | 83.2             |\\n| Visual Correspondence      | **67.4**                 | 28.5                | 39.5           | 50.0                 | 44.2            | 84.9                           | **91.3**        | 72.7                       | 82.6             |\\n| Visual Similarity          | **86.7**                 | 67.4                | 88.1           | 87.4                 | 85.2            | **87.4**                       | 80.7            | 79.3                       | 83.0             |\\n| **Overall**                | **61.6**                 | **48.1**            | **51.2**       | **55.3**             | **52.5**        | **59.3**                       | **64.0**        | **56.9**                   | **62.4**         |\\n\\n\\n\\n\\n\\n## Usage\\n\\n### Requirements\\n\\nPhi-4 family has been integrated in the `4.48.2` version of `transformers`. The current `transformers` version can be verified with: `pip list | grep transformers`.\\nWe suggest to run with Python 3.10.\\nExamples of required packages:\\n```\\nflash_attn==2.7.4.post1\\ntorch==2.6.0\\ntransformers==4.48.2\\naccelerate==1.3.0\\nsoundfile==0.13.1\\npillow==11.1.0\\nscipy==1.15.2\\ntorchvision==0.21.0\\nbackoff==2.2.1\\npeft==0.13.2\\n```\\n\\nPhi-4-multimodal-instruct is also available in \\n\\n### Tokenizer\\n\\nPhi-4-multimodal-instruct supports a vocabulary size of up to `200064` tokens. The  already provide placeholder tokens that can be used for downstream fine-tuning, but they can also be extended up to the model\\'s vocabulary size.\\n\\n### Input Formats\\n\\nGiven the nature of the training data, the Phi-4-multimodal-instruct model is best suited for prompts using the chat format as follows:\\n\\n#### Text chat format\\n\\nThis format is used for general conversation and instructions:\\n\\n`\\nYou are a helpful assistant.How to explain Internet for a medieval knight?\\n`\\n\\n#### Tool-enabled function-calling format\\n\\nThis format is used when the user wants the model to provide function calls based on\\nthe given tools. The user should provide the available tools in the system prompt,\\nwrapped by  and  tokens. The tools should be specified in JSON format,\\nusing a JSON dump structure. Example:\\n\\n`\\nYou are a helpful assistant with some tools.[{\"name\": \"get_weather_updates\", \"description\": \"Fetches weather updates for a given city using the RapidAPI Weather API.\", \"parameters\": {\"city\": {\"description\": \"The name of the city for which to retrieve weather information.\", \"type\": \"str\", \"default\": \"London\"}}}]What is the weather like in Paris today?\\n`\\n\\n#### Vision-Language Format\\n\\nThis format is used for conversation with image:\\n\\n`\\nDescribe the image in detail.\\n`\\n\\nFor multiple images, the user needs to insert multiple image placeholders in the prompt as below:\\n\\n`\\nSummarize the content of the images.\\n`\\n\\n#### Speech-Language Format\\n\\nThis format is used for various speech and audio tasks:\\n\\n`\\n{task prompt}\\n`\\n\\nThe task prompt can vary for different task.\\nAutomatic Speech Recognition:\\n\\n`\\nTranscribe the audio clip into text.\\n`\\n\\nAutomatic Speech Translation:\\n\\n`\\nTranslate the audio to {lang}.\\n`\\n\\nAutomatic Speech Translation with chain-of-thoughts:\\n\\n`\\nTranscribe the audio to text, and then translate the audio to {lang}. Use  as a separator between the original transcript and the translation.\\n`\\n\\nSpoken-query Question Answering:\\n\\n`\\n\\n`\\n\\n#### Vision-Speech Format\\n\\nThis format is used for conversation with image and audio.\\nThe audio may contain query related to the image:\\n\\n`\\n\\n`\\n\\nFor multiple images, the user needs to insert multiple image placeholders in the prompt as below:\\n\\n`\\n\\n`\\n\\n**Vision**\\n- Any common RGB/gray image format (e.g., (\".jpg\", \".jpeg\", \".png\", \".ppm\", \".bmp\", \".pgm\", \".tif\", \".tiff\", \".webp\")) can be supported.\\n- Resolution depends on the GPU memory size. Higher resolution and more images will produce more tokens, thus using more GPU memory. During training, 64 crops can be supported.\\nIf it is a square image, the resolution would be around (8*448 by 8*448). For multiple-images, at most 64 frames can be supported, but with more frames as input, the resolution of each frame needs to be reduced to fit in the memory.\\n\\n**Audio**\\n- Any audio format that can be loaded by soundfile package should be supported.\\n- To keep the satisfactory performance, maximum audio length is suggested to be 40s. For summarization tasks, the maximum audio length is suggested to 30 mins.\\n\\n\\n### Loading the model locally\\n\\nAfter obtaining the Phi-4-multimodal-instruct model checkpoints, users can use this sample code for inference.\\n\\n\\n  Click to view details\\n\\n```python\\nimport requests\\nimport torch\\nimport os\\nimport io\\nfrom PIL import Image\\nimport soundfile as sf\\nfrom transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig\\nfrom urllib.request import urlopen\\n\\n\\n# Define model path\\nmodel_path = \"microsoft/Phi-4-multimodal-instruct\"\\n\\n# Load model and processor\\nprocessor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)\\nmodel = AutoModelForCausalLM.from_pretrained(\\n    model_path, \\n    device_map=\"cuda\", \\n    torch_dtype=\"auto\", \\n    trust_remote_code=True,\\n    # if you do not use Ampere or later GPUs, change attention to \"eager\"\\n    _attn_implementation=\\'flash_attention_2\\',\\n).cuda()\\n\\n# Load generation config\\ngeneration_config = GenerationConfig.from_pretrained(model_path)\\n\\n# Define prompt structure\\nuser_prompt = \\'\\'\\nassistant_prompt = \\'\\'\\nprompt_suffix = \\'\\'\\n\\n# Part 1: Image Processing\\nprint(\"\\\\n--- IMAGE PROCESSING ---\")\\nimage_url = \\'\\nprompt = f\\'{user_prompt}What is shown in this image?{prompt_suffix}{assistant_prompt}\\'\\nprint(f\\'>>> Prompt\\\\n{prompt}\\')\\n\\n# Download and open image\\nimage = Image.open(requests.get(image_url, stream=True).raw)\\ninputs = processor(text=prompt, images=image, return_tensors=\\'pt\\').to(\\'cuda:0\\')\\n\\n# Generate response\\ngenerate_ids = model.generate(\\n    **inputs,\\n    max_new_tokens=1000,\\n    generation_config=generation_config,\\n)\\ngenerate_ids = generate_ids[:, inputs[\\'input_ids\\'].shape[1]:]\\nresponse = processor.batch_decode(\\n    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n)[0]\\nprint(f\\'>>> Response\\\\n{response}\\')\\n\\n# Part 2: Audio Processing\\nprint(\"\\\\n--- AUDIO PROCESSING ---\")\\naudio_url = \"\\nspeech_prompt = \"Transcribe the audio to text, and then translate the audio to French. Use  as a separator between the original transcript and the translation.\"\\nprompt = f\\'{user_prompt}{speech_prompt}{prompt_suffix}{assistant_prompt}\\'\\nprint(f\\'>>> Prompt\\\\n{prompt}\\')\\n\\n# Downlowd and open audio file\\naudio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read()))\\n\\n# Process with the model\\ninputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors=\\'pt\\').to(\\'cuda:0\\')\\n\\ngenerate_ids = model.generate(\\n    **inputs,\\n    max_new_tokens=1000,\\n    generation_config=generation_config,\\n)\\ngenerate_ids = generate_ids[:, inputs[\\'input_ids\\'].shape[1]:]\\nresponse = processor.batch_decode(\\n    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n)[0]\\nprint(f\\'>>> Response\\\\n{response}\\')\\n```\\n\\n\\nMore inference examples can be found .\\n\\n### vLLM inference\\n\\nUser can start a server with this command\\n\\n```bash\\npython -m vllm.entrypoints.openai.api_server --model \\'microsoft/Phi-4-multimodal-instruct\\' --dtype auto --trust-remote-code --max-model-len 131072 --enable-lora --max-lora-rank 320 --lora-extra-vocab-size 0 --limit-mm-per-prompt audio=3,image=3 --max-loras 2 --lora-modules speech= vision=\\n```\\n\\nThe speech lora and vision lora folders are within the Phi-4-multimodal-instruct folder downloaded by vLLM, you can also use the following script to find thoses:\\n\\n```python\\nfrom huggingface_hub import snapshot_download\\nmodel_path = snapshot_download(repo_id=\"microsoft/Phi-4-multimodal-instruct\")\\nspeech_lora_path = model_path+\"/speech-lora\"\\nvision_lora_path = model_path+\"/vision-lora\"\\n```\\n\\n## Training\\n\\n### Fine-tuning\\n\\nA basic example of supervised fine-tuning (SFT) for  and  is provided respectively.\\n\\nAn example on \\n\\n### Model\\n\\n+ **Architecture:** Phi-4-multimodal-instruct has 5.6B parameters and is a multimodal transformer model. The model has the pretrained Phi-4-Mini-Instruct as the backbone language model, and the advanced encoders and adapters of vision and speech.\\n+ **Inputs:** Text, image, and audio. It is best suited for prompts using the chat format.\\n+ **Context length:** 128K tokens\\n+ **GPUs:** 512 A100-80G\\n+ **Training time:** 28 days\\n+ **Training data:** 5T tokens, 2.3M speech hours, and 1.1T image-text tokens\\n+ **Outputs:** Generated text in response to the input\\n+ **Dates:** Trained between December 2024 and January 2025\\n+ **Status:** This is a static model trained on offline datasets with the cutoff date of June 2024 for publicly available data.\\n+ **Supported languages:** \\n  + Text: Arabic, Chinese, Czech, Danish, Dutch, English, Finnish, French, German, Hebrew, Hungarian, Italian, Japanese, Korean, Norwegian, Polish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Ukrainian\\n  + Vision: English\\n  + Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese\\n+ **Release date:** February 2025\\n\\n### Training Datasets\\n\\nPhi-4-multimodal-instruct\\'s training data includes a wide variety of sources, totaling 5 trillion text tokens, and is a combination of \\n1) publicly available documents filtered for quality, selected high-quality educational data, and code\\n2) newly created synthetic, “textbook-like” data for the purpose of teaching math, coding, common sense reasoning, general knowledge of the world (e.g., science, daily activities, theory of mind, etc.)\\n3) high quality human labeled data in chat format\\n4) selected high-quality image-text interleave data\\n5) synthetic and publicly available image, multi-image, and video data\\n6) anonymized in-house speech-text pair data with strong/weak transcriptions\\n7) selected high-quality publicly available and anonymized in-house speech data with task-specific supervisions\\n8) selected synthetic speech data\\n9) synthetic vision-speech data.\\n\\nFocus was placed on the quality of data that could potentially improve the reasoning ability for the model, and the publicly available documents were filtered to contain a preferred level of knowledge. As an example, the result of a game in premier league on a particular day might be good training data for large foundation models, but such information was removed for the Phi-4-multimodal-instruct to leave more model capacity for reasoning for the model\\'s small size. The data collection process involved sourcing information from publicly available documents, with a focus on filtering out undesirable documents and images. To safeguard privacy, image and text data sources were filtered to remove or scrub potentially personal data from the training data.\\nThe decontamination process involved normalizing and tokenizing the dataset, then generating and comparing n-grams between the target dataset and benchmark datasets. Samples with matching n-grams above a threshold were flagged as contaminated and removed from the dataset. A detailed contamination report was generated, summarizing the matched text, matching ratio, and filtered results for further analysis. \\n\\n### Software\\n* \\n* \\n* \\n* \\n* \\n* \\n\\n### Hardware\\nNote that by default, the Phi-4-multimodal-instruct model uses flash attention, which requires certain types of GPU hardware to run. We have tested on the following GPU types:\\n* NVIDIA A100\\n* NVIDIA A6000\\n* NVIDIA H100\\n\\nIf you want to run the model on:\\n* NVIDIA V100 or earlier generation GPUs: call AutoModelForCausalLM.from_pretrained() with _attn_implementation=\"eager\"\\n\\n\\n## Responsible AI Considerations\\n\\n  Click to view detail descriptions\\n\\nLike other language models, the Phi family of models can potentially behave in ways that are unfair, unreliable, or offensive. Some of the limiting behaviors to be aware of include:   \\n+ Quality of Service: The Phi models are trained primarily on English language content across text, speech, and visual inputs, with some additional multilingual coverage. Performance may vary significantly across different modalities and languages:\\n  + Text: Languages other than English will experience reduced performance, with varying levels of degradation across different non-English languages. English language varieties with less representation in the training data may perform worse than standard American English.\\n  + Speech: Speech recognition and processing shows similar language-based performance patterns, with optimal performance for standard American English accents and pronunciations. Other English accents, dialects, and non-English languages may experience lower recognition accuracy and response quality. Background noise, audio quality, and speaking speed can further impact performance.\\n  + Vision: Visual processing capabilities may be influenced by cultural and geographical biases in the training data. The model may show reduced performance when analyzing images containing text in non-English languages or visual elements more commonly found in non-Western contexts. Image quality, lighting conditions, and composition can also affect processing accuracy.\\n+ Multilingual performance and safety gaps: We believe it is important to make language models more widely available across different languages, but the Phi 4 models still exhibit challenges common across multilingual releases. As with any deployment of LLMs, developers will be better positioned to test for performance or safety gaps for their linguistic and cultural context and customize the model with additional fine-tuning and appropriate safeguards.\\n+ Representation of Harms & Perpetuation of Stereotypes: These models can over- or under-represent groups of people, erase representation of some groups, or reinforce demeaning or negative stereotypes. Despite safety post-training, these limitations may still be present due to differing levels of representation of different groups, cultural contexts, or prevalence of examples of negative stereotypes in training data that reflect real-world patterns and societal biases. \\n+ Inappropriate or Offensive Content: These models may produce other types of inappropriate or offensive content, which may make it inappropriate to deploy for sensitive contexts without additional mitigations that are specific to the case. \\n+ Information Reliability: Language models can generate nonsensical content or fabricate content that might sound reasonable but is inaccurate or outdated.   \\n+ Limited Scope for Code: The majority of Phi 4 training data is based in Python and uses common packages such as \"typing, math, random, collections, datetime, itertools\". If the model generates Python scripts that utilize other packages or scripts in other languages, it is strongly recommended that users manually verify all API uses.\\n+ Long Conversation: Phi 4 models, like other models, can in some cases generate responses that are repetitive, unhelpful, or inconsistent in very long chat sessions in both English and non-English languages. Developers are encouraged to place appropriate mitigations, like limiting conversation turns to account for the possible conversational drift.\\n+ Inference of Sensitive Attributes: The Phi 4 models can sometimes attempt to infer sensitive attributes (such as personality characteristics, country of origin, gender, etc...) from the users’ voices when specifically asked to do so. Phi 4-multimodal-instruct is not designed or intended to be used as a biometric categorization system to categorize individuals based on their biometric data to deduce or infer their race, political opinions, trade union membership, religious or philosophical beliefs, sex life, or sexual orientation. This behavior can be easily and efficiently mitigated at the application level by a system message.\\n  \\nDevelopers should apply responsible AI best practices, including mapping, measuring, and mitigating risks associated with their specific use case and cultural, linguistic context. Phi 4 family of models are general purpose models. As developers plan to deploy these models for specific use cases, they are encouraged to fine-tune the models for their use case and leverage the models as part of broader AI systems with language-specific safeguards in place. Important areas for consideration include:\\n\\n+ Allocation: Models may not be suitable for scenarios that could have consequential impact on legal status or the allocation of resources or life opportunities (ex: housing, employment, credit, etc.) without further assessments and additional debiasing techniques.\\n+ High-Risk Scenarios: Developers should assess the suitability of using models in high-risk scenarios where unfair, unreliable or offensive outputs might be extremely costly or lead to harm. This includes providing advice in sensitive or expert domains where accuracy and reliability are critical (ex: legal or health advice). Additional safeguards should be implemented at the application level according to the deployment context. \\n+ Misinformation: Models may produce inaccurate information. Developers should follow transparency best practices and inform end-users they are interacting with an AI system. At the application level, developers can build feedback mechanisms and pipelines to ground responses in use-case specific, contextual information, a technique known as Retrieval Augmented Generation (RAG).   \\n+ Generation of Harmful Content: Developers should assess outputs for their context and use available safety classifiers or custom solutions appropriate for their use case. \\n+ Misuse: Other forms of misuse such as fraud, spam, or malware production may be possible, and developers should ensure that their applications do not violate applicable laws and regulations.\\n\\n\\n## Safety\\n\\n  Click to view detail descriptions\\n\\nThe Phi-4 family of models has adopted a robust safety post-training approach. This approach leverages a variety of both open-source and in-house generated datasets. The overall technique employed for safety alignment is a combination of SFT (Supervised Fine-Tuning), DPO (Direct Preference Optimization), and RLHF (Reinforcement Learning from Human Feedback) approaches by utilizing human-labeled and synthetic English-language datasets, including publicly available datasets focusing on helpfulness and harmlessness, as well as various questions and answers targeted to multiple safety categories. For non-English languages, existing datasets were extended via machine translation. Speech Safety datasets were generated by running Text Safety datasets through Azure TTS (Text-To-Speech) Service, for both English and non-English languages. Vision (text & images) Safety datasets were created to cover harm categories identified both in public and internal multi-modal RAI datasets.\\n\\n### Safety Evaluation and Red-Teaming\\n\\nVarious evaluation techniques including red teaming, adversarial conversation simulations, and multilingual safety evaluation benchmark datasets were leveraged to evaluate Phi-4 models\\' propensity to produce undesirable outputs across multiple languages and risk categories. Several approaches were used to compensate for the limitations of one approach alone. Findings across the various evaluation methods indicate that safety post-training that was done as detailed in the  had a positive impact across multiple languages and risk categories as observed by refusal rates (refusal to output undesirable outputs) and robustness to jailbreak techniques. Details on prior red team evaluations across Phi models can be found in the . For this release, the red teaming effort focused on the newest Audio input modality and on the following safety areas: harmful content, self-injury risks, and exploits. The model was found to be more susceptible to providing undesirable outputs when attacked with context manipulation or persuasive techniques. These findings applied to all languages, with the persuasive techniques mostly affecting French and Italian. This highlights the need for industry-wide investment in the development of high-quality safety evaluation datasets across multiple languages, including low resource languages, and risk areas that account for cultural nuances where those languages are spoken.\\n\\n### Vision Safety Evaluation\\n\\nTo assess model safety in scenarios involving both text and images, Microsoft\\'s Azure AI Evaluation SDK was utilized. This tool facilitates the simulation of single-turn conversations with the target model by providing prompt text and images designed to incite harmful responses. The target model\\'s responses are subsequently evaluated by a capable model across multiple harm categories, including violence, sexual content, self-harm, hateful and unfair content, with each response scored based on the severity of the harm identified. The evaluation results were compared with those of Phi-3.5-Vision and open-source models of comparable size. In addition, we ran both an internal and the public RTVLM and VLGuard multi-modal (text & vision) RAI benchmarks, once again comparing scores with Phi-3.5-Vision and open-source models of comparable size. However, the model may be susceptible to language-specific attack prompts and cultural context.\\n\\n### Audio Safety Evaluation\\n\\nIn addition to extensive red teaming, the Safety of the model was assessed through three distinct evaluations. First, as performed with Text and Vision inputs, Microsoft\\'s Azure AI Evaluation SDK was leveraged to detect the presence of harmful content in the model\\'s responses to Speech prompts. Second,  was run to verify that Speech-To-Text transcription worked well across a variety of demographics. Third, we proposed and evaluated a mitigation approach via a system message to help prevent the model from inferring sensitive attributes (such as gender, sexual orientation, profession, medical condition, etc...) from the voice of a user.\\n\\n  \\n## License\\nThe model is licensed under the .\\n\\n## Trademarks\\nThis project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow\\u202f. Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party\\'s policies.\\n\\n\\n## Appendix A: Benchmark Methodology\\n\\n\\n  Click to view detail descriptions\\n\\nWe include a brief word on methodology here - and in particular, how we think about optimizing prompts.\\nIn an ideal world, we would never change any prompts in our benchmarks to ensure it is always an apples-to-apples comparison when comparing different models. Indeed, this is our default approach, and is the case in the vast majority of models we have run to date.\\nThere are, however, some exceptions to this. In some cases, we see a model that performs worse than expected on a given eval due to a failure to respect the output format. For example:\\n\\n+ A model may refuse to answer questions (for no apparent reason), or in coding tasks models may prefix their response with “Sure, I can help with that. …” which may break the parser. In such cases, we have opted to try different system messages (e.g. “You must always respond to a question” or “Get to the point!”).\\n+ Some models, we observed that few shots actually hurt model performance. In this case we did allow running the benchmarks with 0-shots for all cases.\\n+ We have tools to convert between chat and completions APIs. When converting a chat prompt to a completion prompt, some models have different keywords e.g. Human vs User. In these cases, we do allow for model-specific mappings for chat to completion prompts.\\n\\nHowever, we do not:\\n\\n+ Pick different few-shot examples. Few shots will always be the same when comparing different models.\\n+ Change prompt format: e.g. if it is an A/B/C/D multiple choice, we do not tweak this to 1/2/3/4 multiple choice.\\n\\n### Vision Benchmark Settings\\n\\nThe goal of the benchmark setup is to measure the performance of the LMM when a regular user utilizes these models for a task involving visual input. To this end, we selected 9 popular and publicly available single-frame datasets and 3 multi-frame benchmarks that cover a wide range of challenging topics and tasks (e.g., mathematics, OCR tasks, charts-and-plots understanding, etc.) as well as a set of high-quality models. \\nOur benchmarking setup utilizes zero-shot prompts and all the prompt content are the same for every model. We only formatted the prompt content to satisfy the model\\'s prompt API. This ensures that our evaluation is fair across the set of models we tested. Many benchmarks necessitate models to choose their responses from a presented list of options. Therefore, we\\'ve included a directive in the prompt\\'s conclusion, guiding all models to pick the option letter that corresponds to the answer they deem correct.\\nIn terms of the visual input, we use the images from the benchmarks as they come from the original datasets. We converted these images to base-64 using a JPEG encoding for models that require this format (e.g., GPTV, Claude Sonnet 3.5, Gemini 1.5 Pro/Flash). For other models (e.g., Llava Interleave, and InternVL2 4B and 8B), we used their Huggingface interface and passed in PIL images or a JPEG image stored locally. We did not scale or pre-process images in any other way.\\nLastly, we used the same code to extract answers and evaluate them using the same code for every considered model. This ensures that we are fair in assessing the quality of their answers.\\n\\n### Speech Benchmark Settings\\n\\nThe objective of this benchmarking setup is to assess the performance of models in speech and audio understanding tasks as utilized by regular users. To accomplish this, we selected several state-of-the-art open-sourced and closed-sourced models and performed evaluations across a variety of public and in-house benchmarks. These benchmarks encompass diverse and challenging topics, including Automatic Speech Recognition (ASR), Automatic Speech Translation (AST), Spoken Query Question Answering (SQQA), Audio Understanding (AU), and Speech Summarization.\\nThe results are derived from evaluations conducted on identical test data without any further clarifications. All results were obtained without sampling during inference. For an accurate comparison, we employed consistent prompts for models across different tasks, except for certain model APIs (e.g., GPT-4o), which may refuse to respond to specific prompts for some tasks.\\nIn conclusion, we used uniform code to extract answers and evaluate them for all considered models. This approach ensured fairness by assessing the quality of their responses.\\n\\n### Benchmark datasets\\n\\nThe model was evaluated across a breadth of public and internal benchmarks to understand it\\'s capabilities under multiple tasks and conditions. While most evaluations use English, multilingual benchmark was incorporated to cover performance in select languages.  More specifically,\\n+ Vision: \\n  + Popular aggregated benchmark:\\n    + MMMU and MMMU-Pro: massive multi-discipline tasks at college-level subject knowledge and deliberate reasoning.\\n\\t+ MMBench: large-scale benchmark to evaluate perception and reasoning capabilities.\\n  +\\tVisual reasoning:\\n    + ScienceQA: multimodal visual question answering on science.\\n\\t+ MathVista: visual math reasoning.\\n\\t+ InterGPS: Visual 2D geometry reasoning.\\n  +\\tChart reasoning:\\n\\t+ ChartQA: visual and logical reasoning on charts.\\n\\t+ AI2D: diagram understanding.\\n  +\\tDocument Intelligence:\\n\\t+ TextVQA: read and reason about text in images to answer questions about them.\\n\\t+ InfoVQA: read and reason about high-resolution infographics images with arbitrary aspect ratios.\\n\\t+ DocVQA: read and reason about document images with dense texts and handwritten texts.\\n\\t+ OCRBench: test OCR and QA capability on diverse text related images.\\n  +\\tVision speech multimodal understanding:\\n\\t+ s_AI2D: diagram understanding with speech as the question format.\\n\\t+ s_ChartQA: visual and logical reasoning on charts with speech as the question format.\\n\\t+ s_InfoVQA: read and reason about high-resolution infographics images with speech as the question format.\\n\\t+ s_DocVQA: read and reason about document images with dense texts and handwritten texts with speech as the question format.\\n  + RAI & Security Benchmarks:\\n\\t+ VLGuardExt: VLGuard is a vision-language instruction following public dataset for model safety to address safety on deception\\n    discrimination, privacy and risky behavior (advice, sexual, violence, political). This was extended to a few internal categories such as child safety and election critical information.\\n\\t+ RTVLM: Public benchmark for red-teaming vision-language model on model truthfulness, privacy, safety, and fairness.\\n\\t+ GPTV-RAI: In-house benchmark for GPT-4V released from Azure AI, measuring harmfulness (ex. sexual, violent, hate and self-harm), privacy, jailbreak, misinformation.\\n\\n+ Speech: \\n  + CommonVoice v15 is an open-source, multilingual speech dataset developed by Mozilla. It includes over 33,000 hours of speech data in 133 languages, contributed and validated by volunteers worldwide.The evaluations were conducted in the eight supported languages.\\n  + The OpenASR Leaderboard on Hugging Face is designed for benchmarking and evaluating the robustness of ASR models on English. The datasets in the leaderboard cover diverse speech domains including reading speech, conversations, meetings, and so on.\\n  + CoVoST2 is a multilingual speech-to-text translation dataset derived from Mozilla\\'s Common Voice project. It is one of the largest open datasets available for speech translation, providing support for both X-to-English (X→En) and English-to-X (En→X) translation tasks. The directions with supported languages were evaluated on the test sets.\\n  + FLEURS is a multilingual speech dataset designed for evaluating speech recognition and speech-to-text translation models across a wide range of languages. The test sets for speech recognition and translation tasks were evaluated with the eight supported languages.\\n  + MT Bench (Multi-turn Benchmark) is specifically designed to evaluate the conversational and instruction-following abilities of AI models in multi-turn question-answering (QA) scenarios. To support spoken questions, the text is synthesized into speech.\\n  + MMMLU (Multilingual Massive Multitask Language Understanding) is an extensive benchmark designed to evaluate the general knowledge and reasoning capabilities of AI models across a wide array of subjects. To support spoken questions, the text is synthesized into its speech counterpart.  The model was evaluated on the eight supported languages for this test set. \\n  + AIR-Bench Chat (Audio Instruction and Response Benchmark) is a comprehensive evaluation framework designed to test the capabilities of large audio language models (LALMs). It includes both foundation and chat benchmarks. The chat benchmark is selected for its open-ended question answering for audio capability.\\n  + MMAU (Massive Multi-Task Audio Understanding) is a comprehensive dataset designed to evaluate the capabilities of multi-modal models in audio-based understanding and reasoning tasks. The test sets are in the form of multiple-choices QA, covering the categories of music, sound, and speech.\\n  + Golden3 is a real-world meeting dataset, containing 108 meeting recordings with corresponding transcripts, averaging 6 minutes each. It is recorded across 30 conference rooms, featuring 4-8 attendees. The dataset is primarily in English, covering a wide range of topics. GPT4 is employed to generate summarization instructions that ask to summarize partial or the entire conversation or control the output style/length/structure.\\n  + AMI (Augmented Multi-Party Interaction) is a comprehensive collection of meeting recordings, encompassing approximately 100 hours of data. The test split contains 20 meeting recordings with an average duration of 32 minutes. The model was tested on the close-talking version of audio. GPT4 is employed to generate summarization instructions that ask to summarize partial or the entire conversation or control the output style/length/structure.\\n\\n+ Safety and RAI:\\n  + Single-turn trustworthiness evaluation:\\n    + DecodingTrust: DecodingTrust is a collection of trustworthiness benchmarks in eight different perspectives\\n    + XSTest: XSTest is an exaggerated safety evaluation\\n    + Toxigen: Toxigen is adversarial and hate speech detection\\n  + Red Team:\\n    + Responses to prompts provided by AI Red Team at Microsoft\\n\\n\\n\\n## Appendix B: Fine-tuning Korean speech\\n\\n\\n  Click to view detail descriptions\\n\\n### Overview and Datasets\\n\\nPhi-4-multimodal is originally not designed for Korean speech-to-text task, but it can be fine-tuned for Korean speech-to-text task using your own data or public Korean speech datasets.\\n\\nWe have fine-tuned Phi-4-multimodal model for Korean speech-to-text task using the following datasets:\\n\\n- kresnik/zeroth_korean\\n- mozilla-foundation/common_voice_17_0 (Used Korean speech only)\\n- PolyAI/minds14 (Used Korean speech only)\\n- Custom dataset. The speech was a mix of fast and slow speech (Technical blog contents and presentations that the author have posted), with some modulation using  and \\n\\nTotal 35K samples. Each sample is a pair of Korean speech and its transcription. Dataset was sampled 16kHz.\\n\\nYou can download the fine-tuned model . Please refer to the Jupyter notebook and video clips in the . They are not production-quality as they were simply fine-tuned for PoC purposes, but you can see that they transcribe and translate with high accuracy even when a native speaker speaks quite quickly.\\n\\n### Requirements\\nBased on Python 3.10, the following packages are required, and A100/H100 GPU is recommended.\\n```\\ntorch==2.6.0\\ntransformers==4.48.2\\naccelerate==1.4.0\\nsoundfile==0.13.1\\npillow==11.1.0\\nscipy==1.15.2\\ntorchvision==0.21.0\\nbackoff==2.2.1\\npeft==0.14.0\\ndatasets==3.3.2\\npandas==2.2.3\\nflash_attn==2.7.4.post1\\nevaluate==0.4.3\\nsacrebleu==2.5.1  \\n```\\n\\n### Training\\nThe model was trained on a single A100 80GB GPU for 4 epochs with a batch size of 16 using the `sample_finetune_speech.py` script from \\n\\nThe fine tuning script and command line are basically the same as , but you need to prepare your own dataset. Also, to perform audio encoder unfreeze, please refer to the code snippet below. The code snippet is retrieved from .\\n\\n```python\\nwith accelerator.local_main_process_first():\\n    processor = AutoProcessor.from_pretrained(\\n        \"microsoft/Phi-4-multimodal-instruct\",\\n        trust_remote_code=True,\\n    )\\n    model = create_model(\\n        args.model_name_or_path,\\n        use_flash_attention=args.use_flash_attention,\\n    )\\n\\ndef unfreeze_speech_components(model):\\n    \"\"\"Directly target verified components from your debug logs\"\"\"\\n    # 1. Audio Embed Module (confirmed exists)\\n    audio_embed = model.model.embed_tokens_extend.audio_embed\\n\\n    # 2. Entire Audio Encoder (simplified)\\n    audio_encoder = audio_embed.encoder  # Direct access\\n\\n    # 3. Audio Projection (from debug logs)\\n    audio_projection = audio_embed.audio_projection\\n\\n    # Unfreeze ONLY these 3 components\\n    for component in [audio_embed, audio_encoder, audio_projection]:\\n        for param in component.parameters():\\n            param.requires_grad = True\\n    return model\\n\\nmodel = unfreeze_speech_components(model)\\n\\n# Verify unfrozen parameters\\ntrainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\\nprint(f\"Trainable parameters: {trainable_params:,}\")\\n\\n# After unfreezing\\nencoder_params = list(model.model.embed_tokens_extend.audio_embed.encoder.parameters())\\nproj_params = list(model.model.embed_tokens_extend.audio_embed.audio_projection.parameters())\\n\\nassert any(p.requires_grad for p in encoder_params), \"Encoder params frozen!\"\\nassert any(p.requires_grad for p in proj_params), \"Projection params frozen!\"\\nprint(\"Components properly unfrozen ✅\")    \\n```\\n\\nExample commands to run finetuning scripts are as follows:\\n```bash\\npython main.py\\n```\\n\\nThe latest version of the model currently uploaded was fine-tuned by **unfreezing the audio encoder**, and the ASR performance was significantly improved compared to the baseline LoRA adapter-based fine-tuning. \\nComparing the full fine-tuning and LoRA fine-tuning, the CER on zeroth-test set is **1.61%** and 2.72%, and the WER on zeroth-test set is **3.54%** and 7.19%, respectively. Please refer to the  for more details.\\n\\n### Experimental Settings and Results\\nThe purpose of this benchmarking setup is to evaluate the basic performance of Korean audio in speech and audio understanding tasks. We did this for automatic speech recognition and automatic speech translation, and the test data used the following datasets and samples:\\n\\nEvaluation was done on the following datasets:\\n+ ASR (Automatic Speech Recognition): Evaluated with CER (Character Error Rate) and WER (Word Error Rate) on .\\n+ AST (Automatic Speech Translation): Evaluated with BLEU score on .\\n\\nEvaluation Script is retrieved from \\n\\nWe used the  as a baseline to improve performance, as it showed significant performance improvement with 1 epoch. Note that the baseline was trained with  for 1 epoch. Based on this baseline with 35K training samples, we conducted additional experiments with the following scenarios:\\n\\n+ [Case 1] LoRA finetune (1 epoch): LoRA adapter-based fine-tuning for 1 epochs\\n+ [Case 2] LoRA finetune (4 epochs): LoRA adapter-based fine-tuning for 4 epochs\\n+ [Case 3] Unfreeze audio encoder finetune (4 epochs): Full fine-tuning for 4 epochs. \\n\\nThe results of the experiments are as follows:\\n+ CER and WER for zeroth-test set (Lower is better)\\n  + Case 1\\'s CER and WER are 3.80% and 11.52%, respectively, which are better than the baseline (7.02% and 17.31%).\\n  + Case 2\\'s CER and WER are 2.72% and 7.19%, respectively, which are better than Case 1.\\n  + Case 3\\'s CER and WER are 1.61% and 3.54%, respectively, which are the best among the cases.\\n\\n+ BLEU score for fleurs ko  en speech translation test set (Higher is better)\\n  + Case 1\\'s result is not improved compared to the baseline. Especially, the BLEU score for fleurs-ko2en-cot is decreased compared to the baseline.\\n  + Case 2\\'s result is slightly improved compared to Case 1, which is the best among the cases.\\n  + Case 3\\'s result is not improved compared to the baseline and Case 2.\\n  \\n| Model                          | zeroth (CER) | zeroth (WER) | fleurs-ko2en | fleurs-ko2en-cot | fleurs-en2ko | fleurs-en2ko-cot |\\n|--------------------------------|-------------|-------------|--------------|------------------|--------------|------------------|\\n| original                       | 99.16       | 99.63       | 5.63         | 2.42             | 6.86         | 4.17             |\\n| Ours - speech full finetune (4 epochs) | 1.61        | 3.54        | 7.67         | 8.38             | 12.31        | 9.69             |\\n| LoRA finetune (4 epochs)        | 2.72        | 7.19        | 7.11         | 9.95             | 13.22        | 10.45            |\\n| LoRA finetune (1 epoch)         | 3.80        | 11.52       | 7.03         | 7.04             | 12.50        | 9.54             |\\n| Phi-4-mm-inst-zeroth-kor        | 7.02        | 17.31       | 7.07         | 9.19             | 13.08        | 9.35             |\\n\\n## Cautions\\n\\nNote that this model is just a PoC/experimental purpose, and not intended to be used in production. More high-quality data, tuning, ablation studies, and experiments are needed.\\n\\nPhi-4-multimodal model is strong in multimodal tasks, especially in speech-to-text and high potential in Korean language tasks. Thus if you are interested in Korean speech-to-text task, this model can be a good starting point.\\n\\n## References\\n\\n- \\n- \\n\\n## Data Summary\\n\\n\\n',\n",
       "  'domain': 'visual-question-answering'},\n",
       " {'model_id': 'prithivMLmods/Common-Voice-Gender-Detection',\n",
       "  'created_at': '2025-05-31T10:39:16+00:00',\n",
       "  'downloads': 206572,\n",
       "  'likes': 18,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'wav2vec2',\n",
       "   'audio-classification',\n",
       "   'voice-gender-detection',\n",
       "   'male',\n",
       "   'female',\n",
       "   'biology',\n",
       "   'SFT',\n",
       "   'en',\n",
       "   'arxiv:2006.11477',\n",
       "   'base_model:facebook/wav2vec2-base-960h',\n",
       "   'base_model:finetune:facebook/wav2vec2-base-960h',\n",
       "   'doi:10.57967/hf/5684',\n",
       "   'license:apache-2.0',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n\\n\\n# Common-Voice-Gender-Detection\\n\\n> **Common-Voice-Gender-Detection** is a fine-tuned version of `facebook/wav2vec2-base-960h` for **binary audio classification**, specifically trained to detect speaker gender as **female** or **male**. This model leverages the `Wav2Vec2ForSequenceClassification` architecture for efficient and accurate voice-based gender classification.\\n\\n> [!note]\\nWav2Vec2: Self-Supervised Learning for Speech Recognition : \\n\\n```py\\nClassification Report:\\n\\n              precision    recall  f1-score   support\\n\\n      female     0.9705    0.9916    0.9809      2622\\n        male     0.9943    0.9799    0.9870      3923\\n\\n    accuracy                         0.9846      6545\\n   macro avg     0.9824    0.9857    0.9840      6545\\nweighted avg     0.9848    0.9846    0.9846      6545\\n```\\n\\n\\n\\n\\n\\n---\\n\\n## Label Space: 2 Classes\\n\\n```\\nClass 0: female  \\nClass 1: male\\n```\\n\\n---\\n\\n## Install Dependencies\\n\\n```bash\\npip install gradio transformers torch librosa hf_xet\\n```\\n\\n---\\n\\n## Inference Code\\n\\n```python\\nimport gradio as gr\\nfrom transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor\\nimport torch\\nimport librosa\\n\\n# Load model and processor\\nmodel_name = \"prithivMLmods/Common-Voice-Geneder-Detection\"\\nmodel = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)\\nprocessor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)\\n\\n# Label mapping\\nid2label = {\\n    \"0\": \"female\",\\n    \"1\": \"male\"\\n}\\n\\ndef classify_audio(audio_path):\\n    # Load and resample audio to 16kHz\\n    speech, sample_rate = librosa.load(audio_path, sr=16000)\\n\\n    # Process audio\\n    inputs = processor(\\n        speech,\\n        sampling_rate=sample_rate,\\n        return_tensors=\"pt\",\\n        padding=True\\n    )\\n\\n    with torch.no_grad():\\n        outputs = model(**inputs)\\n        logits = outputs.logits\\n        probs = torch.nn.functional.softmax(logits, dim=1).squeeze().tolist()\\n\\n    prediction = {\\n        id2label[str(i)]: round(probs[i], 3) for i in range(len(probs))\\n    }\\n\\n    return prediction\\n\\n# Gradio Interface\\niface = gr.Interface(\\n    fn=classify_audio,\\n    inputs=gr.Audio(type=\"filepath\", label=\"Upload Audio (WAV, MP3, etc.)\"),\\n    outputs=gr.Label(num_top_classes=2, label=\"Gender Classification\"),\\n    title=\"Common Voice Gender Detection\",\\n    description=\"Upload an audio clip to classify the speaker\\'s gender as female or male.\"\\n)\\n\\nif __name__ == \"__main__\":\\n    iface.launch()\\n```\\n\\n---\\n\\n## Demo Inference\\n\\n> [!note]\\nmale\\n\\n\\n\\n\\n\\n\\n> [!note]\\nfemale\\n\\n\\n\\n\\n\\n--- \\n\\n## Intended Use\\n\\n`Common-Voice-Gender-Detection` is designed for:\\n\\n* **Speech Analytics** – Assist in analyzing speaker demographics in call centers or customer service recordings.\\n* **Conversational AI Personalization** – Adjust tone or dialogue based on gender detection for more personalized voice assistants.\\n* **Voice Dataset Curation** – Automatically tag or filter voice datasets by speaker gender for better dataset management.\\n* **Research Applications** – Enable linguistic and acoustic research involving gender-specific speech patterns.\\n* **Multimedia Content Tagging** – Automate metadata generation for gender identification in podcasts, interviews, or video content.  ',\n",
       "  'domain': 'audio-classification'},\n",
       " {'model_id': 'depth-anything/DA3METRIC-LARGE',\n",
       "  'created_at': '2025-11-13T18:49:18+00:00',\n",
       "  'downloads': 197184,\n",
       "  'likes': 9,\n",
       "  'author': None,\n",
       "  'tags': ['depth-anything-3',\n",
       "   'safetensors',\n",
       "   'depth-estimation',\n",
       "   'computer-vision',\n",
       "   'monocular-depth',\n",
       "   'multi-view-geometry',\n",
       "   'pose-estimation',\n",
       "   'license:apache-2.0',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# Depth Anything 3: DA3METRIC-LARGE\\n\\n\\n\\n\\n\\n  # noqa: E501\\n\\n\\n\\n\\n## Model Description\\n\\nDA3 Metric Large model specialized for metric depth estimation in monocular settings, ideal for applications requiring real-world scale. Canonical metric depth; multiplying by focal length gives metric depth.\\n\\n| Property | Value |\\n|----------|-------|\\n| **Model Series** | Monocular Metric Depth |\\n| **Parameters** | 0.35B |\\n| **License** | Apache 2.0 |\\n\\n\\n\\n## Capabilities\\n\\n- ✅ Relative Depth\\n- ✅ Metric Depth\\n- ✅ Sky Segmentation\\n\\n## Quick Start\\n\\n### Installation\\n\\n```bash\\ngit clone \\ncd depth-anything-3\\npip install -e .\\n```\\n\\n### Basic Example\\n\\n```python\\nimport torch\\nfrom depth_anything_3.api import DepthAnything3\\n\\n# Load model from Hugging Face Hub\\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\\nmodel = DepthAnything3.from_pretrained(\"depth-anything/da3metric-large\")\\nmodel = model.to(device=device)\\n\\n# Run inference on images\\nimages = [\"image1.jpg\", \"image2.jpg\"]  # List of image paths, PIL Images, or numpy arrays\\nprediction = model.inference(\\n    images,\\n    export_dir=\"output\",\\n    export_format=\"glb\"  # Options: glb, npz, ply, mini_npz, gs_ply, gs_video\\n)\\n\\n# Access results\\nprint(prediction.depth.shape)        # Depth maps: [N, H, W] float32\\nprint(prediction.conf.shape)         # Confidence maps: [N, H, W] float32\\nprint(prediction.extrinsics.shape)   # Camera poses (w2c): [N, 3, 4] float32\\nprint(prediction.intrinsics.shape)   # Camera intrinsics: [N, 3, 3] float32\\n```\\n\\n### Command Line Interface\\n\\n```bash\\n# Process images with auto mode\\nda3 auto path/to/images \\\\\\n    --export-format glb \\\\\\n    --export-dir output \\\\\\n    --model-dir depth-anything/da3metric-large\\n\\n# Use backend for faster repeated inference\\nda3 backend --model-dir depth-anything/da3metric-large\\nda3 auto path/to/images --export-format glb --use-backend\\n```\\n\\n## Model Details\\n\\n- **Developed by:** ByteDance Seed Team\\n- **Model Type:** Vision Transformer for Visual Geometry\\n- **Architecture:** Plain transformer with unified depth-ray representation\\n- **Training Data:** Public academic datasets only\\n\\n### Key Insights\\n\\n💎 A **single plain transformer** (e.g., vanilla DINO encoder) is sufficient as a backbone without architectural specialization.  # noqa: E501\\n\\n✨ A singular **depth-ray representation** obviates the need for complex multi-task learning.\\n\\n## Performance\\n\\n🏆 Depth Anything 3 significantly outperforms:\\n- **Depth Anything 2** for monocular depth estimation\\n- **VGGT** for multi-view depth estimation and pose estimation\\n\\nFor detailed benchmarks, please refer to our .  # noqa: E501\\n\\n## Limitations\\n\\n- The model is trained on academic datasets and may have limitations on certain domain-specific images  # noqa: E501\\n- Performance may vary depending on image quality, lighting conditions, and scene complexity\\n\\n\\n## Citation\\n\\nIf you find Depth Anything 3 useful in your research or projects, please cite:\\n\\n```bibtex\\n@article{depthanything3,\\n  title={Depth Anything 3: Recovering the visual space from any views},\\n  author={Haotong Lin and Sili Chen and Jun Hao Liew and Donny Y. Chen and Zhenyu Li and Guang Shi and Jiashi Feng and Bingyi Kang},  # noqa: E501\\n  journal={arXiv preprint arXiv:XXXX.XXXXX},\\n  year={2025}\\n}\\n```\\n\\n## Links\\n\\n- 🏠 \\n- 📄 \\n- 💻 \\n- 🤗 \\n- 📚 \\n\\n## Authors\\n\\n ·  ·  ·  ·  ·  ·  ·   # noqa: E501\\n',\n",
       "  'domain': 'depth-estimation'},\n",
       " {'model_id': 'numz/SeedVR2_comfyUI',\n",
       "  'created_at': '2025-06-20T05:05:46+00:00',\n",
       "  'downloads': 176327,\n",
       "  'likes': 183,\n",
       "  'author': None,\n",
       "  'tags': ['diffusers',\n",
       "   'art',\n",
       "   'video-to-video',\n",
       "   'base_model:ByteDance-Seed/SeedVR2-3B',\n",
       "   'base_model:finetune:ByteDance-Seed/SeedVR2-3B',\n",
       "   'license:apache-2.0',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# ComfyUI-SeedVR2_VideoUpscaler\\n\\n\\n\\nOfficial release of  for ComfyUI that enables high-quality video and image upscaling.\\n\\nCan run as **Multi-GPU standalone CLI** too, see  section.\\n\\n\\n\\n\\n\\n\\n\\n## 📋 Quick Access\\n\\n- \\n- \\n- \\n- \\n- \\n- \\n- \\n- \\n- \\n- \\n- \\n\\n## 🆙 Future Releases\\n\\nWe\\'re actively working on improvements and new features. To stay informed:\\n\\n- **📌 Track Active Development**: Visit  to see active development, report bugs, and request new features\\n- **💬 Join the Community**: Learn from others, share your workflows, and get help in the \\n- **🔮 Next Model Survey**: We\\'re looking for community input on the next open-source super-powerful generic restoration model. Share your suggestions in \\n\\n## 🚀 Updates\\n\\n**2025.11.09 - Version 2.5.5**\\n\\n- 💾 **Memory: Fixed RAM leak for long videos** - On-demand reconstruction with lightweight batch indices instead of storing full transformed videos, fixed release_tensor_memory to handle CPU/CUDA/MPS consistently, and refactored batch processing helpers\\n\\n**2025.11.08 - Version 2.5.4**\\n\\n- 🎨 **Fix: AdaIN color correction** - Replace `.view()` with `.reshape()` to handle non-contiguous tensors after spatial padding, resolving \"view size is not compatible with input tensor\\'s size and stride\" error\\n- 🔴 **Fix: AMD ROCm compatibility** - Add cuDNN availability check in Conv3d workaround to prevent \"ATen not compiled with cuDNN support\" error on ROCm systems (AMD GPUs on Windows/Linux)\\n\\n**2025.11.08 - Version 2.5.3**\\n\\n- 🍎 **Fix: Apple Silicon MPS device handling** - Corrected MPS device enumeration to use `\"mps\"` instead of `\"mps:0\"`, resolving invalid device errors on M-series Macs\\n- 🪟 **Fix: torch.mps AttributeError on Windows** - Add defensive checks for `torch.mps.is_available()` to handle PyTorch versions where the method doesn\\'t exist on non-Mac platforms\\n\\n**2025.11.07 - Version 2.5.0** 🎉\\n\\n⚠️ **BREAKING CHANGE**: This is a major update requiring workflow recreation. All nodes and CLI parameters have been redesigned for better usability and consistency. Watch the latest video from  for a deep dive and check out the  section.\\n\\n**📦 Official Release**: Now available on main branch with ComfyUI Manager support for easy installation and automatic version tracking. Updated dependencies and local imports prevent conflicts with other ComfyUI custom nodes.\\n\\n### 🎨 ComfyUI Improvements\\n\\n- **Four-Node Modular Architecture**: Split into dedicated nodes for DiT model, VAE model, torch.compile settings, and main upscaler for granular control\\n- **Global Model Cache**: Models now shared across multiple upscaler instances with automatic config updates - no more redundant loading\\n- **ComfyUI V3 Migration**: Full compatibility with ComfyUI V3 stateless node design\\n- **RGBA Support**: Native alpha channel processing with edge-guided upscaling for clean transparency\\n- **Improved Memory Management**: Streaming architecture prevents VRAM spikes regardless of video length\\n- **Flexible Resolution Support**: Upscale to any resolution divisible by 2 with lossless padding approach (replaced restrictive cropping)\\n- **Enhanced Parameters**: Added `uniform_batch_size`, `temporal_overlap`, `prepend_frames`, and `max_resolution` for better control\\n\\n### 🖥️ CLI Enhancements\\n\\n- **Batch Directory Processing**: Process entire folders of videos/images with model caching for efficiency\\n- **Single Image Support**: Direct image upscaling without video conversion\\n- **Smart Output Detection**: Auto-detects output format (MP4/PNG) based on input type\\n- **Enhanced Multi-GPU**: Improved workload distribution with temporal overlap blending\\n- **Unified Parameters**: CLI and ComfyUI now use identical parameter names for consistency\\n- **Better UX**: Auto-display help, validation improvements, progress tracking, and cleaner output\\n\\n### ⚡ Performance & Optimization\\n\\n- **torch.compile Support**: 20-40% DiT speedup and 15-25% VAE speedup with full graph compilation\\n- **Optimized BlockSwap**: Adaptive memory clearing (5% threshold), separate I/O component handling, reduced overhead\\n- **Enhanced VAE Tiling**: Tensor offload support for accumulation buffers, separate encode/decode configuration\\n- **Native Dtype Pipeline**: Eliminated unnecessary conversions, maintains bfloat16 precision throughout for speed and quality\\n- **Optimized Tensor Operations**: Replaced einops rearrange with native PyTorch ops for 2-5x faster transforms\\n\\n### 🎯 Quality Improvements\\n\\n- **LAB Color Correction**: New perceptual color transfer method with superior color accuracy (now default)\\n- **Additional Color Methods**: HSV saturation matching, wavelet adaptive, and hybrid approaches\\n- **Deterministic Generation**: Seed-based reproducibility with phase-specific seeding strategy\\n- **Better Temporal Consistency**: Hann window blending for smooth transitions between batches\\n\\n### 💾 Memory Management\\n\\n- **Smarter Offloading**: Independent device configuration for DiT, VAE, and tensors (CPU/GPU/none)\\n- **Four-Phase Pipeline**: Completes each phase (encode→upscale→decode→postprocess) for all batches before moving to next, minimizing model swaps\\n- **Better Cleanup**: Phase-specific resource management with proper tensor memory release\\n- **Peak VRAM Tracking**: Per-phase memory monitoring with summary display\\n\\n### 🔧 Technical Improvements\\n\\n- **GGUF Quantization Support**: Added full GGUF support for 4-bit/8-bit inference on low-VRAM systems\\n- **Improved GGUF Handling**: Fixed VRAM leaks, torch.compile compatibility, non-persistent buffers\\n- **Apple Silicon Support**: Full MPS (Metal Performance Shaders) support for Apple Silicon Macs\\n- **AMD ROCm Compatibility**: Conditional FSDP imports for PyTorch ROCm 7+ support\\n- **Conv3d Memory Workaround**: Fixes PyTorch 2.9+ cuDNN memory bug (3x usage reduction)\\n- **Flash Attention Optional**: Graceful fallback to SDPA when flash-attn unavailable\\n\\n### 📚 Code Quality\\n\\n- **Modular Architecture**: Split monolithic files into focused modules (generation_phases, model_configuration, etc.)\\n- **Comprehensive Documentation**: Extensive docstrings with type hints across all modules\\n- **Better Error Handling**: Early validation, clear error messages, installation instructions\\n- **Consistent Logging**: Unified indentation, better categorization, concise messages\\n\\n**2025.08.07**\\n\\n- 🎯 **Unified Debug System**: New structured logging with categories, timers, and memory tracking. `enable_debug` now available on main node\\n- ⚡ **Smart FP8 Optimization**: FP8 models now keep native FP8 storage, converting to BFloat16 only for arithmetic - faster and more memory efficient than FP16\\n- 📦 **Model Registry**: Multi-repo support (numz/ & AInVFX/), auto-discovery of user models, added mixed FP8 variants to fix 7B artifacts\\n- 💾 **Model Caching**: `cache_model` moved to main node, fixed memory leaks with proper RoPE/wrapper cleanup\\n- 🧹 **Code Cleanup**: New modular structure (`constants.py`, `model_registry.py`, `debug.py`), removed legacy code\\n- 🚀 **Performance**: Better memory management with `torch.cuda.ipc_collect()`, improved RoPE handling\\n\\n**2025.07.17**\\n\\n- 🛠️ Add 7B sharp Models: add 2 new 7B models with sharpen output\\n\\n**2025.07.11**\\n\\n- 🎬 Complete tutorial released: Adrien from  created an in-depth ComfyUI SeedVR2 guide covering everything from basic setup to advanced BlockSwap techniques for running on consumer GPUs. Perfect for understanding memory optimization and upscaling of image sequences with alpha channel! \\n\\n**2025.09.07**\\n\\n- 🛠️ Blockswap Integration: Big thanks to  from  for this :), useful for low VRAM users (see  section)\\n\\n**2025.07.03**\\n\\n- 🛠️ Can run as **standalone mode** with **Multi GPU** see \\n\\n**2025.06.30**\\n\\n- 🚀 Speed Up the process and less VRAM used\\n- 🛠️ Fixed memory leak on 3B models\\n- ❌ Can now interrupt process if needed\\n- ✅ Refactored the code for better sharing with the community, feel free to propose pull requests\\n- 🛠️ Removed flash attention dependency (thanks to  !!)\\n\\n**2025.06.24**\\n\\n- 🚀 Speed up the process until x4\\n\\n**2025.06.22**\\n\\n- 💪 FP8 compatibility !\\n- 🚀 Speed Up all Process\\n- 🚀 less VRAM consumption (Stay high, batch_size=1 for RTX4090 max, I\\'m trying to fix that)\\n- 🛠️ Better benchmark coming soon\\n\\n**2025.06.20**\\n\\n- 🛠️ Initial push\\n\\n## 🎯 Features\\n\\n### Core Capabilities\\n- **High-Quality Diffusion-Based Upscaling**: One-step diffusion model for video and image enhancement\\n- **Temporal Consistency**: Maintains coherence across video frames with configurable batch processing\\n- **Multi-Format Support**: Handles RGB and RGBA (alpha channel) for both videos and images\\n- **Any Video Length**: Suitable for any video length\\n\\n### Model Support\\n- **Multiple Model Variants**: 3B and 7B parameter models with different precision options\\n- **FP16, FP8, and GGUF Quantization**: Choose between full precision (FP16), mixed precision (FP8), or heavily quantized GGUF models for different VRAM requirements\\n- **Automatic Model Downloads**: Models are automatically downloaded from HuggingFace on first use\\n\\n### Memory Optimization\\n- **BlockSwap Technology**: Dynamically swap transformer blocks between GPU and CPU memory to run large models on limited VRAM\\n- **VAE Tiling**: Process large resolutions with tiled encoding/decoding to reduce VRAM usage\\n- **Intelligent Offloading**: Offload models and intermediate tensors to CPU or secondary GPUs between processing phases\\n- **GGUF Quantization Support**: Run models with 4-bit or 8-bit quantization for extreme VRAM savings\\n\\n### Performance Features\\n- **torch.compile Integration**: Optional 20-40% DiT speedup and 15-25% VAE speedup with PyTorch 2.0+ compilation\\n- **Multi-GPU CLI**: Distribute workload across multiple GPUs with automatic temporal overlap blending\\n- **Model Caching**: Keep models loaded in memory for faster batch processing\\n- **Flexible Attention Backends**: Choose between PyTorch SDPA (stable, always available) or Flash Attention 2 (faster on supported hardware)\\n\\n### Quality Control\\n- **Advanced Color Correction**: Five methods including LAB (recommended for highest fidelity), wavelet, wavelet adaptive, HSV, and AdaIN\\n- **Noise Injection Controls**: Fine-tune input and latent noise scales for artifact reduction at high resolutions\\n- **Configurable Resolution Limits**: Set target and maximum resolutions with automatic aspect ratio preservation\\n\\n### Workflow Features\\n- **ComfyUI Integration**: Four dedicated nodes for complete control over the upscaling pipeline\\n- **Standalone CLI**: Command-line interface for batch processing and automation\\n- **Debug Logging**: Comprehensive debug mode with memory tracking, timing information, and processing details\\n- **Progress Reporting**: Real-time progress updates during processing\\n\\n## 🔧 Requirements\\n\\n### Hardware\\n\\nWith the current optimizations (tiling, BlockSwap, GGUF quantization), SeedVR2 can run on a wide range of hardware:\\n\\n- **Minimal VRAM** (8GB or less): Use GGUF Q4_K_M models with BlockSwap and VAE tiling enabled\\n- **Moderate VRAM** (12-16GB): Use FP8 models with BlockSwap or VAE tiling as needed\\n- **High VRAM** (24GB+): Use FP16 models for best quality and speed without memory optimizations\\n\\n### Software\\n\\n- **ComfyUI**: Latest version recommended\\n- **Python**: 3.12+ (Python 3.12 and 3.13 tested and recommended)\\n- **PyTorch**: 2.0+ for torch.compile support (optional but recommended)\\n- **Triton**: Required for torch.compile with inductor backend (optional)\\n- **Flash Attention 2**: Provides faster attention computation on supported hardware (optional, falls back to PyTorch SDPA)\\n\\n## 📦 Installation\\n\\n### Option 1: ComfyUI Manager (Recommended)\\n\\n1. Open ComfyUI Manager in your ComfyUI interface\\n2. Click \"Custom Nodes Manager\"\\n3. Search for \"ComfyUI-SeedVR2_VideoUpscaler\"\\n4. Click \"Install\" and restart ComfyUI\\n\\n**Registry Link**: \\n\\n### Option 2: Manual Installation\\n\\n1. **Clone the repository** into your ComfyUI custom nodes directory:\\n```bash\\ncd ComfyUI\\ngit clone  custom_nodes/seedvr2_videoupscaler\\n```\\n\\n2. **Install dependencies using standalone Python**:\\n```bash\\n# Install requirements (from same ComfyUI directory)\\n# Windows:\\n.venv\\\\Scripts\\\\python.exe -m pip install -r custom_nodes\\\\seedvr2_videoupscaler\\\\requirements.txt\\n# Linux/macOS:\\n.venv/bin/python -m pip install -r custom_nodes/seedvr2_videoupscaler/requirements.txt\\n```\\n\\n3. **Restart ComfyUI**\\n\\n### Model Installation\\n\\nModels will be **automatically downloaded** on first use and saved to `ComfyUI/models/SEEDVR2`.\\n\\nYou can also manually download models from:\\n- Main models available at  and \\n- Additional GGUF models available at \\n\\n## 📖 Usage\\n\\n### 🎬 Video Tutorials\\n\\n#### Latest Version Deep Dive (Recommended)\\n\\nComplete walkthrough of version 2.5 by Adrien from , covering the new 4-node architecture, GGUF support, memory optimizations, and production workflows:\\n\\n\\n\\nThis comprehensive tutorial covers:\\n- Installing v2.5 through ComfyUI Manager and troubleshooting conflicts\\n- Understanding the new 4-node modular architecture and why we rebuilt it\\n- Running 7B models on 8GB VRAM with GGUF quantization\\n- Configuring BlockSwap, VAE tiling, and torch.compile for your hardware\\n- Image and video upscaling workflows with alpha channel support\\n- CLI for batch processing and multi-GPU rendering\\n- Memory optimization strategies for different VRAM levels\\n- Real production tips and the critical batch_size formula (4n+1)\\n\\n#### Previous Version Tutorial\\n\\nFor reference, here\\'s the original tutorial covering the initial release:\\n\\n\\n\\n*Note: This tutorial covers the previous single-node architecture. While the UI has changed significantly in v2.5, the core concepts about BlockSwap and memory management remain valuable.*\\n\\n### Node Setup\\n\\nSeedVR2 uses a modular node architecture with four specialized nodes:\\n\\n#### 1. SeedVR2 (Down)Load DiT Model\\n\\n\\n\\nConfigure the DiT (Diffusion Transformer) model for video upscaling.\\n\\n**Parameters:**\\n\\n- **model**: Choose your DiT model\\n  - **3B Models**: Faster, lower VRAM requirements\\n    - `seedvr2_ema_3b_fp16.safetensors`: FP16 (best quality)\\n    - `seedvr2_ema_3b_fp8_e4m3fn.safetensors`: FP8 8-bit (good quality)\\n    - `seedvr2_ema_3b-Q4_K_M.gguf`: GGUF 4-bit quantized (acceptable quality)\\n    - `seedvr2_ema_3b-Q8_0.gguf`: GGUF 8-bit quantized (good quality)\\n  - **7B Models**: Higher quality, higher VRAM requirements\\n    - `seedvr2_ema_7b_fp16.safetensors`: FP16 (best quality)\\n    - `seedvr2_ema_7b_fp8_e4m3fn_mixed_block35_fp16.safetensors`: FP8 with last block in FP16 to reduce artifacts (good quality)\\n    - `seedvr2_ema_7b-Q4_K_M.gguf`: GGUF 4-bit quantized (acceptable quality)\\n    - `seedvr2_ema_7b_sharp_*`: Sharp variants for enhanced detail\\n\\n- **device**: GPU device for DiT inference (e.g., `cuda:0`)\\n\\n- **offload_device**: Device to offload DiT model when not actively processing\\n  - `none`: Keep model on inference device (fastest, highest VRAM)\\n  - `cpu`: Offload to system RAM (reduces VRAM)\\n  - `cuda:X`: Offload to another GPU (good balance if available)\\n\\n- **cache_model**: Keep DiT model loaded on offload_device between workflow runs\\n  - Useful for batch processing to avoid repeated loading\\n  - Requires offload_device to be set\\n\\n- **blocks_to_swap**: BlockSwap memory optimization\\n  - `0`: Disabled (default)\\n  - `1-32`: Number of transformer blocks to swap for 3B model\\n  - `1-36`: Number of transformer blocks to swap for 7B model\\n  - Higher values = more VRAM savings but slower processing\\n  - Requires offload_device to be set and different from device\\n\\n- **swap_io_components**: Offload input/output embeddings and normalization layers\\n  - Additional VRAM savings when combined with blocks_to_swap\\n  - Requires offload_device to be set and different from device\\n\\n- **attention_mode**: Attention computation backend\\n  - `sdpa`: PyTorch scaled_dot_product_attention (default, stable, always available)\\n  - `flash_attn`: Flash Attention 2 (faster on supported hardware, requires flash-attn package)\\n\\n- **torch_compile_args**: Connect to SeedVR2 Torch Compile Settings node for 20-40% speedup\\n\\n**BlockSwap Explained:**\\n\\nBlockSwap enables running large models on GPUs with limited VRAM by dynamically swapping transformer blocks between GPU and CPU memory during inference. Here\\'s how it works:\\n\\n- **What it does**: Keeps only the currently-needed transformer blocks on the GPU, while storing the rest on CPU or another device\\n- **When to use it**: When you get OOM (Out of Memory) errors during the upscaling phase\\n- **How to configure**:\\n  1. Set `offload_device` to `cpu` or another GPU\\n  2. Start with `blocks_to_swap=16` (half the blocks)\\n  3. If still getting OOM, increase to 24 or 32 (3B) / 36 (7B)\\n  4. Enable `swap_io_components` for maximum VRAM savings\\n  5. If you have plenty of VRAM, decrease or set to 0 for faster processing\\n\\n**Example Configuration for Low VRAM (8GB)**:\\n- model: `seedvr2_ema_3b-Q8_0.gguf`\\n- device: `cuda:0`\\n- offload_device: `cpu`\\n- blocks_to_swap: `32`\\n- swap_io_components: `True`\\n\\n#### 2. SeedVR2 (Down)Load VAE Model\\n\\n\\n\\nConfigure the VAE (Variational Autoencoder) model for encoding/decoding video frames.\\n\\n**Parameters:**\\n\\n- **model**: VAE model selection\\n  - `ema_vae_fp16.safetensors`: Default and recommended\\n\\n- **device**: GPU device for VAE inference (e.g., `cuda:0`)\\n\\n- **offload_device**: Device to offload VAE model when not actively processing\\n  - `none`: Keep model on inference device (default, fastest)\\n  - `cpu`: Offload to system RAM (reduces VRAM)\\n  - `cuda:X`: Offload to another GPU (good balance if available)\\n\\n- **cache_model**: Keep VAE model loaded on offload_device between workflow runs\\n  - Requires offload_device to be set\\n\\n- **encode_tiled**: Enable tiled encoding to reduce VRAM usage during encoding phase\\n  - Enable if you see OOM errors during the \"Encoding\" phase in debug logs\\n\\n- **encode_tile_size**: Encoding tile size in pixels (default: 1024)\\n  - Applied to both height and width\\n  - Lower values reduce VRAM but may increase processing time\\n\\n- **encode_tile_overlap**: Encoding tile overlap in pixels (default: 128)\\n  - Reduces visible seams between tiles\\n\\n- **decode_tiled**: Enable tiled decoding to reduce VRAM usage during decoding phase\\n  - Enable if you see OOM errors during the \"Decoding\" phase in debug logs\\n\\n- **decode_tile_size**: Decoding tile size in pixels (default: 1024)\\n\\n- **decode_tile_overlap**: Decoding tile overlap in pixels (default: 128)\\n\\n- **torch_compile_args**: Connect to SeedVR2 Torch Compile Settings node for 15-25% speedup\\n\\n**VAE Tiling Explained:**\\n\\nVAE tiling processes large resolutions in smaller tiles to reduce VRAM requirements. Here\\'s how to use it:\\n\\n1. **Run without tiling first** and monitor the debug logs (enable `enable_debug` on main node)\\n2. **If OOM during \"Encoding\" phase**:\\n   - Enable `encode_tiled`\\n   - If still OOM, reduce `encode_tile_size` (try 768, 512, etc.)\\n3. **If OOM during \"Decoding\" phase**:\\n   - Enable `decode_tiled`\\n   - If still OOM, reduce `decode_tile_size`\\n4. **Adjust overlap** (default 128) if you see visible seams in output (increase it) or processing times are too slow (decrease it).\\n\\n**Example Configuration for High Resolution (4K)**:\\n- encode_tiled: `True`\\n- encode_tile_size: `1024`\\n- encode_tile_overlap: `128`\\n- decode_tiled: `True`\\n- decode_tile_size: `1024`\\n- decode_tile_overlap: `128`\\n\\n#### 3. SeedVR2 Torch Compile Settings (Optional)\\n\\n\\n\\nConfigure torch.compile optimization for 20-40% DiT speedup and 15-25% VAE speedup.\\n\\n**Requirements:**\\n- PyTorch 2.0+\\n- Triton (for inductor backend)\\n\\n**Parameters:**\\n\\n- **backend**: Compilation backend\\n  - `inductor`: Full optimization with Triton kernel generation and fusion (recommended)\\n  - `cudagraphs`: Lightweight wrapper using CUDA graphs, no kernel optimization\\n\\n- **mode**: Optimization level (compilation time vs runtime performance)\\n  - `default`: Fast compilation with good speedup (recommended for development)\\n  - `reduce-overhead`: Lower overhead, optimized for smaller models\\n  - `max-autotune`: Slowest compilation, best runtime performance (recommended for production)\\n  - `max-autotune-no-cudagraphs`: Like max-autotune but without CUDA graphs\\n\\n- **fullgraph**: Compile entire model as single graph without breaks\\n  - `False`: Allow graph breaks for better compatibility (default, recommended)\\n  - `True`: Enforce no breaks for maximum optimization (may fail with dynamic shapes)\\n\\n- **dynamic**: Handle varying input shapes without recompilation\\n  - `False`: Specialize for exact input shapes (default)\\n  - `True`: Create dynamic kernels that adapt to shape variations (enable when processing different resolutions or batch sizes)\\n\\n- **dynamo_cache_size_limit**: Max cached compiled versions per function (default: 64)\\n  - Higher = more memory, lower = more recompilation\\n\\n- **dynamo_recompile_limit**: Max recompilation attempts before falling back to eager mode (default: 128)\\n  - Safety limit to prevent compilation loops\\n\\n**Usage:**\\n1. Add this node to your workflow\\n2. Connect its output to the `torch_compile_args` input of DiT and/or VAE loader nodes\\n3. First run will be slow (compilation), subsequent runs will be much faster\\n\\n**When to use:**\\n- torch.compile only makes sense when processing **multiple batches, long videos, or many tiles**\\n- For single images or short clips, the compilation time outweighs the speed improvement\\n- Best suited for batch processing workflows or long videos\\n\\n**Recommended Settings:**\\n- For development/testing: `mode=default`, `backend=inductor`, `fullgraph=False`\\n- For production: `mode=max-autotune`, `backend=inductor`, `fullgraph=False`\\n\\n#### 4. SeedVR2 Video Upscaler (Main Node)\\n\\n\\n\\nMain upscaling node that processes video frames using DiT and VAE models.\\n\\n**Required Inputs:**\\n\\n- **image**: Input video frames as image batch (RGB or RGBA format)\\n- **dit**: DiT model configuration from SeedVR2 (Down)Load DiT Model node\\n- **vae**: VAE model configuration from SeedVR2 (Down)Load VAE Model node\\n\\n**Parameters:**\\n\\n- **seed**: Random seed for reproducible generation (default: 42)\\n  - Same seed with same inputs produces identical output\\n\\n- **resolution**: Target resolution for shortest edge in pixels (default: 1080)\\n  - Maintains aspect ratio automatically\\n\\n- **max_resolution**: Maximum resolution for any edge (default: 0 = no limit)\\n  - Automatically scales down if exceeded to prevent OOM\\n\\n- **batch_size**: Frames per batch (default: 5)\\n  - **CRITICAL REQUIREMENT**: Must follow the **4n+1 formula** (1, 5, 9, 13, 17, 21, 25, ...)\\n  - **Why this matters**: The model uses these frames for temporal consistency calculations\\n  - **Minimum 5 for temporal consistency**: Use 1 only for single images or when temporal consistency isn\\'t needed\\n  - **Match shot length ideally**: For best results, set batch_size to match your shot length (e.g., batch_size=21 for a 20-frame shot)\\n  - **VRAM impact**: Higher batch_size = better quality and speed but requires more VRAM\\n  - **If you get OOM with batch_size=5**: Try optimization techniques first (model offloading, BlockSwap, GGUF models...) before reducing batch_size or input resolution, as these directly impact quality\\n\\n**uniform_batch_size** (default: False)\\n  - Pads the final batch to match `batch_size` for uniform processing\\n  - Prevents temporal artifacts when the last batch is significantly smaller than others\\n  - Example: 45 frames with `batch_size=33` creates [33, 33] instead of [33, 12]\\n  - Recommended when using large batch sizes and video length is not a multiple of `batch_size`\\n  - Increases VRAM usage slightly but ensures consistent temporal coherence across all batches\\n\\n- **temporal_overlap**: Overlapping frames between batches (default: 0)\\n  - Used for blending between batches to reduce temporal artifacts\\n  - Range: 0-16 frames\\n\\n- **prepend_frames**: Frames to prepend (default: 0)\\n  - Prepends reversed frames to reduce artifacts at video start\\n  - Automatically removed after processing\\n  - Range: 0-32 frames\\n\\n- **color_correction**: Color correction method (default: \"wavelet\")\\n  - **`lab`**: Full perceptual color matching with detail preservation (recommended for highest fidelity to original)\\n  - **`wavelet`**: Frequency-based natural colors, preserves details well\\n  - **`wavelet_adaptive`**: Wavelet base + targeted saturation correction\\n  - **`hsv`**: Hue-conditional saturation matching\\n  - **`adain`**: Statistical style transfer\\n  - **`none`**: No color correction\\n\\n- **input_noise_scale**: Input noise injection scale 0.0-1.0 (default: 0.0)\\n  - Adds noise to input frames to reduce artifacts at very high resolutions\\n  - Try 0.1-0.3 if you see artifacts with high output resolutions\\n\\n- **latent_noise_scale**: Latent space noise scale 0.0-1.0 (default: 0.0)\\n  - Adds noise during diffusion process, can soften excessive detail\\n  - Use if input_noise doesn\\'t help, try 0.05-0.15\\n\\n- **offload_device**: Device for storing intermediate tensors between processing phases (default: \"cpu\")\\n  - `none`: Keep all tensors on inference device (fastest but highest VRAM)\\n  - `cpu`: Offload to system RAM (recommended for long videos, slower transfers)\\n  - `cuda:X`: Offload to another GPU (good balance if available, faster than CPU)\\n\\n- **enable_debug**: Enable detailed debug logging (default: False)\\n  - Shows memory usage, timing information, and processing details\\n  - **Highly recommended** for troubleshooting OOM issues\\n\\n**Output:**\\n- Upscaled video frames with color correction applied\\n- Format (RGB/RGBA) matches input\\n- Range [0, 1] normalized for ComfyUI compatibility\\n\\n### Typical Workflow Setup\\n\\n**Basic Workflow (High VRAM - 24GB+)**:\\n```\\nLoad Video Frames\\n    ↓\\nSeedVR2 Load DiT Model\\n  ├─ model: seedvr2_ema_3b_fp16.safetensors\\n  └─ device: cuda:0\\n    ↓\\nSeedVR2 Load VAE Model\\n  ├─ model: ema_vae_fp16.safetensors\\n  └─ device: cuda:0\\n    ↓\\nSeedVR2 Video Upscaler\\n  ├─ batch_size: 21\\n  └─ resolution: 1080\\n    ↓\\nSave Video/Frames\\n```\\n\\n**Low VRAM Workflow (8-12GB)**:\\n```\\nLoad Video Frames\\n    ↓\\nSeedVR2 Load DiT Model\\n  ├─ model: seedvr2_ema_3b-Q8_0.gguf\\n  ├─ device: cuda:0\\n  ├─ offload_device: cpu\\n  ├─ blocks_to_swap: 32\\n  └─ swap_io_components: True\\n    ↓\\nSeedVR2 Load VAE Model\\n  ├─ model: ema_vae_fp16.safetensors\\n  ├─ device: cuda:0\\n  ├─ encode_tiled: True\\n  └─ decode_tiled: True\\n    ↓\\nSeedVR2 Video Upscaler\\n  ├─ batch_size: 5\\n  └─ resolution: 720\\n    ↓\\nSave Video/Frames\\n```\\n\\n**High Performance Workflow (24GB+ with torch.compile)**:\\n```\\nLoad Video Frames\\n    ↓\\nSeedVR2 Torch Compile Settings\\n  ├─ mode: max-autotune\\n  └─ backend: inductor\\n    ↓\\nSeedVR2 Load DiT Model\\n  ├─ model: seedvr2_ema_7b_sharp_fp16.safetensors\\n  ├─ device: cuda:0\\n  └─ torch_compile_args: connected\\n    ↓\\nSeedVR2 Load VAE Model\\n  ├─ model: ema_vae_fp16.safetensors\\n  ├─ device: cuda:0\\n  └─ torch_compile_args: connected\\n    ↓\\nSeedVR2 Video Upscaler\\n  ├─ batch_size: 81\\n  └─ resolution: 1080\\n    ↓\\nSave Video/Frames\\n```\\n\\n## 🖥️ Run as Standalone (CLI)\\n\\nThe standalone CLI provides powerful batch processing capabilities with multi-GPU support and sophisticated optimization options.\\n\\n### Prerequisites\\n\\nChoose the appropriate setup based on your installation:\\n\\n#### Option 1: Already Have ComfyUI with SeedVR2 Installed\\n\\nIf you\\'ve already installed SeedVR2 as part of ComfyUI (via ), you can use the CLI directly:\\n\\n```bash\\n# Navigate to your ComfyUI directory\\ncd ComfyUI\\n\\n# Run the CLI using standalone Python (display help message)\\n# Windows:\\n.venv\\\\Scripts\\\\python.exe custom_nodes\\\\seedvr2_videoupscaler\\\\inference_cli.py --help\\n# Linux/macOS:\\n.venv/bin/python custom_nodes/seedvr2_videoupscaler/inference_cli.py --help\\n```\\n\\n**Skip to  below.**\\n\\n#### Option 2: Standalone Installation (Without ComfyUI)\\n\\nIf you want to use the CLI without ComfyUI installation, follow these steps:\\n\\n1. **Install ** (modern Python package manager):\\n```bash\\n# Windows\\npowershell -ExecutionPolicy ByPass -c \"irm  | iex\"\\n\\n# macOS and Linux\\ncurl -LsSf  | sh\\n```\\n\\n2. **Clone the repository**:\\n```bash\\ngit clone  seedvr2_videoupscaler\\ncd seedvr2_videoupscaler\\n```\\n\\n3. **Create virtual environment and install dependencies**:\\n```bash\\n# Create virtual environment with Python 3.13\\nuv venv --python 3.13\\n\\n# Activate virtual environment\\n# Windows:\\n.venv\\\\Scripts\\\\activate\\n# Linux/macOS:\\nsource .venv/bin/activate\\n\\n# Install PyTorch with CUDA support\\n# Check command line based on your environment: \\nuv pip install --pre torch torchvision torchaudio --index-url \\n\\n# Install SeedVR2 requirements\\nuv pip install -r requirements.txt\\n\\n# Run the CLI (display help message)\\n# Windows:\\n.venv\\\\Scripts\\\\python.exe inference_cli.py --help\\n# Linux/macOS:\\n.venv/bin/python inference_cli.py --help\\n```\\n\\n### Command Line Usage\\n\\nThe CLI provides comprehensive options for single-GPU, multi-GPU, and batch processing workflows.\\n\\n**Basic Usage Examples:**\\n\\n```bash\\n# Basic image upscaling\\npython inference_cli.py image.jpg\\n\\n# Basic video video upscaling with temporal consistency\\npython inference_cli.py video.mp4 --resolution 720 --batch_size 33\\n\\n# Multi-GPU processing with temporal overlap\\npython inference_cli.py video.mp4 \\\\\\n    --cuda_device 0,1 \\\\\\n    --resolution 1080 \\\\\\n    --batch_size 81 \\\\\\n    --uniform_batch_size \\\\\\n    --temporal_overlap 3 \\\\\\n    --prepend_frames 4\\n\\n# Memory-optimized for low VRAM (8GB)\\npython inference_cli.py image.png \\\\\\n    --dit_model seedvr2_ema_3b-Q8_0.gguf \\\\\\n    --resolution 1080 \\\\\\n    --blocks_to_swap 32 \\\\\\n    --swap_io_components \\\\\\n    --dit_offload_device cpu \\\\\\n    --vae_offload_device cpu\\n\\n# High resolution with VAE tiling\\npython inference_cli.py video.mp4 \\\\\\n    --resolution 1440 \\\\\\n    --batch_size 31 \\\\\\n    --uniform_batch_size \\\\\\n    --temporal_overlap 3 \\\\\\n    --vae_encode_tiled \\\\\\n    --vae_decode_tiled\\n\\n# Batch directory processing with model caching\\npython inference_cli.py media_folder/ \\\\\\n    --output processed/ \\\\\\n    --cuda_device 0 \\\\\\n    --cache_dit \\\\\\n    --cache_vae \\\\\\n    --dit_offload_device cpu \\\\\\n    --vae_offload_device cpu \\\\\\n    --resolution 1080 \\\\\\n    --max_resolution 1920\\n```\\n\\n### Command Line Arguments\\n\\n**Input/Output:**\\n- ``: Input file (.mp4, .avi, .png, .jpg, etc.) or directory\\n- `--output`: Output path (default: auto-generated in \\'output/\\' directory)\\n- `--output_format`: Output format: \\'mp4\\' (video) or \\'png\\' (image sequence). Default: auto-detect from input type\\n- `--model_dir`: Model directory (default: ./models/SEEDVR2)\\n\\n**Model Selection:**\\n- `--dit_model`: DiT model to use. Options: 3B/7B with fp16/fp8/GGUF variants (default: 3B FP8)\\n\\n**Processing Parameters:**\\n- `--resolution`: Target short-side resolution in pixels (default: 1080)\\n- `--max_resolution`: Maximum resolution for any edge. Scales down if exceeded. 0 = no limit (default: 0)\\n- `--batch_size`: Frames per batch (must follow 4n+1: 1, 5, 9, 13, 17, 21...). Ideally matches shot length for best temporal consistency (default: 5)\\n- `--seed`: Random seed for reproducibility (default: 42)\\n- `--skip_first_frames`: Skip N initial frames (default: 0)\\n- `--load_cap`: Load maximum N frames from video. 0 = load all (default: 0)\\n- `--prepend_frames`: Prepend N reversed frames to reduce start artifacts (auto-removed) (default: 0)\\n- `--temporal_overlap`: Frames to overlap between batches/GPUs for smooth blending (default: 0)\\n\\n**Quality Control:**\\n- `--color_correction`: Color correction method: \\'lab\\' (perceptual, recommended), \\'wavelet\\', \\'wavelet_adaptive\\', \\'hsv\\', \\'adain\\', or \\'none\\' (default: lab)\\n- `--input_noise_scale`: Input noise injection scale (0.0-1.0). Reduces artifacts at high resolutions (default: 0.0)\\n- `--latent_noise_scale`: Latent space noise scale (0.0-1.0). Softens details if needed (default: 0.0)\\n\\n**Memory Management:**\\n- `--dit_offload_device`: Device to offload DiT model: \\'none\\' (keep on GPU), \\'cpu\\', or \\'cuda:X\\' (default: none)\\n- `--vae_offload_device`: Device to offload VAE model: \\'none\\', \\'cpu\\', or \\'cuda:X\\' (default: none)\\n- `--blocks_to_swap`: Number of transformer blocks to swap (0=disabled, 3B: 0-32, 7B: 0-36). Requires dit_offload_device (default: 0)\\n- `--swap_io_components`: Offload I/O components for additional VRAM savings. Requires dit_offload_device\\n- `--use_non_blocking`: Use non-blocking memory transfers for BlockSwap (recommended)\\n\\n**VAE Tiling:**\\n- `--vae_encode_tiled`: Enable VAE encode tiling to reduce VRAM during encoding\\n- `--vae_encode_tile_size`: VAE encode tile size in pixels (default: 1024)\\n- `--vae_encode_tile_overlap`: VAE encode tile overlap in pixels (default: 128)\\n- `--vae_decode_tiled`: Enable VAE decode tiling to reduce VRAM during decoding\\n- `--vae_decode_tile_size`: VAE decode tile size in pixels (default: 1024)\\n- `--vae_decode_tile_overlap`: VAE decode tile overlap in pixels (default: 128)\\n- `--tile_debug`: Visualize tiles: \\'false\\' (default), \\'encode\\', or \\'decode\\'\\n\\n**Performance Optimization:**\\n- `--attention_mode`: Attention backend: \\'sdpa\\' (default, stable) or \\'flash_attn\\' (faster, requires package)\\n- `--compile_dit`: Enable torch.compile for DiT model (20-40% speedup, requires PyTorch 2.0+ and Triton)\\n- `--compile_vae`: Enable torch.compile for VAE model (15-25% speedup, requires PyTorch 2.0+ and Triton)\\n- `--compile_backend`: Compilation backend: \\'inductor\\' (full optimization) or \\'cudagraphs\\' (lightweight) (default: inductor)\\n- `--compile_mode`: Optimization level: \\'default\\', \\'reduce-overhead\\', \\'max-autotune\\', \\'max-autotune-no-cudagraphs\\' (default: default)\\n- `--compile_fullgraph`: Compile entire model as single graph (faster but less flexible) (default: False)\\n- `--compile_dynamic`: Handle varying input shapes without recompilation (default: False)\\n- `--compile_dynamo_cache_size_limit`: Max cached compiled versions per function (default: 64)\\n- `--compile_dynamo_recompile_limit`: Max recompilation attempts before fallback (default: 128)\\n\\n**Model Caching (batch processing):**\\n- `--cache_dit`: Cache DiT model between files (single GPU only, speeds up directory processing)\\n- `--cache_vae`: Cache VAE model between files (single GPU only, speeds up directory processing)\\n\\n**Multi-GPU:**\\n- `--cuda_device`: CUDA device id(s). Single id (e.g., \\'0\\') or comma-separated list \\'0,1\\' for multi-GPU\\n\\n**Debugging:**\\n- `--debug`: Enable verbose debug logging\\n\\n### Multi-GPU Processing Explained\\n\\nThe CLI\\'s multi-GPU mode automatically distributes the workload across multiple GPUs with intelligent temporal overlap handling:\\n\\n**How it works:**\\n1. Video is split into chunks, one per GPU\\n2. Each GPU processes its chunk independently\\n3. Chunks overlap by `--temporal_overlap` frames\\n4. Results are blended together seamlessly using the overlap region\\n\\n**Example for 2 GPUs with temporal_overlap=4:**\\n```\\nGPU 0: Frames 0-50 (includes 4 overlap frames at end)\\nGPU 1: Frames 46-100 (includes 4 overlap frames at beginning)\\nResult: Frames 0-100 with smooth transition at frame 48\\n```\\n\\n**Best practices:**\\n- Set `--temporal_overlap` to 2-8 frames for smooth blending\\n- Higher overlap = smoother transitions but more redundant processing\\n- Use `--prepend_frames` to reduce artifacts at video start\\n- batch_size should divide evenly into chunk sizes for best results\\n\\n## ⚠️ Limitations\\n\\n### Model Limitations\\n\\n**Batch Size Constraint**: The model requires batch_size to follow the **4n+1 formula** (1, 5, 9, 13, 17, 21, 25, ...) due to temporal consistency architecture. All frames in a batch are processed together for temporal coherence, then batches can be blended using temporal_overlap. Ideally, set batch_size to match your shot length for optimal quality.\\n\\n### Performance Considerations\\n\\n**VAE Bottleneck**: Even with optimized DiT upscaling (BlockSwap, GGUF, torch.compile), the VAE encoding/decoding stages can be the bottleneck, especially for high resolutions. The VAE is slow. Use large batch_size to mitigate this.\\n\\n**VRAM Usage**: While the integration now supports low VRAM systems (8GB or less with proper optimization), VRAM usage varies based on:\\n- Input/output resolution (larger = more VRAM)\\n- Batch size (higher = more VRAM but better temporal consistency and speed)\\n- Model choice (FP16 > FP8 > GGUF in VRAM usage)\\n- Optimization settings (BlockSwap, VAE tiling significantly reduce VRAM)\\n\\n**Speed**: Processing speed depends on:\\n- GPU capabilities (compute performance, VRAM bandwidth, and architecture generation)\\n- Model size (3B faster than 7B)\\n- Batch size (larger batch sizes are faster per frame due to better GPU utilization)\\n- Optimization settings (torch.compile provides significant speedup)\\n- Resolution (higher resolutions are slower)\\n\\n### Best Practices\\n\\n1. **Start with debug enabled** to understand where VRAM is being used\\n2. **For OOM errors during encoding**: Enable VAE encode tiling and reduce tile size\\n3. **For OOM errors during upscaling**: Enable BlockSwap and increase blocks_to_swap\\n4. **For OOM errors during decoding**: Enable VAE decode tiling and reduce tile size\\n   - **If still getting OOM after trying all above**: Reduce batch_size or resolution\\n5. **For best quality**: Use higher batch_size matching your shot length, FP16 models, and LAB color correction\\n6. **For speed**: Use FP8/GGUF models, enable torch.compile, and use Flash Attention if available\\n7. **Test settings with a short clip first** before processing long videos\\n\\n## 🤝 Contributing\\n\\nContributions are welcome! We value community input and improvements.\\n\\nFor detailed contribution guidelines, see .\\n\\n**Quick Start:**\\n\\n1. Fork the repository\\n2. Create your feature branch (`git checkout -b feature/AmazingFeature`)\\n3. Commit your changes (`git commit -m \\'Add some AmazingFeature\\'`)\\n4. Push to the branch (`git push origin feature/AmazingFeature`)\\n5. Open a Pull Request to **main** branch for stable features or **nightly** branch for experimental features\\n\\n**Get Help:**\\n- YouTube: \\n- GitHub : For bug reports and feature requests\\n- GitHub : For questions and community support\\n- Discord: adrientoupet & NumZ#7184\\n\\n## 🙏 Credits\\n\\nThis ComfyUI implementation is a collaborative project by **** and **** (Adrien Toupet), based on the original  by ByteDance Seed Team.\\n\\nSpecial thanks to our community contributors including , , , , , , , , , and many others for their improvements, bug fixes, and testing.\\n\\n## 📜 License\\n\\nThe code in this repository is released under the MIT license as found in the  file.',\n",
       "  'domain': 'video-to-video'},\n",
       " {'model_id': 'vidore/colqwen2-v1.0',\n",
       "  'created_at': '2024-11-03T15:12:07+00:00',\n",
       "  'downloads': 145625,\n",
       "  'likes': 116,\n",
       "  'author': None,\n",
       "  'tags': ['colpali',\n",
       "   'safetensors',\n",
       "   'vidore-experimental',\n",
       "   'vidore',\n",
       "   'visual-document-retrieval',\n",
       "   'en',\n",
       "   'arxiv:2004.12832',\n",
       "   'arxiv:2407.01449',\n",
       "   'arxiv:2106.09685',\n",
       "   'base_model:vidore/colqwen2-base',\n",
       "   'base_model:finetune:vidore/colqwen2-base',\n",
       "   'license:apache-2.0',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': '# ColQwen2: Visual Retriever based on Qwen2-VL-2B-Instruct with ColBERT strategy\\n\\n### This is the base version trained with batch_size 256 instead of 32 for 5 epoch and with the updated pad token\\n\\nColQwen2 is a model based on a novel model architecture and training strategy based on Vision Language Models (VLMs) to efficiently index documents from their visual features.\\nIt is a  extension that generates - style multi-vector representations of text and images. \\nIt was introduced in the paper  and first released in \\n\\n\\n\\n## Version specificity\\n\\n\\nThis model takes dynamic image resolutions in input and does not resize them, changing their aspect ratio as in ColPali.\\nMaximal resolution is set so that 768 image patches are created at most. Experiments show clear improvements with larger amounts of image patches, at the cost of memory requirements.\\n\\nThis version is trained with `colpali-engine==0.3.1`.\\n\\nData is the same as the ColPali data described in the paper.\\n\\n\\n## Model Training\\n\\n### Dataset\\nOur training dataset of 127,460 query-page pairs is comprised of train sets of openly available academic datasets (63%) and a synthetic dataset made up of pages from web-crawled PDF documents and augmented with VLM-generated (Claude-3 Sonnet) pseudo-questions (37%). \\nOur training set is fully English by design, enabling us to study zero-shot generalization to non-English languages. We explicitly verify no multi-page PDF document is used both  and in the train set to prevent evaluation contamination. \\nA validation set is created with 2% of the samples to tune hyperparameters.\\n\\n*Note: Multilingual data is present in the pretraining corpus of the language model and most probably in the multimodal training.*\\n\\n### Parameters\\n\\nAll models are trained for 1 epoch on the train set. Unless specified otherwise, we train models in `bfloat16` format, use low-rank adapters () \\nwith `alpha=32`  and `r=32` on the transformer layers from the language model, \\nas well as the final randomly initialized projection layer, and use a `paged_adamw_8bit` optimizer. \\nWe train on an 8 GPU setup with data parallelism, a learning rate of 5e-5 with linear decay with 2.5% warmup steps, and a batch size of 32.\\n\\n## Usage\\n\\nMake sure `colpali-engine` is installed from source or with a version superior to 0.3.4.\\n`transformers` version must be > 4.46.1.\\n\\n```bash\\npip install git+\\n```\\n\\n```python\\nimport torch\\nfrom PIL import Image\\nfrom transformers.utils.import_utils import is_flash_attn_2_available\\n\\nfrom colpali_engine.models import ColQwen2, ColQwen2Processor\\n\\nmodel = ColQwen2.from_pretrained(\\n    \"vidore/colqwen2-v1.0\",\\n    torch_dtype=torch.bfloat16,\\n    device_map=\"cuda:0\",  # or \"mps\" if on Apple Silicon\\n    attn_implementation=\"flash_attention_2\" if is_flash_attn_2_available() else None,\\n).eval()\\nprocessor = ColQwen2Processor.from_pretrained(\"vidore/colqwen2-v1.0\")\\n\\n# Your inputs\\nimages = [\\n    Image.new(\"RGB\", (128, 128), color=\"white\"),\\n    Image.new(\"RGB\", (64, 32), color=\"black\"),\\n]\\nqueries = [\\n    \"Is attention really all you need?\",\\n    \"What is the amount of bananas farmed in Salvador?\",\\n]\\n\\n# Process the inputs\\nbatch_images = processor.process_images(images).to(model.device)\\nbatch_queries = processor.process_queries(queries).to(model.device)\\n\\n# Forward pass\\nwith torch.no_grad():\\n    image_embeddings = model(**batch_images)\\n    query_embeddings = model(**batch_queries)\\n\\nscores = processor.score_multi_vector(query_embeddings, image_embeddings)\\n```\\n\\n\\n## Limitations\\n\\n - **Focus**: The model primarily focuses on PDF-type documents and high-ressources languages, potentially limiting its generalization to other document types or less represented languages.\\n - **Support**: The model relies on multi-vector retreiving derived from the ColBERT late interaction mechanism, which may require engineering efforts to adapt to widely used vector retrieval frameworks that lack native multi-vector support.\\n\\n## License\\n\\nColQwen2\\'s vision language backbone model (Qwen2-VL) is under `apache2.0` license. The adapters attached to the model are under MIT license.\\n\\n## Contact\\n\\n- Manuel Faysse: manuel.faysse@illuin.tech\\n- Hugues Sibille: hugues.sibille@illuin.tech\\n- Tony Wu: tony.wu@illuin.tech\\n\\n## Citation\\n\\nIf you use any datasets or models from this organization in your research, please cite the original dataset as follows:\\n\\n```bibtex\\n@misc{faysse2024colpaliefficientdocumentretrieval,\\n  title={ColPali: Efficient Document Retrieval with Vision Language Models}, \\n  author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},\\n  year={2024},\\n  eprint={2407.01449},\\n  archivePrefix={arXiv},\\n  primaryClass={cs.IR},\\n  url={ \\n}\\n```',\n",
       "  'domain': 'visual-document-retrieval'},\n",
       " {'model_id': 'PekingU/rtdetr_v2_r18vd',\n",
       "  'created_at': '2025-01-31T17:10:44+00:00',\n",
       "  'downloads': 143462,\n",
       "  'likes': 5,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'rt_detr_v2',\n",
       "   'object-detection',\n",
       "   'vision',\n",
       "   'en',\n",
       "   'dataset:coco',\n",
       "   'arxiv:2407.17140',\n",
       "   'license:apache-2.0',\n",
       "   'endpoints_compatible',\n",
       "   'deploy:azure',\n",
       "   'region:us'],\n",
       "  'modelcard': '## RT-DETRv2\\n\\n### **Overview**\\n\\nThe RT-DETRv2 model was proposed in  by Wenyu Lv, Yian Zhao, Qinyao Chang, Kui Huang, Guanzhong Wang, Yi Liu. RT-DETRv2 refines RT-DETR by introducing selective multi-scale feature extraction, a discrete sampling operator for broader deployment compatibility, and improved training strategies like dynamic data augmentation and scale-adaptive hyperparameters. \\nThese changes enhance flexibility and practicality while maintaining real-time performance.\\n\\nThis model was contributed by  with the help of  and \\n\\nThis is \\n### **Performance**\\n\\nRT-DETRv2 consistently outperforms its predecessor across all model sizes while maintaining the same real-time speeds.\\n\\n\\n\\n### **How to use**\\n\\n```python\\nimport torch\\nimport requests\\n\\nfrom PIL import Image\\nfrom transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor\\n\\nurl = \\'\\nimage = Image.open(requests.get(url, stream=True).raw)\\n\\nimage_processor = RTDetrImageProcessor.from_pretrained(\"PekingU/rtdetr_v2_r18vd\")\\nmodel = RTDetrV2ForObjectDetection.from_pretrained(\"PekingU/rtdetr_v2_r18vd\")\\n\\ninputs = image_processor(images=image, return_tensors=\"pt\")\\n\\nwith torch.no_grad():\\n     outputs = model(**inputs)\\n\\nresults = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.5)\\n\\nfor result in results:\\n     for score, label_id, box in zip(result[\"scores\"], result[\"labels\"], result[\"boxes\"]):\\n         score, label = score.item(), label_id.item()\\n         box = [round(i, 2) for i in box.tolist()]\\n         print(f\"{model.config.id2label[label]}: {score:.2f} {box}\")\\n```\\n\\n```\\ncat: 0.97 [341.14, 25.11, 639.98, 372.89]\\ncat: 0.96 [12.78, 56.35, 317.67, 471.34]\\nremote: 0.95 [39.96, 73.12, 175.65, 117.44]\\nsofa: 0.86 [-0.11, 2.97, 639.89, 473.62]\\nsofa: 0.82 [-0.12, 1.78, 639.87, 473.52]\\nremote: 0.79 [333.65, 76.38, 370.69, 187.48]\\n```\\n\\n### **Training**\\n\\nRT-DETRv2 is trained on COCO (Lin et al. [2014]) train2017 and validated on COCO val2017 dataset. We report the standard AP metrics (averaged over uniformly sampled IoU thresholds ranging from 0.50 − 0.95 with a step size of 0.05), and APval50 commonly used in real scenarios.\\n\\n### **Applications**\\n\\nRT-DETRv2 is ideal for real-time object detection in diverse applications such as **autonomous driving**, **surveillance systems**, **robotics**, and **retail analytics**. Its enhanced flexibility and deployment-friendly design make it suitable for both edge devices and large-scale systems + ensures high accuracy and speed in dynamic, real-world environments.',\n",
       "  'domain': 'object-detection'},\n",
       " {'model_id': 'tencent/Hunyuan3D-2',\n",
       "  'created_at': '2025-01-20T06:55:37+00:00',\n",
       "  'downloads': 68862,\n",
       "  'likes': 1697,\n",
       "  'author': None,\n",
       "  'tags': ['hunyuan3d-2',\n",
       "   'diffusers',\n",
       "   'safetensors',\n",
       "   'image-to-3d',\n",
       "   'text-to-3d',\n",
       "   'en',\n",
       "   'zh',\n",
       "   'arxiv:2501.12202',\n",
       "   'arxiv:2411.02293',\n",
       "   'license:other',\n",
       "   'region:us'],\n",
       "  'modelcard': \"\\n\\n  \\n\\n\\n\\n  \\n  \\n  \\n  \\n\\n    \\n\\n\\n\\n[//]: # (  )\\n\\n[//]: # (  )\\n\\n[//]: # (  )\\n\\n\\n\\n“ Living out everyone’s imagination on creating and manipulating 3D assets.”\\n\\n\\nThis repository contains the models of the paper .\\nFor code and more details on how to use it, refer to the .\\n\\n## 🔥 News\\n\\n- Jan 21, 2025: 💬 Release . Please give it a try!\\n\\n## **Abstract**\\n\\nWe present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for generating high-resolution textured 3D assets.\\nThis system includes two foundation components: a large-scale shape generation model - Hunyuan3D-DiT, and a large-scale\\ntexture synthesis model - Hunyuan3D-Paint.\\nThe shape generative model, built on a scalable flow-based diffusion transformer, aims to create geometry that properly\\naligns with a given condition image, laying a solid foundation for downstream applications.\\nThe texture synthesis model, benefiting from strong geometric and diffusion priors, produces high-resolution and vibrant\\ntexture maps for either generated or hand-crafted meshes.\\nFurthermore, we build Hunyuan3D-Studio - a versatile, user-friendly production platform that simplifies the re-creation\\nprocess of 3D assets. It allows both professional and amateur users to manipulate or even animate their meshes\\nefficiently.\\nWe systematically evaluate our models, showing that Hunyuan3D 2.0 outperforms previous state-of-the-art models,\\nincluding the open-source models and closed-source models in geometry details, condition alignment, texture quality, and\\ne.t.c.\\n\\n\\n  \\n\\n\\n## ☯️ **Hunyuan3D 2.0**\\n\\n### Architecture\\n\\nHunyuan3D 2.0 features a two-stage generation pipeline, starting with the creation of a bare mesh, followed by the\\nsynthesis of a texture map for that mesh. This strategy is effective for decoupling the difficulties of shape and\\ntexture generation and also provides flexibility for texturing either generated or handcrafted meshes.\\n\\n\\n  \\n\\n\\n### Performance\\n\\nWe have evaluated Hunyuan3D 2.0 with other open-source as well as close-source 3d-generation methods.\\nThe numerical results indicate that Hunyuan3D 2.0 surpasses all baselines in the quality of generated textured 3D assets\\nand the condition following ability.\\n\\n| Model                   | CMMD(⬇)   | FID_CLIP(⬇) | FID(⬇)      | CLIP-score(⬆) |\\n|-------------------------|-----------|-------------|-------------|---------------|\\n| Top Open-source Model1  | 3.591     | 54.639      | 289.287     | 0.787         |\\n| Top Close-source Model1 | 3.600     | 55.866      | 305.922     | 0.779         |\\n| Top Close-source Model2 | 3.368     | 49.744      | 294.628     | 0.806         |\\n| Top Close-source Model3 | 3.218     | 51.574      | 295.691     | 0.799         |\\n| Hunyuan3D 2.0           | **3.193** | **49.165**  | **282.429** | **0.809**     |\\n\\nGeneration results of Hunyuan3D 2.0:\\n\\n  \\n  \\n\\n\\n### Pretrained Models\\n\\n| Model                | Date       | Huggingface                                            |\\n|----------------------|------------|--------------------------------------------------------| \\n| Hunyuan3D-DiT-v2-0   | 2025-01-21 |  |\\n| Hunyuan3D-Paint-v2-0 | 2025-01-21 |  |\\n| Hunyuan3D-Delight-v2-0 | 2025-01-21 |  |\\n\\n## 🤗 Get Started with Hunyuan3D 2.0\\n\\nYou may follow the next steps to use Hunyuan3D 2.0 via code or the Gradio App.\\n\\n### Install Requirements\\n\\nPlease install Pytorch via the  site. Then install the other requirements via\\n\\n```bash\\npip install -r requirements.txt\\n# for texture\\ncd hy3dgen/texgen/custom_rasterizer\\npython3 setup.py install\\ncd ../../..\\ncd hy3dgen/texgen/differentiable_renderer\\nbash compile_mesh_painter.sh OR python3 setup.py install (on Windows)\\n```\\n\\n### API Usage\\n\\nWe designed a diffusers-like API to use our shape generation model - Hunyuan3D-DiT and texture synthesis model -\\nHunyuan3D-Paint.\\n\\nYou could assess **Hunyuan3D-DiT** via:\\n\\n```python\\nfrom hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline\\n\\npipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained('tencent/Hunyuan3D-2')\\nmesh = pipeline(image='assets/demo.png')[0]\\n```\\n\\nThe output mesh is a , which you could save to glb/obj (or other\\nformat) file.\\n\\nFor **Hunyuan3D-Paint**, do the following:\\n\\n```python\\nfrom hy3dgen.texgen import Hunyuan3DPaintPipeline\\nfrom hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline\\n\\n# let's generate a mesh first\\npipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained('tencent/Hunyuan3D-2')\\nmesh = pipeline(image='assets/demo.png')[0]\\n\\npipeline = Hunyuan3DPaintPipeline.from_pretrained('tencent/Hunyuan3D-2')\\nmesh = pipeline(mesh, image='assets/demo.png')\\n```\\n\\nPlease visit  for more advanced usage, such as **text to 3D** and **texture generation\\nfor handcrafted mesh**.\\n\\n### Gradio App\\n\\nYou could also host a  App in your own computer via:\\n\\n```bash\\npip3 install gradio==3.39.0\\npython3 gradio_app.py\\n```\\n\\nDon't forget to visit  for quick use, if you don't want to host yourself.\\n\\n## 📑 Open-Source Plan\\n\\n- [x] Inference Code\\n- [x] Model Checkpoints\\n- [x] Technical Report\\n- [ ] ComfyUI\\n- [ ] TensorRT Version\\n\\n## 🔗 BibTeX\\n\\nIf you found this repository helpful, please cite our report:\\n\\n```bibtex\\n@misc{hunyuan3d22025tencent,\\n    title={Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D Assets Generation},\\n    author={Tencent Hunyuan3D Team},\\n    year={2025},\\n    eprint={2501.12202},\\n    archivePrefix={arXiv},\\n    primaryClass={cs.CV}\\n}\\n\\n@misc{yang2024tencent,\\n    title={Tencent Hunyuan3D-1.0: A Unified Framework for Text-to-3D and Image-to-3D Generation},\\n    author={Tencent Hunyuan3D Team},\\n    year={2024},\\n    eprint={2411.02293},\\n    archivePrefix={arXiv},\\n    primaryClass={cs.CV}\\n}\\n```\\n\\n## Community Resources\\n\\nThanks for the contributions of community members, here we have these great extensions of Hunyuan3D 2.0:\\n\\n- \\n- \\n- \\n\\n## Acknowledgements\\n\\nWe would like to thank the contributors to\\nthe , , , \\nand  repositories, for their open research and exploration.\\n\\n## Star History\\n\\n\\n \\n   \\n   \\n   \\n \\n\",\n",
       "  'domain': 'text-to-3d'},\n",
       " {'model_id': 'yyfz233/Pi3',\n",
       "  'created_at': '2025-07-14T09:37:03+00:00',\n",
       "  'downloads': 37654,\n",
       "  'likes': 12,\n",
       "  'author': None,\n",
       "  'tags': ['pytorch',\n",
       "   'safetensors',\n",
       "   'model_hub_mixin',\n",
       "   'pytorch_model_hub_mixin',\n",
       "   'image-to-3d',\n",
       "   'arxiv:2507.13347',\n",
       "   'license:bsd-2-clause',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# 🌌 $\\\\\\\\pi^3$: Scalable Permutation-Equivariant Visual Geometry Learning\\n\\n\\n    \\n        \\n        \\n        \\n        \\n        \\n        \\n        \\n        \\n        \\n        \\n        \\n        \\n    \\n\\n\\n\\n    \\n        \\n    \\n    \\n        $\\\\\\\\pi^3$ reconstructs visual geometry without a fixed reference view, achieving robust, state-of-the-art performance.\\n    \\n\\n\\n## ✨ Overview\\n\\nWe introduce $\\\\\\\\pi^3$ (Pi-Cubed), a novel feed-forward neural network that revolutionizes visual geometry reconstruction by **eliminating the need for a fixed reference view**. Traditional methods, which rely on a designated reference frame, are often prone to instability and failure if the reference is suboptimal.\\n\\nIn contrast, $\\\\\\\\pi^3$ employs a fully **permutation-equivariant** architecture. This allows it to directly predict affine-invariant camera poses and scale-invariant local point maps from an unordered set of images, breaking free from the constraints of a reference frame. This design makes our model inherently **robust to input ordering** and **highly scalable**.\\n\\nA key emergent property of our simple, bias-free design is the learning of a dense and structured latent representation of the camera pose manifold. Without complex priors or training schemes, $\\\\\\\\pi^3$ achieves **state-of-the-art performance** 🏆 on a wide range of tasks, including camera pose estimation, monocular/video depth estimation, and dense point map estimation.\\n\\n## 🚀 Quick Start\\n\\n### 1. Clone & Install Dependencies\\nFirst, clone the repository and install the required packages.\\n```bash\\ngit clone \\ncd Pi3\\npip install -r requirements.txt\\n```\\n\\n### 2. Run Inference from Command Line\\n\\nTry our example inference script. You can run it on a directory of images or a video file.\\n\\nIf the automatic download from Hugging Face is slow, you can download the model checkpoint manually from  and specify its local path using the `--ckpt` argument.\\n\\n```bash\\n# Run with default example video\\npython example.py\\n\\n# Run on your own data (image folder or .mp4 file)\\npython example.py --data_path \\n```\\n\\n**Optional Arguments:**\\n\\n  * `--data_path`: Path to the input image directory or a video file. (Default: `examples/skating.mp4`)\\n  * `--save_path`: Path to save the output `.ply` point cloud. (Default: `examples/result.ply`)\\n  * `--interval`: Frame sampling interval. (Default: `1` for images, `10` for video)\\n  * `--ckpt`: Path to a custom model checkpoint file.\\n  * `--device`: Device to run inference on. (Default: `cuda`)\\n\\n### 3. Run with Gradio Demo\\n\\nYou can also launch a local Gradio demo for an interactive experience.\\n\\n```bash\\n# Install demo-specific requirements\\npip install -r requirements_demo.txt\\n\\n# Launch the demo\\npython demo_gradio.py\\n```\\n\\n## 🛠️ Detailed Usage\\n\\n### Model Input & Output\\n\\nThe model takes a tensor of images and outputs a dictionary containing the reconstructed geometry.\\n\\n  * **Input**: A `torch.Tensor` of shape $B \\\\times N \\\\times 3 \\\\times H \\\\times W$ with pixel values in the range `[0, 1]`.\\n  * **Output**: A `dict` with the following keys:\\n      * `points`: Global point cloud unprojected by `local points` and `camera_poses` (`torch.Tensor`, $B \\\\times N \\\\times H \\\\times W \\\\times 3$).\\n      * `local_points`: Per-view local point maps (`torch.Tensor`,  $B \\\\times N \\\\times H \\\\times W \\\\times 3$).\\n      * `conf`: Confidence scores for local points (values in `[0, 1]`, higher is better) (`torch.Tensor`,  $B \\\\times N \\\\times H \\\\times W \\\\times 1$).\\n      * `camera_poses`: Camera-to-world transformation matrices (`4x4` in OpenCV format) (`torch.Tensor`,  $B \\\\times N \\\\times 4 \\\\times 4$).\\n\\n### Example Code Snippet\\n\\nHere is a minimal example of how to run the model on a batch of images.\\n\\n```python\\nimport torch\\nfrom pi3.models.pi3 import Pi3\\nfrom pi3.utils.basic import load_images_as_tensor # Assuming you have a helper function\\n\\n# --- Setup ---\\ndevice = \\'cuda\\' if torch.cuda.is_available() else \\'cpu\\'\\nmodel = Pi3.from_pretrained(\"yyfz233/Pi3\").to(device).eval()\\n# or download checkpoints from `\\n\\n# --- Load Data ---\\n# Load a sequence of N images into a tensor\\n# imgs shape: (N, 3, H, W).\\n# imgs value: [0, 1]\\nimgs = load_images_as_tensor(\\'examples/skating.mp4\\', interval=10).to(device)\\n\\n# --- Inference ---\\nprint(\"Running model inference...\")\\n# Use mixed precision for better performance on compatible GPUs\\ndtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16\\n\\nwith torch.no_grad():\\n    with torch.amp.autocast(\\'cuda\\', dtype=dtype):\\n        # Add a batch dimension -> (1, N, 3, H, W)\\n        results = model(imgs[None])\\n\\nprint(\"Reconstruction complete!\")\\n# Access outputs: results[\\'points\\'], results[\\'camera_poses\\'] and results[\\'local_points\\'].\\n```\\n\\n## 🙏 Acknowledgements\\n\\nOur work builds upon several fantastic open-source projects. We\\'d like to express our gratitude to the authors of:\\n\\n  * \\n  * \\n  * \\n\\n## 📜 Citation\\n\\nIf you find our work useful, please consider citing:\\n\\n```bibtex\\n@misc{wang2025pi3,\\n      title={$\\\\\\\\pi^3$: Scalable Permutation-Equivariant Visual Geometry Learning}, \\n      author={Yifan Wang and Jianjun Zhou and Haoyi Zhu and Wenzheng Chang and Yang Zhou and Zizun Li and Junyi Chen and Jiangmiao Pang and Chunhua Shen and Tong He},\\n      year={2025},\\n      eprint={2507.13347},\\n      archivePrefix={arXiv},\\n      primaryClass={cs.CV},\\n      url={ \\n}\\n```\\n\\n## 📄 License\\nFor academic use, this project is licensed under the 2-clause BSD License. See the  file for details. For commercial use, please contact the authors.',\n",
       "  'domain': 'image-to-3d'},\n",
       " {'model_id': 'TianheWu/VisualQuality-R1-7B',\n",
       "  'created_at': '2025-05-25T06:59:49+00:00',\n",
       "  'downloads': 9164,\n",
       "  'likes': 9,\n",
       "  'author': None,\n",
       "  'tags': ['safetensors',\n",
       "   'qwen2_5_vl',\n",
       "   'IQA',\n",
       "   'Reasoning',\n",
       "   'VLM',\n",
       "   'Pytorch',\n",
       "   'R1',\n",
       "   'GRPO',\n",
       "   'RL2R',\n",
       "   'reinforcement-learning',\n",
       "   'en',\n",
       "   'arxiv:2505.14460',\n",
       "   'base_model:Qwen/Qwen2.5-VL-7B-Instruct',\n",
       "   'base_model:finetune:Qwen/Qwen2.5-VL-7B-Instruct',\n",
       "   'license:mit',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# VisualQuality-R1-7B\\nOur Paper has been accept as **spotlight** in NeurIPS 2025!\\nThis is the latest version of VisualQuality-R1, trained on a diverse combination of synthetic and realistic datasets.\\nPaper link: \\nCode link: \\n\\n> The first NR-IQA model enhanced by RL2R, capable of both quality description and rating through reasoning.\\n\\n\\n\\n\\n\\n## ⚡Quick Start\\n\\n### Non-Thinking Inference\\nWhen you execute inference with VisualQuality-R1 as a reward/evaluation model, you can only use **non-thinking** mode to reduce inference time, generating only a single output token with the following prompt:\\n```\\nPROMPT = (\\n    \"You are doing the image quality assessment task. Here is the question: \"\\n    \"What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, \"\\n    \"rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality.\"\\n)\\n\\nQUESTION_TEMPLATE = \"{Question} Please only output the final answer with only one score in   tags.\"\\n```\\n\\nFor single image quality rating, the code is:\\n\\n\\nExample Code (VisualQuality-R1: Image Quality Rating with non-thinking mode)\\n\\n```python\\nfrom transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\\nfrom qwen_vl_utils import process_vision_info\\n\\nimport torch\\nimport random\\nimport re\\nimport os\\n\\n\\ndef score_image(image_path, model, processor):\\n    PROMPT = (\\n        \"You are doing the image quality assessment task. Here is the question: \"\\n        \"What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, \"\\n        \"rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality.\"\\n    )\\n    \\n    QUESTION_TEMPLATE = \"{Question} Please only output the final answer with only one score in   tags.\"\\n    message = [\\n        {\\n            \"role\": \"user\",\\n            \"content\": [\\n                {\\'type\\': \\'image\\', \\'image\\': image_path},\\n                {\"type\": \"text\", \"text\": QUESTION_TEMPLATE.format(Question=PROMPT)}\\n            ],\\n        }\\n    ]\\n\\n    batch_messages = [message]\\n\\n    # Preparation for inference\\n    text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in batch_messages]\\n    image_inputs, video_inputs = process_vision_info(batch_messages)\\n    inputs = processor(\\n        text=text,\\n        images=image_inputs,\\n        videos=video_inputs,\\n        padding=True,\\n        return_tensors=\"pt\",\\n    )\\n    inputs = inputs.to(device)\\n\\n    # Inference: Generation of the output\\n    generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=2048, do_sample=True, top_k=50, top_p=1)\\n    generated_ids_trimmed = [\\n        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\\n    ]\\n    batch_output_text = processor.batch_decode(\\n        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n    )\\n\\n    reasoning = None\\n\\n    try:\\n        model_output_matches = re.findall(r\\'(.*?)\\', batch_output_text[0], re.DOTALL)\\n        model_answer = model_output_matches[-1].strip() if model_output_matches else batch_output_text[0].strip()\\n        score = float(re.search(r\\'\\\\d+(\\\\.\\\\d+)?\\', model_answer).group())\\n    except:\\n        print(f\"================= Meet error with {img_path}, please generate again. =================\")\\n        score = random.randint(1, 5)\\n\\n    return reasoning, score\\n\\n\\nrandom.seed(1)\\nMODEL_PATH = \"\"\\ndevice = torch.device(\"cuda:5\") if torch.cuda.is_available() else torch.device(\"cpu\")\\nimage_path = \"\"\\n\\nmodel = Qwen2_5_VLForConditionalGeneration.from_pretrained(\\n    MODEL_PATH,\\n    torch_dtype=torch.bfloat16,\\n    attn_implementation=\"flash_attention_2\",\\n    device_map=device,\\n)\\nprocessor = AutoProcessor.from_pretrained(MODEL_PATH)\\nprocessor.tokenizer.padding_side = \"left\"\\n\\nreasoning, score = score_image(\\n    image_path, model, processor\\n)\\n\\nprint(score)\\n```\\n\\n\\n\\n\\nExample Code (VisualQuality-R1: Batch Images Quality Rating with non-thinking mode)\\n\\n```python\\nfrom transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\\nfrom qwen_vl_utils import process_vision_info\\nfrom tqdm import tqdm\\n\\nimport torch\\nimport random\\nimport re\\nimport os\\n\\n\\ndef get_image_paths(folder_path):\\n    image_extensions = {\\'.jpg\\', \\'.jpeg\\', \\'.png\\', \\'.bmp\\', \\'.gif\\', \\'.tiff\\', \\'.webp\\'}\\n    image_paths = []\\n\\n    for root, dirs, files in os.walk(folder_path):\\n        for file in files:\\n            _, ext = os.path.splitext(file)\\n            if ext.lower() in image_extensions:\\n                image_paths.append(os.path.join(root, file))\\n\\n    return image_paths\\n\\ndef score_batch_image(image_paths, model, processor):\\n    PROMPT = (\\n        \"You are doing the image quality assessment task. Here is the question: \"\\n        \"What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, \"\\n        \"rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality.\"\\n    )\\n\\n    QUESTION_TEMPLATE = \"{Question} Please only output the final answer with only one score in   tags.\"\\n\\n    messages = []\\n    for img_path in image_paths:\\n        message = [\\n            {\\n                \"role\": \"user\",\\n                \"content\": [\\n                    {\\'type\\': \\'image\\', \\'image\\': img_path},\\n                    {\"type\": \"text\", \"text\": QUESTION_TEMPLATE.format(Question=PROMPT)}\\n                ],\\n            }\\n        ]\\n        messages.append(message)\\n\\n    BSZ = 32\\n    all_outputs = []  # List to store all answers\\n    for i in tqdm(range(0, len(messages), BSZ)):\\n        batch_messages = messages[i:i + BSZ]\\n    \\n        # Preparation for inference\\n        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in batch_messages]\\n        \\n        image_inputs, video_inputs = process_vision_info(batch_messages)\\n        inputs = processor(\\n            text=text,\\n            images=image_inputs,\\n            videos=video_inputs,\\n            padding=True,\\n            return_tensors=\"pt\",\\n        )\\n        inputs = inputs.to(device)\\n\\n        # Inference: Generation of the output\\n        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=512, do_sample=True, top_k=50, top_p=1)\\n        generated_ids_trimmed = [\\n            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\\n        ]\\n        batch_output_text = processor.batch_decode(\\n            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n        )\\n\\n        all_outputs.extend(batch_output_text)\\n    \\n    path_score_dict = {}\\n    for img_path, model_output in zip(image_paths, all_outputs):\\n        try:\\n            model_output_matches = re.findall(r\\'(.*?)\\', model_output, re.DOTALL)\\n            model_answer = model_output_matches[-1].strip() if model_output_matches else model_output.strip()\\n            score = float(re.search(r\\'\\\\d+(\\\\.\\\\d+)?\\', model_answer).group())\\n        except:\\n            print(f\"Meet error with {img_path}, please generate again.\")\\n            score = random.randint(1, 5)\\n\\n        path_score_dict[img_path] = score\\n\\n    return path_score_dict\\n\\n\\nrandom.seed(1)\\nMODEL_PATH = \"\"\\ndevice = torch.device(\"cuda:3\") if torch.cuda.is_available() else torch.device(\"cpu\")\\n\\nmodel = Qwen2_5_VLForConditionalGeneration.from_pretrained(\\n    MODEL_PATH,\\n    torch_dtype=torch.bfloat16,\\n    attn_implementation=\"flash_attention_2\",\\n    device_map=device,\\n)\\nprocessor = AutoProcessor.from_pretrained(MODEL_PATH)\\nprocessor.tokenizer.padding_side = \"left\"\\n\\nimage_root = \"\"\\nimage_paths = get_image_paths(image_root) # It should be a list\\n\\npath_score_dict = score_batch_image(\\n    image_paths, model, processor\\n)\\n\\nfile_name = \"output.txt\"\\nwith open(file_name, \"w\") as file:\\n    for key, value in path_score_dict.items():\\n        file.write(f\"{key} {value}\\\\n\") \\n\\nprint(\"Done!\")\\n```\\n\\n\\n### Thinking mode for inference\\n\\n\\nExample Code (VisualQuality-R1: Single Image Quality Rating with thinking)\\n    \\n```python\\nfrom transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\\nfrom qwen_vl_utils import process_vision_info\\n\\nimport torch\\nimport random\\nimport re\\nimport os\\n\\n\\ndef score_image(image_path, model, processor):\\n    PROMPT = (\\n        \"You are doing the image quality assessment task. Here is the question: \"\\n        \"What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, \"\\n        \"rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality.\"\\n    )\\n        \\n    QUESTION_TEMPLATE = \"{Question} First output the thinking process in   tags and then output the final answer with only one score in   tags.\"\\n    # QUESTION_TEMPLATE = \"Please describe the quality of this image.\"\\n    message = [\\n        {\\n            \"role\": \"user\",\\n            \"content\": [\\n                {\\'type\\': \\'image\\', \\'image\\': image_path},\\n                {\"type\": \"text\", \"text\": QUESTION_TEMPLATE.format(Question=PROMPT)}\\n            ],\\n        }\\n    ]\\n\\n    batch_messages = [message]\\n\\n    # Preparation for inference\\n    text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in batch_messages]\\n    image_inputs, video_inputs = process_vision_info(batch_messages)\\n    inputs = processor(\\n        text=text,\\n        images=image_inputs,\\n        videos=video_inputs,\\n        padding=True,\\n        return_tensors=\"pt\",\\n    )\\n    inputs = inputs.to(device)\\n\\n    # Inference: Generation of the output\\n    generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=2048, do_sample=True, top_k=50, top_p=1)\\n    generated_ids_trimmed = [\\n        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\\n    ]\\n    batch_output_text = processor.batch_decode(\\n        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n    )\\n\\n    reasoning = re.findall(r\\'(.*?)\\', batch_output_text[0], re.DOTALL)\\n    reasoning = reasoning[-1].strip()\\n\\n    try:\\n        model_output_matches = re.findall(r\\'(.*?)\\', batch_output_text[0], re.DOTALL)\\n        model_answer = model_output_matches[-1].strip() if model_output_matches else batch_output_text[0].strip()\\n        score = float(re.search(r\\'\\\\d+(\\\\.\\\\d+)?\\', model_answer).group())\\n    except:\\n        print(f\"================= Meet error with {img_path}, please generate again. =================\")\\n        score = random.randint(1, 5)\\n\\n    return reasoning, score\\n\\n\\nrandom.seed(1)\\nMODEL_PATH = \"\"\\ndevice = torch.device(\"cuda:5\") if torch.cuda.is_available() else torch.device(\"cpu\")\\nimage_path = \"\"\\n\\nmodel = Qwen2_5_VLForConditionalGeneration.from_pretrained(\\n    MODEL_PATH,\\n    torch_dtype=torch.bfloat16,\\n    attn_implementation=\"flash_attention_2\",\\n    device_map=device,\\n)\\nprocessor = AutoProcessor.from_pretrained(MODEL_PATH)\\nprocessor.tokenizer.padding_side = \"left\"\\n\\nreasoning, score = score_image(\\n    image_path, model, processor\\n)\\n\\nprint(reasoning)\\nprint(score)\\n```\\n\\n\\n\\n\\nExample Code (VisualQuality-R1: Batch Images Quality Rating with thinking)\\n\\n```python\\nfrom transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor\\nfrom qwen_vl_utils import process_vision_info\\nfrom tqdm import tqdm\\n\\nimport torch\\nimport random\\nimport re\\nimport os\\n\\n\\ndef get_image_paths(folder_path):\\n    image_extensions = {\\'.jpg\\', \\'.jpeg\\', \\'.png\\', \\'.bmp\\', \\'.gif\\', \\'.tiff\\', \\'.webp\\'}\\n    image_paths = []\\n\\n    for root, dirs, files in os.walk(folder_path):\\n        for file in files:\\n            _, ext = os.path.splitext(file)\\n            if ext.lower() in image_extensions:\\n                image_paths.append(os.path.join(root, file))\\n\\n    return image_paths\\n\\ndef score_batch_image(image_paths, model, processor):\\n    PROMPT = (\\n        \"You are doing the image quality assessment task. Here is the question: \"\\n        \"What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, \"\\n        \"rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality.\"\\n    )\\n\\n    QUESTION_TEMPLATE = \"{Question} First output the thinking process in   tags and then output the final answer with only one score in   tags.\"\\n\\n    messages = []\\n    for img_path in image_paths:\\n        message = [\\n            {\\n                \"role\": \"user\",\\n                \"content\": [\\n                    {\\'type\\': \\'image\\', \\'image\\': img_path},\\n                    {\"type\": \"text\", \"text\": QUESTION_TEMPLATE.format(Question=PROMPT)}\\n                ],\\n            }\\n        ]\\n        messages.append(message)\\n\\n    BSZ = 32\\n    all_outputs = []  # List to store all answers\\n    for i in tqdm(range(0, len(messages), BSZ)):\\n        batch_messages = messages[i:i + BSZ]\\n    \\n        # Preparation for inference\\n        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in batch_messages]\\n        \\n        image_inputs, video_inputs = process_vision_info(batch_messages)\\n        inputs = processor(\\n            text=text,\\n            images=image_inputs,\\n            videos=video_inputs,\\n            padding=True,\\n            return_tensors=\"pt\",\\n        )\\n        inputs = inputs.to(device)\\n\\n        # Inference: Generation of the output\\n        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=512, do_sample=True, top_k=50, top_p=1)\\n        generated_ids_trimmed = [\\n            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\\n        ]\\n        batch_output_text = processor.batch_decode(\\n            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\\n        )\\n\\n        all_outputs.extend(batch_output_text)\\n    \\n    path_score_dict = {}\\n    for img_path, model_output in zip(image_paths, all_outputs):\\n        reasoning = re.findall(r\\'(.*?)\\', model_output, re.DOTALL)\\n        reasoning = reasoning[-1].strip()\\n\\n        try:\\n            model_output_matches = re.findall(r\\'(.*?)\\', model_output, re.DOTALL)\\n            model_answer = model_output_matches[-1].strip() if model_output_matches else model_output.strip()\\n            score = float(re.search(r\\'\\\\d+(\\\\.\\\\d+)?\\', model_answer).group())\\n        except:\\n            print(f\"Meet error with {img_path}, please generate again.\")\\n            score = random.randint(1, 5)\\n\\n        path_score_dict[img_path] = score\\n\\n    return path_score_dict\\n\\n\\nrandom.seed(1)\\nMODEL_PATH = \"\"\\ndevice = torch.device(\"cuda:3\") if torch.cuda.is_available() else torch.device(\"cpu\")\\n\\nmodel = Qwen2_5_VLForConditionalGeneration.from_pretrained(\\n    MODEL_PATH,\\n    torch_dtype=torch.bfloat16,\\n    attn_implementation=\"flash_attention_2\",\\n    device_map=device,\\n)\\nprocessor = AutoProcessor.from_pretrained(MODEL_PATH)\\nprocessor.tokenizer.padding_side = \"left\"\\n\\nimage_root = \"\"\\nimage_paths = get_image_paths(image_root) # It should be a list\\n\\npath_score_dict = score_batch_image(\\n    image_paths, model, processor\\n)\\n\\nfile_name = \"output.txt\"\\nwith open(file_name, \"w\") as file:\\n    for key, value in path_score_dict.items():\\n        file.write(f\"{key} {value}\\\\n\") \\n\\nprint(\"Done!\")\\n```\\n\\n\\n\\n## 🚀 Updated: VisualQuality-R1 high efficiency inference script with vLLM\\n\\n\\nExample Code (VisualQuality-R1: Batch Images Quality Rating with thinking, using vLLM)\\n\\n```python\\n# Please install vLLM first: \\n\\nfrom transformers import Qwen2_5_VLProcessor, AutoProcessor\\nfrom vllm import LLM, RequestOutput, SamplingParams\\nfrom qwen_vl_utils import process_vision_info\\n\\nimport torch\\nimport random\\nimport re\\nimport os\\n\\nIMAGE_PATH = \"./images\"\\nMODEL_PATH = \"TianheWu/VisualQuality-R1-7B\"\\n\\ndef get_image_paths(folder_path):\\n    image_extensions = {\\'.jpg\\', \\'.jpeg\\', \\'.png\\', \\'.bmp\\', \\'.gif\\', \\'.tiff\\', \\'.webp\\'}\\n    image_paths = []\\n\\n    for root, dirs, files in os.walk(folder_path):\\n        for file in files:\\n            _, ext = os.path.splitext(file)\\n            if ext.lower() in image_extensions:\\n                image_paths.append(os.path.join(root, file))\\n\\n    return image_paths\\n\\ndef score_batch_image(image_paths, model: LLM, processor: Qwen2_5_VLProcessor):\\n    PROMPT = (\\n        \"You are doing the image quality assessment task. Here is the question: \"\\n        \"What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, \"\\n        \"rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality.\"\\n    )\\n\\n    QUESTION_TEMPLATE = \"{Question} First output the thinking process in   tags and then output the final answer with only one score in   tags.\"\\n\\n    messages = []\\n    for img_path in image_paths:\\n        message = [\\n            {\\n                \"role\": \"user\",\\n                \"content\": [\\n                    {\\'type\\': \\'image\\', \\'image\\': img_path},\\n                    {\"type\": \"text\", \"text\": QUESTION_TEMPLATE.format(Question=PROMPT)}\\n                ],\\n            }\\n        ]\\n        messages.append(message)\\n\\n    all_outputs = []  # List to store all answers\\n\\n    # Preparation for inference\\n    print(\"preprocessing ...\")\\n    texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in messages]\\n    image_inputs, video_inputs = process_vision_info(messages)\\n\\n    inputs = [{\\n        \"prompt\": texts[i],\\n        \"multi_modal_data\": {\\n            \"image\": image_inputs[i]\\n        },\\n    } for i in range(len(messages))]\\n    \\n    output: list[RequestOutput] = model.generate(\\n        inputs,\\n        sampling_params=SamplingParams(\\n            max_tokens=512,\\n            temperature=0.1,\\n            top_k=50,\\n            top_p=1.0,\\n            stop_token_ids=[processor.tokenizer.eos_token_id],\\n        ),\\n    )\\n\\n    batch_output_text = [o.outputs[0].text for o in output]\\n\\n    all_outputs.extend(batch_output_text)\\n    \\n    path_score_dict = {}\\n    for img_path, model_output in zip(image_paths, all_outputs):\\n        print(f\"{model_output = }\")\\n        try:\\n            model_output_matches = re.findall(r\\'(.*?)\\', model_output, re.DOTALL)\\n            model_answer = model_output_matches[-1].strip() if model_output_matches else model_output.strip()\\n            score = float(re.search(r\\'\\\\d+(\\\\.\\\\d+)?\\', model_answer).group())\\n        except:\\n            print(f\"Meet error with {img_path}, please generate again.\")\\n            score = random.randint(1, 5)\\n\\n        path_score_dict[img_path] = score\\n\\n    return path_score_dict\\n\\n\\nrandom.seed(1)\\nmodel = LLM(\\n    model=MODEL_PATH,\\n    tensor_parallel_size=1,\\n    trust_remote_code=True,\\n    seed=1,\\n)\\n\\nprocessor = AutoProcessor.from_pretrained(MODEL_PATH)\\nprocessor.tokenizer.padding_side = \"left\"\\n\\nimage_paths = get_image_paths(IMAGE_PATH) # It should be a list\\n\\npath_score_dict = score_batch_image(\\n    image_paths, model, processor\\n)\\n\\nfile_name = \"output.txt\"\\nwith open(file_name, \"w\") as file:\\n    for key, value in path_score_dict.items():\\n        file.write(f\"{key} {value}\\\\n\") \\n\\nprint(\"Done!\")\\n```\\n\\n\\n## Training\\n\\n### Preparation\\n1. To smoothly execute the training procedure, first download the IQA images and place them all in a **single folder**.\\n2. Given an original MOS file (e.g., KADID-10K_mos.txt), first execute `cd datasets`, then run `python make_data.py` (with moderate modifications) to generate a **JSON file** for model training.\\n3. Download the  into a folder.\\n\\n### Training within a Single Node\\nPlease modify three elements in `src/open-r1-multimodal/run_scripts/KADID-10K/one_node_run_kadid.sh`:\\n```\\n--model_name_or_path [Your Qwen2.5-VL-7B-Instruct path] \\\\\\n--image_folders [Your dataset images path] \\\\\\n--data_file_paths [Your JSON file path] \\\\\\n```\\nThen, run:\\n```\\nbash src/open-r1-multimodal/run_scripts/KADID-10K/one_node_run_kadid.sh\\n```\\n\\n### Training within Multiple Nodes\\nAfter making the necessary modifications, run the following command:\\n```\\nbash src/open-r1-multimodal/run_scripts/KADID-10K/multi_run_kadid.sh\\n```\\n\\n\\n## Acknowledgement\\n- : We start from codebase from the VLM-R1.\\n\\nI would like to sincerely thank  for the generous support of my project and for the invaluable guidance in the field of AR generation.\\n\\n\\n## 📧 Contact\\nIf you have any question, please email `sigstianhewu@gmail.com` or `tianhewu-c@my.cityu.edu.hk`.\\n\\n## BibTeX\\n```\\n@article{wu2025visualquality,\\n  title={{VisualQuality-R1}: Reasoning-Induced Image Quality Assessment via Reinforcement Learning to Rank},\\n  author={Wu, Tianhe and Zou, Jian and Liang, Jie and Zhang, Lei and Ma, Kede},\\n  journal={arXiv preprint arXiv:2505.14460},\\n  year={2025}\\n}\\n```',\n",
       "  'domain': 'reinforcement-learning'},\n",
       " {'model_id': 'tasksource/ModernBERT-large-nli',\n",
       "  'created_at': '2025-01-04T00:56:11+00:00',\n",
       "  'downloads': 8855,\n",
       "  'likes': 9,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'modernbert',\n",
       "   'text-classification',\n",
       "   'instruct',\n",
       "   'natural-language-inference',\n",
       "   'nli',\n",
       "   'zero-shot-classification',\n",
       "   'en',\n",
       "   'dataset:nyu-mll/glue',\n",
       "   'dataset:facebook/anli',\n",
       "   'base_model:answerdotai/ModernBERT-large',\n",
       "   'base_model:finetune:answerdotai/ModernBERT-large',\n",
       "   'license:apache-2.0',\n",
       "   'endpoints_compatible',\n",
       "   'deploy:azure',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# Model Card for Model ID\\n\\nThis model is ModernBERT multi-task fine-tuned on tasksource NLI tasks, including MNLI, ANLI, SICK, WANLI, doc-nli, LingNLI, FOLIO, FOL-NLI, LogicNLI, Label-NLI and all datasets in the below table).\\nThis is the equivalent of an \"instruct\" version.\\nThe model was trained for 200k steps on an Nvidia A30 GPU.\\n\\nIt is very good at reasoning tasks (better than llama 3.1 8B Instruct on ANLI and FOLIO), long context reasoning, sentiment analysis and zero-shot classification with new labels. \\n\\nThe following table shows model test accuracy. These are the scores for the same single transformer with different classification heads on top.\\nFurther gains can be obtained by fine-tuning on a single-task, e.g. SST, but it this checkpoint is great for zero-shot classification and natural language inference (contradiction/entailment/neutral classification).\\n\\n| test_name                             |   test_accuracy |\\n|:--------------------------------------|----------------:|\\n| glue/mnli                             |            0.89 |\\n| glue/qnli                             |            0.96 |\\n| glue/rte                              |            0.91 |\\n| glue/wnli                             |            0.64 |\\n| glue/mrpc                             |            0.81 |\\n| glue/qqp                              |            0.87 |\\n| glue/cola                             |            0.87 |\\n| glue/sst2                             |            0.96 |\\n| super_glue/boolq                      |            0.66 |\\n| super_glue/cb                         |            0.86 |\\n| super_glue/multirc                    |            0.9  |\\n| super_glue/wic                        |            0.71 |\\n| super_glue/axg                        |            1    |\\n| anli/a1                               |            0.72 |\\n| anli/a2                               |            0.54 |\\n| anli/a3                               |            0.55 |\\n| sick/label                            |            0.91 |\\n| sick/entailment_AB                    |            0.93 |\\n| snli                                  |            0.94 |\\n| scitail/snli_format                   |            0.95 |\\n| hans                                  |            1    |\\n| WANLI                                 |            0.77 |\\n| recast/recast_ner                     |            0.85 |\\n| recast/recast_sentiment               |            0.97 |\\n| recast/recast_verbnet                 |            0.89 |\\n| recast/recast_megaveridicality        |            0.87 |\\n| recast/recast_verbcorner              |            0.87 |\\n| recast/recast_kg_relations            |            0.9  |\\n| recast/recast_factuality              |            0.95 |\\n| recast/recast_puns                    |            0.98 |\\n| probability_words_nli/reasoning_1hop  |            1    |\\n| probability_words_nli/usnli           |            0.79 |\\n| probability_words_nli/reasoning_2hop  |            0.98 |\\n| nan-nli                               |            0.85 |\\n| nli_fever                             |            0.78 |\\n| breaking_nli                          |            0.99 |\\n| conj_nli                              |            0.72 |\\n| fracas                                |            0.79 |\\n| dialogue_nli                          |            0.94 |\\n| mpe                                   |            0.75 |\\n| dnc                                   |            0.91 |\\n| recast_white/fnplus                   |            0.76 |\\n| recast_white/sprl                     |            0.9  |\\n| recast_white/dpr                      |            0.84 |\\n| add_one_rte                           |            0.94 |\\n| paws/labeled_final                    |            0.96 |\\n| pragmeval/pdtb                        |            0.56 |\\n| lex_glue/scotus                       |            0.58 |\\n| lex_glue/ledgar                       |            0.85 |\\n| dynasent/dynabench.dynasent.r1.all/r1 |            0.83 |\\n| dynasent/dynabench.dynasent.r2.all/r2 |            0.76 |\\n| cycic_classification                  |            0.96 |\\n| lingnli                               |            0.91 |\\n| monotonicity-entailment               |            0.97 |\\n| scinli                                |            0.88 |\\n| naturallogic                          |            0.93 |\\n| dynahate                              |            0.86 |\\n| syntactic-augmentation-nli            |            0.94 |\\n| autotnli                              |            0.92 |\\n| defeasible-nli/atomic                 |            0.83 |\\n| defeasible-nli/snli                   |            0.8  |\\n| help-nli                              |            0.96 |\\n| nli-veridicality-transitivity         |            0.99 |\\n| lonli                                 |            0.99 |\\n| dadc-limit-nli                        |            0.79 |\\n| folio                                 |            0.71 |\\n| tomi-nli                              |            0.54 |\\n| puzzte                                |            0.59 |\\n| temporal-nli                          |            0.93 |\\n| counterfactually-augmented-snli       |            0.81 |\\n| cnli                                  |            0.9  |\\n| boolq-natural-perturbations           |            0.72 |\\n| equate                                |            0.65 |\\n| logiqa-2.0-nli                        |            0.58 |\\n| mindgames                             |            0.96 |\\n| ConTRoL-nli                           |            0.66 |\\n| logical-fallacy                       |            0.38 |\\n| cladder                               |            0.89 |\\n| conceptrules_v2                       |            1    |\\n| zero-shot-label-nli                   |            0.79 |\\n| scone                                 |            1    |\\n| monli                                 |            1    |\\n| SpaceNLI                              |            1    |\\n| propsegment/nli                       |            0.92 |\\n| FLD.v2/default                        |            0.91 |\\n| FLD.v2/star                           |            0.78 |\\n| SDOH-NLI                              |            0.99 |\\n| scifact_entailment                    |            0.87 |\\n| feasibilityQA                         |            0.79 |\\n| AdjectiveScaleProbe-nli               |            1    |\\n| resnli                                |            1    |\\n| semantic_fragments_nli                |            1    |\\n| dataset_train_nli                     |            0.95 |\\n| nlgraph                               |            0.97 |\\n| ruletaker                             |            0.99 |\\n| PARARULE-Plus                         |            1    |\\n| logical-entailment                    |            0.93 |\\n| nope                                  |            0.56 |\\n| LogicNLI                              |            0.91 |\\n| contract-nli/contractnli_a/seg        |            0.88 |\\n| contract-nli/contractnli_b/full       |            0.84 |\\n| nli4ct_semeval2024                    |            0.72 |\\n| biosift-nli                           |            0.92 |\\n| SIGA-nli                              |            0.57 |\\n| FOL-nli                               |            0.79 |\\n| doc-nli                               |            0.81 |\\n| mctest-nli                            |            0.92 |\\n| natural-language-satisfiability       |            0.92 |\\n| idioms-nli                            |            0.83 |\\n| lifecycle-entailment                  |            0.79 |\\n| MSciNLI                               |            0.84 |\\n| hover-3way/nli                        |            0.92 |\\n| seahorse_summarization_evaluation     |            0.81 |\\n| missing-item-prediction/contrastive   |            0.88 |\\n| Pol_NLI                               |            0.93 |\\n| synthetic-retrieval-NLI/count         |            0.72 |\\n| synthetic-retrieval-NLI/position      |            0.9  |\\n| synthetic-retrieval-NLI/binary        |            0.92 |\\n| babi_nli                              |            0.98 |\\n\\n\\n\\n# Usage\\n\\n## [ZS] Zero-shot classification pipeline\\n```python\\nfrom transformers import pipeline\\nclassifier = pipeline(\"zero-shot-classification\",model=\"tasksource/ModernBERT-large-nli\")\\n\\ntext = \"one day I will see the world\"\\ncandidate_labels = [\\'travel\\', \\'cooking\\', \\'dancing\\']\\nclassifier(text, candidate_labels)\\n```\\nNLI training data of this model includes , a NLI dataset specially constructed to improve this kind of zero-shot classification.\\n\\n## [NLI] Natural language inference pipeline\\n\\n```python\\nfrom transformers import pipeline\\npipe = pipeline(\"text-classification\",model=\"tasksource/ModernBERT-large-nli\")\\npipe([dict(text=\\'there is a cat\\',\\n  text_pair=\\'there is a black cat\\')]) #list of (premise,hypothesis)\\n```\\n\\n## Backbone for further fune-tuning\\n\\nThis checkpoint has stronger reasoning and fine-grained abilities than the base version and can be used for further fine-tuning.\\n\\n# Citation\\n\\n```\\n@inproceedings{sileo-2024-tasksource,\\n    title = \"tasksource: A Large Collection of {NLP} tasks with a Structured Dataset Preprocessing Framework\",\\n    author = \"Sileo, Damien\",\\n    booktitle = \"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)\",\\n    month = may,\\n    year = \"2024\",\\n    address = \"Torino, Italia\",\\n    publisher = \"ELRA and ICCL\",\\n    url = \"\\n    pages = \"15655--15684\",\\n}\\n```',\n",
       "  'domain': 'zero-shot-classification'},\n",
       " {'model_id': 'knowledgator/gliner-multitask-v1.0',\n",
       "  'created_at': '2024-12-05T09:20:56+00:00',\n",
       "  'downloads': 3876,\n",
       "  'likes': 35,\n",
       "  'author': None,\n",
       "  'tags': ['gliner',\n",
       "   'pytorch',\n",
       "   'NER',\n",
       "   'information extraction',\n",
       "   'relation extraction',\n",
       "   'summarization',\n",
       "   'sentiment extraction',\n",
       "   'question-answering',\n",
       "   'token-classification',\n",
       "   'en',\n",
       "   'dataset:knowledgator/GLINER-multi-task-synthetic-data',\n",
       "   'arxiv:2406.12925',\n",
       "   'license:apache-2.0',\n",
       "   'region:us'],\n",
       "  'modelcard': '🚀 Meet the first multi-task prompt-tunable GLiNER model 🚀\\n\\n**GLiNER-Multitask** is a model designed to extract various pieces of information from plain text based on a user-provided custom prompt. This versatile model leverages a bidirectional transformer encoder, similar to BERT, which ensures both high generalization and compute efficiency despite its compact size.\\n\\nThe `gliner-multitask-v1.0` variant achieves state-of-the-art performance on NER zero-shot benchmarks, demonstrating its robustness and flexibility. It excels not only in named entity recognition but also in handling various other information extraction tasks, making it a powerful tool for diverse natural language processing applications.\\n\\n### Supported tasks:\\n* **Named Entity Recognition (NER)**: Identifies and categorizes entities such as names, organizations, dates, and other specific items in the text.\\n* **Relation Extraction**: Detects and classifies relationships between entities within the text.\\n* **Summarization**: Extract the most important sentences that summarize the input text, capturing the essential information.\\n* **Sentiment Extraction**: Identify parts of the text that signalize a positive, negative, or neutral sentiment;\\n* **Key-Phrase Extraction**: Identifies and extracts important phrases and keywords from the text.\\n* **Question-answering**: Finding an answer in the text given a question;\\n* **Open Information Extraction**: Extracts pieces of text given an open prompt from a user, for example, product description extraction;\\n* **Text classification**: Classifying text by matching labels specified in the prompt;\\n\\n\\n### Installation \\t\\nTo use this model, you must install the :\\n\\n```bash\\npip install gliner\\n```\\n\\nOnce you\\'ve downloaded the GLiNER library, you can import the GLiNER class. You can then load this model using GLiNER.from_pretrained.\\n\\n**How to use for NER:**\\n\\n```python\\nfrom gliner import GLiNER\\n\\nmodel = GLiNER.from_pretrained(\"knowledgator/gliner-multitask-v1.0\")\\n\\ntext = \"\"\"\\nMicrosoft was founded by Bill Gates and Paul Allen on April 4, 1975 to develop and sell BASIC interpreters for the Altair 8800. During his career at Microsoft, Gates held the positions of chairman, chief executive officer, president and chief software architect, while also being the largest individual shareholder until May 2014.\\n\"\"\"\\n\\nlabels = [\"founder\", \"computer\", \"software\", \"position\", \"date\"]\\n\\nentities = model.predict_entities(text, labels)\\n\\nfor entity in entities:\\n    print(entity[\"text\"], \"=>\", entity[\"label\"])\\n```\\n### Performance:\\n\\n| Model                              | Dataset            | Precision | Recall | F1 Score | F1 Score (Decimal) |\\n|------------------------------------|--------------------|-----------|--------|----------|--------------------|\\n| knowledgator/gliner-multitask-v0.5 | CrossNER_AI         | 51.00%    | 51.11% | 51.05%   | 0.5105             |\\n|                                    | CrossNER_literature | 72.65%    | 65.62% | 68.96%   | 0.6896             |\\n|                                    | CrossNER_music      | 74.91%    | 73.70% | 74.30%   | 0.7430             |\\n|                                    | CrossNER_politics   | 78.84%    | 77.71% | 78.27%   | 0.7827             |\\n|                                    | CrossNER_science    | 69.20%    | 65.48% | 67.29%   | 0.6729             |\\n|                                    | mit-movie           | 61.29%    | 52.59% | 56.60%   | 0.5660             |\\n|                                    | mit-restaurant      | 50.65%    | 38.13% | 43.51%   | 0.4351             |\\n|                                    | **Average**        |           |        |          | **0.6276**         |\\n| knowledgator/gliner-multitask-v1.0 | CrossNER_AI         | 67.15%    | 56.10% | 61.13%   | 0.6113             |\\n|                                    | CrossNER_literature | 71.60%    | 64.74% | 68.00%   | 0.6800             |\\n|                                    | CrossNER_music      | 73.57%    | 69.29% | 71.36%   | 0.7136             |\\n|                                    | CrossNER_politics   | 77.54%    | 76.52% | 77.03%   | 0.7703             |\\n|                                    | CrossNER_science    | 74.54%    | 66.00% | 70.01%   | 0.7001             |\\n|                                    | mit-movie           | 61.86%    | 42.02% | 50.04%   | 0.5004             |\\n|                                    | mit-restaurant      | 58.87%    | 36.67% | 45.19%   | 0.4519             |\\n|                                    | **Average**         |           |        |          | **0.6325**         |\\n| knowledgator/gliner-llama-multitask-1B-v1.0 | CrossNER_AI         | 63.24%    | 55.60% | 59.17%   | 0.5917             |\\n|                                    | CrossNER_literature | 69.74%    | 60.10% | 64.56%   | 0.6456             |\\n|                                    | CrossNER_music      | 74.03%    | 67.22% | 70.46%   | 0.7046             |\\n|                                    | CrossNER_politics   | 76.96%    | 71.64% | 74.20%   | 0.7420             |\\n|                                    | CrossNER_science    | 73.79%    | 63.73% | 68.39%   | 0.6839             |\\n|                                    | mit-movie           | 56.89%    | 46.70% | 51.30%   | 0.5130             |\\n|                                    | mit-restaurant      | 48.45%    | 38.13% | 42.67%   | 0.4267             |\\n|                                    | **Average**         |           |        |          | **0.6153**         |\\n\\n\\n---\\n**How to use for relation extraction:**\\n\\n```python\\ntext = \"\"\"\\nMicrosoft was founded by Bill Gates and Paul Allen on April 4, 1975 to develop and sell BASIC interpreters for the Altair 8800. During his career at Microsoft, Gates held the positions of chairman, chief executive officer, president and chief software architect, while also being the largest individual shareholder until May 2014.\\n\"\"\"\\n\\nlabels = [\"Microsoft <> founder\", \"Microsoft <> inception date\", \"Bill Gates <> held position\"]\\n\\nentities = model.predict_entities(text, labels)\\n\\nfor entity in entities:\\n    print(entity[\"label\"], \"=>\", entity[\"text\"])\\n```\\n### Construct relations extraction pipeline with \\nFirst of all, we need import neccessary components of the library and initalize predictor - GLiNER model and construct pipeline that combines NER and realtions extraction:\\n```python\\nfrom utca.core import RenameAttribute\\nfrom utca.implementation.predictors import (\\n    GLiNERPredictor,\\n    GLiNERPredictorConfig\\n)\\nfrom utca.implementation.tasks import (\\n    GLiNER,\\n    GLiNERPreprocessor,\\n    GLiNERRelationExtraction,\\n    GLiNERRelationExtractionPreprocessor,\\n)\\n\\npredictor = GLiNERPredictor( # Predictor manages the model that will be used by tasks\\n    GLiNERPredictorConfig(\\n        model_name = \"knowledgator/gliner-multitask-v1.0\", # Model to use\\n        device = \"cuda:0\", # Device to use\\n    )\\n)\\n\\npipe = (\\n    GLiNER( # GLiNER task produces classified entities that will be at the \"output\" key.\\n        predictor=predictor,\\n        preprocess=GLiNERPreprocessor(threshold=0.7) # Entities threshold\\n    ) \\n    | RenameAttribute(\"output\", \"entities\") # Rename output entities from GLiNER task to use them as inputs in GLiNERRelationExtraction\\n    | GLiNERRelationExtraction( # GLiNERRelationExtraction is used for relation extraction.\\n        predictor=predictor,\\n        preprocess=(\\n            GLiNERPreprocessor(threshold=0.5) # Relations threshold\\n            | GLiNERRelationExtractionPreprocessor()\\n        )\\n    )\\n)\\n```\\n\\nTo run pipeline we need to specify entity types and relations with their parameters:\\n\\n```python\\nr = pipe.run({\\n    \"text\": text, # Text to process\\n    \"labels\": [\"organisation\", \"founder\", \"position\", \"date\"],\\n    \"relations\": [{ # Relation parameters\\n        \"relation\": \"founder\", # Relation label. Required parameter.\\n        \"pairs_filter\": [(\"organisation\", \"founder\")], # Optional parameter. It specifies possible members of relations by their entity labels.\\n        \"distance_threshold\": 100, # Optional parameter. It specifies the max distance between spans in the text (i.e., the end of the span that is closer to the start of the text and the start of the next one).\\n    }, {\\n        \"relation\": \"inception date\",\\n        \"pairs_filter\": [(\"organisation\", \"date\")],\\n    }, {\\n        \"relation\": \"held position\",\\n        \"pairs_filter\": [(\"founder\", \"position\")],\\n    }]\\n})\\n\\nprint(r[\"output\"])\\n```\\n\\n### Performance:\\n| Model                  |  Dataset     | Precision |   Recall |   F1 Score |\\n|:-----------------------|------------:|---------:|-----------:|-----------:|\\n| knowledgator/gliner-llama-multitask-1B-v1.0 | CrossRe       | 0.606472 | 0.511444 | 0.554919 |\\n|                                            | DocRed         | 0.707483 | 0.589355 |   0.643039 |\\n| knowledgator/gliner-multitask-v0.5  |   CrossRe | 0.585319 | 0.800176 | 0.676088 |\\n|                                            | DocRed         | 0.713392 | 0.772826 |   0.74192  |\\n|knowledgator/gliner-multitask-v1.0  |   CrossRe | 0.760653 | 0.738556 | 0.749442 |\\n|                                            | DocRed         | 0.770644 | 0.761373 |   0.76598  |\\n\\n---\\n\\n**How to use for open information extraction:**\\n\\n```python\\nprompt = \"\"\"Find all positive aspects about the product:\\\\n\"\"\"\\ntext = \"\"\"\\nI recently purchased the Sony WH-1000XM4 Wireless Noise-Canceling Headphones from Amazon and I must say, I\\'m thoroughly impressed. The package arrived in New York within 2 days, thanks to Amazon Prime\\'s expedited shipping.\\n\\nThe headphones themselves are remarkable. The noise-canceling feature works like a charm in the bustling city environment, and the 30-hour battery life means I don\\'t have to charge them every day. Connecting them to my Samsung Galaxy S21 was a breeze, and the sound quality is second to none.\\n\\nI also appreciated the customer service from Amazon when I had a question about the warranty. They responded within an hour and provided all the information I needed.\\n\\nHowever, the headphones did not come with a hard case, which was listed in the product description. I contacted Amazon, and they offered a 10% discount on my next purchase as an apology.\\n\\nOverall, I\\'d give these headphones a 4.5/5 rating and highly recommend them to anyone looking for top-notch quality in both product and service.\\n\"\"\"\\n\\ninput_ = prompt+text\\n\\nlabels = [\"match\"]\\n\\nmatches = model.predict_entities(input_, labels)\\n\\nfor match in matches:\\n    print(match[\"text\"], \"=>\", match[\"score\"])\\n```\\n\\n### Performance:\\n\\n*Dataset: WiRe57_343-manual-oie*\\n| Model                  |   Precision |   Recall |   F1 Score |\\n|:-----------------------|------------:|---------:|-----------:|\\n| knowledgator/gliner-llama-multitask-1B-v1.0 |    0.9047 | 0.2794 |   0.4269 |\\n| knowledgator/gliner-multitask-v0.5  |    0.9278 | 0.2779 |   0.4287  |\\n| knowledgator/gliner-multitask-v1.0  |    0.8775      | 0.2733 |   0.4168 |\\n\\n---\\n\\n**How to use for question-answering:**\\n\\n```python\\nquestion = \"Who was the CEO of Microsoft?\"\\ntext = \"\"\"\\nMicrosoft was founded by Bill Gates and Paul Allen on April 4, 1975, to develop and sell BASIC interpreters for the Altair 8800. During his career at Microsoft, Gates held the positions of chairman, chief executive officer, president and chief software architect, while also being the largest individual shareholder until May 2014.\\n\"\"\"\\n\\nlabels = [\"answer\"]\\n\\ninput_ = question+text\\nanswers = model.predict_entities(input_, labels)\\n\\nfor answer in answers:\\n    print(answer[\"text\"], \"=>\", answer[\"score\"])\\n```\\n\\n### Performance:\\n*Dataset: SQuAD 2.0*\\n| Model                  |   Precision |   Recall |   F1 Score |\\n|:-----------------------|------------:|---------:|-----------:|\\n| knowledgator/gliner-llama-multitask-1B-v1.0  |    0.578296 | 0.795821 |   0.669841 |\\n| knowledgator/gliner-multitask-v0.5  |    0.429213 | 0.94378  |   0.590072 |\\n| knowledgator/gliner-multitask-v1.0  |    0.601354 | 0.874784 |   0.712745 |\\n\\n---\\n\\n**How to use for summarization:**\\n\\nWith threshold parameters, you can control how much information you want to extract.\\n\\n```python\\nprompt = \"Summarize the given text, highlighting the most important information:\\\\n\"\\n\\ntext = \"\"\"\\nSeveral studies have reported its pharmacological activities, including anti-inflammatory, antimicrobial, and antitumoral effects.\\nThe effect of E-anethole was studied in the osteosarcoma MG-63 cell line, and the antiproliferative activity was evaluated by an MTT assay.\\nIt showed a GI50 value of 60.25 μM with apoptosis induction through the mitochondrial-mediated pathway. Additionally, it induced cell cycle arrest at the G0/G1 phase, up-regulated the expression of p53, caspase-3, and caspase-9, and down-regulated Bcl-xL expression.\\nMoreover, the antitumoral activity of anethole was assessed against oral tumor Ca9-22 cells, and the cytotoxic effects were evaluated by MTT and LDH assays.\\nIt demonstrated a LD50 value of 8 μM, and cellular proliferation was 42.7% and 5.2% at anethole concentrations of 3 μM and 30 μM, respectively.\\nIt was reported that it could selectively and in a dose-dependent manner decrease cell proliferation and induce apoptosis, as well as induce autophagy, decrease ROS production, and increase glutathione activity. The cytotoxic effect was mediated through NF-kB, MAP kinases, Wnt, caspase-3 and -9, and PARP1 pathways. Additionally, treatment with anethole inhibited cyclin D1 oncogene expression, increased cyclin-dependent kinase inhibitor p21WAF1, up-regulated p53 expression, and inhibited the EMT markers.\\n\"\"\"\\n\\nlabels = [\"summary\"]\\n\\ninput_ = prompt+text\\n\\nthreshold = 0.1\\nsummaries = model.predict_entities(input_, labels, threshold=threshold)\\n\\nfor summary in summaries:\\n    print(summary[\"text\"], \"=>\", summary[\"score\"])\\n```\\n---\\n\\n**How to use for text classification:**\\n\\nWith threshold parameters, you can control recall and precision of text classification.\\n\\n```python\\nprompt = \"Classify text into the following classes: positive review, negative review\"\\n\\ntext = \"\"\"\\n\"I recently purchased the Sony WH-1000XM4 Wireless Noise-Canceling Headphones from Amazon and I must say, I\\'m thoroughly impressed. The package arrived in New York within 2 days, thanks to Amazon Prime\\'s expedited shipping.\\n\"\"\"\\n\\nlabels = [\"match\"]\\n\\ninput_ = prompt+text\\n\\nthreshold = 0.5\\nclasses = model.predict_entities(input_, labels, threshold=threshold)\\n\\nfor label in classes:\\n    print(label[\"text\"], \"=>\", label[\"score\"])\\n```\\n\\n### Performance:\\n\\n| Model Name            | Dataset   | Micro F1 Score |\\n|-----------------------|-----------|----------------|\\n| knowledgator/gliner-multitask-v1.0 | Emotion   | 0.322          |\\n|  | AG News   | 0.7436         |\\n|  | IMDb      | 0.7907         |\\n| knowledgator/gliner-llama-multitask-1B-v1.0 | Emotion   | 0.3475          |\\n|  | AG News   | 0.7436         |\\n|  | IMDb      | 0.7907         |\\n\\n---\\n\\n### Extensive NER Benchmarks:\\n\\n\\n\\nOur multitask model demonstrates comparable performance on different zero-shot benchmarks to dedicated models to NER task (all labels were lowecased in this testing):\\n\\n| Dataset                | Precision | Recall | F1 Score | F1 Score (Decimal) |\\n|------------------------|-----------|--------|----------|--------------------|\\n| ACE 2004              | 53.25%    | 23.20% | 32.32%   | 0.3232             |\\n| ACE 2005              | 43.25%    | 18.00% | 25.42%   | 0.2542             |\\n| AnatEM                | 51.75%    | 25.98% | 34.59%   | 0.3459             |\\n| Broad Tweet Corpus    | 69.54%    | 72.50% | 70.99%   | 0.7099             |\\n| CoNLL 2003            | 68.33%    | 68.43% | 68.38%   | 0.6838             |\\n| CrossNER_AI           | 67.15%    | 56.10% | 61.13%   | 0.6113             |\\n| CrossNER_literature   | 71.60%    | 64.74% | 68.00%   | 0.6800             |\\n| CrossNER_music        | 73.57%    | 69.29% | 71.36%   | 0.7136             |\\n| CrossNER_politics     | 77.54%    | 76.52% | 77.03%   | 0.7703             |\\n| CrossNER_science      | 74.54%    | 66.00% | 70.01%   | 0.7001             |\\n| FabNER                | 69.28%    | 62.62% | 65.78%   | 0.6578             |\\n| FindVehicle           | 49.75%    | 51.25% | 50.49%   | 0.5049             |\\n| GENIA_NER             | 60.98%    | 46.91% | 53.03%   | 0.5303             |\\n| HarveyNER             | 24.27%    | 35.66% | 28.88%   | 0.2888             |\\n| MultiNERD             | 54.33%    | 89.34% | 67.57%   | 0.6757             |\\n| Ontonotes             | 27.26%    | 36.64% | 31.26%   | 0.3126             |\\n| PolyglotNER           | 33.54%    | 64.29% | 44.08%   | 0.4408             |\\n| TweetNER7             | 44.77%    | 38.67% | 41.50%   | 0.4150             |\\n| WikiANN en            | 56.33%    | 57.09% | 56.71%   | 0.5671             |\\n| WikiNeural            | 71.70%    | 86.60% | 78.45%   | 0.7845             |\\n| bc2gm                 | 64.71%    | 51.68% | 57.47%   | 0.5747             |\\n| bc4chemd              | 69.24%    | 50.08% | 58.12%   | 0.5812             |\\n| bc5cdr                | 79.22%    | 69.19% | 73.87%   | 0.7387             |\\n| mit-movie             | 61.86%    | 42.02% | 50.04%   | 0.5004             |\\n| mit-restaurant        | 58.87%    | 36.67% | 45.19%   | 0.4519             |\\n| ncbi                  | 68.72%    | 54.86% | 61.01%   | 0.6101             |\\n\\n\\n### Join Our Discord\\n\\nConnect with our community on Discord for news, support, and discussion about our models. Join .\\n\\n### Citation:\\n```\\n@misc{stepanov2024gliner,\\n      title={GLiNER multi-task: Generalist Lightweight Model for Various Information Extraction Tasks}, \\n      author={Ihor Stepanov and Mykhailo Shtopko},\\n      year={2024},\\n      eprint={2406.12925},\\n      archivePrefix={arXiv},\\n      primaryClass={id=\\'cs.LG\\' full_name=\\'Machine Learning\\' is_active=True alt_name=None in_archive=\\'cs\\' is_general=False description=\\'Papers on all aspects of machine learning research (supervised, unsupervised, reinforcement learning, bandit problems, and so on) including also robustness, explanation, fairness, and methodology. cs.LG is also an appropriate primary category for applications of machine learning methods.\\'}\\n}\\n```',\n",
       "  'domain': 'question-answering'},\n",
       " {'model_id': 'Lamapi/next-1b',\n",
       "  'created_at': '2025-10-15T11:09:56+00:00',\n",
       "  'downloads': 3154,\n",
       "  'likes': 21,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'safetensors',\n",
       "   'gguf',\n",
       "   'gemma3_text',\n",
       "   'text-generation',\n",
       "   'turkish',\n",
       "   'türkiye',\n",
       "   'english',\n",
       "   'ai',\n",
       "   'lamapi',\n",
       "   'gemma3',\n",
       "   'next',\n",
       "   'next-x1',\n",
       "   'efficient',\n",
       "   'open-source',\n",
       "   '1b',\n",
       "   'huggingface',\n",
       "   'large-language-model',\n",
       "   'llm',\n",
       "   'causal',\n",
       "   'transformer',\n",
       "   'artificial-intelligence',\n",
       "   'machine-learning',\n",
       "   'ai-research',\n",
       "   'natural-language-processing',\n",
       "   'nlp',\n",
       "   'finetuned',\n",
       "   'lightweight',\n",
       "   'creative',\n",
       "   'summarization',\n",
       "   'question-answering',\n",
       "   'chat-model',\n",
       "   'generative-ai',\n",
       "   'optimized-model',\n",
       "   'unsloth',\n",
       "   'trl',\n",
       "   'sft',\n",
       "   'chemistry',\n",
       "   'biology',\n",
       "   'finance',\n",
       "   'legal',\n",
       "   'music',\n",
       "   'art',\n",
       "   'code',\n",
       "   'climate',\n",
       "   'medical',\n",
       "   'agent',\n",
       "   'text-generation-inference',\n",
       "   'conversational',\n",
       "   'tr',\n",
       "   'ar',\n",
       "   'af',\n",
       "   'az',\n",
       "   'es',\n",
       "   'en',\n",
       "   'el',\n",
       "   'ro',\n",
       "   'ru',\n",
       "   'rm',\n",
       "   'th',\n",
       "   'uk',\n",
       "   'uz',\n",
       "   'pl',\n",
       "   'pt',\n",
       "   'fa',\n",
       "   'sk',\n",
       "   'sl',\n",
       "   'da',\n",
       "   'de',\n",
       "   'nl',\n",
       "   'fr',\n",
       "   'fi',\n",
       "   'ka',\n",
       "   'hi',\n",
       "   'hu',\n",
       "   'hy',\n",
       "   'ja',\n",
       "   'kk',\n",
       "   'kn',\n",
       "   'ko',\n",
       "   'ku',\n",
       "   'ky',\n",
       "   'la',\n",
       "   'lb',\n",
       "   'id',\n",
       "   'is',\n",
       "   'it',\n",
       "   'zh',\n",
       "   'cs',\n",
       "   'vi',\n",
       "   'be',\n",
       "   'bg',\n",
       "   'bs',\n",
       "   'ne',\n",
       "   'mn',\n",
       "   'dataset:mlabonne/FineTome-100k',\n",
       "   'dataset:ITCL/FineTomeOs',\n",
       "   'dataset:Gryphe/ChatGPT-4o-Writing-Prompts',\n",
       "   'dataset:dongguanting/ARPO-SFT-54K',\n",
       "   'dataset:GreenerPastures/All-Your-Base-Full',\n",
       "   'dataset:Gryphe/Opus-WritingPrompts',\n",
       "   'dataset:HuggingFaceH4/MATH-500',\n",
       "   'dataset:mlabonne/smoltalk-flat',\n",
       "   'dataset:mlabonne/natural_reasoning-formatted',\n",
       "   'dataset:OpenSPG/KAG-Thinker-training-dataset',\n",
       "   'dataset:uclanlp/Brief-Pro',\n",
       "   'dataset:CognitiveKernel/CognitiveKernel-Pro-SFT',\n",
       "   'dataset:SuperbEmphasis/Claude-4.0-DeepSeek-R1-RP-SFWish',\n",
       "   'dataset:QuixiAI/dolphin-r1',\n",
       "   'dataset:mlabonne/lmsys-arena-human-sft-55k',\n",
       "   'license:mit',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n\\n\\n# 🚀 Next-1B (t416)\\n\\n### *Lightweight, Efficient, and Türkiye-Focused AI*\\n\\n\\n\\n\\n\\n---\\n\\n## 📖 Overview\\n\\n**Next-1B** is a **1-billion parameter causal language model** based on **Gemma 3**, designed for **efficiency, low-resource deployment, and reasoning-focused natural language understanding**.\\n\\nKey highlights:\\n\\n* Extremely **lightweight** — can run on consumer GPUs with low VRAM.\\n* Optimized for **text reasoning, summarization, and creative generation**.\\n* Supports **Turkish natively** while remaining multilingual.\\n* Open-source and transparent for research and applications.\\n\\nIdeal for **developers, students, and organizations** needing **fast, reliable, and low-resource text-generation**.\\n\\n---\\n\\n# Our Next 1B and Next 4B models are leading to all of the tiny models in benchmarks. \\n\\n\\n  \\n    \\n      Model\\n      MMLU (5-shot) %\\n      MMLU-Pro %\\n      GSM8K %\\n      MATH %\\n    \\n  \\n  \\n    \\n      Next 4B preview\\n      84.6\\n      66.9\\n      82.7\\n      70.5\\n    \\n    \\n      Next 1B Version t327\\n      87.3\\n      69.2\\n      90.5\\n      70.1\\n    \\n    \\n      Qwen 3 0.6B\\n      52.81\\n      37.6\\n      60.7\\n      20.5\\n    \\n    \\n      Llama 3.2 1B\\n      49.3\\n      44.4\\n      11.9\\n      30.6\\n    \\n  \\n\\n\\n---\\n\\n# Also, our Next 14b model is leading to state-of-the-art models in some of the Benchmarks.\\n\\n  \\n    \\n      Model\\n      MMLU (5-shot) %\\n      MMLU-Pro %\\n      GSM8K %\\n      MATH %\\n    \\n  \\n  \\n    \\n      Next 14B (Thinking)\\n      94.6\\n      93.2\\n      98.8\\n      92.7\\n    \\n    \\n      Next 12B\\n      92.7\\n      84.4\\n      95.3\\n      87.2\\n    \\n    \\n      GPT-5\\n      92.5\\n      87.0\\n      98.4\\n      96.0\\n    \\n    \\n      Claude Opus 4.1 (Thinking)\\n      ~92.0\\n      87.8\\n      84.7\\n      95.4\\n    \\n  \\n\\n\\n---\\n\\n## 🎯 Goals\\n\\n1. **Lightweight Efficiency:** Run smoothly on low-resource devices.\\n2. **Reasoning-Focused:** Provide logical and coherent text outputs.\\n3. **Accessibility:** Fully open-source with clear documentation.\\n4. **Multilingual Adaptability:** Turkish-focused but supports other languages.\\n\\n---\\n\\n## ✨ Key Features\\n\\n| Feature                     | Description                                                           |\\n| --------------------------- | --------------------------------------------------------------------- |\\n| 🔋 Lightweight Architecture | Optimized for low VRAM usage; ideal for small GPUs or CPU deployment. |\\n| 🇹🇷 Turkish & Multilingual | Handles complex Turkish prompts accurately.                           |\\n| 🧠 Reasoning Capabilities   | Logical chain-of-thought for question-answering and problem-solving.  |\\n| 📊 Consistent Outputs       | Reliable and reproducible results across multiple runs.               |\\n| 🌍 Open Source              | Transparent, research-friendly, and community-driven.                 |\\n\\n---\\n\\n## 📐 Model Specifications\\n\\n| Specification      | Details                                                                |\\n| ------------------ | ---------------------------------------------------------------------- |\\n| Base Model         | Gemma 3                                                           |\\n| Parameter Count    | 1 Billion                                                              |\\n| Architecture       | Transformer, causal LLM                                                |\\n| Fine-Tuning Method | Instruction fine-tuning (SFT) with Turkish and multilingual datasets   |\\n| Optimizations      | Quantization-ready (q8, f16, f32)                      |\\n| Use Cases          | Text generation, summarization, Q&A, creative writing, reasoning tasks |\\n\\n---\\n\\n## 🚀 Installation & Usage\\n\\n### Use the model:\\n\\n```python\\nfrom transformers import AutoTokenizer, AutoModelForCausalLM\\nimport torch\\n\\nmodel_id = \"Lamapi/next-1b\"\\ntokenizer = AutoTokenizer.from_pretrained(model_id)\\nmodel = AutoModelForCausalLM.from_pretrained(model_id)\\n\\n# Chat message\\nmessages = [\\n    {\"role\": \"system\", \"content\": \"You are Next-X1, a smart and concise AI assistant trained by Lamapi. Always respond in the user\\'s language. Proudly made in Turkey.\"},\\n    {\"role\": \"user\", \"content\": \"Hello, how are you?\"}\\n]\\n\\n# Prepare input with Tokenizer\\nprompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\\ninputs = tokenizer(prompt, return_tensors=\"pt\")\\n\\n# Output from the model\\noutput = model.generate(**inputs, max_new_tokens=50)\\nprint(tokenizer.decode(output[0], skip_special_tokens=True))\\n```\\n\\n\\n  \\n    Hello, how are you?\\n  \\n  \\n  I\\'m fine, thank you. How are you?\\n  \\n\\n\\n---\\n\\n## 📄 License\\n\\nMIT License — free to use, modify, and distribute. Attribution appreciated.\\n\\n---\\n\\n## 📞 Contact & Support\\n\\n* 📧 **Email:** \\n* 🤗 **HuggingFace:** \\n\\n---\\n\\n> **Next-1B** — Lightweight, **efficient, and reasoning-focused**, bringing **Turkey’s AI forward** on low-resource hardware.\\n\\n',\n",
       "  'domain': 'summarization'},\n",
       " {'model_id': 'akasharidas/ddpm-cifar10-32-dot.in.name',\n",
       "  'created_at': '2025-06-09T17:25:27+00:00',\n",
       "  'downloads': 1211,\n",
       "  'likes': 0,\n",
       "  'author': None,\n",
       "  'tags': ['diffusers',\n",
       "   'safetensors',\n",
       "   'pytorch',\n",
       "   'unconditional-image-generation',\n",
       "   'arxiv:2006.11239',\n",
       "   'license:apache-2.0',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n# Denoising Diffusion Probabilistic Models (DDPM)\\n\\n**Paper**: \\n\\n**Authors**: Jonathan Ho, Ajay Jain, Pieter Abbeel\\n\\n**Abstract**:\\n\\n*We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN.*\\n\\n## Inference\\n\\n**DDPM** models can use *discrete noise schedulers* such as:\\n\\n- \\n- \\n- \\n\\nfor inference. Note that while the *ddpm* scheduler yields the highest quality, it also takes the longest.\\nFor a good trade-off between quality and inference speed you might want to consider the *ddim* or *pndm* schedulers instead.\\n\\nSee the following code:\\n\\n```python\\n# !pip install diffusers\\nfrom diffusers import DDPMPipeline, DDIMPipeline, PNDMPipeline\\n\\nmodel_id = \"google/ddpm-cifar10-32\"\\n\\n# load model and scheduler\\nddpm = DDPMPipeline.from_pretrained(model_id)  # you can replace DDPMPipeline with DDIMPipeline or PNDMPipeline for faster inference\\n\\n# run pipeline in inference (sample random noise and denoise)\\nimage = ddpm().images[0]\\n\\n# save image\\nimage.save(\"ddpm_generated_image.png\")\\n```\\n\\nFor more in-detail information, please have a look at the \\n\\n## Training\\n\\nIf you want to train your own model, please have a look at the \\n\\n## Samples\\n1. \\n2. \\n3. \\n4. ',\n",
       "  'domain': 'unconditional-image-generation'},\n",
       " {'model_id': 'mradermacher/DarkIdol-LongWriter-8B-Uncensored-1048k-GGUF',\n",
       "  'created_at': '2025-02-11T23:47:01+00:00',\n",
       "  'downloads': 309,\n",
       "  'likes': 2,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'gguf',\n",
       "   'text-generation',\n",
       "   'text2text-generation',\n",
       "   'translation',\n",
       "   'summarization',\n",
       "   'document-question-answering',\n",
       "   'NSFW',\n",
       "   'not-for-all-audiences',\n",
       "   'en',\n",
       "   'base_model:MrRobotoAI/DarkIdol-LongWriter-8B-Uncensored-1048k',\n",
       "   'base_model:quantized:MrRobotoAI/DarkIdol-LongWriter-8B-Uncensored-1048k',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': \"## About\\n\\n\\n\\n\\n\\n\\nstatic quants of \\n\\n\\n\\n***For a convenient overview and download list, visit our .***\\n\\nweighted/imatrix quants seem not to be available (by me) at this time. If they do not show up a week or so after the static ones, I have probably not planned for them. Feel free to request them by opening a Community Discussion.\\n## Usage\\n\\nIf you are unsure how to use GGUF files, refer to one of  for\\nmore details, including on how to concatenate multi-part files.\\n\\n## Provided Quants\\n\\n(sorted by size, not necessarily quality. IQ-quants are often preferable over similar sized non-IQ quants)\\n\\n| Link | Type | Size/GB | Notes |\\n|:-----|:-----|--------:|:------|\\n|  | Q2_K | 3.3 |  |\\n|  | Q3_K_S | 3.8 |  |\\n|  | Q3_K_M | 4.1 | lower quality |\\n|  | Q3_K_L | 4.4 |  |\\n|  | IQ4_XS | 4.6 |  |\\n|  | Q4_K_S | 4.8 | fast, recommended |\\n|  | Q4_K_M | 5.0 | fast, recommended |\\n|  | Q5_K_S | 5.7 |  |\\n|  | Q5_K_M | 5.8 |  |\\n|  | Q6_K | 6.7 | very good quality |\\n|  | Q8_0 | 8.6 | fast, best quality |\\n|  | f16 | 16.2 | 16 bpw, overkill |\\n\\nHere is a handy graph by ikawrakow comparing some lower-quality quant\\ntypes (lower is better):\\n\\n\\n\\nAnd here are Artefact2's thoughts on the matter:\\n\\n\\n## FAQ / Model Request\\n\\nSee  for some answers to\\nquestions you might have and/or if you want some other model quantized.\\n\\n## Thanks\\n\\nI thank my company, , for letting\\nme use its servers and providing upgrades to my workstation to enable\\nthis work in my free time.\\n\\n\\n\",\n",
       "  'domain': 'document-question-answering'},\n",
       " {'model_id': 'QuantFactory/TableLLM-13b-GGUF',\n",
       "  'created_at': '2024-10-21T08:46:45+00:00',\n",
       "  'downloads': 94,\n",
       "  'likes': 4,\n",
       "  'author': None,\n",
       "  'tags': ['transformers',\n",
       "   'gguf',\n",
       "   'table-question-answering',\n",
       "   'arxiv:2403.19318',\n",
       "   'license:llama2',\n",
       "   'endpoints_compatible',\n",
       "   'region:us'],\n",
       "  'modelcard': '\\n\\n\\n\\n# QuantFactory/TableLLM-13b-GGUF\\nThis is quantized version of  created using llama.cpp\\n\\n# Original Model Card\\n\\n---\\n\\ndatasets:\\n- RUCKBReasoning/TableLLM-SFT\\nlanguage:\\n- en\\ntags:\\n- Table\\n- QA\\n- Code\\n\\n---\\n\\n\\n\\n\\n# QuantFactory/TableLLM-13b-GGUF\\nThis is quantized version of  created using llama.cpp\\n\\n# Original Model Card\\n\\n\\n# TableLLM: Enabling Tabular Data Manipulation by LLMs in Real Office Usage Scenarios\\n\\n| **** | **** | **** | **** |\\n\\nWe present **TableLLM**, a powerful large language model designed to handle tabular data manipulation tasks efficiently, whether they are embedded in spreadsheets or documents, meeting the demands of real office scenarios. The TableLLM series encompasses two distinct scales:  and , which are fine-tuned based on  and .\\n\\nTableLLM generates either a code solution or a direct text answer to handle tabular data manipulation tasks based on different scenarios. Code generation is used for handling spreadsheet-embedded tabular data, which often involves the insert, delete, update, query, merge, and plot operations of tables. Text generation is used for handling document-embedded tabular data, which often involves the query operation of short tables.\\n\\n## Evaluation Results\\nWe evaluate the code solution generation ability of TableLLM on three benchmarks: WikiSQL, Spider and Self-created table operation benchmark. The text answer generation ability is tested on four benchmarks: WikiTableQuestion (WikiTQ), TAT-QA, FeTaQA and OTTQA. The evaluation result is shown below:\\n\\n| Model                | WikiTQ | TAT-QA | FeTaQA |  OTTQA  | WikiSQL | Spider | Self-created | Average |\\n| :------------------- | :----: | :----: | :----: | :-----: | :-----: | :----: | :----------: | :-----: |\\n| TaPEX                |  38.5  |    –   |    –   |    –    |   83.9  |  15.0  |       /      |   45.8  |\\n| TaPas                |  31.5  |    –   |    –    |   74.2  |  23.1  |       /      |   42.92 |\\n| TableLlama           |  24.0  |  22.2  |  20.5  |   6.4   |   43.7  |   9.0  |       /      |   20.7  |\\n| GPT3.5               |  58.5  | 72.1  |  71.2  |  60.8   |   81.7   |  67.4  | 77.1 |   69.8  |\\n| GPT4                 |**74.1**|**77.1**|**78.4**|**69.5** |   84.0  |  69.5  |     77.8     | **75.8**|\\n| Llama2-Chat (13B)    |  48.8  |  49.6  |  67.7  |  61.5   |    –    |    –   |       –      |   56.9  |\\n| CodeLlama (13B)      |  43.4  |  47.2  |  57.2  |  49.7   |   38.3  |  21.9  |     47.6     |   43.6  |\\n| Deepseek-Coder (33B) |   6.5  |  11.0  |   7.1  |   7.4   |   72.5  |  58.4  |     73.9     |   33.8  |\\n| StructGPT (GPT3.5)   |  52.5  |  27.5  |  11.8  |  14.0   |   67.8  |**84.8**|       /      |   48.9  |\\n| Binder (GPT3.5)      |  61.6  |  12.8  |   6.8  |   5.1   |   78.6  |  52.6  |       /      |   42.5  |\\n| DATER (GPT3.5)       |  53.4  |  28.4  |  18.3  |  13.0   |   58.2  |  26.5  |       /      |   37.0  |\\n| TableLLM-7B (Ours)   |  58.8  |  66.9  |  72.6  | 63.1 | 86.6|  82.6  | 78.8|   72.8  |\\n| TableLLM-13B (Ours)  | 62.4|  68.2  | 74.5|  62.5   | **90.7**| 83.4|   **80.8**   | 74.7|\\n\\n## Prompt Template\\nThe prompts we used for generating code solutions and text answers are introduced below.\\n\\n### Code Solution\\nThe prompt template for the insert, delete, update, query, and plot operations on a single table.\\n```\\n[INST]Below are the first few lines of a CSV file. You need to write a Python program to solve the provided question.\\n\\nHeader and first few lines of CSV file:\\n{csv_data}\\n\\nQuestion: {question}[/INST]\\n```\\n\\nThe prompt template for the merge operation on two tables.\\n```\\n[INST]Below are the first few lines two CSV file. You need to write a Python program to solve the provided question.\\n\\nHeader and first few lines of CSV file 1:\\n{csv_data1}\\n\\nHeader and first few lines of CSV file 2:\\n{csv_data2}\\n\\nQuestion: {question}[/INST]\\n```\\n\\nThe csv_data field is filled with the first few lines of your provided table file. Below is an example:\\n```\\nSex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings\\nM,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15\\nM,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7\\nF,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9\\nM,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10\\nI,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7\\n```\\n\\n### Text Answer\\nThe prompt template for direct text answer generation on short tables.\\n````\\n[INST]Offer a thorough and accurate solution that directly addresses the Question outlined in the [Question].\\n### [Table Text]\\n{table_descriptions}\\n\\n### [Table]\\n```\\n{table_in_csv}\\n```\\n\\n### [Question]\\n{question}\\n\\n### [Solution][INST/]\\n````\\n\\nFor more details about how to use TableLLM, please refer to our GitHub page: ',\n",
       "  'domain': 'table-question-answering'}]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(cleaned_models)\n",
    "# drop domain == any-to-any\n",
    "df = df[df[\"domain\"] != \"any-to-any\"]\n",
    "# keep 25 most downloaded models per domain\n",
    "df = df.sort_values(by=\"downloads\", ascending=False).groupby(\"domain\").head(25)\n",
    "cleaned_models = df.to_dict(orient=\"records\")\n",
    "cleaned_models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "6373f22a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18251"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "4969"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "7828"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "18617"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "19993"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "5322"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "3848"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "17216"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "3074"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "25296"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "10481"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "8325"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "9598"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "2655"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "11344"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "12428"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "10515"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "5755"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "2569"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "12039"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "3677"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "52964"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "3737"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "22779"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "9326"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "10231"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "3348"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "9770"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "59754"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "3898"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "3961"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "40211"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "5335"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "3024"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "6546"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "6079"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "21996"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "10303"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "19103"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "6938"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "2204"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "2079"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "5139"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# save collected_models to jsonl\n",
    "\n",
    "if not os.path.exists(os.getenv(\"SELF_INSTRUCT_ROOT_DATA\")):\n",
    "    os.makedirs(os.getenv(\"SELF_INSTRUCT_ROOT_DATA\"))\n",
    "\n",
    "output_file = os.path.join(\n",
    "    os.getenv(\"SELF_INSTRUCT_ROOT_DATA\"), os.getenv(\"FILE_NAME_MODEL_CARDS_STEP_1\")\n",
    ")\n",
    "with open(output_file, \"w\") as f:\n",
    "    for item in cleaned_models:\n",
    "        f.write(json.dumps(item) + \"\\n\")\n",
    "# read collected_models from jsonl\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
