{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nombre de captions filtrées collectées : 6552\n",
      "\n",
      "Exemples de prompts générés:\n",
      "1. pink azaleas in the garden\n",
      "2. a white silverbush with a yellow flower in the middle of the flower\n",
      "3. a pink desert-rose in a garden\n",
      "4. a close up of some toad lilys in the garden\n",
      "5. a bee on a globe thistle\n",
      "6. a close up of a blue alpine sea holly\n",
      "7. a purple passion flower in the garden\n",
      "8. a pink and yellow hard-leaved pocket orchid on a black background\n",
      "9. a pink desert-rose in the garden\n",
      "10. a white tree mallow with green leaves on it\n",
      "Prompts sauvegardés dans ../prompts/flowers.json\n",
      "\n",
      "Génération terminée: 6552 prompts créés\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "import json\n",
    "from datasets import load_dataset\n",
    "\n",
    "def generate_prompt_list(num_prompts=10000):\n",
    "    # Charger le dataset complet (version 12M)\n",
    "    ds = load_dataset(\"pranked03/flowers-blip-captions\", split=\"train\")\n",
    "\n",
    "    all_captions = []\n",
    "    for i, example in enumerate(ds):\n",
    "        caption = example[\"text\"]  # Accès direct à la colonne\n",
    "        all_captions.append(caption)\n",
    "\n",
    "        # Arrêter après 50 000 captions maximum pour limiter la mémoire\n",
    "        if len(all_captions) >= min(num_prompts * 2, 50000):\n",
    "            break\n",
    "\n",
    "    print(f\"Nombre de captions filtrées collectées : {len(all_captions)}\")\n",
    "\n",
    "    # Garder uniquement les captions qui font au moins 3 mots\n",
    "    filtered_captions = [\n",
    "        cap for cap in all_captions\n",
    "        if isinstance(cap, str) and len(cap.split()) >= 3\n",
    "    ]\n",
    "\n",
    "    selected_prompts = random.sample(\n",
    "        filtered_captions,\n",
    "        min(num_prompts, len(filtered_captions))\n",
    "    )\n",
    "\n",
    "    return selected_prompts\n",
    "\n",
    "def save_prompts(prompts, output_file=\"../prompts/flowers.json\"):\n",
    "    with open(output_file, \"w\", encoding=\"utf-8\") as f:\n",
    "        json.dump(prompts, f, ensure_ascii=False, indent=2)\n",
    "    print(f\"Prompts sauvegardés dans {output_file}\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    prompts = generate_prompt_list(10000)\n",
    "\n",
    "    print(\"\\nExemples de prompts générés:\")\n",
    "    for i in range(min(10, len(prompts))):\n",
    "        print(f\"{i+1}. {prompts[i]}\")\n",
    "\n",
    "    save_prompts(prompts)\n",
    "\n",
    "    print(f\"\\nGénération terminée: {len(prompts)} prompts créés\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "test",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
