{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from collections import Counter\n",
    "from nltk.corpus import stopwords\n",
    "from tqdm import tqdm\n",
    "\n",
    "# (Make sure you've downloaded the stopwords if you haven't already.)\n",
    "# import nltk\n",
    "# nltk.download('stopwords')\n",
    "\n",
    "# Set of stopwords you might want to ignore (you can expand this list)\n",
    "stop_words = set(stopwords.words('english'))\n",
    "# Optionally, add any extra words you don't consider modifiers:\n",
    "ignore_words = {\"a\", \"an\", \"the\", \"of\", \"and\", \"in\", \"on\"}\n",
    "\n",
    "all_modifiers = []\n",
    "\n",
    "with open('train_for_ft.txt', 'r') as f: # can be found in promptist repo https://github.com/microsoft/LMOps/tree/main/promptist\n",
    "    lines = f.readlines()\n",
    "lines = [line.strip() for line in lines]\n",
    "lines = [line for line in lines if line]  # Remove empty lines\n",
    "\n",
    "source_prompts = []\n",
    "target_prompts = []\n",
    "for line in tqdm(lines, desc=\"Processing lines\"):\n",
    "    one_inst = eval(line)\n",
    "    source = one_inst['source'].replace(\"Rephrase: \", \"\") # main content\n",
    "    target = one_inst['target'] # human engineered\n",
    "    source_prompts.append(source)\n",
    "    target_prompts.append(target)\n",
    "\n",
    "# For each prompt, extract the comma-separated parts.\n",
    "# This regex looks for tokens that either start at the beginning (^) or after a comma,\n",
    "# then skips any whitespace, and captures any sequence of characters until the next comma or end-of-string.\n",
    "pattern = r'(?:^|,)\\s*([^,]+?)(?=,|$)'\n",
    "\n",
    "for prompt in tqdm(target_prompts):\n",
    "    # Find all comma-separated tokens in the prompt\n",
    "    tokens = re.findall(pattern, prompt)\n",
    "    # Clean and filter each token:\n",
    "    for token in tokens:\n",
    "        word = token.strip().lower()  # lower-case for case-insensitive counting\n",
    "        # (Optionally, you might split multiword tokens further if needed.)\n",
    "        if word and word not in stop_words and word not in ignore_words:\n",
    "            all_modifiers.append(word)\n",
    "\n",
    "# Count frequency of modifiers across all prompts and get top 15\n",
    "top15_modifiers = Counter(all_modifiers).most_common(15)\n",
    "print(\"Top 15 modifiers:\")\n",
    "for modifier, count in top15_modifiers:\n",
    "    print(f\"{modifier}: {count}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TOP_MODIFIERS = [\n",
    "#     'concept art',\n",
    "#     'highly detailed',\n",
    "#     'sharp focus',\n",
    "#     'artstation',\n",
    "#     'digital painting',\n",
    "#     'intricate',\n",
    "#     'illustration',\n",
    "#     'trending on artstation',\n",
    "#     'smooth',\n",
    "#     'elegant',\n",
    "#     'octane render',\n",
    "#     'fantasy',\n",
    "#     'wlop',\n",
    "#     'digital art',\n",
    "#     '8 k',\n",
    "# ]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
