{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "313443a7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "from dotenv import load_dotenv\n",
    "import os\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "recent_prompts_path = os.path.join(\n",
    "    os.getenv(\"SELF_INSTRUCT_ROOT_DATA\"),\n",
    "    os.getenv(\"FILE_NAME_SELF_INSTRUCTED_MODELS_STEP_6\"),\n",
    ")\n",
    "old_prompts_path = os.path.join(\n",
    "    os.getenv(\"SELF_INSTRUCT_ROOT_DATA\"),\n",
    "    os.getenv(\"FILE_NAME_SELF_INSTRUCTED_LEGACY_MODELS_STEP_6\"),\n",
    ")\n",
    "corpus_path = os.path.join(os.getenv(\"DATA_PATH\"), \"model_indices/e1_e2.json\")\n",
    "recent_prompts = []\n",
    "old_prompts = []\n",
    "corpus = []\n",
    "with open(recent_prompts_path, \"r\") as f:\n",
    "    for line in f:\n",
    "        entry = json.loads(line)\n",
    "        recent_prompts.append(entry)\n",
    "with open(old_prompts_path, \"r\") as f:\n",
    "    for line in f:\n",
    "        entry = json.loads(line)\n",
    "        old_prompts.append(entry)\n",
    "with open(corpus_path, \"r\") as f:\n",
    "    for line in f:\n",
    "        entry = json.loads(line)\n",
    "        corpus.append(entry)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "551e1aac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['text-to-image', 'translation', 'depth-estimation', 'text-to-3d',\n",
       "       'zero-shot-classification', 'question-answering', 'image-to-text',\n",
       "       'visual-question-answering', 'image-text-to-text',\n",
       "       'text-classification', 'text-to-video', 'object-detection',\n",
       "       'text-generation', 'summarization', 'token-classification',\n",
       "       'image-segmentation', 'mask-generation',\n",
       "       'zero-shot-object-detection', 'table-question-answering',\n",
       "       'image-classification', 'image-feature-extraction',\n",
       "       'automatic-speech-recognition', 'zero-shot-image-classification',\n",
       "       'image-to-image', 'audio-classification', 'text-to-speech',\n",
       "       'feature-extraction', 'video-classification', 'fill-mask',\n",
       "       'sentence-similarity', 'audio-to-audio'], dtype=object)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_old = pd.DataFrame(old_prompts)\n",
    "df_old[\"domain\"].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "beeb5e95",
   "metadata": {},
   "outputs": [],
   "source": [
    "# manual mapping of domains of df to the unique domains in the corpus\n",
    "mapping = {\n",
    "    \"reinforcement-learning\": \"Reinforcement Learning\",\n",
    "    \"tabular-classification\": \"Tabular Tabular Classification\",\n",
    "    \"tabular-regression\": \"Tabular Tabular Regression\",\n",
    "    \"text-to-3d\": \"Computer Vision Text-to-3D\",\n",
    "    \"image-to-3d\": \"Computer Vision Image-to-3D\",\n",
    "    \"text-ranking\": \"Natural Language Processing Text Ranking\",\n",
    "    \"sentence-similarity\": \"Natural Language Processing Sentence Similarity\",\n",
    "    \"zero-shot-classification\": \"Natural Language Processing Zero-Shot Classification\",\n",
    "    \"translation\": \"Natural Language Processing Translation\",\n",
    "    \"summarization\": \"Natural Language Processing Summarization\",\n",
    "    \"text-generation\": \"Natural Language Processing Text Generation\",\n",
    "    \"feature-extraction\": \"Natural Language Processing Feature Extraction\",\n",
    "    \"question-answering\": \"Natural Language Processing Question Answering\",\n",
    "    \"table-question-answering\": \"Natural Language Processing Table Question Answering\",\n",
    "    \"text-to-image\": \"Computer Vision Text-to-Image\",\n",
    "    \"text-to-video\": \"Computer Vision Text-to-Video\",\n",
    "    \"image-to-text\": \"Computer Vision Image-to-Text\",\n",
    "    \"image-to-image\": \"Computer Vision Image-to-Image\",\n",
    "    \"image-to-video\": \"Computer Vision Image-to-Video\",\n",
    "    \"image-classification\": \"Computer Vision Image Classification\",\n",
    "    \"image-segmentation\": \"Computer Vision Image Segmentation\",\n",
    "    \"image-feature-extraction\": \"Computer Vision Image Feature Extraction\",\n",
    "    \"object-detection\": \"Computer Vision Object Detection\",\n",
    "    \"zero-shot-object-detection\": \"Computer Vision Zero-Shot Object Detection\",\n",
    "    \"keypoint-detection\": \"Computer Vision Keypoint Detection\",\n",
    "    \"mask-generation\": \"Computer Vision Mask Generation\",\n",
    "    \"depth-estimation\": \"Computer Vision Depth Estimation\",\n",
    "    \"unconditional-image-generation\": \"Computer Vision Unconditional Image Generation\",\n",
    "    \"zero-shot-image-classification\": \"Computer Vision Zero-Shot Image Classification\",\n",
    "    \"video-classification\": \"Computer Vision Video Classification\",\n",
    "    \"video-to-video\": \"Computer Vision Video-to-Video\",\n",
    "    \"video-text-to-text\": \"Multimodal Video-Text-to-Text\",\n",
    "    \"visual-question-answering\": \"Multimodal Visual Question Answering\",\n",
    "    \"document-question-answering\": \"Multimodal Document Question Answering\",\n",
    "    \"visual-document-retrieval\": \"Multimodal Visual Document Retrieval\",\n",
    "    \"image-text-to-text\": \"Multimodal Image-Text-to-Text\",\n",
    "    \"image-text-to-image\": \"Multimodal Image-Text-to-Image\",\n",
    "    \"image-text-to-video\": \"Multimodal Image-Text-to-Video\",\n",
    "    \"audio-text-to-text\": \"Audio Audio Text-to-Text\",\n",
    "    \"text-to-speech\": \"Audio Text-to-Speech\",\n",
    "    \"text-to-audio\": \"Audio Text-to-Audio\",\n",
    "    \"automatic-speech-recognition\": \"Audio Automatic Speech Recognition\",\n",
    "    \"audio-to-audio\": \"Audio Audio-to-Audio\",\n",
    "    \"audio-classification\": \"Audio Audio Classification\",\n",
    "    \"any-to-any\": \"Multimodal Any-to-Any\",\n",
    "    \"token-classification\": \"Natural Language Processing Token Classification\",\n",
    "    \"text-classification\": \"Natural Language Processing Text Classification\",\n",
    "    \"fill-mask\": \"Natural Language Processing Fill-Mask\",\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c314562f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_new = pd.DataFrame(recent_prompts)\n",
    "df_old = pd.DataFrame(old_prompts)\n",
    "# change columns names to have domain instead of task\n",
    "df_new.rename(\n",
    "    columns={\n",
    "        \"Instruction\": \"instruction\",\n",
    "        \"model_id\": \"model_name\",\n",
    "        \"modelcard\": \"description\",\n",
    "    },\n",
    "    inplace=True,\n",
    ")\n",
    "df_old.rename(\n",
    "    columns={\n",
    "        \"Instruction\": \"instruction\",\n",
    "        \"model_id\": \"model_name\",\n",
    "        \"modelcard\": \"description\",\n",
    "    },\n",
    "    inplace=True,\n",
    ")\n",
    "\n",
    "df_new[\"domain\"] = df_new[\"domain\"].map(mapping)\n",
    "\n",
    "# for the old prompts, we need to search the model_id in the corpus to get the domain\n",
    "model_id_to_domain = {}\n",
    "for entry in corpus:\n",
    "    model_id_to_domain[entry[\"model_name\"]] = (\n",
    "        entry[\"domain\"] if \"domain\" in entry else None\n",
    "    )\n",
    "df_old[\"domain\"] = df_old[\"model_name\"].map(model_id_to_domain)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d2a0cb51",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of missing domains in old prompts: 0\n"
     ]
    }
   ],
   "source": [
    "# print number of missing domains in df_old\n",
    "missing_domains = df_old[\"domain\"].isnull().sum()\n",
    "print(f\"Number of missing domains in old prompts: {missing_domains}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "28762efc",
   "metadata": {},
   "source": [
    "# Gestisci created_at\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7a6a167a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "\n",
    "def parse_created_at(s):\n",
    "    s = s.copy()\n",
    "\n",
    "    num = pd.to_numeric(s, errors=\"coerce\")\n",
    "\n",
    "    dt = pd.to_datetime(s, errors=\"coerce\", utc=True)\n",
    "\n",
    "    ms = num.notna() & (num > 1e12)\n",
    "    sec = num.notna() & ~ms\n",
    "\n",
    "    dt.loc[ms] = pd.to_datetime(num[ms], unit=\"ms\", utc=True)\n",
    "    dt.loc[sec] = pd.to_datetime(num[sec], unit=\"s\", utc=True)\n",
    "\n",
    "    return dt\n",
    "\n",
    "\n",
    "df_new[\"created_at\"] = parse_created_at(df_new[\"created_at\"])\n",
    "df_old[\"created_at\"] = parse_created_at(df_old[\"created_at\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a2049cfc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2024-09-02 06:36:42+00:00\n",
      "2025-05-31 09:02:35+00:00\n",
      "2025-05-31 10:39:16+00:00\n",
      "2026-01-15 17:15:23+00:00\n",
      "Number of recent models: 9290\n",
      "Number of old recent models: 9258\n",
      "Number of common models between old and old recent: 0\n",
      "Number of unique models in old recent: 465\n",
      "Number of unique models in recent: 466\n",
      "########################################################################\n",
      "New number of recent prompts: 9890\n",
      "New number of unique models in recent: 526\n",
      "New number of old recent models: 8658\n",
      "New number of unique models in old recent: 465\n"
     ]
    }
   ],
   "source": [
    "# split df_new in two dataframes according to the median created_at date which is in format YYYY-MM-DD\n",
    "df_new[\"created_at\"] = pd.to_datetime(df_new[\"created_at\"])\n",
    "median_date = df_new[\"created_at\"].median()\n",
    "# split\n",
    "df_recent = df_new[df_new[\"created_at\"] >= median_date]\n",
    "df_old_recent = df_new[df_new[\"created_at\"] < median_date]\n",
    "# print min and max created_at of df_old_recent\n",
    "print(df_old_recent[\"created_at\"].min())\n",
    "print(df_old_recent[\"created_at\"].max())\n",
    "print(df_recent[\"created_at\"].min())\n",
    "print(df_recent[\"created_at\"].max())\n",
    "\n",
    "# print sizes\n",
    "print(f\"Number of recent models: {len(df_recent)}\")\n",
    "print(f\"Number of old recent models: {len(df_old_recent)}\")\n",
    "# check number of unique model names between df_old and df_old_recent\n",
    "unique_old_models = set(df_old[\"model_name\"].unique())\n",
    "unique_old_recent_models = set(df_old_recent[\"model_name\"].unique())\n",
    "common_models = unique_old_models.intersection(unique_old_recent_models)\n",
    "print(f\"Number of common models between old and old recent: {len(common_models)}\")\n",
    "# nunique models in df_old_recent and df_recent\n",
    "print(f\"Number of unique models in old recent: {df_old_recent['model_name'].nunique()}\")\n",
    "print(f\"Number of unique models in recent: {df_recent['model_name'].nunique()}\")\n",
    "\n",
    "\n",
    "# pick 40 models from df_old_recent, then pick 10 prompts for each model from df_old_recent and put to df_recent\n",
    "selected_models = (\n",
    "    df_old_recent[\"model_name\"].drop_duplicates().sample(60, random_state=42)\n",
    ")\n",
    "prompts_to_add = []\n",
    "for model in selected_models:\n",
    "    model_prompts = df_old_recent[df_old_recent[\"model_name\"] == model]\n",
    "    sampled_prompts = model_prompts.sample(10, random_state=42)\n",
    "    prompts_to_add.append(sampled_prompts)\n",
    "    # remove sampled prompts from df_old_recent\n",
    "    df_old_recent = df_old_recent.drop(sampled_prompts.index)\n",
    "prompts_to_add_df = pd.concat(prompts_to_add)\n",
    "# concatenate to df_recent\n",
    "df_recent = pd.concat([df_recent, prompts_to_add_df])\n",
    "# print sizes and stats as before\n",
    "print(\"########################################################################\")\n",
    "\n",
    "print(f\"New number of recent prompts: {len(df_recent)}\")\n",
    "print(f\"New number of unique models in recent: {df_recent['model_name'].nunique()}\")\n",
    "print(f\"New number of old recent models: {len(df_old_recent)}\")\n",
    "print(\n",
    "    f\"New number of unique models in old recent: {df_old_recent['model_name'].nunique()}\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e27700d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# put half of models of df_old in df_recent and the other half in df_old_recent\n",
    "num_models_old = df_old[\"model_name\"].nunique()\n",
    "half_num_models_old = 70  # num_models_old // 2\n",
    "old_models_list = df_old[\"model_name\"].unique().tolist()\n",
    "# pick randomly half of the models\n",
    "import random\n",
    "\n",
    "models_for_old_recent = random.sample(old_models_list, half_num_models_old)\n",
    "models_for_recent = [\n",
    "    model for model in old_models_list if model not in models_for_old_recent\n",
    "]\n",
    "\n",
    "df_old_recent_additional = df_old[df_old[\"model_name\"].isin(models_for_old_recent)]\n",
    "df_recent_additional = df_old[df_old[\"model_name\"].isin(models_for_recent)]\n",
    "# concatenate dataframes\n",
    "experience_3 = pd.concat([df_old_recent, df_old_recent_additional], ignore_index=True)\n",
    "experience_4 = pd.concat([df_recent, df_recent_additional], ignore_index=True)\n",
    "\n",
    "# shuffle dataframes\n",
    "experience_3 = experience_3.sample(frac=1, random_state=42).reset_index()\n",
    "experience_4 = experience_4.sample(frac=1, random_state=42).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "eb53fbb2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "########################################################################\n",
      "Final number of prompts in experience 3: 10011\n",
      "Final number of unique models in experience 3: 530\n",
      "Final number of prompts in experience 4: 10490\n",
      "Final number of unique models in experience 4: 556\n"
     ]
    }
   ],
   "source": [
    "# drop models that compare less than 5 prompts in experience_3 and experience_4\n",
    "min_prompts = 10\n",
    "model_prompt_counts_3 = experience_3[\"model_name\"].value_counts()\n",
    "models_to_keep_3 = model_prompt_counts_3[model_prompt_counts_3 >= min_prompts].index\n",
    "experience_3 = experience_3[experience_3[\"model_name\"].isin(models_to_keep_3)]\n",
    "model_prompt_counts_4 = experience_4[\"model_name\"].value_counts()\n",
    "models_to_keep_4 = model_prompt_counts_4[model_prompt_counts_4 >= min_prompts].index\n",
    "experience_4 = experience_4[experience_4[\"model_name\"].isin(models_to_keep_4)]\n",
    "# print final sizes\n",
    "print(\"########################################################################\")\n",
    "print(f\"Final number of prompts in experience 3: {len(experience_3)}\")\n",
    "print(\n",
    "    f\"Final number of unique models in experience 3: {experience_3['model_name'].nunique()}\"\n",
    ")\n",
    "print(f\"Final number of prompts in experience 4: {len(experience_4)}\")\n",
    "print(\n",
    "    f\"Final number of unique models in experience 4: {experience_4['model_name'].nunique()}\"\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f1f76a2b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Statistics for Experience 3:\n",
      "Number of prompts: 10011\n",
      "Number of unique models: 530\n",
      "Number of unique domains: 49\n",
      "Last date in created_at: 2025-05-31 09:02:35+00:00\n",
      "First date in created_at: 2022-03-02 23:29:04+00:00\n",
      "Number of models also in old prompts: 70\n",
      "Minimum number of prompts per model: 10\n",
      "\n",
      "Statistics for Experience 4:\n",
      "Number of prompts: 10490\n",
      "Number of unique models: 556\n",
      "Number of unique domains: 46\n",
      "Last date in created_at: 2026-01-15 17:15:23+00:00\n",
      "First date in created_at: 2022-03-02 23:29:04+00:00\n",
      "Number of models also in old prompts: 90\n",
      "Minimum number of prompts per model: 10\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "\n",
    "# statistics\n",
    "def print_stats(df, name, df_old):\n",
    "    print(f\"Statistics for {name}:\")\n",
    "    print(f\"Number of prompts: {len(df)}\")\n",
    "    print(f\"Number of unique models: {df['model_name'].nunique()}\")\n",
    "    print(f\"Number of unique domains: {df['domain'].nunique()}\")\n",
    "\n",
    "    # created_at statistics\n",
    "    if \"created_at\" in df.columns:\n",
    "        # Normalize all timestamps to UTC to avoid tz-aware / tz-naive mix\n",
    "        created_at_series = pd.to_datetime(df[\"created_at\"], errors=\"coerce\", utc=True)\n",
    "\n",
    "        max_date = created_at_series.max()\n",
    "        min_date = created_at_series.min()\n",
    "\n",
    "        if pd.notna(max_date):\n",
    "            print(f\"Last date in created_at: {max_date}\")\n",
    "            print(f\"First date in created_at: {min_date}\")\n",
    "        else:\n",
    "            print(\"Last date in created_at: N/A\")\n",
    "    else:\n",
    "        print(\"Column 'created_at' not found\")\n",
    "\n",
    "    # models also present in old prompts\n",
    "    common_models = set(df[\"model_name\"].unique()).intersection(\n",
    "        set(df_old[\"model_name\"].unique())\n",
    "    )\n",
    "    print(f\"Number of models also in old prompts: {len(common_models)}\")\n",
    "\n",
    "    # minimum number of prompts per model\n",
    "    model_prompt_counts = df[\"model_name\"].value_counts()\n",
    "    min_prompts = model_prompt_counts.min()\n",
    "    print(f\"Minimum number of prompts per model: {min_prompts}\")\n",
    "    print()\n",
    "\n",
    "\n",
    "# calls\n",
    "print_stats(experience_3, \"Experience 3\", df_old)\n",
    "\n",
    "print_stats(\n",
    "    experience_4, \"Experience 4\", pd.concat([df_old, df_old_recent], ignore_index=True)\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "05b1d551",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\begin{tabular}{lrr}\n",
      "\\toprule\n",
      "domain & counts_e3 & counts_e4 \\\\\n",
      "\\midrule\n",
      "Audio Audio Classification & 350 & 170 \\\\\n",
      "Audio Audio Text-to-Text & 60 & 428 \\\\\n",
      "Audio Audio-to-Audio & 140 & 380 \\\\\n",
      "Audio Automatic Speech Recognition & 360 & 260 \\\\\n",
      "Audio Classification & 20 & 0 \\\\\n",
      "Audio Text-to-Speech & 230 & 290 \\\\\n",
      "Audio Voice Activity Detection & 0 & 20 \\\\\n",
      "Computer Vision Depth Estimation & 209 & 269 \\\\\n",
      "Computer Vision Image Classification & 219 & 180 \\\\\n",
      "Computer Vision Image Feature Extraction & 120 & 377 \\\\\n",
      "Computer Vision Image Segmentation & 356 & 159 \\\\\n",
      "Computer Vision Image-to-3D & 160 & 280 \\\\\n",
      "Computer Vision Image-to-Image & 159 & 500 \\\\\n",
      "Computer Vision Image-to-Text & 120 & 379 \\\\\n",
      "Computer Vision Image-to-Video & 260 & 220 \\\\\n",
      "Computer Vision Keypoint Detection & 230 & 269 \\\\\n",
      "Computer Vision Mask Generation & 250 & 210 \\\\\n",
      "Computer Vision Object Detection & 390 & 190 \\\\\n",
      "Computer Vision Text-to-3D & 180 & 240 \\\\\n",
      "Computer Vision Text-to-Image & 179 & 320 \\\\\n",
      "Computer Vision Text-to-Video & 190 & 269 \\\\\n",
      "Computer Vision Unconditional Image Generation & 80 & 100 \\\\\n",
      "Computer Vision Video Classification & 209 & 240 \\\\\n",
      "Computer Vision Video-to-Video & 130 & 350 \\\\\n",
      "Computer Vision Zero-Shot Image Classification & 318 & 139 \\\\\n",
      "Computer Vision Zero-Shot Object Detection & 300 & 160 \\\\\n",
      "Multimodal Document Question Answering & 319 & 60 \\\\\n",
      "Multimodal Feature Extraction & 20 & 0 \\\\\n",
      "Multimodal Image-Text-to-Text & 189 & 305 \\\\\n",
      "Multimodal Image-to-Text & 76 & 20 \\\\\n",
      "Multimodal Text-to-Image & 20 & 0 \\\\\n",
      "Multimodal Video-Text-to-Text & 259 & 159 \\\\\n",
      "Multimodal Visual Document Retrieval & 289 & 210 \\\\\n",
      "Multimodal Visual Question Answering & 249 & 110 \\\\\n",
      "Multimodal Zero-Shot Image Classification & 20 & 0 \\\\\n",
      "Natural Language Processing Feature Extraction & 200 & 240 \\\\\n",
      "Natural Language Processing Fill-Mask & 200 & 160 \\\\\n",
      "Natural Language Processing Question Answering & 140 & 155 \\\\\n",
      "Natural Language Processing Sentence Similarity & 419 & 200 \\\\\n",
      "Natural Language Processing Summarization & 140 & 140 \\\\\n",
      "Natural Language Processing Table Question Answering & 469 & 90 \\\\\n",
      "Natural Language Processing Text Classification & 340 & 157 \\\\\n",
      "Natural Language Processing Text Generation & 364 & 138 \\\\\n",
      "Natural Language Processing Text Ranking & 140 & 320 \\\\\n",
      "Natural Language Processing Token Classification & 20 & 538 \\\\\n",
      "Natural Language Processing Translation & 219 & 240 \\\\\n",
      "Natural Language Processing Zero-Shot Classification & 360 & 230 \\\\\n",
      "Reinforcement Learning & 80 & 119 \\\\\n",
      "Tabular Tabular Classification & 100 & 200 \\\\\n",
      "Tabular Tabular Regression & 160 & 300 \\\\\n",
      "\\bottomrule\n",
      "\\end{tabular}\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>domain</th>\n",
       "      <th>counts_e3</th>\n",
       "      <th>counts_e4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Audio Audio Classification</td>\n",
       "      <td>350</td>\n",
       "      <td>170</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Audio Audio Text-to-Text</td>\n",
       "      <td>60</td>\n",
       "      <td>428</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Audio Audio-to-Audio</td>\n",
       "      <td>140</td>\n",
       "      <td>380</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Audio Automatic Speech Recognition</td>\n",
       "      <td>360</td>\n",
       "      <td>260</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Audio Classification</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Audio Text-to-Speech</td>\n",
       "      <td>230</td>\n",
       "      <td>290</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Audio Voice Activity Detection</td>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Computer Vision Depth Estimation</td>\n",
       "      <td>209</td>\n",
       "      <td>269</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Computer Vision Image Classification</td>\n",
       "      <td>219</td>\n",
       "      <td>180</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Computer Vision Image Feature Extraction</td>\n",
       "      <td>120</td>\n",
       "      <td>377</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Computer Vision Image Segmentation</td>\n",
       "      <td>356</td>\n",
       "      <td>159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Computer Vision Image-to-3D</td>\n",
       "      <td>160</td>\n",
       "      <td>280</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Computer Vision Image-to-Image</td>\n",
       "      <td>159</td>\n",
       "      <td>500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Computer Vision Image-to-Text</td>\n",
       "      <td>120</td>\n",
       "      <td>379</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Computer Vision Image-to-Video</td>\n",
       "      <td>260</td>\n",
       "      <td>220</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Computer Vision Keypoint Detection</td>\n",
       "      <td>230</td>\n",
       "      <td>269</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Computer Vision Mask Generation</td>\n",
       "      <td>250</td>\n",
       "      <td>210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Computer Vision Object Detection</td>\n",
       "      <td>390</td>\n",
       "      <td>190</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Computer Vision Text-to-3D</td>\n",
       "      <td>180</td>\n",
       "      <td>240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Computer Vision Text-to-Image</td>\n",
       "      <td>179</td>\n",
       "      <td>320</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Computer Vision Text-to-Video</td>\n",
       "      <td>190</td>\n",
       "      <td>269</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>Computer Vision Unconditional Image Generation</td>\n",
       "      <td>80</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Computer Vision Video Classification</td>\n",
       "      <td>209</td>\n",
       "      <td>240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>Computer Vision Video-to-Video</td>\n",
       "      <td>130</td>\n",
       "      <td>350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Computer Vision Zero-Shot Image Classification</td>\n",
       "      <td>318</td>\n",
       "      <td>139</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>Computer Vision Zero-Shot Object Detection</td>\n",
       "      <td>300</td>\n",
       "      <td>160</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>Multimodal Document Question Answering</td>\n",
       "      <td>319</td>\n",
       "      <td>60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>Multimodal Feature Extraction</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>Multimodal Image-Text-to-Text</td>\n",
       "      <td>189</td>\n",
       "      <td>305</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>Multimodal Image-to-Text</td>\n",
       "      <td>76</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>Multimodal Text-to-Image</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>Multimodal Video-Text-to-Text</td>\n",
       "      <td>259</td>\n",
       "      <td>159</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>Multimodal Visual Document Retrieval</td>\n",
       "      <td>289</td>\n",
       "      <td>210</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>Multimodal Visual Question Answering</td>\n",
       "      <td>249</td>\n",
       "      <td>110</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>Multimodal Zero-Shot Image Classification</td>\n",
       "      <td>20</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>Natural Language Processing Feature Extraction</td>\n",
       "      <td>200</td>\n",
       "      <td>240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>Natural Language Processing Fill-Mask</td>\n",
       "      <td>200</td>\n",
       "      <td>160</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>Natural Language Processing Question Answering</td>\n",
       "      <td>140</td>\n",
       "      <td>155</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>Natural Language Processing Sentence Similarity</td>\n",
       "      <td>419</td>\n",
       "      <td>200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>Natural Language Processing Summarization</td>\n",
       "      <td>140</td>\n",
       "      <td>140</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>Natural Language Processing Table Question Ans...</td>\n",
       "      <td>469</td>\n",
       "      <td>90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>Natural Language Processing Text Classification</td>\n",
       "      <td>340</td>\n",
       "      <td>157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>Natural Language Processing Text Generation</td>\n",
       "      <td>364</td>\n",
       "      <td>138</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>Natural Language Processing Text Ranking</td>\n",
       "      <td>140</td>\n",
       "      <td>320</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>Natural Language Processing Token Classification</td>\n",
       "      <td>20</td>\n",
       "      <td>538</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>Natural Language Processing Translation</td>\n",
       "      <td>219</td>\n",
       "      <td>240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>Natural Language Processing Zero-Shot Classifi...</td>\n",
       "      <td>360</td>\n",
       "      <td>230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>Reinforcement Learning</td>\n",
       "      <td>80</td>\n",
       "      <td>119</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>Tabular Tabular Classification</td>\n",
       "      <td>100</td>\n",
       "      <td>200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>Tabular Tabular Regression</td>\n",
       "      <td>160</td>\n",
       "      <td>300</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               domain  counts_e3  counts_e4\n",
       "0                          Audio Audio Classification        350        170\n",
       "1                            Audio Audio Text-to-Text         60        428\n",
       "2                                Audio Audio-to-Audio        140        380\n",
       "3                  Audio Automatic Speech Recognition        360        260\n",
       "4                                Audio Classification         20          0\n",
       "5                                Audio Text-to-Speech        230        290\n",
       "6                      Audio Voice Activity Detection          0         20\n",
       "7                    Computer Vision Depth Estimation        209        269\n",
       "8                Computer Vision Image Classification        219        180\n",
       "9            Computer Vision Image Feature Extraction        120        377\n",
       "10                 Computer Vision Image Segmentation        356        159\n",
       "11                        Computer Vision Image-to-3D        160        280\n",
       "12                     Computer Vision Image-to-Image        159        500\n",
       "13                      Computer Vision Image-to-Text        120        379\n",
       "14                     Computer Vision Image-to-Video        260        220\n",
       "15                 Computer Vision Keypoint Detection        230        269\n",
       "16                    Computer Vision Mask Generation        250        210\n",
       "17                   Computer Vision Object Detection        390        190\n",
       "18                         Computer Vision Text-to-3D        180        240\n",
       "19                      Computer Vision Text-to-Image        179        320\n",
       "20                      Computer Vision Text-to-Video        190        269\n",
       "21     Computer Vision Unconditional Image Generation         80        100\n",
       "22               Computer Vision Video Classification        209        240\n",
       "23                     Computer Vision Video-to-Video        130        350\n",
       "24     Computer Vision Zero-Shot Image Classification        318        139\n",
       "25         Computer Vision Zero-Shot Object Detection        300        160\n",
       "26             Multimodal Document Question Answering        319         60\n",
       "27                      Multimodal Feature Extraction         20          0\n",
       "28                      Multimodal Image-Text-to-Text        189        305\n",
       "29                           Multimodal Image-to-Text         76         20\n",
       "30                           Multimodal Text-to-Image         20          0\n",
       "31                      Multimodal Video-Text-to-Text        259        159\n",
       "32               Multimodal Visual Document Retrieval        289        210\n",
       "33               Multimodal Visual Question Answering        249        110\n",
       "34          Multimodal Zero-Shot Image Classification         20          0\n",
       "35     Natural Language Processing Feature Extraction        200        240\n",
       "36              Natural Language Processing Fill-Mask        200        160\n",
       "37     Natural Language Processing Question Answering        140        155\n",
       "38    Natural Language Processing Sentence Similarity        419        200\n",
       "39          Natural Language Processing Summarization        140        140\n",
       "40  Natural Language Processing Table Question Ans...        469         90\n",
       "41    Natural Language Processing Text Classification        340        157\n",
       "42        Natural Language Processing Text Generation        364        138\n",
       "43           Natural Language Processing Text Ranking        140        320\n",
       "44   Natural Language Processing Token Classification         20        538\n",
       "45            Natural Language Processing Translation        219        240\n",
       "46  Natural Language Processing Zero-Shot Classifi...        360        230\n",
       "47                             Reinforcement Learning         80        119\n",
       "48                     Tabular Tabular Classification        100        200\n",
       "49                         Tabular Tabular Regression        160        300"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "e3 = experience_3.groupby(\"domain\").size().reset_index(name=\"counts\")\n",
    "e4 = experience_4.groupby(\"domain\").size().reset_index(name=\"counts\")\n",
    "# join e3 and e4 on domain mantaining all domains\n",
    "e3_e4 = pd.merge(e3, e4, on=\"domain\", how=\"outer\", suffixes=(\"_e3\", \"_e4\"))\n",
    "# convert to latex table transform all to integer without scientific notation\n",
    "e3_e4[\"counts_e3\"] = e3_e4[\"counts_e3\"].fillna(0).astype(int)\n",
    "e3_e4[\"counts_e4\"] = e3_e4[\"counts_e4\"].fillna(0).astype(int)\n",
    "latex_table = e3_e4.to_latex(index=False)\n",
    "print(latex_table)\n",
    "e3_e4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "40aaab4f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# split in train valdation and test (80/10/10) in stratified way according to model_name using sklearn\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_3, temp_3 = train_test_split(\n",
    "    experience_3, test_size=0.2, stratify=experience_3[\"model_name\"], random_state=42\n",
    ")\n",
    "val_3, test_3 = train_test_split(\n",
    "    temp_3, test_size=0.5, stratify=temp_3[\"model_name\"], random_state=42\n",
    ")\n",
    "train_4, temp_4 = train_test_split(\n",
    "    experience_4, test_size=0.2, stratify=experience_4[\"model_name\"], random_state=42\n",
    ")\n",
    "val_4, test_4 = train_test_split(\n",
    "    temp_4, test_size=0.5, stratify=temp_4[\"model_name\"], random_state=42\n",
    ")\n",
    "# save to jsonl files\n",
    "output_dir_exp3 = os.path.join(os.getenv(\"DATA_PATH\"), \"raw/exp3/\")\n",
    "output_dir_exp4 = os.path.join(os.getenv(\"DATA_PATH\"), \"raw/exp4/\")\n",
    "train_3.to_json(\n",
    "    output_dir_exp3 + \"exp3-train.jsonl\",\n",
    "    orient=\"records\",\n",
    "    lines=True,\n",
    "    force_ascii=False,\n",
    ")\n",
    "val_3.to_json(\n",
    "    output_dir_exp3 + \"exp3-val.jsonl\",\n",
    "    orient=\"records\",\n",
    "    lines=True,\n",
    "    force_ascii=False,\n",
    ")\n",
    "test_3.to_json(\n",
    "    output_dir_exp3 + \"exp3-eval.jsonl\",\n",
    "    orient=\"records\",\n",
    "    lines=True,\n",
    "    force_ascii=False,\n",
    ")\n",
    "train_4.to_json(\n",
    "    output_dir_exp4 + \"exp4-train.jsonl\",\n",
    "    orient=\"records\",\n",
    "    lines=True,\n",
    "    force_ascii=False,\n",
    ")\n",
    "val_4.to_json(\n",
    "    output_dir_exp4 + \"exp4-val.jsonl\",\n",
    "    orient=\"records\",\n",
    "    lines=True,\n",
    "    force_ascii=False,\n",
    ")\n",
    "test_4.to_json(\n",
    "    output_dir_exp4 + \"exp4-eval.jsonl\",\n",
    "    orient=\"records\",\n",
    "    lines=True,\n",
    "    force_ascii=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "e91238a4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Statistics for Experience 3 Train:\n",
      "Number of prompts: 8008\n",
      "Number of unique models: 530\n",
      "Number of unique domains: 49\n",
      "Last date in created_at: 2025-05-31 09:02:35+00:00\n",
      "First date in created_at: 2022-03-02 23:29:04+00:00\n",
      "Number of models also in old prompts: 70\n",
      "Minimum number of prompts per model: 8\n",
      "\n",
      "Statistics for Experience 3 Validation:\n",
      "Number of prompts: 1001\n",
      "Number of unique models: 530\n",
      "Number of unique domains: 49\n",
      "Last date in created_at: 2025-05-31 09:02:35+00:00\n",
      "First date in created_at: 2022-03-02 23:29:04+00:00\n",
      "Number of models also in old prompts: 70\n",
      "Minimum number of prompts per model: 1\n",
      "\n",
      "Statistics for Experience 3 Test:\n",
      "Number of prompts: 1002\n",
      "Number of unique models: 530\n",
      "Number of unique domains: 49\n",
      "Last date in created_at: 2025-05-31 09:02:35+00:00\n",
      "First date in created_at: 2022-03-02 23:29:04+00:00\n",
      "Number of models also in old prompts: 70\n",
      "Minimum number of prompts per model: 1\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print_stats(train_3, \"Experience 3 Train\", df_old)\n",
    "print_stats(val_3, \"Experience 3 Validation\", df_old)\n",
    "print_stats(test_3, \"Experience 3 Test\", df_old)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "c2a8dfff",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "All models in test are present in train for Experience 3.\n",
      "All models in test are present in train for Experience 4.\n",
      "All models in test are present in train for Experience 3.\n",
      "All models in test are present in train for Experience 4.\n"
     ]
    }
   ],
   "source": [
    "# check if all models in test are in train\n",
    "def check_models_in_train(train_df, test_df, experience_name):\n",
    "    train_models = set(train_df[\"model_name\"].unique())\n",
    "    test_models = set(test_df[\"model_name\"].unique())\n",
    "    missing_models = test_models - train_models\n",
    "    if len(missing_models) == 0:\n",
    "        print(f\"All models in test are present in train for {experience_name}.\")\n",
    "    else:\n",
    "        print(f\"Missing models in train for {experience_name}: {missing_models}\")\n",
    "\n",
    "\n",
    "check_models_in_train(train_3, test_3, \"Experience 3\")\n",
    "check_models_in_train(train_4, test_4, \"Experience 4\")\n",
    "\n",
    "check_models_in_train(train_3, val_3, \"Experience 3\")\n",
    "check_models_in_train(train_4, val_4, \"Experience 4\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f3663b14",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
