{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "cafc53e5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 938 entries from /leonardo/home/userexternal/ggramagl/prjects/CCO/self-instruct/data/models_with_prompt_WITH_2024.jsonl\n",
      "Removed entries with 'None': 7\n",
      "Final cleaned models: 931\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import re\n",
    "from dotenv import load_dotenv\n",
    "import os\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "file = os.path.join(\n",
    "    os.getenv(\"SELF_INSTRUCT_ROOT_DATA\"),\n",
    "    os.getenv(\"FILE_NAME_SELF_INSTRUCTED_MODELS_STEP_5\"),\n",
    ")\n",
    "\n",
    "with open(file, \"r\") as f:\n",
    "    data = json.load(f)\n",
    "print(f\"Loaded {len(data)} entries from {file}\")\n",
    "\n",
    "# remove models if contains \"None\" string in modelcard or queries\n",
    "final_cleaned_models = []\n",
    "for entry in data:\n",
    "    modelcard = entry.get(\"modelcard\", \"\")\n",
    "    queries = entry.get(\"queries\", [])\n",
    "\n",
    "    # Check if modelcard contains \"None\" or any query is exactly \"None\"\n",
    "    if \"None\" in modelcard or any(\"None\" in str(q) for q in queries):\n",
    "        continue\n",
    "\n",
    "    final_cleaned_models.append(entry)\n",
    "\n",
    "print(f\"Removed entries with 'None': {len(data) - len(final_cleaned_models)}\")\n",
    "print(f\"Final cleaned models: {len(final_cleaned_models)}\")\n",
    "data = final_cleaned_models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c1db8759",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing complete:\n",
      "  ✅ Valid entries (direct): 931\n",
      "  ✅ Valid entries (manual extraction): 0\n",
      "  ✅ Total valid: 931\n",
      "  ❌ Invalid JSON: 0\n",
      "  ❌ Invalid structure: 0\n",
      "  📊 Total processed: 931\n",
      "  📈 Success rate: 100.00%\n"
     ]
    }
   ],
   "source": [
    "# Extract and validate queries from all entries\n",
    "count_valid = 0\n",
    "count_invalid_json = 0\n",
    "count_invalid_structure = 0\n",
    "count_manual_extraction = 0\n",
    "failed_entries = []  # Store failed entries\n",
    "cleaned_models = []  # Store valid entries\n",
    "\n",
    "for idx, entry in enumerate(data):\n",
    "    generated_str = entry.get(\"generated\", \"\")\n",
    "\n",
    "    # Remove text before </think> if present\n",
    "    if \"</think>\" in generated_str:\n",
    "        text_after_think = generated_str.split(\"</think>\", 1)[-1].strip()\n",
    "    else:\n",
    "        text_after_think = generated_str\n",
    "\n",
    "    # Try to find JSON object containing \"queries\" key\n",
    "    match_json = re.search(\n",
    "        r'\\{[^}]*\"queries\"[^}]*:.*?\\].*?\\}', text_after_think, re.DOTALL\n",
    "    )\n",
    "\n",
    "    if match_json:\n",
    "        json_str = match_json.group(0).strip()\n",
    "        try:\n",
    "            # Try to parse the JSON\n",
    "            generated_dict = json.loads(json_str)\n",
    "\n",
    "            # Extract queries\n",
    "            queries = generated_dict.get(\"queries\", [])\n",
    "\n",
    "            # Validate: must be a list with 20 strings\n",
    "            if (\n",
    "                isinstance(queries, list)\n",
    "                and len(queries) == 20\n",
    "                and all(isinstance(q, str) for q in queries)\n",
    "            ):\n",
    "                count_valid += 1\n",
    "                entry[\"queries\"] = queries\n",
    "                cleaned_models.append(entry)\n",
    "            else:\n",
    "                count_invalid_structure += 1\n",
    "                failed_entries.append(entry)\n",
    "        except json.JSONDecodeError as e:\n",
    "            # If parsing fails, try to extract manually with regex\n",
    "            # Look for array pattern: [\"query1\", \"query2\", ...]\n",
    "            manual_match = re.search(\n",
    "                r'\"queries\"\\s*:\\s*(\\[.*?\\])', text_after_think, re.DOTALL\n",
    "            )\n",
    "\n",
    "            if manual_match:\n",
    "                try:\n",
    "                    queries = json.loads(manual_match.group(1))\n",
    "\n",
    "                    # Validate manually extracted queries\n",
    "                    if (\n",
    "                        isinstance(queries, list)\n",
    "                        and len(queries) == 20\n",
    "                        and all(isinstance(q, str) for q in queries)\n",
    "                    ):\n",
    "                        count_manual_extraction += 1\n",
    "                        entry[\"queries\"] = queries\n",
    "                        cleaned_models.append(entry)\n",
    "                    else:\n",
    "                        count_invalid_structure += 1\n",
    "                        failed_entries.append(entry)\n",
    "                except json.JSONDecodeError:\n",
    "                    count_invalid_json += 1\n",
    "                    failed_entries.append(entry)\n",
    "            else:\n",
    "                count_invalid_json += 1\n",
    "                failed_entries.append(entry)\n",
    "    else:\n",
    "        # No JSON found\n",
    "        count_invalid_json += 1\n",
    "        failed_entries.append(entry)\n",
    "\n",
    "print(f\"Processing complete:\")\n",
    "print(f\"  ✅ Valid entries (direct): {count_valid}\")\n",
    "print(f\"  ✅ Valid entries (manual extraction): {count_manual_extraction}\")\n",
    "print(f\"  ✅ Total valid: {len(cleaned_models)}\")\n",
    "print(f\"  ❌ Invalid JSON: {count_invalid_json}\")\n",
    "print(f\"  ❌ Invalid structure: {count_invalid_structure}\")\n",
    "print(f\"  📊 Total processed: {len(data)}\")\n",
    "print(f\"  📈 Success rate: {len(cleaned_models) / len(data) * 100:.2f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b94f3aa1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "============================================================\n",
      "FAILED ENTRIES DETAILS (0 total)\n",
      "============================================================\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Print sample of failed entries\n",
    "print(f\"\\n{'=' * 60}\")\n",
    "print(f\"FAILED ENTRIES DETAILS ({len(failed_entries)} total)\")\n",
    "print(f\"{'=' * 60}\\n\")\n",
    "\n",
    "for i, entry in enumerate(failed_entries[:5], 1):  # Show first 5\n",
    "    print(f\"{i}. Model: {entry['model_id']}\")\n",
    "    if \"raw_text\" in entry:\n",
    "        print(f\"   Raw text: {entry['raw_text'][:150]}...\")\n",
    "    elif \"queries\" in entry:\n",
    "        print(f\"   Found {len(entry['queries'])} queries\")\n",
    "    print()\n",
    "\n",
    "if len(failed_entries) > 5:\n",
    "    print(f\"... and {len(failed_entries) - 5} more failed entries\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2292656e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "failed_entries\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "26b330ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "# put failed entries are correct so put them in the cleaned_models list as well\n",
    "for entry in failed_entries:\n",
    "    entry[\"queries\"] = entry.get(\"queries\", [])\n",
    "    cleaned_models.append(entry)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "13f7b274",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============================================================\n",
      "VALID ENTRIES SAMPLES (931 total)\n",
      "============================================================\n",
      "\n",
      "1. Model: Qwen/Qwen2.5-3B-Instruct\n",
      "   Number of queries: 20\n",
      "   First query: Can you help me solve this math problem step by step?...\n",
      "   Last query: Can you explain the process of photosynthesis in plants in an easy-to-understand way?...\n",
      "\n",
      "2. Model: Qwen/Qwen3-0.6B\n",
      "   Number of queries: 20\n",
      "   First query: Can you explain how to solve this math problem step by step?...\n",
      "   Last query: What are the basic principles of effective time management?...\n",
      "\n",
      "3. Model: openai/gpt-oss-20b\n",
      "   Number of queries: 20\n",
      "   First query: Can you help me break down a complex math problem into smaller steps to make it easier to solve?...\n",
      "   Last query: How might a chef describe the flavor profile of a new dish using vivid and poetic language?...\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Print sample of valid entries\n",
    "print(f\"{'=' * 60}\")\n",
    "print(f\"VALID ENTRIES SAMPLES ({len(cleaned_models)} total)\")\n",
    "print(f\"{'=' * 60}\\n\")\n",
    "\n",
    "for i, entry in enumerate(cleaned_models[:3], 1):  # Show first 3\n",
    "    print(f\"{i}. Model: {entry.get('model_id', 'unknown')}\")\n",
    "    print(f\"   Number of queries: {len(entry['queries'])}\")\n",
    "    print(f\"   First query: {entry['queries'][0][:100]}...\")\n",
    "    print(f\"   Last query: {entry['queries'][-1][:100]}...\")\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fe396dc3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total final prompts generated: 18620\n",
      "Sample final prompts:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'Instruction': 'Can you help me solve this math problem step by step?',\n",
       " 'model_id': 'Qwen/Qwen2.5-3B-Instruct',\n",
       " 'description': \"Qwen2.5 is the latest series of Qwen large language models. The Qwen2.5 series includes a range of base language models and instruction-tuned language models with parameter sizes ranging from 0.5 to 72 billion. Qwen2.5 introduces several improvements over Qwen2, including significantly more knowledge and enhanced capabilities in coding and mathematics, achieved through specialized expert models in these domains. It also shows significant improvements in instruction following, generating long texts (over 8K tokens), understanding structured data such as tables, and generating structured outputs, especially JSON. The model is more resilient to the diversity of system prompts, which enhances role-play implementation and condition-setting for chatbots. It supports long-context processing up to 128K tokens and can generate up to 8K tokens. The model provides multilingual support for over 29 languages, including Chinese, English, French, Spanish, Portuguese, German, Italian, Russian, Japanese, Korean, Vietnamese, Thai, and Arabic. This specific model is the instruction-tuned 3B Qwen2.5 model. It is a causal language model that has undergone pretraining and post-training. The architecture includes transformers with RoPE, SwiGLU, RMSNorm, attention QKV bias, and tied word embeddings. It has 3.09 billion parameters, with 2.77 billion non-embedding parameters. The model consists of 36 layers and uses 16 attention heads for Q and 2 for KV. It supports a context length of 32,768 tokens and can generate up to 8192 tokens. The code for Qwen2.5 is available in the latest Hugging Face `transformers` library, and it is recommended to use the latest version to avoid errors. Using versions of `transformers` earlier than 4.37.0 may result in a KeyError: 'qwen2'. Detailed evaluation results are available in a separate document, and information on GPU memory requirements and throughput is also provided. If you find this work helpful, you may cite the associated publications.\",\n",
       " 'domain': 'text-generation',\n",
       " 'created_at': 1726582132000}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# duplicate models, one for each prompt in the queries list\n",
    "final_prompts = []\n",
    "for entry in cleaned_models:\n",
    "    queries = entry[\"queries\"]\n",
    "    for q in queries:\n",
    "        final_prompts.append(\n",
    "            {\n",
    "                \"Instruction\": q,\n",
    "                \"model_id\": entry[\"model_id\"],\n",
    "                \"description\": entry[\"modelcard\"],\n",
    "                \"domain\": entry[\"domain\"],\n",
    "                \"created_at\": entry[\"created_at\"],\n",
    "            }\n",
    "        )\n",
    "print(f\"Total final prompts generated: {len(final_prompts)}\")\n",
    "print(f\"Sample final prompts:\")\n",
    "final_prompts[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "3f9a75c1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total instructions: 18620\n",
      "Unique models: 931\n",
      "Instruction length stats (chars):\n",
      "count    18620.000000\n",
      "mean        82.973201\n",
      "std         16.337795\n",
      "min         26.000000\n",
      "25%         72.000000\n",
      "50%         82.000000\n",
      "75%         93.000000\n",
      "max        197.000000\n",
      "Name: Instruction, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# compute instruction lenght statistics with pandas\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.DataFrame(final_prompts)\n",
    "print(f\"Total instructions: {len(df)}\")\n",
    "print(f\"Unique models: {df['model_id'].nunique()}\")\n",
    "print(f\"Instruction length stats (chars):\")\n",
    "print(df[\"Instruction\"].str.len().describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "aa6b4bf3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Instructions with minimum length (26 chars):\n",
      "- Model: Seonghaa/korean-emotion-classifier-roberta, Instruction: 이 시를 분석해서 시인의 감정 상태를 설명해줘.\n",
      "- Model: huggingface-KREW/Llama-3.1-8B-Spider-SQL-Ko, Instruction: 학생들의 평균 성적을 계산하는 쿼리를 생성해줘.\n"
     ]
    }
   ],
   "source": [
    "# print queries with min length\n",
    "min_len = df[\"Instruction\"].str.len().min()\n",
    "print(f\"\\nInstructions with minimum length ({min_len} chars):\")\n",
    "shortest_instructions = df[df[\"Instruction\"].str.len() == min_len]\n",
    "for i, row in shortest_instructions.iterrows():\n",
    "    print(f\"- Model: {row['model_id']}, Instruction: {row['Instruction']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "5d0003b9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total unique models: 931\n",
      "Total instructions before dropping duplicates: 18620\n",
      "Total instructions after dropping duplicates: 18548\n",
      "Total unique models after dropping duplicates: 931\n"
     ]
    }
   ],
   "source": [
    "# sort by model_id and drop instruction duplicates\n",
    "print(f\"Total unique models: {df['model_id'].nunique()}\")\n",
    "print(f\"Total instructions before dropping duplicates: {len(df)}\")\n",
    "df = df.sort_values(by=[\"model_id\", \"Instruction\"])\n",
    "df = df.drop_duplicates(subset=[\"Instruction\"], keep=\"last\")\n",
    "print(f\"Total instructions after dropping duplicates: {len(df)}\")\n",
    "# print total unique models after dropping duplicates\n",
    "print(f\"Total unique models after dropping duplicates: {df['model_id'].nunique()}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6daf8f09",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Instruction counts by domain:\n",
      "domain\n",
      "audio-classification              500\n",
      "text-to-speech                    500\n",
      "audio-to-audio                    500\n",
      "image-to-image                    499\n",
      "image-to-text                     499\n",
      "keypoint-detection                499\n",
      "visual-document-retrieval         499\n",
      "text-to-image                     499\n",
      "token-classification              498\n",
      "image-feature-extraction          497\n",
      "audio-text-to-text                497\n",
      "image-segmentation                495\n",
      "image-text-to-text                494\n",
      "image-to-video                    480\n",
      "video-to-video                    480\n",
      "zero-shot-classification          479\n",
      "text-generation                   478\n",
      "tabular-regression                460\n",
      "automatic-speech-recognition      460\n",
      "zero-shot-object-detection        460\n",
      "text-ranking                      460\n",
      "mask-generation                   440\n",
      "object-detection                  440\n",
      "image-to-3d                       440\n",
      "feature-extraction                440\n",
      "text-to-video                     439\n",
      "sentence-similarity               439\n",
      "depth-estimation                  438\n",
      "text-classification               437\n",
      "video-classification              437\n",
      "translation                       419\n",
      "video-text-to-text                418\n",
      "text-to-3d                        400\n",
      "document-question-answering       379\n",
      "fill-mask                         340\n",
      "image-classification              339\n",
      "zero-shot-image-classification    317\n",
      "tabular-classification            300\n",
      "visual-question-answering         299\n",
      "table-question-answering          280\n",
      "question-answering                275\n",
      "summarization                     220\n",
      "reinforcement-learning            199\n",
      "unconditional-image-generation    180\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# group by domain and count instructions per domain\n",
    "domain_counts = df[\"domain\"].value_counts()\n",
    "print(f\"\\nInstruction counts by domain:\")\n",
    "print(domain_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "51670de8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# save final prompts to jsonl file\n",
    "output_file = os.path.join(\n",
    "    os.getenv(\"SELF_INSTRUCT_ROOT_DATA\"),\n",
    "    os.getenv(\"FILE_NAME_SELF_INSTRUCTED_MODELS_STEP_6\"),\n",
    ")\n",
    "df.to_json(output_file, orient=\"records\", lines=True, force_ascii=False)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
