{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bac2cb98",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import re\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "# Load .env file\n",
    "# env_path = os.path.join(os.path.dirname(os.getcwd()), \"..\", \".env\")\n",
    "load_dotenv()\n",
    "path_model_cards_generated = os.path.join(\n",
    "    os.getenv(\"SELF_INSTRUCT_ROOT_DATA\"),\n",
    "    os.getenv(\"FILE_NAME_MODEL_CARDS_GENERATED_STEP_3\"),\n",
    ")\n",
    "\n",
    "with open(path_model_cards_generated, \"r\") as f:\n",
    "    data = json.load(f)\n",
    "print(f\"Loaded {len(data)} entries from {path_model_cards_generated}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e3dbd0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "data[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "925446a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract JSON object with {\"model_description\": \"...text..\"} from the response\n",
    "count_failed = 0\n",
    "count_no_json = 0\n",
    "failed_entries = []  # Store failed entries\n",
    "no_json_entries = []  # Store entries without JSON\n",
    "cleaned_models = []\n",
    "\n",
    "for entry in data:\n",
    "    # Get the full response text\n",
    "    description = str(entry.get(\"model_description\") or \"\")\n",
    "\n",
    "    if \"</think>\" in description:\n",
    "        # Extract text after </think>\n",
    "        text_after_think = description.split(\"</think>\", 1)[-1].strip()\n",
    "    else:\n",
    "        text_after_think = description\n",
    "\n",
    "    # Try to find JSON object containing \"model_description\" key\n",
    "    # Look for pattern: {\"model_description\": \"...\"}\n",
    "    match_json = re.search(\n",
    "        r'\\{[^}]*\"model_description\"[^}]*:.*?\\}', text_after_think, re.DOTALL\n",
    "    )\n",
    "\n",
    "    if match_json:\n",
    "        json_str = match_json.group(0).strip()\n",
    "        try:\n",
    "            # Try to parse the JSON\n",
    "            parsed_json = json.loads(json_str)\n",
    "            # Extract just the model_description value\n",
    "            if \"model_description\" in parsed_json:\n",
    "                entry[\"model_description\"] = parsed_json[\"model_description\"]\n",
    "                cleaned_models.append(entry)\n",
    "            else:\n",
    "                entry[\"model_description\"] = \"\"\n",
    "                failed_entries.append(\n",
    "                    {\n",
    "                        \"model_id\": entry.get(\"model_id\", \"unknown\"),\n",
    "                        \"reason\": \"JSON parsed but no 'model_description' key\",\n",
    "                        \"raw_text\": text_after_think,\n",
    "                    }\n",
    "                )\n",
    "                count_failed += 1\n",
    "        except json.JSONDecodeError as e:\n",
    "            # If parsing fails, try to extract manually with regex\n",
    "            manual_match = re.search(\n",
    "                r'\"model_description\"\\s*:\\s*\"([^\"]*)\"', text_after_think, re.DOTALL\n",
    "            )\n",
    "            if manual_match:\n",
    "                entry[\"model_description\"] = manual_match.group(1)\n",
    "                cleaned_models.append(entry)\n",
    "            else:\n",
    "                entry[\"model_description\"] = text_after_think\n",
    "                failed_entries.append(\n",
    "                    {\n",
    "                        \"model_id\": entry.get(\"model_id\", \"unknown\"),\n",
    "                        \"reason\": f\"JSON parse error: {str(e)}\",\n",
    "                        \"raw_text\": text_after_think,\n",
    "                    }\n",
    "                )\n",
    "                count_failed += 1\n",
    "    else:\n",
    "        # No JSON found\n",
    "        entry[\"model_description\"] = \"\"\n",
    "        no_json_entries.append(\n",
    "            {\n",
    "                \"model_id\": entry.get(\"model_id\", \"unknown\"),\n",
    "                \"raw_text\": text_after_think,\n",
    "            }\n",
    "        )\n",
    "        count_no_json += 1\n",
    "\n",
    "\n",
    "print(f\"Extraction complete:\")\n",
    "print(f\"  - Failed to parse JSON: {count_failed} entries\")\n",
    "print(f\"  - No JSON found: {count_no_json} entries\")\n",
    "print(f\"  - Successfully extracted: {len(cleaned_models)} entries\")\n",
    "\n",
    "\n",
    "print(\"\\nFirst 3 model descriptions:\")\n",
    "for i, entry in enumerate(cleaned_models[:3], 1):\n",
    "    desc = str(entry.get(\"model_description\", \"\"))\n",
    "    print(f\"\\n{i}. {entry['model_id']}:\")\n",
    "    print(f\"   {desc}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "749e2911",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Print failed entries (JSON parsing errors)\n",
    "print(f\"=== FAILED ENTRIES ({len(failed_entries)}) ===\\n\")\n",
    "for i, entry in enumerate(failed_entries[:10], 1):  # Show first 10\n",
    "    print(f\"{i}. Model: {entry['model_id']}\")\n",
    "    print(f\"   Reason: {entry['reason']}\")\n",
    "    print(f\"   Raw text: {entry['raw_text'][:200]}...\")\n",
    "    print()\n",
    "\n",
    "if len(failed_entries) > 10:\n",
    "    print(f\"... and {len(failed_entries) - 10} more failed entries\")\n",
    "\n",
    "print(f\"\\n{'=' * 60}\\n\")\n",
    "\n",
    "# Print entries without JSON\n",
    "print(f\"=== NO JSON FOUND ({len(no_json_entries)}) ===\\n\")\n",
    "for i, entry in enumerate(no_json_entries[:10], 1):  # Show first 10\n",
    "    print(f\"{i}. Model: {entry['model_id']}\")\n",
    "    print(f\"   Raw text: {entry['raw_text'][:200]}...\")\n",
    "    print()\n",
    "\n",
    "if len(no_json_entries) > 10:\n",
    "    print(f\"... and {len(no_json_entries) - 10} more entries without JSON\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9cfb28fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove model_description entries that are empty or None\n",
    "print(\"Size before cleaning:\", len(cleaned_models))\n",
    "print(f\"\\nCleaning up entries with empty descriptions...\")\n",
    "cleaned_models = [\n",
    "    entry\n",
    "    for entry in cleaned_models\n",
    "    if entry[\"model_description\"] != \"None\" and entry[\"model_description\"] != \"\"\n",
    "]\n",
    "print(f\"After removing empty descriptions, {len(cleaned_models)} entries remain.\")\n",
    "# study len of model_description of each cleaned model with pandas, print the description with less then 500 characters\n",
    "\n",
    "print(\"\\nAnalyzing description lengths...\")\n",
    "lengths = [len(entry.get(\"model_description\", \"\")) for entry in cleaned_models]\n",
    "import pandas as pd\n",
    "\n",
    "pd.Series(lengths).describe()\n",
    "\n",
    "for entry in cleaned_models:\n",
    "    description = entry.get(\"model_description\", \"\")\n",
    "    if len(description) < 500:\n",
    "        print(description)\n",
    "        print(\"-\" * 80)\n",
    "# remove models with description less than 500   characters\n",
    "cleaned_models = [\n",
    "    entry for entry in cleaned_models if len(entry.get(\"model_description\", \"\")) >= 500\n",
    "]\n",
    "print(f\"After filtering short descriptions, {len(cleaned_models)} entries remain.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a141a46",
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove items that contain chinese characters\n",
    "def contains_chinese(text):\n",
    "    return any(\"\\u4e00\" <= char <= \"\\u9fff\" for char in text)\n",
    "\n",
    "\n",
    "print(f\"\\nRemoving entries with Chinese characters...\")\n",
    "print(\"Size before removing Chinese entries:\", len(cleaned_models))\n",
    "cleaned_models = [\n",
    "    entry\n",
    "    for entry in cleaned_models\n",
    "    if not contains_chinese(entry.get(\"model_description\", \"\"))\n",
    "]\n",
    "print(\"Size after removing Chinese entries:\", len(cleaned_models))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "471b4751",
   "metadata": {},
   "outputs": [],
   "source": [
    "# print greater than 5000 characters\n",
    "print(\"\\nDescriptions longer than 5000 characters:\")\n",
    "for entry in cleaned_models:\n",
    "    description = entry.get(\"model_description\", \"\")\n",
    "    if len(description) > 5000:\n",
    "        print(f\"Model ID: {entry['model_id']}, Length: {len(description)}\")\n",
    "        print(description)\n",
    "        print(\"-\" * 80)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "166eea34",
   "metadata": {},
   "outputs": [],
   "source": [
    "def remove_duplicates_by_modelcard(models, key=\"modelcard\", priority_key=\"downloads\"):\n",
    "    \"\"\"\n",
    "    Remove duplicate models based on a key (e.g., 'modelcard').\n",
    "    When duplicates are found, keep the one with the highest value for priority_key.\n",
    "\n",
    "    Args:\n",
    "        models: List of model dictionaries\n",
    "        key: Field to check for duplicates (default: 'modelcard')\n",
    "        priority_key: Field to use for selecting which duplicate to keep (default: 'downloads')\n",
    "\n",
    "    Returns:\n",
    "        List of unique models\n",
    "    \"\"\"\n",
    "    unique_items = {}\n",
    "\n",
    "    for item in models:\n",
    "        key_value = item.get(key)\n",
    "\n",
    "        if key_value is None:\n",
    "            # Skip items without the key\n",
    "            continue\n",
    "\n",
    "        if key_value not in unique_items:\n",
    "            # First time seeing this key value\n",
    "            unique_items[key_value] = item\n",
    "        else:\n",
    "            # Duplicate found - keep the one with higher priority_key value\n",
    "            current_priority = item.get(priority_key, 0)\n",
    "            existing_priority = unique_items[key_value].get(priority_key, 0)\n",
    "\n",
    "            if current_priority > existing_priority:\n",
    "                unique_items[key_value] = item\n",
    "\n",
    "    return list(unique_items.values())\n",
    "\n",
    "\n",
    "# Check duplicates\n",
    "model_cards = [item[\"modelcard\"] for item in cleaned_models]\n",
    "print(f\"Unique model cards: {len(set(model_cards))} / {len(model_cards)}\")\n",
    "\n",
    "# Remove duplicates\n",
    "cleaned_models = remove_duplicates_by_modelcard(\n",
    "    cleaned_models, key=\"modelcard\", priority_key=\"downloads\"\n",
    ")\n",
    "print(f\"After removing duplicates: {len(cleaned_models)} models\")\n",
    "cleaned_models = remove_duplicates_by_modelcard(\n",
    "    cleaned_models,\n",
    "    key=\"model_description\",\n",
    "    priority_key=\"downloads\",\n",
    ")\n",
    "print(f\"After removing duplicates: {len(cleaned_models)} models\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc1ff0ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save failed and no-json entries to files\n",
    "\n",
    "# Save cleaned data in JSONL format\n",
    "cleaned_file = os.path.join(\n",
    "    os.getenv(\"SELF_INSTRUCT_ROOT_DATA\"),\n",
    "    os.getenv(\"FILE_NAME_CLEANED_MODEL_CARDS_STEP_4\"),\n",
    ")\n",
    "# put model description in modelcard field\n",
    "for model in cleaned_models:\n",
    "    model[\"modelcard\"] = model.pop(\"model_description\", None)\n",
    "\n",
    "\n",
    "print(\"sample of cleaned models:\")\n",
    "for model in cleaned_models[:3]:\n",
    "    print(model[\"modelcard\"])\n",
    "\n",
    "# Write JSONL with proper formatting (no blank lines, no trailing newline)\n",
    "with open(cleaned_file, \"w\") as f:\n",
    "    for i, item in enumerate(cleaned_models):\n",
    "        json_line = json.dumps(item, ensure_ascii=False)\n",
    "        if i < len(cleaned_models) - 1:\n",
    "            f.write(json_line + \"\\n\")\n",
    "        else:\n",
    "            # Last line: no trailing newline to avoid empty line\n",
    "            f.write(json_line)\n",
    "\n",
    "print(f\"Saved {len(cleaned_models)} cleaned entries to: {cleaned_file}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18383083",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Verify the saved JSONL file can be read correctly\n",
    "data = []\n",
    "with open(cleaned_file, \"r\") as f:\n",
    "    for line in f:\n",
    "        line = line.strip()\n",
    "        if line:  # Skip empty lines\n",
    "            data.append(json.loads(line))\n",
    "\n",
    "print(f\"Successfully loaded {len(data)} entries from {cleaned_file}\")\n",
    "print(f\"\\nFirst entry keys: {list(data[0].keys())}\")\n",
    "print(f\"Sample modelcard length: {len(data[0].get('modelcard', ''))}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
