{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "jozZrzp3UbtE",
        "outputId": "3fa3db33-ce9a-432d-f6a8-6dad9e12a6df"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Filtered rows saved to /content/model_release_only.tsv\n",
            "Total rows in original: 3196\n",
            "Rows with is_model_release == True: 693\n"
          ]
        }
      ],
      "source": [
        "# --- Filter rows where is_model_release == True (no stratification) ---\n",
        "\n",
        "import pandas as pd\n",
        "\n",
        "# Path to your input file\n",
        "INPUT_PATH = \"/content/MAIN_ANNOTATIONS_MERGED.tsv\"\n",
        "\n",
        "# Load TSV\n",
        "df = pd.read_csv(INPUT_PATH, sep=\"\\t\")\n",
        "\n",
        "# Normalize column names\n",
        "df.columns = [c.strip() for c in df.columns]\n",
        "\n",
        "# Ensure boolean-like values are mapped properly\n",
        "if \"is_model_release\" in df.columns:\n",
        "    df[\"is_model_release\"] = df[\"is_model_release\"].map(\n",
        "        {\n",
        "            True: True, False: False,\n",
        "            \"True\": True, \"False\": False,\n",
        "            \"true\": True, \"false\": False,\n",
        "            1: True, 0: False\n",
        "        }\n",
        "    )\n",
        "\n",
        "# Filter: keep only rows with is_model_release == True\n",
        "df_filtered = df[df[\"is_model_release\"] == True].copy()\n",
        "\n",
        "# Save output\n",
        "OUTPUT_PATH = \"/content/model_release_only.tsv\"\n",
        "df_filtered.to_csv(OUTPUT_PATH, sep=\"\\t\", index=False)\n",
        "\n",
        "print(f\"Filtered rows saved to {OUTPUT_PATH}\")\n",
        "print(f\"Total rows in original: {len(df)}\")\n",
        "print(f\"Rows with is_model_release == True: {len(df_filtered)}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# country\n",
        "\n",
        "# --- Filter rows where is_model_release == True and aggregate by country ---\n",
        "\n",
        "import pandas as pd\n",
        "\n",
        "# Path to your input file\n",
        "INPUT_PATH = \"/content/MAIN_ANNOTATIONS_MERGED.tsv\"\n",
        "\n",
        "# Load TSV\n",
        "df = pd.read_csv(INPUT_PATH, sep=\"\\t\")\n",
        "\n",
        "# Normalize column names\n",
        "df.columns = [c.strip() for c in df.columns]\n",
        "\n",
        "# Ensure boolean-like values are mapped properly\n",
        "if \"is_model_release\" in df.columns:\n",
        "    df[\"is_model_release\"] = df[\"is_model_release\"].map(\n",
        "        {\n",
        "            True: True, False: False,\n",
        "            \"True\": True, \"False\": False,\n",
        "            \"true\": True, \"false\": False,\n",
        "            1: True, 0: False\n",
        "        }\n",
        "    )\n",
        "\n",
        "# Filter: keep only rows with is_model_release == True\n",
        "df_filtered = df[df[\"is_model_release\"] == True].copy()\n",
        "\n",
        "# Save output (all rows kept)\n",
        "OUTPUT_PATH = \"/content/model_release_only.tsv\"\n",
        "df_filtered.to_csv(OUTPUT_PATH, sep=\"\\t\", index=False)\n",
        "\n",
        "# =========================\n",
        "# Aggregate by country\n",
        "# =========================\n",
        "if \"country\" in df_filtered.columns:\n",
        "    country_counts = (\n",
        "        df_filtered.groupby(\"country\")\n",
        "        .size()\n",
        "        .reset_index(name=\"release_count\")\n",
        "        .sort_values(by=\"release_count\", ascending=False)\n",
        "    )\n",
        "else:\n",
        "    raise ValueError(\"No 'country' column found in the TSV!\")\n",
        "\n",
        "# Save aggregated output\n",
        "COUNTRY_OUTPUT_PATH = \"/content/country_release_only.tsv\"\n",
        "country_counts.to_csv(COUNTRY_OUTPUT_PATH, sep=\"\\t\", index=False)\n",
        "\n",
        "print(f\"Filtered rows saved to {OUTPUT_PATH}\")\n",
        "print(f\"Aggregated by country saved to {COUNTRY_OUTPUT_PATH}\")\n",
        "print(\"\\nTop 10 countries by release count:\")\n",
        "print(country_counts.head(10).to_string(index=False))\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "YT0uTsJ4br9k",
        "outputId": "4e4ba71a-e2f9-4ae9-a398-182ca9589f41"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Filtered rows saved to /content/model_release_only.tsv\n",
            "Aggregated by country saved to /content/country_release_only.tsv\n",
            "\n",
            "Top 10 countries by release count:\n",
            "             country  release_count\n",
            "       United States            315\n",
            "               China            224\n",
            "              France             84\n",
            "              Canada             14\n",
            "      United Kingdom             14\n",
            "         South Korea             14\n",
            "             Armenia              7\n",
            "             Germany              7\n",
            "              Israel              7\n",
            "United Arab Emirates              7\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# --- Provider, Country, and Sector x Category averaged scores (fill missing categories with 0 per model, rounded to 2 decimals) ---\n",
        "\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "# =========================\n",
        "# 1) Path to your filtered input (is_model_release == True)\n",
        "# =========================\n",
        "INPUT_PATH = \"/content/model_release_only.tsv\"\n",
        "\n",
        "# =========================\n",
        "# 2) Load and normalize columns\n",
        "# =========================\n",
        "df = pd.read_csv(INPUT_PATH, sep=\"\\t\")\n",
        "\n",
        "# Normalize column names (strip)\n",
        "df.columns = [c.strip() for c in df.columns]\n",
        "\n",
        "# Map to standard names\n",
        "col_map = {}\n",
        "if \"Provider\" in df.columns: col_map[\"Provider\"] = \"provider\"\n",
        "if \"provider\" in df.columns: col_map[\"provider\"] = \"provider\"\n",
        "if \"Model\" in df.columns: col_map[\"Model\"] = \"model\"\n",
        "if \"name\" in df.columns: col_map[\"name\"] = \"model\"\n",
        "if \"Category\" in df.columns: col_map[\"Category\"] = \"category\"\n",
        "if \"category\" in df.columns: col_map[\"category\"] = \"category\"\n",
        "if \"Score\" in df.columns: col_map[\"Score\"] = \"score\"\n",
        "if \"score\" in df.columns: col_map[\"score\"] = \"score\"\n",
        "if \"Country\" in df.columns: col_map[\"Country\"] = \"country\"\n",
        "if \"country\" in df.columns: col_map[\"country\"] = \"country\"\n",
        "if \"Sector\" in df.columns: col_map[\"Sector\"] = \"sector\"\n",
        "if \"sector\" in df.columns: col_map[\"sector\"] = \"sector\"\n",
        "\n",
        "df = df.rename(columns=col_map)\n",
        "\n",
        "required = {\"provider\", \"model\", \"category\", \"score\"}\n",
        "missing = required - set(df.columns)\n",
        "if missing:\n",
        "    raise ValueError(f\"Missing required columns in input: {sorted(missing)}\")\n",
        "\n",
        "# Coerce types\n",
        "df[\"category\"] = pd.to_numeric(df[\"category\"], errors=\"coerce\").astype(\"Int64\")\n",
        "df[\"score\"] = pd.to_numeric(df[\"score\"], errors=\"coerce\")\n",
        "\n",
        "# Keep only valid categories 1..7\n",
        "df = df[df[\"category\"].between(1, 7, inclusive=\"both\")]\n",
        "\n",
        "# Clean strings\n",
        "df[\"provider\"] = df[\"provider\"].astype(str).str.strip()\n",
        "if \"country\" in df.columns:\n",
        "    df[\"country\"] = df[\"country\"].astype(str).str.strip().replace({\"\": np.nan})\n",
        "else:\n",
        "    df[\"country\"] = \"Unknown\"\n",
        "\n",
        "if \"sector\" in df.columns:\n",
        "    df[\"sector\"] = df[\"sector\"].astype(str).str.strip().replace({\"\": np.nan})\n",
        "else:\n",
        "    df[\"sector\"] = \"Unknown\"\n",
        "\n",
        "# =========================\n",
        "# 3) Build per-model 7-category vectors (fill missing with 0)\n",
        "# =========================\n",
        "cats = [1,2,3,4,5,6,7]\n",
        "\n",
        "# Average duplicates first at (country, sector, provider, model, category)\n",
        "df_model_cat = (\n",
        "    df.groupby([\"country\", \"sector\", \"provider\", \"model\", \"category\"], as_index=False)[\"score\"]\n",
        "      .mean()\n",
        ")\n",
        "\n",
        "# Pivot to category columns; fill missing categories with 0 PER MODEL\n",
        "mat_model = (\n",
        "    df_model_cat\n",
        "    .pivot_table(index=[\"country\",\"sector\",\"provider\",\"model\"], columns=\"category\", values=\"score\", aggfunc=\"mean\")\n",
        "    .reindex(columns=cats)\n",
        "    .fillna(0.0)\n",
        ")\n",
        "\n",
        "# =========================\n",
        "# 4a) Average across models per PROVIDER\n",
        "# =========================\n",
        "mat_provider = mat_model.groupby(level=\"provider\").mean()\n",
        "mat_provider[\"overall_avg\"] = mat_provider[cats].mean(axis=1)\n",
        "\n",
        "# =========================\n",
        "# 4b) Average across models per COUNTRY\n",
        "# =========================\n",
        "mat_country = mat_model.groupby(level=\"country\").mean()\n",
        "mat_country[\"overall_avg\"] = mat_country[cats].mean(axis=1)\n",
        "\n",
        "# =========================\n",
        "# 4c) Average across models per SECTOR\n",
        "# =========================\n",
        "mat_sector = mat_model.groupby(level=\"sector\").mean()\n",
        "mat_sector[\"overall_avg\"] = mat_sector[cats].mean(axis=1)\n",
        "\n",
        "# --- Round to 2 decimals everywhere ---\n",
        "mat_model = mat_model.round(2)\n",
        "mat_provider = mat_provider.round(2)\n",
        "mat_country = mat_country.round(2)\n",
        "mat_sector = mat_sector.round(2)\n",
        "\n",
        "# =========================\n",
        "# 5) Save outputs\n",
        "# =========================\n",
        "MODEL_MATRIX_OUT = \"/content/model_category_scores_matrix.tsv\"\n",
        "PROVIDER_AVG_OUT = \"/content/provider_category_avg_scores.tsv\"\n",
        "COUNTRY_AVG_OUT = \"/content/country_category_avg_scores.tsv\"\n",
        "SECTOR_AVG_OUT = \"/content/sector_category_avg_scores.tsv\"\n",
        "\n",
        "# Per-model matrix\n",
        "mat_model_reset = mat_model.reset_index().rename_axis(None, axis=1)\n",
        "mat_model_reset.to_csv(MODEL_MATRIX_OUT, sep=\"\\t\", index=False)\n",
        "\n",
        "# Provider averages\n",
        "mat_provider_reset = mat_provider.reset_index().rename_axis(None, axis=1)\n",
        "mat_provider_reset.to_csv(PROVIDER_AVG_OUT, sep=\"\\t\", index=False)\n",
        "\n",
        "# Country averages\n",
        "mat_country_reset = mat_country.reset_index().rename_axis(None, axis=1)\n",
        "mat_country_reset.to_csv(COUNTRY_AVG_OUT, sep=\"\\t\", index=False)\n",
        "\n",
        "# Sector averages\n",
        "mat_sector_reset = mat_sector.reset_index().rename_axis(None, axis=1)\n",
        "mat_sector_reset.to_csv(SECTOR_AVG_OUT, sep=\"\\t\", index=False)\n",
        "\n",
        "print(f\"Saved per-model matrix to: {MODEL_MATRIX_OUT}\")\n",
        "print(f\"Saved provider averages to: {PROVIDER_AVG_OUT}\")\n",
        "print(f\"Saved country averages to:  {COUNTRY_AVG_OUT}\")\n",
        "print(f\"Saved sector averages to:   {SECTOR_AVG_OUT}\")\n",
        "\n",
        "print(\"\\nPreview (first 8 providers):\")\n",
        "print(mat_provider_reset.head(8).to_string(index=False))\n",
        "print(\"\\nPreview (top 8 countries):\")\n",
        "print(mat_country_reset.head(8).to_string(index=False))\n",
        "print(\"\\nPreview (all sectors):\")\n",
        "print(mat_sector_reset.to_string(index=False))\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xmmhy49EgI7r",
        "outputId": "09b4985a-07e0-47bc-ebf0-d3c04141ee3a"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saved per-model matrix to: /content/model_category_scores_matrix.tsv\n",
            "Saved provider averages to: /content/provider_category_avg_scores.tsv\n",
            "Saved country averages to:  /content/country_category_avg_scores.tsv\n",
            "Saved sector averages to:   /content/sector_category_avg_scores.tsv\n",
            "\n",
            "Preview (first 8 providers):\n",
            " provider   1   2    3   4    5    6    7  overall_avg\n",
            "    01.ai 1.0 1.0 0.00 0.0 1.00 0.50 0.00         0.50\n",
            "      ai2 0.6 1.0 0.00 1.8 0.20 1.00 0.40         0.71\n",
            "     ai21 0.0 1.0 0.00 0.0 0.00 1.00 0.00         0.29\n",
            "aiForever 0.0 0.0 0.00 1.0 0.00 1.00 0.00         0.29\n",
            "  alibaba 0.0 0.5 1.75 0.0 0.12 0.00 0.25         0.38\n",
            "   amazon 1.0 1.0 1.00 0.0 1.00 0.00 0.00         0.57\n",
            "      ant 0.0 0.0 0.00 0.0 0.00 0.00 0.00         0.00\n",
            "anthropic 1.0 1.0 1.00 0.0 0.17 0.33 0.00         0.50\n",
            "\n",
            "Preview (top 8 countries):\n",
            "             country    1    2    3    4    5    6    7  overall_avg\n",
            "             Armenia 0.00 0.00 0.00 1.00 0.00 1.00 0.00         0.29\n",
            "              Canada 2.50 2.00 2.00 0.00 0.50 0.00 0.50         1.07\n",
            "               China 0.28 0.69 0.59 0.12 0.25 0.38 0.06         0.34\n",
            "              France 0.50 0.42 1.67 0.33 0.08 0.25 0.08         0.48\n",
            "             Germany 0.00 1.00 0.00 3.00 0.00 1.00 0.00         0.71\n",
            "              Israel 0.00 1.00 0.00 0.00 0.00 1.00 0.00         0.29\n",
            "         South Korea 0.00 1.00 0.00 0.50 0.00 0.50 0.00         0.29\n",
            "United Arab Emirates 0.00 2.00 3.00 1.00 0.00 3.00 0.00         1.29\n",
            "\n",
            "Preview (all sectors):\n",
            "    sector    1    2    3    4    5    6    7  overall_avg\n",
            "  Academia 1.50 1.00 2.50 1.50 0.50 0.50 0.50         1.14\n",
            "Government 0.00 1.00 1.00 0.33 0.00 1.00 0.00         0.48\n",
            "  Industry 0.80 0.99 0.83 0.45 0.40 0.43 0.15         0.58\n",
            " Nonprofit 0.86 0.71 0.43 1.57 0.57 1.00 0.29         0.78\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# --- Year and Release-Quarter x Category averaged scores (model-balanced; fill missing cats with 0, rounded to 2 decimals) ---\n",
        "\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import re\n",
        "from datetime import datetime\n",
        "\n",
        "INPUT_PATH = \"/content/model_release_only.tsv\"\n",
        "\n",
        "# =========================\n",
        "# 1) Load and normalize\n",
        "# =========================\n",
        "df = pd.read_csv(INPUT_PATH, sep=\"\\t\")\n",
        "df.columns = [c.strip() for c in df.columns]\n",
        "\n",
        "# Normalize column names\n",
        "col_map = {}\n",
        "if \"Provider\" in df.columns: col_map[\"Provider\"] = \"provider\"\n",
        "if \"provider\" in df.columns: col_map[\"provider\"] = \"provider\"\n",
        "if \"Model\" in df.columns: col_map[\"Model\"] = \"model\"\n",
        "if \"name\" in df.columns: col_map[\"name\"] = \"model\"\n",
        "if \"Category\" in df.columns: col_map[\"Category\"] = \"category\"\n",
        "if \"category\" in df.columns: col_map[\"category\"] = \"category\"\n",
        "if \"Score\" in df.columns: col_map[\"Score\"] = \"score\"\n",
        "if \"score\" in df.columns: col_map[\"score\"] = \"score\"\n",
        "if \"Year\" in df.columns: col_map[\"Year\"] = \"year\"\n",
        "if \"year\" in df.columns: col_map[\"year\"] = \"year\"\n",
        "if \"Metadata\" in df.columns: col_map[\"Metadata\"] = \"metadata\"\n",
        "if \"metadata\" in df.columns: col_map[\"metadata\"] = \"metadata\"\n",
        "df = df.rename(columns=col_map)\n",
        "\n",
        "required = {\"provider\", \"model\", \"category\", \"score\"}\n",
        "missing = required - set(df.columns)\n",
        "if missing:\n",
        "    raise ValueError(f\"Missing required columns: {sorted(missing)}\")\n",
        "\n",
        "df[\"category\"] = pd.to_numeric(df[\"category\"], errors=\"coerce\").astype(\"Int64\")\n",
        "df[\"score\"] = pd.to_numeric(df[\"score\"], errors=\"coerce\")\n",
        "df = df[df[\"category\"].between(1, 7, inclusive=\"both\")]\n",
        "\n",
        "df[\"provider\"] = df[\"provider\"].astype(str).str.strip()\n",
        "\n",
        "# =========================\n",
        "# 2) Parse release_date from metadata\n",
        "# =========================\n",
        "def extract_release_date(val):\n",
        "    if pd.isna(val):\n",
        "        return None\n",
        "    s = str(val)\n",
        "    m = re.search(r\"\\b(20\\d{2})-(0[1-9]|1[0-2])-(0[1-9]|[12]\\d|3[01])\\b\", s)\n",
        "    if not m:\n",
        "        return None\n",
        "    try:\n",
        "        return datetime.strptime(m.group(0), \"%Y-%m-%d\").date()\n",
        "    except Exception:\n",
        "        return None\n",
        "\n",
        "df[\"release_date_parsed\"] = df.get(\"metadata\", pd.Series([np.nan]*len(df))).apply(extract_release_date)\n",
        "\n",
        "# Prefer explicit year column, else fallback to release_date\n",
        "if \"year\" not in df.columns:\n",
        "    df[\"year\"] = pd.Series([np.nan]*len(df), dtype=\"Int64\")\n",
        "\n",
        "df.loc[df[\"year\"].isna() & df[\"release_date_parsed\"].notna(), \"year\"] = \\\n",
        "    pd.to_numeric(df.loc[df[\"release_date_parsed\"].notna(), \"release_date_parsed\"].apply(lambda d: d.year), errors=\"coerce\").astype(\"Int64\")\n",
        "\n",
        "def to_quarter_label(d):\n",
        "    if pd.isna(d):\n",
        "        return \"Unknown\"\n",
        "    month = d.month\n",
        "    if   1 <= month <= 3:  q = \"Q1\"\n",
        "    elif 4 <= month <= 6:  q = \"Q2\"\n",
        "    elif 7 <= month <= 9:  q = \"Q3\"\n",
        "    else:                  q = \"Q4\"\n",
        "    return f\"{d.year}-{q}\"\n",
        "\n",
        "df[\"release_quarter\"] = df[\"release_date_parsed\"].apply(lambda d: to_quarter_label(d) if pd.notna(d) else \"Unknown\")\n",
        "\n",
        "# =========================\n",
        "# 3) Build per-model vectors\n",
        "# =========================\n",
        "cats = [1,2,3,4,5,6,7]\n",
        "\n",
        "# Average duplicates at (year, provider, model, category)\n",
        "df_model_cat = (\n",
        "    df.groupby([\"year\", \"release_quarter\", \"provider\", \"model\", \"category\"], as_index=False)[\"score\"]\n",
        "      .mean()\n",
        ")\n",
        "\n",
        "mat_model = (\n",
        "    df_model_cat\n",
        "    .pivot_table(index=[\"year\",\"release_quarter\",\"provider\",\"model\"], columns=\"category\", values=\"score\", aggfunc=\"mean\")\n",
        "    .reindex(columns=cats)\n",
        "    .fillna(0.0)\n",
        ")\n",
        "\n",
        "# =========================\n",
        "# 4a) Aggregate by YEAR\n",
        "# =========================\n",
        "mat_year = mat_model.groupby(level=\"year\").mean()\n",
        "mat_year[\"overall_avg\"] = mat_year[cats].mean(axis=1)\n",
        "\n",
        "# =========================\n",
        "# 4b) Aggregate by RELEASE QUARTER\n",
        "# =========================\n",
        "mat_quarter = mat_model.groupby(level=\"release_quarter\").mean()\n",
        "mat_quarter[\"overall_avg\"] = mat_quarter[cats].mean(axis=1)\n",
        "\n",
        "# Round\n",
        "mat_model = mat_model.round(2)\n",
        "mat_year = mat_year.round(2)\n",
        "mat_quarter = mat_quarter.round(2)\n",
        "\n",
        "# =========================\n",
        "# 5) Save\n",
        "# =========================\n",
        "MODEL_MATRIX_OUT   = \"/content/model_category_scores_matrix.tsv\"\n",
        "YEAR_AVG_OUT       = \"/content/year_category_avg_scores.tsv\"\n",
        "QUARTER_AVG_OUT    = \"/content/quarter_category_avg_scores.tsv\"\n",
        "\n",
        "mat_model.reset_index().rename_axis(None, axis=1).to_csv(MODEL_MATRIX_OUT, sep=\"\\t\", index=False)\n",
        "mat_year.reset_index().rename_axis(None, axis=1).to_csv(YEAR_AVG_OUT, sep=\"\\t\", index=False)\n",
        "mat_quarter.reset_index().rename_axis(None, axis=1).to_csv(QUARTER_AVG_OUT, sep=\"\\t\", index=False)\n",
        "\n",
        "print(f\"Saved per-model matrix to: {MODEL_MATRIX_OUT}\")\n",
        "print(f\"Saved year averages to:    {YEAR_AVG_OUT}\")\n",
        "print(f\"Saved quarter averages to: {QUARTER_AVG_OUT}\")\n",
        "\n",
        "print(\"\\nPreview (years):\")\n",
        "print(mat_year.reset_index().to_string(index=False))\n",
        "print(\"\\nPreview (quarters):\")\n",
        "print(mat_quarter.reset_index().to_string(index=False))\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5eoqhHvzmwgG",
        "outputId": "4dc17ed0-6c13-490f-f8a9-27016d8e0a91"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saved per-model matrix to: /content/model_category_scores_matrix.tsv\n",
            "Saved year averages to:    /content/year_category_avg_scores.tsv\n",
            "Saved quarter averages to: /content/quarter_category_avg_scores.tsv\n",
            "\n",
            "Preview (years):\n",
            " year    1    2    3    4    5    6    7  overall_avg\n",
            " 2019 0.00 0.00 1.00 0.00 0.00 0.00 0.00         0.14\n",
            " 2020 3.00 0.00 1.00 2.00 1.00 1.00 1.00         1.29\n",
            " 2021 0.00 0.00 0.00 1.00 0.00 1.00 0.00         0.29\n",
            " 2022 1.17 1.33 1.00 2.00 0.17 0.67 0.33         0.95\n",
            " 2023 1.22 1.11 1.17 0.56 0.50 0.61 0.39         0.79\n",
            " 2024 0.79 1.21 0.72 0.51 0.51 0.41 0.08         0.60\n",
            " 2025 0.48 0.64 0.79 0.27 0.27 0.45 0.09         0.43\n",
            "\n",
            "Preview (quarters):\n",
            "release_quarter    1    2    3    4    5    6    7  overall_avg\n",
            "        2019-Q1 0.00 0.00 1.00 0.00 0.00 0.00 0.00         0.14\n",
            "        2020-Q2 3.00 0.00 1.00 2.00 1.00 1.00 1.00         1.29\n",
            "        2021-Q1 0.00 0.00 0.00 1.00 0.00 1.00 0.00         0.29\n",
            "        2022-Q2 2.00 2.00 0.50 1.50 0.00 0.50 0.50         1.00\n",
            "        2022-Q3 1.50 1.50 1.50 3.00 0.50 1.00 0.50         1.36\n",
            "        2022-Q4 0.00 0.50 1.00 1.50 0.00 0.50 0.00         0.50\n",
            "        2023-Q1 1.33 1.67 0.33 1.00 0.67 0.67 0.67         0.90\n",
            "        2023-Q2 1.50 0.75 1.50 0.75 1.50 0.75 0.75         1.07\n",
            "        2023-Q3 1.14 1.14 0.86 0.43 0.00 0.43 0.00         0.57\n",
            "        2023-Q4 1.00 1.00 2.00 0.25 0.25 0.75 0.50         0.82\n",
            "        2024-Q1 1.00 1.33 0.92 0.42 0.58 0.33 0.08         0.67\n",
            "        2024-Q2 0.78 1.33 0.56 0.22 0.44 0.33 0.00         0.52\n",
            "        2024-Q3 0.50 0.88 0.62 0.62 0.50 0.50 0.25         0.55\n",
            "        2024-Q4 0.80 1.20 0.70 0.80 0.50 0.50 0.00         0.64\n",
            "        2025-Q1 0.33 0.33 1.00 0.22 0.33 0.44 0.33         0.43\n",
            "        2025-Q2 0.33 0.47 0.53 0.33 0.00 0.33 0.00         0.29\n",
            "        2025-Q3 0.89 1.22 1.00 0.22 0.67 0.67 0.00         0.67\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#  checking count to verify data\n",
        "\n",
        "#=========================\n",
        "# Category 4 → TSV with quarter headers that include total model counts (all categories)\n",
        "# =========================\n",
        "\n",
        "import numpy as np\n",
        "import re\n",
        "from datetime import datetime\n",
        "\n",
        "# -- Ensure release_quarter exists on the full df (used for counts across all categories)\n",
        "def to_quarter_label(d):\n",
        "    if pd.isna(d):\n",
        "        return \"Unknown\"\n",
        "    m = d.month\n",
        "    q = \"Q1\" if m<=3 else \"Q2\" if m<=6 else \"Q3\" if m<=9 else \"Q4\"\n",
        "    return f\"{d.year}-{q}\"\n",
        "\n",
        "if \"release_quarter\" not in df.columns:\n",
        "    if \"release_date_parsed\" not in df.columns:\n",
        "        # Minimal fallback parser (uses first YYYY-MM-DD in metadata)\n",
        "        def extract_release_date(val):\n",
        "            if pd.isna(val): return None\n",
        "            s = str(val)\n",
        "            m = re.search(r\"\\b(20\\d{2})-(0[1-9]|1[0-2])-(0[1-9]|[12]\\d|3[01])\\b\", s)\n",
        "            if not m: return None\n",
        "            try:\n",
        "                return datetime.strptime(m.group(0), \"%Y-%m-%d\").date()\n",
        "            except Exception:\n",
        "                return None\n",
        "        df[\"release_date_parsed\"] = df.get(\"metadata\", pd.Series([np.nan]*len(df))).apply(extract_release_date)\n",
        "    df[\"release_quarter\"] = df[\"release_date_parsed\"].apply(lambda d: to_quarter_label(d) if pd.notna(d) else \"Unknown\")\n",
        "\n",
        "# -- Build distinct model counts per quarter across ALL categories\n",
        "# Use (provider, model) pairs to count unique models per quarter\n",
        "df[\"_model_key\"] = df[\"provider\"].astype(str).str.strip() + \"||\" + df[\"model\"].astype(str).str.strip()\n",
        "quarter_model_counts = (\n",
        "    df.dropna(subset=[\"release_quarter\"])\n",
        "      .groupby(\"release_quarter\")[\"_model_key\"]\n",
        "      .nunique()\n",
        "      .to_dict()\n",
        ")\n",
        "\n",
        "# -- Filter Category 4 rows and sort\n",
        "cat4_rows = df[df[\"category\"] == 4].copy()\n",
        "cat4_rows = cat4_rows.sort_values(by=\"release_date_parsed\", ascending=True)\n",
        "\n",
        "# -- Insert labeled header rows on quarter change (and before the first block)\n",
        "output_rows = []\n",
        "prev_quarter = None\n",
        "\n",
        "for idx, row in cat4_rows.iterrows():\n",
        "    current_quarter = row.get(\"release_quarter\", \"Unknown\")\n",
        "    # Insert header before first quarter and when quarter changes\n",
        "    if prev_quarter is None or current_quarter != prev_quarter:\n",
        "        header = {col: \"\" for col in cat4_rows.columns}\n",
        "        # Label year and quarter; year from the quarter label if possible\n",
        "        year_val = row.get(\"year\", \"\")\n",
        "        if (not year_val) or (pd.isna(year_val)):\n",
        "            # Try to parse year from \"YYYY-Qn\"\n",
        "            try:\n",
        "                year_val = int(str(current_quarter).split(\"-\")[0])\n",
        "            except Exception:\n",
        "                year_val = \"\"\n",
        "        header[\"year\"] = year_val\n",
        "        total_models = quarter_model_counts.get(current_quarter, 0)\n",
        "        header[\"release_quarter\"] = f\"=== {current_quarter} — Total models this quarter (all categories): {total_models} ===\"\n",
        "        output_rows.append(header)\n",
        "    output_rows.append(row.to_dict())\n",
        "    prev_quarter = current_quarter\n",
        "\n",
        "cat4_with_headers = pd.DataFrame(output_rows, columns=cat4_rows.columns)\n",
        "\n",
        "# -- Save\n",
        "OUTPUT_PATH = \"/content/4category.tsv\"\n",
        "cat4_with_headers.to_csv(OUTPUT_PATH, sep=\"\\t\", index=False)\n",
        "\n",
        "print(f\"Saved {len(cat4_with_headers)} rows with quarter headers to {OUTPUT_PATH}\")\n",
        "print(cat4_with_headers.head(20).to_string(index=False))\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5dhrhAdFu_PS",
        "outputId": "be8a8528-dabb-4b0f-ab4c-d16ac03a36bf"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saved 44 rows with quarter headers to /content/4category.tsv\n",
            "   provider                model size variant version     sector openness                       region              country      source_id is_first_party category  year                                                                                                                                                          metadata score is_model_release release_date_parsed                                                 release_quarter                        _model_key\n",
            "                                                                                                                                                                    2020                                                                                                                                                                                                              === 2020-Q2 — Total models this quarter (all categories): 1 ===                                  \n",
            "     openAi                gpt-3  NaN     NaN     NaN   Industry   closed                North America        United States first-party-60           True        4  2020                                                                                       {'url': ['https://arxiv.org/pdf/2005.14165'], 'release_date': '2020-05-29'}     2             True          2020-05-29                                                         2020-Q2                     openAi||gpt-3\n",
            "                                                                                                                                                                    2021                                                                                                                                                                                                              === 2021-Q1 — Total models this quarter (all categories): 1 ===                                  \n",
            "     openAi               dall-e  NaN     NaN     NaN   Industry   closed                North America        United States first-party-67           True        4  2021                                                   {'url': ['https://openai.com/index/dall-e/', 'https://arxiv.org/pdf/2102.12092'], 'release_date': '2021-01-05'}     1             True          2021-01-05                                                         2021-Q1                    openAi||dall-e\n",
            "                                                                                                                                                                    2022                                                                                                                                                                                                              === 2022-Q2 — Total models this quarter (all categories): 2 ===                                  \n",
            "       meta                  opt  NaN     NaN     NaN   Industry     open                North America        United States first-party-38           True        4  2022                                                                                       {'url': ['https://arxiv.org/pdf/2205.01068'], 'release_date': '2022-05-02'}     3             True          2022-05-02                                                         2022-Q2                         meta||opt\n",
            "                                                                                                                                                                    2022                                                                                                                                                                                                              === 2022-Q3 — Total models this quarter (all categories): 2 ===                                  \n",
            " bigScience                bloom  NaN     NaN     NaN   Academia     open                       Europe               France first-party-19           True        4  2022                                                                                       {'url': ['https://arxiv.org/abs/2211.05100'], 'release_date': '2022-07-12'}     3             True          2022-07-12                                                         2022-Q3                 bigScience||bloom\n",
            "stabilityAi stable-diffusion-1.0  NaN     NaN     NaN   Industry     open                       Europe       United Kingdom first-party-69           True        4  2022 {'url': ['https://github.com/CompVis/stable-diffusion/blob/main/Stable_Diffusion_v1_Model_Card.md', 'https://stability.ai/safety'], 'release_date': '2022-08-22'}     3             True          2022-08-22                                                         2022-Q3 stabilityAi||stable-diffusion-1.0\n",
            "                                                                                                                                                                    2022                                                                                                                                                                                                              === 2022-Q4 — Total models this quarter (all categories): 2 ===                                  \n",
            "stabilityAi stable-diffusion-2.0  NaN     NaN     NaN   Industry     open                       Europe       United Kingdom first-party-70           True        4  2022                                   {'url': ['https://huggingface.co/stabilityai/stable-diffusion-2', 'https://stability.ai/safety'], 'release_date': '2022-11-01'}     3             True          2022-11-01                                                         2022-Q4 stabilityAi||stable-diffusion-2.0\n",
            "                                                                                                                                                                    2023                                                                                                                                                                                                              === 2023-Q1 — Total models this quarter (all categories): 2 ===                                  \n",
            "       meta              llama-1  NaN     NaN     NaN   Industry   closed                North America        United States first-party-39           True        4  2023                                                                                       {'url': ['https://arxiv.org/pdf/2302.13971'], 'release_date': '2023-02-24'}     3             True          2023-02-24                                                         2023-Q1                     meta||llama-1\n",
            "                                                                                                                                                                    2023                                                                                                                                                                                                              === 2023-Q2 — Total models this quarter (all categories): 2 ===                                  \n",
            " eleutherAi               pythia  NaN     NaN     NaN  Nonprofit     open                North America        United States first-party-27           True        4  2023                                                                                       {'url': ['https://arxiv.org/pdf/2304.01373'], 'release_date': '2023-04-03'}     1             True          2023-04-03                                                         2023-Q2                eleutherAi||pythia\n",
            "     google               palm-2  NaN     NaN     NaN   Industry   closed                North America        United States first-party-28           True        4  2023                                                                                       {'url': ['https://arxiv.org/pdf/2305.10403'], 'release_date': '2023-05-10'}     1             True          2023-05-10                                                         2023-Q2                    google||palm-2\n",
            "                                                                                                                                                                    2023                                                                                                                                                                                                              === 2023-Q3 — Total models this quarter (all categories): 4 ===                                  \n",
            "       meta              llama-2  NaN     NaN     NaN   Industry     open                North America        United States first-party-40           True        4  2023                                                                                       {'url': ['https://arxiv.org/pdf/2307.09288'], 'release_date': '2023-07-18'}     3             True          2023-07-18                                                         2023-Q3                     meta||llama-2\n",
            "                                                                                                                                                                    2023                                                                                                                                                                                                              === 2023-Q4 — Total models this quarter (all categories): 3 ===                                  \n",
            "        tii               falcon  NaN     NaN     NaN Government     open Middle East and North Africa United Arab Emirates first-party-73           True        4  2023                                                                                       {'url': ['https://arxiv.org/pdf/2311.16867'], 'release_date': '2023-11-28'}     1             True          2023-11-28                                                         2023-Q4                       tii||falcon\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# # --- Provider x Category averaged scores (fill missing categories with 0 per model, rounded to 2 decimals) ---\n",
        "\n",
        "# import pandas as pd\n",
        "# import numpy as np\n",
        "\n",
        "# # =========================\n",
        "# # 1) Path to your stratified input\n",
        "# # =========================\n",
        "# INPUT_PATH = \"/content/model_release_only.tsv\"\n",
        "\n",
        "# # =========================\n",
        "# # 2) Load and normalize columns\n",
        "# # =========================\n",
        "# df = pd.read_csv(INPUT_PATH, sep=\"\\t\")\n",
        "\n",
        "# # Normalize column names (strip + lower)\n",
        "# df.columns = [c.strip() for c in df.columns]\n",
        "\n",
        "# # Map to standard names\n",
        "# col_map = {}\n",
        "# if \"Provider\" in df.columns: col_map[\"Provider\"] = \"provider\"\n",
        "# if \"provider\" in df.columns: col_map[\"provider\"] = \"provider\"\n",
        "# if \"Model\" in df.columns: col_map[\"Model\"] = \"model\"\n",
        "# if \"name\" in df.columns: col_map[\"name\"] = \"model\"\n",
        "# if \"Category\" in df.columns: col_map[\"Category\"] = \"category\"\n",
        "# if \"category\" in df.columns: col_map[\"category\"] = \"category\"\n",
        "# if \"Score\" in df.columns: col_map[\"Score\"] = \"score\"\n",
        "# if \"score\" in df.columns: col_map[\"score\"] = \"score\"\n",
        "\n",
        "# df = df.rename(columns=col_map)\n",
        "\n",
        "# required = {\"provider\", \"model\", \"category\", \"score\"}\n",
        "# missing = required - set(df.columns)\n",
        "# if missing:\n",
        "#     raise ValueError(f\"Missing required columns in input: {sorted(missing)}\")\n",
        "\n",
        "# # Coerce types\n",
        "# df[\"category\"] = pd.to_numeric(df[\"category\"], errors=\"coerce\").astype(\"Int64\")\n",
        "# df[\"score\"] = pd.to_numeric(df[\"score\"], errors=\"coerce\")\n",
        "\n",
        "# # Keep only valid categories 1..7\n",
        "# df = df[df[\"category\"].between(1, 7, inclusive=\"both\")]\n",
        "\n",
        "# # Clean provider names\n",
        "# df[\"provider\"] = df[\"provider\"].astype(str).str.strip()\n",
        "\n",
        "# # =========================\n",
        "# # 3) Build per-model 7-category vectors (fill missing with 0)\n",
        "# # =========================\n",
        "# cats = [1,2,3,4,5,6,7]\n",
        "\n",
        "# df_model_cat = (\n",
        "#     df.groupby([\"provider\", \"model\", \"category\"], as_index=False)[\"score\"]\n",
        "#       .mean()\n",
        "# )\n",
        "\n",
        "# mat_model = (\n",
        "#     df_model_cat\n",
        "#     .pivot_table(index=[\"provider\",\"model\"], columns=\"category\", values=\"score\", aggfunc=\"mean\")\n",
        "#     .reindex(columns=cats)\n",
        "#     .fillna(0.0)\n",
        "# )\n",
        "\n",
        "# # =========================\n",
        "# # 4) Average across models per provider\n",
        "# # =========================\n",
        "# mat_provider = mat_model.groupby(level=\"provider\").mean()\n",
        "\n",
        "# # Add overall average\n",
        "# mat_provider[\"overall_avg\"] = mat_provider[cats].mean(axis=1)\n",
        "\n",
        "# # --- Round to 2 decimals everywhere ---\n",
        "# mat_model = mat_model.round(2)\n",
        "# mat_provider = mat_provider.round(2)\n",
        "\n",
        "# # =========================\n",
        "# # 5) Save outputs\n",
        "# # =========================\n",
        "# MODEL_MATRIX_OUT = \"/content/model_category_scores_matrix.tsv\"\n",
        "# PROVIDER_AVG_OUT = \"/content/provider_category_avg_scores.tsv\"\n",
        "\n",
        "# mat_model_reset = mat_model.reset_index().rename_axis(None, axis=1)\n",
        "# mat_model_reset.to_csv(MODEL_MATRIX_OUT, sep=\"\\t\", index=False)\n",
        "\n",
        "# mat_provider_reset = mat_provider.reset_index().rename_axis(None, axis=1)\n",
        "# mat_provider_reset.to_csv(PROVIDER_AVG_OUT, sep=\"\\t\", index=False)\n",
        "\n",
        "# print(f\"Saved per-model matrix to: {MODEL_MATRIX_OUT}\")\n",
        "# print(f\"Saved provider averages to: {PROVIDER_AVG_OUT}\")\n",
        "# print(\"\\nPreview (first 10 providers):\")\n",
        "# print(mat_provider_reset.head(10).to_string(index=False))\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "oH56mgOsNDi7",
        "outputId": "18e0a8b0-a794-452c-c6da-6e66a6a07301"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saved per-model matrix to: /content/model_category_scores_matrix.tsv\n",
            "Saved provider averages to: /content/provider_category_avg_scores.tsv\n",
            "\n",
            "Preview (first 10 providers):\n",
            "  provider   1    2   3    4    5   6   7  overall_avg\n",
            "     01.ai 1.0 1.00 0.0 0.00 1.00 0.5 0.0         0.50\n",
            "       ai2 1.0 1.67 0.0 2.33 0.33 1.0 0.0         0.90\n",
            "      ai21 0.0 1.00 0.0 0.00 0.00 1.0 0.0         0.29\n",
            "   alibaba 0.0 1.00 2.0 0.00 0.25 0.0 0.0         0.46\n",
            "    amazon 1.0 1.00 1.0 0.00 1.00 0.0 0.0         0.57\n",
            " anthropic 1.5 1.25 1.0 0.00 0.00 0.0 0.0         0.54\n",
            "     apple 2.0 2.00 0.0 1.00 0.00 1.0 0.0         0.86\n",
            "  baichuan 2.0 2.00 2.0 0.00 0.00 1.0 0.0         1.00\n",
            "bigScience 1.5 1.00 2.5 1.50 0.50 0.5 0.5         1.14\n",
            " byteDance 0.0 2.00 0.0 0.00 0.00 0.0 0.0         0.29\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# # --- Provider & Country x Category averaged scores (fill missing categories with 0 per model, rounded to 2 decimals) ---\n",
        "\n",
        "# import pandas as pd\n",
        "# import numpy as np\n",
        "\n",
        "# # =========================\n",
        "# # 1) Path to your filtered input (is_model_release == True)\n",
        "# # =========================\n",
        "# INPUT_PATH = \"/content/model_release_only.tsv\"\n",
        "\n",
        "# # =========================\n",
        "# # 2) Load and normalize columns\n",
        "# # =========================\n",
        "# df = pd.read_csv(INPUT_PATH, sep=\"\\t\")\n",
        "\n",
        "# # Normalize column names (strip)\n",
        "# df.columns = [c.strip() for c in df.columns]\n",
        "\n",
        "# # Map to standard names\n",
        "# col_map = {}\n",
        "# if \"Provider\" in df.columns: col_map[\"Provider\"] = \"provider\"\n",
        "# if \"provider\" in df.columns: col_map[\"provider\"] = \"provider\"\n",
        "# if \"Model\" in df.columns: col_map[\"Model\"] = \"model\"\n",
        "# if \"name\" in df.columns: col_map[\"name\"] = \"model\"\n",
        "# if \"Category\" in df.columns: col_map[\"Category\"] = \"category\"\n",
        "# if \"category\" in df.columns: col_map[\"category\"] = \"category\"\n",
        "# if \"Score\" in df.columns: col_map[\"Score\"] = \"score\"\n",
        "# if \"score\" in df.columns: col_map[\"score\"] = \"score\"\n",
        "# if \"Country\" in df.columns: col_map[\"Country\"] = \"country\"\n",
        "# if \"country\" in df.columns: col_map[\"country\"] = \"country\"\n",
        "\n",
        "# df = df.rename(columns=col_map)\n",
        "\n",
        "# required = {\"provider\", \"model\", \"category\", \"score\"}\n",
        "# missing = required - set(df.columns)\n",
        "# if missing:\n",
        "#     raise ValueError(f\"Missing required columns in input: {sorted(missing)}\")\n",
        "\n",
        "# # Coerce types\n",
        "# df[\"category\"] = pd.to_numeric(df[\"category\"], errors=\"coerce\").astype(\"Int64\")\n",
        "# df[\"score\"] = pd.to_numeric(df[\"score\"], errors=\"coerce\")\n",
        "\n",
        "# # Keep only valid categories 1..7\n",
        "# df = df[df[\"category\"].between(1, 7, inclusive=\"both\")]\n",
        "\n",
        "# # Clean strings\n",
        "# df[\"provider\"] = df[\"provider\"].astype(str).str.strip()\n",
        "# if \"country\" in df.columns:\n",
        "#     df[\"country\"] = df[\"country\"].astype(str).str.strip().replace({\"\": np.nan})\n",
        "# else:\n",
        "#     # If country not present, create a placeholder so the country aggregate still works\n",
        "#     df[\"country\"] = \"Unknown\"\n",
        "\n",
        "# # =========================\n",
        "# # 3) Build per-model 7-category vectors (fill missing with 0)\n",
        "# # =========================\n",
        "# cats = [1,2,3,4,5,6,7]\n",
        "\n",
        "# # Average duplicates first at (country, provider, model, category)\n",
        "# df_model_cat = (\n",
        "#     df.groupby([\"country\", \"provider\", \"model\", \"category\"], as_index=False)[\"score\"]\n",
        "#       .mean()\n",
        "# )\n",
        "\n",
        "# # Pivot to category columns; fill missing categories with 0 PER MODEL\n",
        "# mat_model = (\n",
        "#     df_model_cat\n",
        "#     .pivot_table(index=[\"country\",\"provider\",\"model\"], columns=\"category\", values=\"score\", aggfunc=\"mean\")\n",
        "#     .reindex(columns=cats)\n",
        "#     .fillna(0.0)\n",
        "# )\n",
        "\n",
        "# # =========================\n",
        "# # 4a) Average across models per PROVIDER\n",
        "# # =========================\n",
        "# # group by provider level (ignore country) to get one row per provider\n",
        "# mat_provider = mat_model.groupby(level=\"provider\").mean()\n",
        "# mat_provider[\"overall_avg\"] = mat_provider[cats].mean(axis=1)\n",
        "\n",
        "# # =========================\n",
        "# # 4b) Average across models per COUNTRY\n",
        "# # =========================\n",
        "# mat_country = mat_model.groupby(level=\"country\").mean()\n",
        "# mat_country[\"overall_avg\"] = mat_country[cats].mean(axis=1)\n",
        "\n",
        "# # --- Round to 2 decimals everywhere ---\n",
        "# mat_model = mat_model.round(2)\n",
        "# mat_provider = mat_provider.round(2)\n",
        "# mat_country = mat_country.round(2)\n",
        "\n",
        "# # =========================\n",
        "# # 5) Save outputs\n",
        "# # =========================\n",
        "# MODEL_MATRIX_OUT = \"/content/model_category_scores_matrix.tsv\"\n",
        "# PROVIDER_AVG_OUT = \"/content/provider_category_avg_scores.tsv\"\n",
        "# COUNTRY_AVG_OUT = \"/content/country_category_avg_scores.tsv\"\n",
        "\n",
        "# # Per-model matrix (country, provider, model, 7 category columns)\n",
        "# mat_model_reset = mat_model.reset_index().rename_axis(None, axis=1)\n",
        "# mat_model_reset.to_csv(MODEL_MATRIX_OUT, sep=\"\\t\", index=False)\n",
        "\n",
        "# # Provider-level averages (provider, 7 category columns + overall_avg)\n",
        "# mat_provider_reset = mat_provider.reset_index().rename_axis(None, axis=1)\n",
        "# mat_provider_reset.to_csv(PROVIDER_AVG_OUT, sep=\"\\t\", index=False)\n",
        "\n",
        "# # Country-level averages (country, 7 category columns + overall_avg)\n",
        "# mat_country_reset = mat_country.reset_index().rename_axis(None, axis=1)\n",
        "# mat_country_reset.to_csv(COUNTRY_AVG_OUT, sep=\"\\t\", index=False)\n",
        "\n",
        "# print(f\"Saved per-model matrix to: {MODEL_MATRIX_OUT}\")\n",
        "# print(f\"Saved provider averages to: {PROVIDER_AVG_OUT}\")\n",
        "# print(f\"Saved country averages to:  {COUNTRY_AVG_OUT}\")\n",
        "\n",
        "# print(\"\\nPreview (first 8 providers):\")\n",
        "# print(mat_provider_reset.head(8).to_string(index=False))\n",
        "# print(\"\\nPreview (top 8 countries):\")\n",
        "# print(mat_country_reset.head(8).to_string(index=False))\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "VgrsL7akPejA",
        "outputId": "f8cbf34e-f7c6-40fa-b503-68d87dcdf1f4"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saved per-model matrix to: /content/model_category_scores_matrix.tsv\n",
            "Saved provider averages to: /content/provider_category_avg_scores.tsv\n",
            "Saved country averages to:  /content/country_category_avg_scores.tsv\n",
            "\n",
            "Preview (first 8 providers):\n",
            " provider   1    2   3    4    5   6   7  overall_avg\n",
            "    01.ai 1.0 1.00 0.0 0.00 1.00 0.5 0.0         0.50\n",
            "      ai2 1.0 1.67 0.0 2.33 0.33 1.0 0.0         0.90\n",
            "     ai21 0.0 1.00 0.0 0.00 0.00 1.0 0.0         0.29\n",
            "  alibaba 0.0 1.00 2.0 0.00 0.25 0.0 0.0         0.46\n",
            "   amazon 1.0 1.00 1.0 0.00 1.00 0.0 0.0         0.57\n",
            "anthropic 1.5 1.25 1.0 0.00 0.00 0.0 0.0         0.54\n",
            "    apple 2.0 2.00 0.0 1.00 0.00 1.0 0.0         0.86\n",
            " baichuan 2.0 2.00 2.0 0.00 0.00 1.0 0.0         1.00\n",
            "\n",
            "Preview (top 8 countries):\n",
            "             country    1    2    3    4    5    6    7  overall_avg\n",
            "              Canada 2.50 2.00 2.00 0.00 0.50 0.00 0.50         1.07\n",
            "               China 0.53 1.29 0.76 0.24 0.47 0.71 0.00         0.57\n",
            "              France 0.55 0.45 1.64 0.36 0.09 0.27 0.09         0.49\n",
            "              Israel 0.00 1.00 0.00 0.00 0.00 1.00 0.00         0.29\n",
            "         South Korea 0.00 2.00 0.00 1.00 0.00 1.00 0.00         0.57\n",
            "United Arab Emirates 0.00 2.00 3.00 1.00 0.00 3.00 0.00         1.29\n",
            "      United Kingdom 0.00 1.00 0.00 2.00 0.00 0.67 0.00         0.52\n",
            "       United States 1.59 1.51 0.95 0.95 0.78 0.57 0.27         0.95\n"
          ]
        }
      ]
    }
  ]
}