{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "from datetime import datetime\n",
    "import cohere\n",
    "import pandas as pd\n",
    "import asyncio\n",
    "from tqdm import tqdm\n",
    "import numpy as np\n",
    "from tqdm.asyncio import tqdm_asyncio\n",
    "import nest_asyncio\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import ast\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "FONT_SIZES = {\"small\": 12, \"medium\": 20, \"large\": 20}\n",
    "COLORS = {\"green\": \"#355145\", \"purple\": \"#d8a6e5\", \"orange\": \"#fe7759\"}\n",
    "\n",
    "PLOT_PARAMS = {\n",
    "    \"font.size\": FONT_SIZES.get(\"medium\"),\n",
    "    \"axes.titlesize\": FONT_SIZES.get(\"large\"),\n",
    "    \"axes.labelsize\": FONT_SIZES.get(\"large\"),\n",
    "    \"xtick.labelsize\": FONT_SIZES.get(\"medium\"),\n",
    "    \"ytick.labelsize\": FONT_SIZES.get(\"medium\"),\n",
    "    \"legend.fontsize\": FONT_SIZES.get(\"medium\"),\n",
    "    \"figure.titlesize\": FONT_SIZES.get(\"medium\"),\n",
    "    \"text.usetex\": False,\n",
    "}\n",
    "\n",
    "plt.rcParams.update(PLOT_PARAMS)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_dir = \"/local-disk/lmsys-march-25/\" \n",
    "all_data = []\n",
    "\n",
    "for year in os.listdir(base_dir):\n",
    "    for month in os.listdir(os.path.join(base_dir, year)):\n",
    "        for day in os.listdir(os.path.join(base_dir, year, month)):\n",
    "            for file in os.listdir(os.path.join(base_dir, year, month, day)):\n",
    "                file_path = os.path.join(base_dir, year, month, day, file)\n",
    "                print(f\"Processing: {file_path}\")\n",
    "                df = pd.read_json(file_path, lines = True)  \n",
    "                df['date'] = datetime.strptime(f\"{year}-{month}-{day}\", \"%Y-%m-%d\")\n",
    "                all_data.append(df)\n",
    "\n",
    "df = pd.concat(all_data, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filter = df.copy()\n",
    "df_filter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filter = df_filter.dropna(subset=['message'])\n",
    "df_filter['message'] = df_filter['message'].astype(str).str.strip()\n",
    "df_filter[df_filter['message'] == \"\"] = None \n",
    "df_filter = df_filter.dropna(subset=['message']).reset_index(drop=True)\n",
    "df_filter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filter = df_filter[df_filter['req_chat_history'].apply(lambda x: x == [])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filter = df_filter.dropna(subset=['message'])\n",
    "df_filter['message'] = df_filter['message'].astype(str).str.strip()\n",
    "df_filter[df_filter['message'] == \"\"] = None \n",
    "df_filter = df_filter.dropna(subset=['message']).reset_index(drop=True)\n",
    "df_filter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# get embedding "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import cohere\n",
    "import numpy as np\n",
    "import asyncio\n",
    "from tqdm.asyncio import tqdm_asyncio\n",
    "\n",
    "\n",
    "client = cohere.AsyncClient()\n",
    "\n",
    "\n",
    "async def get_embeddings_async(data):\n",
    "\n",
    "    messages = data['message'].tolist()\n",
    "    semaphore = asyncio.Semaphore(128)  \n",
    "\n",
    "    async def process_message(index, text):\n",
    "        async with semaphore:\n",
    "            for attempt in range(5):\n",
    "                try:\n",
    "                    response = await client.embed(\n",
    "                        texts=[text],\n",
    "                        model=\"embed-multilingual-v3.0\",\n",
    "                        input_type=\"search_document\",\n",
    "                        embedding_types=[\"float\"]\n",
    "                    )\n",
    "                    return index, response.embeddings.float[0]\n",
    "                except Exception as e:\n",
    "                    print(f\"Attempt {attempt + 1} failed at index {index}: {e}\")\n",
    "                    if attempt < 5 - 1:\n",
    "                        await asyncio.sleep(3) \n",
    "                    else:\n",
    "                        return index, None\n",
    "\n",
    "    tasks = [process_message(i, msg) for i, msg in enumerate(messages)]\n",
    "    results = await tqdm_asyncio.gather(*tasks)\n",
    "    \n",
    "    embeddings = [None] * len(messages)\n",
    "    for idx, embedding in results:\n",
    "        embeddings[idx] = embedding\n",
    "\n",
    "    data[\"embedding\"] = embeddings\n",
    "    return data\n",
    "\n",
    "nest_asyncio.apply() \n",
    "\n",
    "df_embed = await get_embeddings_async(df_filter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_embed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_embed = df_embed.dropna()\n",
    "df_embed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_embed.to_pickle('df_embed.pkl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# calculate similarity "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_embed = pd.read_pickle('df_embed.pkl')\n",
    "df_embed['date'] = pd.to_datetime(df_embed['date'])\n",
    "\n",
    "df_embed['month'] = df_embed['date'].dt.to_period('M')\n",
    "\n",
    "monthly_groups = df_embed.groupby('month')\n",
    "\n",
    "sorted_months = sorted(monthly_groups.groups.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_embed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_embed.groupby('model').size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a month-to-month similarity matrix\n",
    "results = []\n",
    "\n",
    "for month_1 in sorted_months:\n",
    "    for month_2 in sorted_months:\n",
    "        \n",
    "        messages_1 = monthly_groups.get_group(month_1)['message'].str.lower().to_numpy()\n",
    "        messages_2 = monthly_groups.get_group(month_2)['message'].str.lower().to_numpy()\n",
    "        \n",
    "        exact_match = (messages_1[:, None] == messages_2[None, :])\n",
    "\n",
    "        embeddings_1 = np.stack(monthly_groups.get_group(month_1)['embedding'].to_numpy()).astype(np.float32)\n",
    "        embeddings_2 = np.stack(monthly_groups.get_group(month_2)['embedding'].to_numpy()).astype(np.float32)\n",
    "        \n",
    "        similarities = cosine_similarity(embeddings_1, embeddings_2)\n",
    "\n",
    "        if month_1 == month_2:\n",
    "            np.fill_diagonal(similarities, 0)\n",
    "            np.fill_diagonal(exact_match, False)\n",
    "        \n",
    "        num_duplicate_exact_match = np.any(exact_match, axis=1).sum()\n",
    "\n",
    "        num_duplicate_embedding = np.sum(np.max(similarities, axis=1) > 0.95)\n",
    "\n",
    "        # Add to results\n",
    "        results.append({\n",
    "            'from': str(month_1),\n",
    "            'to': str(month_2),\n",
    "            'num_duplicate_embedding': num_duplicate_embedding,\n",
    "            'num_duplicate_exact_match': num_duplicate_exact_match,\n",
    "            \"num_samples\": len(messages_1),\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(results).to_pickle('prompt_analysis_stats.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = pd.read_pickle('prompt_analysis_stats.pkl')\n",
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results['duplicate_embedding_rate'] = results['num_duplicate_embedding'] / results['num_samples']\n",
    "results['duplicate_exact_match_rate'] = results['num_duplicate_exact_match'] / results['num_samples']\n",
    "results = results[['from', 'to', 'duplicate_embedding_rate', 'duplicate_exact_match_rate']]\n",
    "results = results[(results['from'] != '2025-02') & (results['to'] != '2025-02')]\n",
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reshape to create a proper matrix format\n",
    "duplicate_embedding_rate_matrix = results.pivot(index='from', columns='to', values='duplicate_embedding_rate')\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.heatmap(duplicate_embedding_rate_matrix, annot=True, fmt=\".3f\", cmap=\"YlGnBu\", linewidths=0.5)\n",
    "plt.xlabel(None)\n",
    "plt.ylabel(None)\n",
    "plt.tight_layout()\n",
    "plt.savefig('prompt-duplication-heatmap-embedding.pdf', dpi=300, bbox_inches='tight')\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reshape to create a proper matrix format\n",
    "duplicate_exact_match_rate_matrix = results.pivot(index='from', columns='to', values='duplicate_exact_match_rate')\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.heatmap(duplicate_exact_match_rate_matrix, annot=True, fmt=\".3f\", cmap=\"YlGnBu\", linewidths=0.5)\n",
    "plt.xlabel(None)\n",
    "plt.ylabel(None)\n",
    "plt.tight_layout()\n",
    "plt.savefig('prompt-duplication-heatmap-exact-match.pdf', dpi=300, bbox_inches='tight')\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = results[results['from'] == results['to']]\n",
    "\n",
    "\n",
    "x = np.arange(len(df['from']))  # the label locations\n",
    "width = 0.35  # the width of the bars\n",
    "\n",
    "# Plot\n",
    "plt.figure(figsize=(12, 6))\n",
    "\n",
    "# Plot both exact same and highly similar bar plots side by side\n",
    "bar1 = plt.bar(x - width/2, df['duplicate_exact_match_rate'] * 100, width, label='Exact Match', alpha=0.7, color='tab:green',edgecolor='black')\n",
    "bar2 = plt.bar(x + width/2, df['duplicate_embedding_rate'] * 100, width, label='High Similarity (cos similarity > 0.95)', alpha=0.7, color='tab:blue',edgecolor='black')\n",
    "\n",
    "# Add text on top of bars\n",
    "for bar in bar1:\n",
    "    height = bar.get_height()\n",
    "    plt.text(bar.get_x() + bar.get_width()/2, height + 0.5, f'{height:.1f}', ha='center', va='bottom', color='black', \n",
    "                     fontweight='bold', fontsize=FONT_SIZES['large']-5)\n",
    "for bar in bar2:\n",
    "    height = bar.get_height()\n",
    "    plt.text(bar.get_x() + bar.get_width()/2, height + 0.5, f'{height:.1f}', ha='center', va='bottom', color='black', \n",
    "                     fontweight='bold', fontsize=FONT_SIZES['large']-5)\n",
    "\n",
    "ax = plt.gca()\n",
    "\n",
    "for spine in ax.spines.values():\n",
    "    spine.set_linewidth(2)\n",
    "    spine.set_color('black')\n",
    "\n",
    "plt.xticks(x, df['from'])\n",
    "\n",
    "# Add legend and labels\n",
    "plt.grid(True, axis='y', linestyle='--', alpha=0.7)\n",
    "plt.legend()\n",
    "plt.ylabel('% of Prompts')\n",
    "plt.tight_layout()\n",
    "plt.ylim(0, 40)\n",
    "plt.savefig('prompt-duplication-bar-plot.pdf', dpi=300, bbox_inches='tight')\n",
    "plt.show()\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "chatbotarena",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
