{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "source": [
    "import os\n",
    "import json\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy import stats\n",
    "import pickle\n",
    "\n",
    "from matplotlib import rcParams\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "rcParams[\"font.family\"] = \"serif\"\n",
    "rcParams[\"grid.linestyle\"] = ':'\n",
    "rcParams[\"xtick.direction\"] = 'in'\n",
    "rcParams[\"ytick.direction\"] = 'in'\n",
    "rcParams[\"legend.fontsize\"] = 9\n",
    "rcParams[\"axes.labelsize\"] = 20\n",
    "rcParams[\"axes.titlesize\"] = 20\n",
    "rcParams[\"xtick.labelsize\"] = 15\n",
    "rcParams[\"ytick.labelsize\"] = 15"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "source": [
    "search_counts_dir = '../search_counts'"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "source": [
    "image_search_counts = [os.path.join(search_counts_dir, x) for x in os.listdir(search_counts_dir) if '0.7_' in x and 'integrated' not in x]\n",
    "\n",
    "text_search_counts = [os.path.join(search_counts_dir, x) for x in os.listdir(search_counts_dir) if 'lemmatized' in x and 'integrated' not in x]\n",
    "\n",
    "integrated_search_counts = [os.path.join(search_counts_dir, x) for x in os.listdir(search_counts_dir) if '0.7_' in x and 'lemmatized' in x and 'integrated' in x]"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "source": [
    "cc3m_relevant_count_files = [x for x in integrated_search_counts if 'cc3m' in x]\n",
    "data = []\n",
    "processed_concepts = set()\n",
    "cc3m_counts = []\n",
    "\n",
    "for rfile in sorted(cc3m_relevant_count_files):\n",
    "    dataset = rfile.split('/')[-1].split('_')[0]\n",
    "    with open(rfile) as f:\n",
    "        concepts = json.load(f)\n",
    "        for concept, count in concepts.items():\n",
    "            if concept not in processed_concepts:\n",
    "                data.append({'id': concept, 'concept_name': concept, 'dataset': dataset, 'count': count})\n",
    "                processed_concepts.add(concept)\n",
    "                cc3m_counts.append(count)\n",
    "\n",
    "cc3m_df = pd.DataFrame(data)\n",
    "print(len(cc3m_df))"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3775\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "source": [
    "cc12m_relevant_count_files = [x for x in image_search_counts if 'cc12m' in x]\n",
    "data = []\n",
    "processed_concepts = set()\n",
    "cc12m_counts = []\n",
    "\n",
    "for rfile in sorted(cc12m_relevant_count_files):\n",
    "    dataset = rfile.split('/')[-1].split('_')[0]\n",
    "    with open(rfile) as f:\n",
    "        concepts = json.load(f)\n",
    "        for concept, count in concepts.items():\n",
    "            if concept not in processed_concepts:\n",
    "                data.append({'id': concept, 'concept_name': concept, 'dataset': dataset, 'count': count})\n",
    "                processed_concepts.add(concept)\n",
    "                cc12m_counts.append(count)\n",
    "\n",
    "cc12m_df = pd.DataFrame(data)\n",
    "print(len(cc12m_df))"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3811\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "source": [
    "yfcc15m_relevant_count_files = [x for x in image_search_counts if 'yfcc15m' in x]\n",
    "data = []\n",
    "processed_concepts = set()\n",
    "yfcc15m_counts = []\n",
    "\n",
    "for rfile in sorted(yfcc15m_relevant_count_files):\n",
    "    dataset = rfile.split('/')[-1].split('_')[0]\n",
    "    with open(rfile) as f:\n",
    "        concepts = json.load(f)\n",
    "        for concept, count in concepts.items():\n",
    "            if concept not in processed_concepts:\n",
    "                data.append({'id': concept, 'concept_name': concept, 'dataset': dataset, 'count': count})\n",
    "                processed_concepts.add(concept)\n",
    "                yfcc15m_counts.append(count)\n",
    "\n",
    "yfcc15m_df = pd.DataFrame(data)\n",
    "print(len(yfcc15m_df))"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3811\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "source": [
    "laion400m_relevant_count_files = [x for x in image_search_counts if 'laion400m' in x]\n",
    "data = []\n",
    "processed_concepts = set()\n",
    "laion400m_counts = []\n",
    "\n",
    "for rfile in sorted(laion400m_relevant_count_files):\n",
    "    dataset = rfile.split('/')[-1].split('_')[0]\n",
    "    with open(rfile) as f:\n",
    "        concepts = json.load(f)\n",
    "        for concept, count in concepts.items():\n",
    "            if concept not in processed_concepts:\n",
    "                data.append({'id': concept, 'concept_name': concept, 'dataset': dataset, 'count': count})\n",
    "                processed_concepts.add(concept)\n",
    "                laion400m_counts.append(count)\n",
    "\n",
    "laion400m_df = pd.DataFrame(data)\n",
    "print(len(laion400m_df))"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3776\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "source": [
    "print(len(laion400m_counts))\n",
    "print(len(yfcc15m_counts))\n",
    "\n",
    "res = stats.spearmanr(laion400m_counts, yfcc15m_counts)\n",
    "print(res)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3840\n",
      "3840\n",
      "SignificanceResult(statistic=0.7644588019322125, pvalue=0.0)\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "source": [
    "\n",
    "print(len(cc3m_counts))\n",
    "print(len(cc12m_counts))\n",
    "\n",
    "res = stats.spearmanr(cc3m_counts, cc12m_counts)\n",
    "print(res)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3775\n",
      "3775\n",
      "SignificanceResult(statistic=0.7924258926755035, pvalue=0.0)\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "source": [
    "\n",
    "print(len(yfcc15m_counts))\n",
    "print(len(cc12m_counts))\n",
    "\n",
    "res = stats.spearmanr(yfcc15m_counts, cc12m_counts)\n",
    "print(res)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3811\n",
      "3811\n",
      "SignificanceResult(statistic=0.9637908005672169, pvalue=0.0)\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "source": [
    "\n",
    "print(len(laion400m_counts))\n",
    "print(len(cc3m_counts))\n",
    "\n",
    "res = stats.spearmanr(laion400m_counts[:3775], cc3m_counts)\n",
    "print(res)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3776\n",
      "3775\n",
      "SignificanceResult(statistic=0.6282374756265174, pvalue=0.0)\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "source": [
    "\n",
    "print(len(laion400m_counts))\n",
    "print(len(cc12m_counts))\n",
    "\n",
    "res = stats.spearmanr(laion400m_counts, cc12m_counts[:3776])\n",
    "print(res)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3776\n",
      "3811\n",
      "SignificanceResult(statistic=0.7353264196231468, pvalue=0.0)\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "source": [
    "\n",
    "print(len(yfcc15m_counts))\n",
    "print(len(cc12m_counts))\n",
    "\n",
    "res = stats.spearmanr(yfcc15m_counts, cc12m_counts)\n",
    "print(res)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3811\n",
      "3811\n",
      "SignificanceResult(statistic=0.9637908005672169, pvalue=0.0)\n"
     ]
    }
   ],
   "metadata": {}
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}