{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "a0fccb67",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the data\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "data_csv = Path(\"reviews.csv\")\n",
    "\n",
    "if not data_csv.exists():  # make sure the file exists, otherwise compile it\n",
    "    from result_compiler import main as data_compiler\n",
    "    data_compiler()\n",
    "\n",
    "df = pd.read_csv(data_csv)\n",
    "\n",
    "# Sort the sources in the DataFrame as AAAI, IJCAI then ICML, NeurIPS and JAIR, JMLR\n",
    "source_order = [\"AAAI\", \"IJCAI\", \"ICML\", \"NeurIPS\", \"JAIR\", \"JMLR\"]\n",
    "df[\"source\"] = pd.Categorical(df[\"source\"], source_order)\n",
    "# Drop the index column\n",
    "df.drop(\"index\", axis=1, inplace=True)\n",
    "# Drop the theoretical papers\n",
    "df = df[df[\"theoretical\"] == False]\n",
    "\n",
    "# Load the enriched data\n",
    "enriched_data_csv = Path(\"enriched_data.csv\")\n",
    "enriched_df = pd.read_csv(enriched_data_csv)\n",
    "\n",
    "missing_titles_enriched = [l.strip() for l in Path(\"unfindable.txt\").open().readlines()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "c639689d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Could not match: Entropy Estimation via Normalizing Flow\n",
      "Could not match: SVFI: Spiking-Based Video Frame Interpolation for High-Speed Motion\n",
      "Could not match: Align, Perturb and Decouple: Toward Better Leverage of Difference Information for RSI Change Detection\n",
      "Could not match: Convergence in Multi-Issue Iterative Voting under Uncertainty\n",
      "Could not match: HEBO: Pushing The Limits of Sample-Efficient Hyper-parameter Optimisation\n",
      "Could not match: Underspecification Presents Challenges for Credibility in Modern Machine Learning\n",
      "Could not match: On the Effect of Initialization: The Scaling Path of 2-Layer Neural Networks\n",
      "Could not match: Nonparametric Inference under B-bits Quantization\n",
      "Could not match: Mean-Square Analysis of Discretized Itô Diffusions for Heavy-tailed Sampling\n",
      "Could not match: Convergence for nonconvex ADMM, with applications to CT imaging\n",
      "Could not match: Localized Debiased Machine Learning: Efficient Inference on Quantile Treatment Effects and Beyond\n",
      "Could not match: Improving physics-informed neural networks with meta-learned optimization\n",
      "Could not match: On the Sample Complexity and Metastability of Heavy-tailed Policy Search in Continuous Control\n",
      "Could not match: Seeded Graph Matching for the Correlated Gaussian Wigner Model via the Projected Power Method\n",
      "Could not match: Post-Regularization Confidence Bands for Ordinary Differential Equations\n",
      "Could not match: Numerically Stable Sparse Gaussian Processes via Minimum Separation using Cover Trees\n",
      "Could not match: On Sufficient Graphical Models\n",
      "Could not match: Additive smoothing error in backward variational inference for general state-space models\n",
      "Could not match: Pursuit of the Cluster Structure of Network Lasso: Recovery Condition and Non-convex Extension\n",
      "Could not match: Power of knockoff: The impact of ranking algorithm, augmented design, and symmetric statistic\n",
      "Could not match: Distributed Gaussian Mean Estimation under Communication Constraints: Optimal Rates and Communication-Efficient Algorithms\n",
      "Could not match: Efficient Modality Selection in Multimodal Learning\n",
      "Could not match: Multiple Descent in the Multiple Random Feature Model\n",
      "Could not match: Effect-Invariant Mechanisms for Policy Generalization\n",
      "Could not match: Decorrelated Variable Importance\n",
      "Could not match: Adam-family Methods for Nonsmooth Optimization with Convergence Guarantees\n",
      "Could not match: On the Generalization of Stochastic Gradient Descent with Momentum\n",
      "Could not match: Black Box Variational Inference with a Deterministic Objective: Faster, More Accurate, and Even More Black Box\n",
      "Could not match: On Efficient and Scalable Computation of the Nonparametric Maximum Likelihood Estimator in Mixture Models\n",
      "Could not match: On Tail Decay Rate Estimation of Loss Function Distributions\n",
      "Could not match: Sparse NMF with Archetypal Regularization: Computational and Robustness Properties\n",
      "Could not match: Invariant and Equivariant Reynolds Networks\n",
      "Could not match: Iterate Averaging in the Quest for Best Test Error\n",
      "Could not match: On Truthing Issues in Supervised Classification\n",
      "Could not match: Survival Kernets: Scalable and Interpretable Deep Kernel Survival Analysis with an Accuracy Guarantee\n",
      "Could not match: Modeling Random Networks with Heterogeneous Reciprocity\n",
      "Could not match: Personalized PCA: Decoupling Shared and Unique Features\n",
      "Could not match: Probabilistic Forecasting with Generative Networks via Scoring Rule Minimization\n",
      "Could not match: An empirical analysis of compute-optimal large language model training\n",
      "Could not match: A High-Resolution Dataset for Instance Detection with Multi-View Object Capture\n",
      "Could not match: MultiVENT: Multilingual Videos of Events and Aligned Natural Text\n",
      "734\n",
      "734\n"
     ]
    }
   ],
   "source": [
    "# Create a mapping between titles\n",
    "import difflib\n",
    "\n",
    "mapping_to_enriched_title = {}\n",
    "for title in df[\"title\"]:\n",
    "    if title in missing_titles_enriched:  # Is not present, was not findable on scopus\n",
    "        continue\n",
    "    if title in enriched_df[\"title\"]:  # Easy case\n",
    "        mapping_to_enriched_title[title] = title\n",
    "    else:\n",
    "        close_matches = difflib.get_close_matches(title, enriched_df[\"title\"], cutoff=0.6)\n",
    "        if len(close_matches) == 0:\n",
    "            print(\"Could not match:\", title)\n",
    "            continue\n",
    "        mapping_to_enriched_title[title] = close_matches[0]\n",
    "\n",
    "print(len(mapping_to_enriched_title.keys()))\n",
    "print(len(set([v for v in mapping_to_enriched_title.values()])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "102add04",
   "metadata": {},
   "outputs": [],
   "source": [
    "for key in mapping_to_enriched_title.keys():\n",
    "    value = mapping_to_enriched_title[key]\n",
    "    matches = [v for v in mapping_to_enriched_title.values() if v == value]\n",
    "    if len(matches) > 1:\n",
    "        print(key, \":\", value)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "id": "12dd4db9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add citations count column and country of first authors\n",
    "df[\"citation_count\"] = None\n",
    "df[\"country\"] = None\n",
    "\n",
    "def most_frequent(l: list):\n",
    "    return max(set(l), key=l.count)\n",
    "\n",
    "for title, mapping in mapping_to_enriched_title.items():\n",
    "    enriched_row = enriched_df[enriched_df[\"title\"] == mapping]\n",
    "    df.loc[df[\"title\"] == title, \"citation_count\"] = enriched_row[\"citedby_count\"].values[0]\n",
    "    aff = enriched_row[\"affiliation_country\"].values[0]\n",
    "    if isinstance(aff, str):\n",
    "        countries = enriched_row[\"affiliation_country\"].values[0].split(\";\")\n",
    "        #df.loc[df[\"title\"] == title, \"country\"] = enriched_row[\"affiliation_country\"].values[0].split(\";\")[0]  # First author country affiliation\n",
    "        df.loc[df[\"title\"] == title, \"country\"] = most_frequent(countries)  # Most frequent country affiliation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "id": "414b5c8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate the average rating for each paper (Excluding expertise)\n",
    "review_categories = [\"Implementation\", \"Data\", \"Configuration\", \"Experimental Procedure\", \"Expertise\"]\n",
    "df[\"Average\"] = df[review_categories[0:-1]].mean(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "id": "fa8f1247",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                citation_count   country   Average\n",
      "citation_count        1.000000 -0.063923  0.045194\n",
      "country              -0.063923  1.000000 -0.161919\n",
      "Average               0.045194 -0.161919  1.000000\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/78/9cl0kydx0g3cdzjyk28yzr7w0000gn/T/ipykernel_90196/2552227325.py:5: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
      "  df_reduce[\"country\"] = df_reduce[\"country\"].replace(country_id)\n"
     ]
    }
   ],
   "source": [
    "df_reduce = df.drop(['source', 'year', 'title', 'authors', 'keywords', 'pdf_path', 'awards', 'theoretical', 'implementation_url', 'public_datasets','total_datasets', 'Implementation', 'Data', 'Configuration', 'Experimental Procedure', 'Expertise'], axis=1)\n",
    "# Only use the ones with citation values\n",
    "df_reduce = df_reduce[df_reduce[\"citation_count\"].notna()]\n",
    "country_id = {v:i for i, v in enumerate(df_reduce[\"country\"].unique())}\n",
    "df_reduce[\"country\"] = df_reduce[\"country\"].replace(country_id)\n",
    "\n",
    "correlation = df_reduce.corr(method=\"kendall\")\n",
    "print(correlation)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
