{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "DfSS9iYxCzT5",
    "outputId": "f9fe844d-1a54-427a-e3ab-47be6e32e307"
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n",
      "[nltk_data]   Package punkt_tab is already up-to-date!\n"
     ]
    },
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "metadata": {},
     "execution_count": 1
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import gc\n",
    "import os\n",
    "import json\n",
    "from collections import Counter, defaultdict\n",
    "from tqdm.notebook import tqdm\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import plotly.express as px\n",
    "import re\n",
    "year_pattern = r'([1-2][0-9]{3})'\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import gensim\n",
    "import gensim.corpora as corpora\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from gensim.models import LdaModel\n",
    "from gensim.corpora import Dictionary\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "import nltk\n",
    "nltk.download('punkt_tab')"
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "def get_metadata():\n",
    "    with open('arxiv-metadata-oai-snapshot.json', 'r') as f:\n",
    "        for line in f:\n",
    "            yield line"
   ],
   "metadata": {
    "id": "Vx-D6MWAJTFV"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "import json\n",
    "import re\n",
    "from tqdm import tqdm\n",
    "\n",
    "metadata = get_metadata()\n",
    "\n",
    "# Regex pattern to extract valid years\n",
    "year_pattern = r'\\b(19\\d{2}|20[01234]\\d)\\b'  # Matches years between 1900 and 2024\n",
    "\n",
    "# List to store the filtered metadata\n",
    "filtered_metadata = []\n",
    "\n",
    "# Process metadata\n",
    "metadata = get_metadata()  # Assuming this function provides the metadata\n",
    "for paper in tqdm(metadata):\n",
    "    paper = json.loads(paper)  # Convert paper from JSON string to dictionary\n",
    "\n",
    "    # Extract year from `journal-ref`\n",
    "    year = None\n",
    "    if paper.get('journal-ref'):\n",
    "        match = re.findall(year_pattern, paper['journal-ref'])\n",
    "        if match:\n",
    "            valid_years = [int(y) for y in match if 1960 <= int(y) <= 2024]\n",
    "            if valid_years:\n",
    "                year = min(valid_years)\n",
    "\n",
    "    # Skip papers without a valid year\n",
    "    if not year:\n",
    "        continue\n",
    "\n",
    "    # Check if any category starts with 'stat.'\n",
    "    categories = paper.get('categories', '').split()\n",
    "    if any(cat.startswith('stat.') for cat in categories):\n",
    "        # Add the paper to the filtered metadata\n",
    "        filtered_metadata.append(paper)\n",
    "\n",
    "# Output or save the filtered metadata\n",
    "print(f\"Found {len(filtered_metadata)} papers matching the criteria.\")\n"
   ],
   "metadata": {
    "id": "VMFkNWpoEpHa",
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "outputId": "046e4dce-3f15-48fd-b086-5b0be20b9518"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "2647472it [01:18, 33733.67it/s]"
     ]
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Found 18452 papers matching the criteria.\n"
     ]
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "with open(\"filtered_metadata.json\", \"w\") as json_file:\n",
    "    json.dump(filtered_metadata, json_file)"
   ],
   "metadata": {
    "id": "LTUO5gjYI3MY"
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}
