{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "code",
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import gc\n",
    "import os\n",
    "import json\n",
    "import nltk\n",
    "nltk.download('punkt')  # Download the sentence tokenizer\n",
    "from nltk.tokenize import sent_tokenize\n",
    "nltk.download('punkt_tab')"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "gDcVefqCOgF5",
    "outputId": "5ffc8178-31ff-4cfa-b798-b77e91291df3",
    "ExecuteTime": {
     "end_time": "2025-09-16T15:06:14.895089Z",
     "start_time": "2025-09-16T15:06:13.122373Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /Users/zixuanwu/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package punkt_tab to\n",
      "[nltk_data]     /Users/zixuanwu/nltk_data...\n",
      "[nltk_data]   Package punkt_tab is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 1
  },
  {
   "cell_type": "code",
   "source": [
    "import json\n",
    "f= open(\"filtered_metadata.json\")\n",
    "filtered_data = json.load(f)"
   ],
   "metadata": {
    "id": "mz7puZZ9MTAI",
    "ExecuteTime": {
     "end_time": "2025-09-16T15:06:15.280635Z",
     "start_time": "2025-09-16T15:06:14.950671Z"
    }
   },
   "outputs": [],
   "execution_count": 2
  },
  {
   "cell_type": "code",
   "source": [
    "df = pd.DataFrame(filtered_data)\n",
    "\n",
    "df[\"update_date\"] = pd.to_datetime(df[\"update_date\"])"
   ],
   "metadata": {
    "id": "-vQ9XqVx7U4o",
    "ExecuteTime": {
     "end_time": "2025-09-16T15:06:19.191673Z",
     "start_time": "2025-09-16T15:06:19.138011Z"
    }
   },
   "outputs": [],
   "execution_count": 4
  },
  {
   "cell_type": "code",
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Create initial DataFrame\n",
    "df[\"doc_id\"] = df[\"id\"]\n",
    "\n",
    "# Step 1: Extract (doc_id, stat_category) pairs\n",
    "rows = []\n",
    "for entry in filtered_data:\n",
    "    doc_id = entry['id']\n",
    "    abstract = entry['abstract']\n",
    "    categories = [cat for cat in entry['categories'].split() if cat.startswith(\"stat.\")]\n",
    "    if len(categories) == 1:  # Keep only if there's exactly one stat category\n",
    "        rows.append({\n",
    "            \"doc_id\": doc_id,\n",
    "            \"abstract\": abstract,\n",
    "            \"stat_category\": categories[0],\n",
    "            \"update_date\": entry[\"update_date\"]\n",
    "        })\n",
    "\n",
    "# Step 2: Create DataFrame with unique (abstract, label) pairs\n",
    "df_unique = pd.DataFrame(rows).drop_duplicates(subset=[\"doc_id\", \"stat_category\"])\n"
   ],
   "metadata": {
    "id": "yp0-e8Gh7-Lt",
    "ExecuteTime": {
     "end_time": "2025-09-16T15:06:20.498519Z",
     "start_time": "2025-09-16T15:06:20.428787Z"
    }
   },
   "outputs": [],
   "execution_count": 5
  },
  {
   "cell_type": "code",
   "source": [
    "df_unique.shape"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "EtmDiCLD8Nal",
    "outputId": "3618ad47-1a8f-42c0-d8d1-c73911a527e9",
    "ExecuteTime": {
     "end_time": "2025-09-16T15:06:27.517081Z",
     "start_time": "2025-09-16T15:06:27.512062Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15278, 4)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 6
  },
  {
   "cell_type": "code",
   "source": [
    "df_unique = df_unique.loc[df_unique[\"update_date\"] > \"2021-01-01\"]\n",
    "df_unique = df_unique.loc[df_unique[\"stat_category\"] != \"stat.OT\"]"
   ],
   "metadata": {
    "id": "mzeArcuT7wVx",
    "ExecuteTime": {
     "end_time": "2025-09-16T15:06:28.664146Z",
     "start_time": "2025-09-16T15:06:28.627973Z"
    }
   },
   "outputs": [],
   "execution_count": 7
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T15:07:40.685218Z",
     "start_time": "2025-09-16T15:07:40.642295Z"
    }
   },
   "cell_type": "code",
   "source": [
    "df_sampled = (\n",
    "    df_unique.groupby(\"stat_category\", group_keys=False)\n",
    "      .apply(lambda x: x.sample(n=200, replace=False))\n",
    "      .sample(frac=1, random_state=42)   # optional: shuffle all groups together\n",
    ")"
   ],
   "outputs": [],
   "execution_count": 8
  },
  {
   "cell_type": "code",
   "source": [
    "# Step 3: Output lists\n",
    "abstracts = df_sampled[\"abstract\"].tolist()\n",
    "labels = df_sampled[\"stat_category\"].tolist()\n"
   ],
   "metadata": {
    "id": "44I-yViFMe2X",
    "ExecuteTime": {
     "end_time": "2025-09-16T15:07:42.103074Z",
     "start_time": "2025-09-16T15:07:42.099058Z"
    }
   },
   "outputs": [],
   "execution_count": 9
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T15:07:43.259527Z",
     "start_time": "2025-09-16T15:07:43.254623Z"
    }
   },
   "cell_type": "code",
   "source": [
    "df_train = df_sampled.iloc[:800, :].reset_index(drop = True)\n",
    "df_test = df_sampled.iloc[800:, :].reset_index(drop = True)\n"
   ],
   "outputs": [],
   "execution_count": 10
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-16T15:07:44.519132Z",
     "start_time": "2025-09-16T15:07:44.480638Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import pandas as pd\n",
    "\n",
    "df_train.to_csv(\"train.csv\", index = False)\n",
    "df_test.to_csv(\"test.csv\", index = False)"
   ],
   "outputs": [],
   "execution_count": 11
  }
 ]
}
