{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "480896f7-fb04-47b9-9835-3642f3f6e402",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading tokenizer: bert-base-uncased...\n",
      "\n",
      "--- Processing SST-2 (In-Distribution) ---\n",
      "Processing Train...\n",
      "✅ Saved train (67349 samples)\n",
      "Processing Dev...\n",
      "✅ Saved dev (872 samples)\n",
      "\n",
      "--- Processing AGNews (Out-of-Distribution) ---\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "19689920f25844f7bfbb1ec6f2256d5f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md: 0.00B [00:00, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cd782a1c582642d78999f706bc07ce8a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "de83e94fbeb647ea96be4ae8410559a5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "91f5f801a03549188dde381b765ad25c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "28b0e98988ac4e168026411e6d316ccb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing OOD (AGnews)...\n",
      "✅ Saved ood (7600 samples)\n",
      "\n",
      "🎉 Pipeline Complete. Upload the 'processed_data' folder to your server.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "# CONFIGURATION\n",
    "MAX_LEN = 64           \n",
    "SAVE_DIR = \"processed_data\"\n",
    "TOKENIZER_NAME = \"bert-base-uncased\"\n",
    "\n",
    "if not os.path.exists(SAVE_DIR):\n",
    "    os.makedirs(SAVE_DIR)\n",
    "\n",
    "print(f\"Loading tokenizer: {TOKENIZER_NAME}...\")\n",
    "tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)\n",
    "\n",
    "def save_numpy(prefix, ids, mask, labels):\n",
    "    np.save(f\"{SAVE_DIR}/{prefix}_ids.npy\", ids)\n",
    "    np.save(f\"{SAVE_DIR}/{prefix}_mask.npy\", mask)\n",
    "    np.save(f\"{SAVE_DIR}/{prefix}_labels.npy\", labels)\n",
    "    print(f\"✅ Saved {prefix} ({len(labels)} samples)\")\n",
    "\n",
    "def process_batch(texts, tokenizer):\n",
    "    \"\"\"Tokenizes text into STRICT static shapes.\"\"\"\n",
    "    clean_texts = [str(x) if x is not None else \"\" for x in texts]\n",
    "    \n",
    "    return tokenizer(\n",
    "        clean_texts,\n",
    "        add_special_tokens=True,\n",
    "        max_length=MAX_LEN,\n",
    "        padding='max_length',\n",
    "        truncation=True,       \n",
    "        return_attention_mask=True,\n",
    "        return_tensors='np'\n",
    "    )\n",
    "\n",
    "# 1. PROCESS SST-2 (In-Distribution)\n",
    "\n",
    "print(\"\\n--- Processing SST-2 (In-Distribution) ---\")\n",
    "sst2 = load_dataset(\"glue\", \"sst2\")\n",
    "\n",
    "# Train Set\n",
    "print(\"Processing Train...\")\n",
    "# We explicitly cast to list to satisfy Transformers\n",
    "train_texts = list(sst2['train']['sentence'])\n",
    "train_enc = process_batch(train_texts, tokenizer)\n",
    "save_numpy(\"train\", train_enc['input_ids'], train_enc['attention_mask'], np.array(sst2['train']['label']))\n",
    "\n",
    "# Validation Set\n",
    "print(\"Processing Dev...\")\n",
    "dev_texts = list(sst2['validation']['sentence'])\n",
    "dev_enc = process_batch(dev_texts, tokenizer)\n",
    "save_numpy(\"dev\", dev_enc['input_ids'], dev_enc['attention_mask'], np.array(sst2['validation']['label']))\n",
    "\n",
    "# 2. PROCESS AGNEWS (Out-of-Distribution)\n",
    "print(\"\\n--- Processing AGNews (Out-of-Distribution) ---\")\n",
    "agn = load_dataset(\"ag_news\")\n",
    "\n",
    "# Using the first 5000 test samples for OOD\n",
    "print(\"Processing OOD (AGnews)...\")\n",
    "ood_texts = list(agn['test']['text'])\n",
    "ood_labels = list(agn['test']['label'])\n",
    "\n",
    "ood_enc = process_batch(ood_texts, tokenizer)\n",
    "save_numpy(\"ood\", ood_enc['input_ids'], ood_enc['attention_mask'], np.array(ood_labels))\n",
    "\n",
    "print(f\"\\n🎉 Pipeline Complete. Upload the '{SAVE_DIR}' \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6e105889-44b5-46d2-ba84-0056818dc53e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.12/site-packages (1.26.4)\n",
      "Requirement already satisfied: pandas in /opt/anaconda3/lib/python3.12/site-packages (2.2.2)\n",
      "\u001b[31mERROR: Could not find a version that satisfies the requirement random (from versions: none)\u001b[0m\u001b[31m\n",
      "\u001b[0m\u001b[31mERROR: No matching distribution found for random\u001b[0m\u001b[31m\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6206d71b-06d7-43f0-99e4-0015dd3e1029",
   "metadata": {},
   "outputs": [],
   "source": [
    "pip "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
