{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4d7ebe83",
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: '/home/hli962/Chunhou Project/all_comments_since_2015.csv'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mFileNotFoundError\u001b[39m                         Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[34;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[34;01mpd\u001b[39;00m\n\u001b[32m      2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[34;01mjson\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m df = \u001b[43mpd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m/home/hli962/Chunhou Project/all_comments_since_2015.csv\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m      6\u001b[39m \u001b[38;5;66;03m# 删除空值\u001b[39;00m\n\u001b[32m      7\u001b[39m df = df.dropna(subset=[\u001b[33m\"\u001b[39m\u001b[33mauthor_flair_text\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mbody\u001b[39m\u001b[33m\"\u001b[39m])\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026\u001b[39m, in \u001b[36mread_csv\u001b[39m\u001b[34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[39m\n\u001b[32m   1013\u001b[39m kwds_defaults = _refine_defaults_read(\n\u001b[32m   1014\u001b[39m     dialect,\n\u001b[32m   1015\u001b[39m     delimiter,\n\u001b[32m   (...)\u001b[39m\u001b[32m   1022\u001b[39m     dtype_backend=dtype_backend,\n\u001b[32m   1023\u001b[39m )\n\u001b[32m   1024\u001b[39m kwds.update(kwds_defaults)\n\u001b[32m-> \u001b[39m\u001b[32m1026\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620\u001b[39m, in \u001b[36m_read\u001b[39m\u001b[34m(filepath_or_buffer, kwds)\u001b[39m\n\u001b[32m    617\u001b[39m _validate_names(kwds.get(\u001b[33m\"\u001b[39m\u001b[33mnames\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[32m    619\u001b[39m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m620\u001b[39m parser = \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    622\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[32m    623\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m parser\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620\u001b[39m, in \u001b[36mTextFileReader.__init__\u001b[39m\u001b[34m(self, f, engine, **kwds)\u001b[39m\n\u001b[32m   1617\u001b[39m     \u001b[38;5;28mself\u001b[39m.options[\u001b[33m\"\u001b[39m\u001b[33mhas_index_names\u001b[39m\u001b[33m\"\u001b[39m] = kwds[\u001b[33m\"\u001b[39m\u001b[33mhas_index_names\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m   1619\u001b[39m \u001b[38;5;28mself\u001b[39m.handles: IOHandles | \u001b[38;5;28;01mNone\u001b[39;00m = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1620\u001b[39m \u001b[38;5;28mself\u001b[39m._engine = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880\u001b[39m, in \u001b[36mTextFileReader._make_engine\u001b[39m\u001b[34m(self, f, engine)\u001b[39m\n\u001b[32m   1878\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[32m   1879\u001b[39m         mode += \u001b[33m\"\u001b[39m\u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m-> \u001b[39m\u001b[32m1880\u001b[39m \u001b[38;5;28mself\u001b[39m.handles = \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1881\u001b[39m \u001b[43m    \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1882\u001b[39m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1883\u001b[39m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mencoding\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1884\u001b[39m \u001b[43m    \u001b[49m\u001b[43mcompression\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcompression\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1885\u001b[39m \u001b[43m    \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmemory_map\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1886\u001b[39m \u001b[43m    \u001b[49m\u001b[43mis_text\u001b[49m\u001b[43m=\u001b[49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1887\u001b[39m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mencoding_errors\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstrict\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1888\u001b[39m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43moptions\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mstorage_options\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1889\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1890\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m.handles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m   1891\u001b[39m f = \u001b[38;5;28mself\u001b[39m.handles.handle\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/lib/python3.12/site-packages/pandas/io/common.py:873\u001b[39m, in \u001b[36mget_handle\u001b[39m\u001b[34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[39m\n\u001b[32m    868\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[32m    869\u001b[39m     \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[32m    870\u001b[39m     \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[32m    871\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m ioargs.encoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs.mode:\n\u001b[32m    872\u001b[39m         \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m873\u001b[39m         handle = \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[32m    874\u001b[39m \u001b[43m            \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    875\u001b[39m \u001b[43m            \u001b[49m\u001b[43mioargs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    876\u001b[39m \u001b[43m            \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m=\u001b[49m\u001b[43mioargs\u001b[49m\u001b[43m.\u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    877\u001b[39m \u001b[43m            \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m=\u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    878\u001b[39m \u001b[43m            \u001b[49m\u001b[43mnewline\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m    879\u001b[39m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    880\u001b[39m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m    881\u001b[39m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[32m    882\u001b[39m         handle = \u001b[38;5;28mopen\u001b[39m(handle, ioargs.mode)\n",
      "\u001b[31mFileNotFoundError\u001b[39m: [Errno 2] No such file or directory: '/home/hli962/Chunhou Project/all_comments_since_2015.csv'"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "df = pd.read_csv(\"/home/hli962/Chunhou Project/all_comments_since_2015.csv\")\n",
    "\n",
    "# 删除空值\n",
    "df = df.dropna(subset=[\"author_flair_text\", \"body\"])\n",
    "\n",
    "# 重命名\n",
    "df = df[[\"author_flair_text\", \"body\"]].rename(columns={\n",
    "    \"author_flair_text\": \"type\",\n",
    "    \"body\": \"posts\"\n",
    "})\n",
    "\n",
    "# 分批写入 JSON 数组\n",
    "output_path = \"posts_chunked.json\"\n",
    "chunk_size = 10000\n",
    "\n",
    "with open(output_path, \"w\", encoding=\"utf-8\") as f:\n",
    "    f.write(\"[\\n\")\n",
    "    total = len(df)\n",
    "    for i in range(0, total, chunk_size):\n",
    "        chunk = df.iloc[i:i+chunk_size].to_dict(orient=\"records\")\n",
    "        for j, item in enumerate(chunk):\n",
    "            json.dump(item, f, ensure_ascii=False)\n",
    "            # 判断是否是最后一条\n",
    "            if i + j + 1 < total:\n",
    "                f.write(\",\\n\")\n",
    "            else:\n",
    "                f.write(\"\\n\")\n",
    "    f.write(\"]\\n\")\n",
    "\n",
    "print(f\"✅ 分批导出完成，共 {total} 条，保存至：{output_path}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "604bcbe8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "🧹 Cleaning batch 1: 100%|██████████| 600000/600000 [00:53<00:00, 11132.45it/s]\n",
      "📖 Streaming JSON: 797578it [00:58, 10804.01it/s]  "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 已保存：cleaned_batches/batch_001.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "🧹 Cleaning batch 2: 100%|██████████| 600000/600000 [00:55<00:00, 10846.30it/s]\n",
      "📖 Streaming JSON: 1390987it [01:57, 9924.77it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 已保存：cleaned_batches/batch_002.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "🧹 Cleaning batch 3: 100%|██████████| 600000/600000 [00:55<00:00, 10737.25it/s]\n",
      "📖 Streaming JSON: 1990829it [02:58, 9840.24it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 已保存：cleaned_batches/batch_003.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "🧹 Cleaning batch 4: 100%|██████████| 600000/600000 [00:55<00:00, 10756.98it/s]\n",
      "📖 Streaming JSON: 2586817it [03:58, 9803.88it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 已保存：cleaned_batches/batch_004.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "🧹 Cleaning batch 5: 100%|██████████| 600000/600000 [00:56<00:00, 10646.30it/s]\n",
      "📖 Streaming JSON: 3083561it [04:58, 5482.86it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 已保存：cleaned_batches/batch_005.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "🧹 Cleaning batch 6: 100%|██████████| 600000/600000 [00:56<00:00, 10588.23it/s]\n",
      "📖 Streaming JSON: 3796380it [05:59, 8722.89it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 已保存：cleaned_batches/batch_006.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "🧹 Cleaning batch 7: 100%|██████████| 600000/600000 [00:48<00:00, 12270.27it/s]\n",
      "📖 Streaming JSON: 4362763it [06:52, 8915.18it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 已保存：cleaned_batches/batch_007.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "🧹 Cleaning batch 8: 100%|██████████| 600000/600000 [00:52<00:00, 11532.29it/s]\n",
      "📖 Streaming JSON: 4874505it [07:49, 5287.30it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 已保存：cleaned_batches/batch_008.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "🧹 Cleaning batch 9: 100%|██████████| 600000/600000 [00:52<00:00, 11412.09it/s]\n",
      "📖 Streaming JSON: 5465660it [08:46, 5101.10it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 已保存：cleaned_batches/batch_009.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "🧹 Cleaning batch 10: 100%|██████████| 600000/600000 [00:53<00:00, 11265.60it/s]\n",
      "📖 Streaming JSON: 6144250it [09:44, 7294.49it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 已保存：cleaned_batches/batch_010.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "🧹 Cleaning batch 11: 100%|██████████| 600000/600000 [00:53<00:00, 11222.80it/s]\n",
      "📖 Streaming JSON: 6668904it [10:42, 4631.88it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 已保存：cleaned_batches/batch_011.json\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "📖 Streaming JSON: 6767705it [10:42, 10535.42it/s]\n",
      "🧹 Cleaning batch 12: 100%|██████████| 167705/167705 [00:14<00:00, 11324.56it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📁 已保存：cleaned_batches/batch_012.json\n",
      "\n",
      "✅ 清洗完成，共 6767705 条数据，分为 12 批。\n"
     ]
    }
   ],
   "source": [
    "#清理潘多拉数据\n",
    "import os\n",
    "import re\n",
    "import json\n",
    "import nltk\n",
    "import ijson\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "\n",
    "# --- 下载一次即可 ---\n",
    "# nltk.download(\"punkt\")\n",
    "# nltk.download(\"stopwords\")\n",
    "# nltk.download(\"wordnet\")\n",
    "\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "lemmatizer = WordNetLemmatizer()\n",
    "\n",
    "def clean_text(text):\n",
    "    text = text.lower()\n",
    "    text = re.sub(r'(https?://\\S+|www\\.\\S+)', ' ', text)\n",
    "    text = text.replace('|', ' ')\n",
    "    text = re.sub(r'[^a-z\\s]', ' ', text)\n",
    "    tokens = nltk.word_tokenize(text)\n",
    "    cleaned = [\n",
    "        lemmatizer.lemmatize(token)\n",
    "        for token in tokens\n",
    "        if token not in stop_words and len(token) > 2\n",
    "    ]\n",
    "    return ' '.join(cleaned)\n",
    "\n",
    "def preprocess_large_flat_json_stream(input_path, output_dir=\"cleaned_batches\", batch_size=1000):\n",
    "    os.makedirs(output_dir, exist_ok=True)\n",
    "\n",
    "    with open(input_path, 'r', encoding='utf-8') as f:\n",
    "        items = ijson.items(f, 'item')  # ⬅ 逐条读取 JSON 数组元素\n",
    "\n",
    "        batch = []\n",
    "        batch_index = 1\n",
    "        total = 0\n",
    "\n",
    "        for item in tqdm(items, desc=\"📖 Streaming JSON\"):\n",
    "            batch.append(item)\n",
    "            total += 1\n",
    "\n",
    "            if len(batch) == batch_size:\n",
    "                save_cleaned_batch(batch, output_dir, batch_index)\n",
    "                batch = []\n",
    "                batch_index += 1\n",
    "\n",
    "        if batch:\n",
    "            save_cleaned_batch(batch, output_dir, batch_index)\n",
    "\n",
    "    print(f\"\\n✅ 清洗完成，共 {total} 条数据，分为 {batch_index} 批。\")\n",
    "\n",
    "def save_cleaned_batch(batch, output_dir, batch_index):\n",
    "    df = pd.DataFrame(batch)\n",
    "    tqdm.pandas(desc=f\"🧹 Cleaning batch {batch_index}\")\n",
    "    df[\"posts_cleaned\"] = df[\"posts\"].astype(str).progress_apply(clean_text)\n",
    "\n",
    "    json_path = os.path.join(output_dir, f\"batch_{batch_index:03}.json\")\n",
    "    #csv_path = os.path.join(output_dir, f\"batch_{batch_index:03}.csv\")\n",
    "\n",
    "    #df.to_csv(csv_path, index=False)\n",
    "    with open(json_path, 'w', encoding='utf-8') as f:\n",
    "        json.dump(df.to_dict(orient=\"records\"), f, ensure_ascii=False, indent=2)\n",
    "\n",
    "    print(f\"📁 已保存：{json_path}\")\n",
    "\n",
    "\n",
    "# --- 调用入口 --\n",
    "if __name__ == \"__main__\":\n",
    "    preprocess_large_flat_json_stream(\n",
    "        input_path=\"posts_chunked.json\",\n",
    "        output_dir=\"cleaned_batches\",\n",
    "        batch_size=600000\n",
    "    )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "914bde3c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ 合并完成，共 6835705 条，保存至：merged_cleaned.json\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import json\n",
    "import glob\n",
    "\n",
    "# 设置目录和输出路径\n",
    "input_dir = \"cleaned_batches\"\n",
    "output_path = \"merged_cleaned.json\"\n",
    "\n",
    "# 找到所有批次 JSON 文件（按数字顺序排序）\n",
    "json_files = sorted(glob.glob(os.path.join(input_dir, \"batch_*.json\")))\n",
    "\n",
    "merged_data = []\n",
    "for file in json_files:\n",
    "    with open(file, \"r\", encoding=\"utf-8\") as f:\n",
    "        data = json.load(f)\n",
    "        merged_data.extend(data)\n",
    "\n",
    "# 写入合并后的文件\n",
    "with open(output_path, \"w\", encoding=\"utf-8\") as f_out:\n",
    "    json.dump(merged_data, f_out, ensure_ascii=False, indent=2)\n",
    "\n",
    "print(f\"✅ 合并完成，共 {len(merged_data)} 条，保存至：{output_path}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "028f2654",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "过滤非法类型后的样本数：\n",
      "type\n",
      "ENFJ      6467\n",
      "ENFP     48932\n",
      "ENTJ     13457\n",
      "ENTP     53810\n",
      "ESFJ       691\n",
      "ESFP      3483\n",
      "ESTJ      3695\n",
      "ESTP      9174\n",
      "INFJ     44511\n",
      "INFP     26245\n",
      "INTJ    171447\n",
      "INTP    220288\n",
      "ISFJ      2179\n",
      "ISFP      6789\n",
      "ISTJ      8430\n",
      "ISTP     21752\n",
      "Name: count, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "# 读取 JSON 文件\n",
    "with open('merged_cleaned_pandora.json', 'r', encoding='utf-8') as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "df = pd.DataFrame(data)\n",
    "\n",
    "# 筛选 posts_cleaned 长度 > 10 的样本\n",
    "df = df[df['posts_cleaned'].str.len() > 10]\n",
    "# 合法的 MBTI 类型集合\n",
    "valid_types = {\n",
    "    'INTJ', 'INTP', 'ENTJ', 'ENTP',\n",
    "    'INFJ', 'INFP', 'ENFJ', 'ENFP',\n",
    "    'ISTJ', 'ISFJ', 'ESTJ', 'ESFJ',\n",
    "    'ISTP', 'ISFP', 'ESTP', 'ESFP'\n",
    "}\n",
    "\n",
    "# 只保留合法类型的样本\n",
    "df = df[df['type'].isin(valid_types)]\n",
    "\n",
    "# 然后再统计数量\n",
    "type_counts = df['type'].value_counts()\n",
    "print(\"过滤非法类型后的样本数：\")\n",
    "print(type_counts.sort_index())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8605e982",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to /home/hli962/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "/tmp/ipykernel_2220419/741978598.py:43: FutureWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
      "  df_limited = df.groupby('type').apply(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ 数据拆分完成，合法类型已筛选，并保存为 JSON 格式。\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "import re\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "# 确保你已经下载了停用词\n",
    "nltk.download('stopwords')\n",
    "stop_words = set(stopwords.words('english'))\n",
    "\n",
    "\n",
    "# ✅ 1. 读取原始 JSON 文件\n",
    "with open('merged_cleaned_pandora.json', 'r', encoding='utf-8') as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "df = pd.DataFrame(data)\n",
    "\n",
    "# ✅ 2. 筛选 posts_cleaned 中有效字符（字母/汉字）个数 > 10\n",
    "def count_valid_characters(text):\n",
    "    return len(re.findall(r'[A-Za-z\\u4e00-\\u9fa5]', text))\n",
    "\n",
    "df = df[df['posts_cleaned'].apply(count_valid_characters) > 30].copy()\n",
    "\n",
    "# ✅ 3. 去除英文停用词\n",
    "def remove_stopwords(text):\n",
    "    tokens = text.split()  # 假设已经预处理过，不再分词\n",
    "    filtered = [word for word in tokens if word.lower() not in stop_words]\n",
    "    return ' '.join(filtered)\n",
    "\n",
    "df['posts_cleaned'] = df['posts_cleaned'].apply(remove_stopwords)\n",
    "\n",
    "# ✅ 3. 保留合法的 16 种 MBTI 类型\n",
    "valid_types = {\n",
    "    'INTJ', 'INTP', 'ENTJ', 'ENTP',\n",
    "    'INFJ', 'INFP', 'ENFJ', 'ENFP',\n",
    "    'ISTJ', 'ISFJ', 'ESTJ', 'ESFJ',\n",
    "    'ISTP', 'ISFP', 'ESTP', 'ESFP'\n",
    "}\n",
    "df = df[df['type'].isin(valid_types)].copy()\n",
    "\n",
    "# ✅ 4. 每类最多保留 2000 条样本\n",
    "df_limited = df.groupby('type').apply(\n",
    "    lambda x: x.sample(n=min(2000, len(x)), random_state=42)\n",
    ").reset_index(drop=True)\n",
    "\n",
    "# ✅ 5. 拆分特征和标签\n",
    "X = df_limited['posts_cleaned']\n",
    "y = df_limited['type']\n",
    "\n",
    "# ✅ 6. 拆分数据（train: 70%，val: 15%，test: 15%）\n",
    "X_train, X_temp, y_train, y_temp = train_test_split(\n",
    "    X, y, test_size=0.3, stratify=y, random_state=42\n",
    ")\n",
    "X_val, X_test, y_val, y_test = train_test_split(\n",
    "    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42\n",
    ")\n",
    "\n",
    "# ✅ 7. 还原为原格式（含 type, posts, posts_cleaned）\n",
    "def reconstruct_json(X_subset, y_subset, full_df):\n",
    "    df_subset = pd.DataFrame({'posts_cleaned': X_subset, 'type': y_subset})\n",
    "    merged = df_subset.merge(\n",
    "        full_df[['type', 'posts_cleaned', 'posts']],\n",
    "        on=['type', 'posts_cleaned'],\n",
    "        how='left'\n",
    "    )\n",
    "    return merged.to_dict(orient='records')\n",
    "\n",
    "train_data = reconstruct_json(X_train, y_train, df_limited)\n",
    "val_data = reconstruct_json(X_val, y_val, df_limited)\n",
    "test_data = reconstruct_json(X_test, y_test, df_limited)\n",
    "\n",
    "# ✅ 8. 保存为 JSON 文件（结构不变）\n",
    "with open('pandora_train.json', 'w', encoding='utf-8') as f:\n",
    "    json.dump(train_data, f, ensure_ascii=False, indent=2)\n",
    "\n",
    "with open('pandora_val.json', 'w', encoding='utf-8') as f:\n",
    "    json.dump(val_data, f, ensure_ascii=False, indent=2)\n",
    "\n",
    "with open('pandora_test.json', 'w', encoding='utf-8') as f:\n",
    "    json.dump(test_data, f, ensure_ascii=False, indent=2)\n",
    "\n",
    "# ✅ 9. 完成提示\n",
    "print(\"✅ 数据拆分完成，合法类型已筛选，并保存为 JSON 格式。\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0590d524",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "save to: pandora_processed_comments.csv\n"
     ]
    }
   ],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "import pandas as pd\n",
    "\n",
    "author_profiles_path = \"author_profiles.csv\"\n",
    "comments_path = \"all_comments_since_2015.csv\"\n",
    "\n",
    "author_profiles = pd.read_csv(author_profiles_path)\n",
    "\n",
    "user_to_mbti = {row['author']: row['mbti'].upper() for index, row in author_profiles.iterrows() if isinstance(row['mbti'], str)}\n",
    "\n",
    "\n",
    "comments = pd.read_csv(comments_path)\n",
    "\n",
    "comments = comments[comments['author'].isin(user_to_mbti.keys())]\n",
    "\n",
    "grouped_comments = comments.groupby('author')['body'].apply(lambda x: '|||'.join(x.dropna())).reset_index()\n",
    "\n",
    "grouped_comments['type'] = grouped_comments['author'].map(user_to_mbti)\n",
    "\n",
    "result = grouped_comments[['type', 'body']]\n",
    "result.columns = ['type', 'posts']\n",
    "\n",
    "output_path = \"pandora_processed_comments.csv\"\n",
    "\n",
    "result.to_csv(output_path, index=False)\n",
    "\n",
    "print(\"save to:\", output_path)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4df6fb6e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The following data contains invalid MBTI types and has been excluded:\n",
      "      type                                              posts\n",
      "1180  INTX  1) Character pick doesn't matter until the reg...\n",
      "1296  INTX  This is why whenever someone likes me, I usual...\n",
      "1407  INTX  Eh, I tried to get into them but I couldn't. B...\n",
      "2904  XNFP  When stressed I tend to withdraw and draw into...\n",
      "3058  INFX  You’re definitely correct to hesitate acceptin...\n",
      "3069  INFX  Yeah I've read it before I think. Maybe he wro...\n",
      "3130  EXXP  And twice as many as Miggy|||Request to see th...\n",
      "4382  XNTP  He should stop posting on Reddit because we ar...\n",
      "4889  XNFX  YESSS. This is so pretty and magical! I wish m...\n",
      "4926  INTX  I'm actually a freshman so I've only taken the...\n",
      "5058  INFX  There was also [this makefile](https://github....\n",
      "5751  INXJ  I beg to differ. TYPE_MENTION's can be scienti...\n",
      "5798  XNTP  LOL|||I'd love to hear your arguments|||he's n...\n",
      "6746  XSFP  If they have perceiving first it means they'll...\n",
      "6954  INFX  Dollar for you in dollar? Doing cats? What is ...\n",
      "7407  XNXJ  They recently did something similar in Austral...\n",
      "7484  INXX  hey, have you seen this picture?  http://psyph...\n",
      "Data has been validated and processed, saved to: filtered_processed_comments.csv\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Define valid MBTI types\n",
    "VALID_MBTIs = {'INTJ', 'INTP', 'INFP', 'ENTP', 'ISTP', 'ISFP', 'ESTJ', 'ISTJ', 'ESTP', 'ISFJ', 'ENFP', 'ESFP', 'ESFJ', 'ENFJ', 'INFJ', 'ENTJ'}\n",
    "\n",
    "# Define input file path\n",
    "input_path = \"pandora_processed_comments.csv\"\n",
    "\n",
    "# Read data\n",
    "data = pd.read_csv(input_path)\n",
    "\n",
    "# Filter out invalid MBTI types\n",
    "filtered_data = data[data['type'].isin(VALID_MBTIs)]\n",
    "\n",
    "# To check for excluded data, use the following\n",
    "invalid_data = data[~data['type'].isin(VALID_MBTIs)]\n",
    "if not invalid_data.empty:\n",
    "    print(\"The following data contains invalid MBTI types and has been excluded:\")\n",
    "    print(invalid_data)\n",
    "\n",
    "# Define output file path\n",
    "output_path = \"filtered_processed_comments.csv\"\n",
    "\n",
    "# Write results to CSV file\n",
    "filtered_data.to_csv(output_path, index=False)\n",
    "\n",
    "print(\"Data has been validated and processed, saved to:\", output_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "43c57349",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "📊 筛选后样本总数：1261235\n",
      "type  count proportion\n",
      "INTP 332458     26.36%\n",
      "INTJ 311165     24.67%\n",
      "INFJ 119530      9.48%\n",
      "INFP 111786      8.86%\n",
      "ENTP 108536      8.61%\n",
      "ENFP  60466      4.79%\n",
      "ISTP  57552      4.56%\n",
      "ENTJ  50234      3.98%\n",
      "ISTJ  41010      3.25%\n",
      "ENFJ  17575      1.39%\n",
      "ISFP  14853      1.18%\n",
      "ISFJ  11517      0.91%\n",
      "ESTP   8893      0.71%\n",
      "ESFP   7343      0.58%\n",
      "ESTJ   4856      0.39%\n",
      "ESFJ   3461      0.27%\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "from collections import Counter\n",
    "\n",
    "# ✅ 修改为你的输出 JSON 文件路径\n",
    "input_path = \"filtered_processed_comments_300.json\"\n",
    "\n",
    "# 加载数据\n",
    "with open(input_path, \"r\", encoding=\"utf-8\") as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "# 统计 MBTI 类型\n",
    "counter = Counter(d[\"type\"].upper() for d in data)\n",
    "total = sum(counter.values())\n",
    "\n",
    "# 构造 DataFrame\n",
    "df_stats = pd.DataFrame([\n",
    "    {\"type\": mbti_type, \"count\": count, \"proportion\": f\"{count / total:.2%}\"}\n",
    "    for mbti_type, count in sorted(counter.items(), key=lambda x: x[1], reverse=True)\n",
    "])\n",
    "\n",
    "# 打印\n",
    "print(f\"\\n📊 筛选后样本总数：{total}\")\n",
    "print(df_stats.to_string(index=False))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "26e9e6e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ 已保存为 JSON 文件: filtered_processed_comments.json\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# 读取 CSV 文件\n",
    "csv_path = \"filtered_processed_comments.csv\"\n",
    "df = pd.read_csv(csv_path)\n",
    "\n",
    "# 转换为 JSON 格式（每行是一个字典）\n",
    "json_path = \"filtered_processed_comments.json\"\n",
    "df.to_json(json_path, orient=\"records\", force_ascii=False, indent=2)\n",
    "\n",
    "print(\"✅ 已保存为 JSON 文件:\", json_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
