{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0f120cac",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import nltk\n",
    "import json\n",
    "import os\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from tqdm import tqdm\n",
    "\n",
    "# 第一次运行请取消注释这两行以确保下载需要的资源\n",
    "# nltk.download(\"punkt\")\n",
    "# nltk.download(\"stopwords\")\n",
    "# nltk.download(\"wordnet\")\n",
    "\n",
    "# 初始化处理器\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "lemmatizer = WordNetLemmatizer()\n",
    "\n",
    "def clean_text(text):\n",
    "    text = text.lower()\n",
    "    text = re.sub(r'(https?://\\S+|www\\.\\S+)', ' ', text)\n",
    "    text = text.replace('|', ' ')\n",
    "    text = re.sub(r'[^a-z\\s]', ' ', text)\n",
    "    tokens = nltk.word_tokenize(text)\n",
    "    cleaned = [\n",
    "        lemmatizer.lemmatize(token)\n",
    "        for token in tokens\n",
    "        if token not in stop_words and len(token) > 2\n",
    "    ]\n",
    "    return ' '.join(cleaned)\n",
    "\n",
    "def flatten_nested_json(raw_data):\n",
    "    \"\"\"\n",
    "    将形如 {\"INFP\": [...], \"INTJ\": [...]} 的字典结构打平成一个 list[dict]\n",
    "    \"\"\"\n",
    "    flat_list = []\n",
    "    for key in raw_data:\n",
    "        entries = raw_data[key]\n",
    "        if isinstance(entries, list):\n",
    "            for item in entries:\n",
    "                if isinstance(item, dict) and \"post\" in item and \"type\" in item:\n",
    "                    flat_list.append(item)\n",
    "    return flat_list\n",
    "\n",
    "def preprocess_nested_json(input_path, output_path_csv=None, output_path_json=None):\n",
    "    print(f\"📥 加载 JSON 数据: {input_path}\")\n",
    "\n",
    "    with open(input_path, 'r', encoding='utf-8') as f:\n",
    "        raw_data = json.load(f)\n",
    "\n",
    "    flat_data = flatten_nested_json(raw_data)\n",
    "\n",
    "    if not flat_data:\n",
    "        raise ValueError(\"❌ 未找到合法的 {'type': ..., 'post': ...} 数据结构\")\n",
    "\n",
    "    df = pd.DataFrame(flat_data)\n",
    "    print(f\"✅ 扁平化完成，共 {len(df)} 条\")\n",
    "\n",
    "    tqdm.pandas(desc=\"Cleaning\")\n",
    "    df[\"posts_cleaned\"] = df[\"post\"].astype(str).progress_apply(clean_text)\n",
    "\n",
    "    # 保存 CSV\n",
    "    if output_path_csv:\n",
    "        os.makedirs(os.path.dirname(output_path_csv), exist_ok=True)\n",
    "        df.to_csv(output_path_csv, index=False)\n",
    "        print(f\"✅ 保存 CSV 至：{output_path_csv}\")\n",
    "\n",
    "    # 可选：保存为 JSON\n",
    "    # 可选：保存为 JSON\n",
    "    if output_path_json:\n",
    "        os.makedirs(os.path.dirname(output_path_json) or \".\", exist_ok=True)\n",
    "        with open(output_path_json, 'w', encoding='utf-8') as f:\n",
    "            json.dump(df.to_dict(orient=\"records\"), f, ensure_ascii=False, indent=2)\n",
    "        print(f\"✅ 保存 JSON 至：{output_path_json}\")\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ad04ddf3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # 示例用法\n",
    "# if __name__ == \"__main__\":\n",
    "#     input_json = \"augmented_data/all_augmented_data_v17.json\"\n",
    "#     output_csv = \"augmented_data/cleaned_all_augmented_data_with_original_posts_v17.csv\"\n",
    "#     output_json = \"cleaned_all_augmented_data_with_original_posts_v17.json\"\n",
    "\n",
    "#     preprocess_nested_json(input_json, output_path_csv=output_csv, output_path_json=output_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b525dad7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import nltk\n",
    "import json\n",
    "import os\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from tqdm import tqdm\n",
    "def preprocess_nested_json(input_path, output_path_csv=None, output_path_json=None):\n",
    "    print(f\"📥 加载 JSON 数据: {input_path}\")\n",
    "\n",
    "    with open(input_path, 'r', encoding='utf-8') as f:\n",
    "        raw_data = json.load(f)\n",
    "\n",
    "    # 判断结构：如果是 list[dict]，直接处理；如果是 dict，再 flatten\n",
    "    if isinstance(raw_data, list):\n",
    "        flat_data = raw_data\n",
    "    elif isinstance(raw_data, dict):\n",
    "        flat_data = flatten_nested_json(raw_data)\n",
    "    else:\n",
    "        raise ValueError(\"❌ 不支持的数据结构类型\")\n",
    "\n",
    "    if not flat_data:\n",
    "        raise ValueError(\"❌ 未找到合法的 {'type': ..., 'post': ...} 数据结构\")\n",
    "\n",
    "    df = pd.DataFrame(flat_data)\n",
    "    print(f\"✅ 扁平化完成，共 {len(df)} 条\")\n",
    "\n",
    "    tqdm.pandas(desc=\"Cleaning\")\n",
    "    df[\"posts_cleaned\"] = df[\"posts\"].astype(str).progress_apply(clean_text)\n",
    "\n",
    "    if output_path_csv:\n",
    "        output_dir = os.path.dirname(output_path_csv)\n",
    "        if output_dir:\n",
    "            os.makedirs(output_dir, exist_ok=True)\n",
    "        df.to_csv(output_path_csv, index=False)\n",
    "        print(f\"✅ 保存 CSV 至：{output_path_csv}\")\n",
    "\n",
    "    if output_path_json:\n",
    "        output_dir = os.path.dirname(output_path_json)\n",
    "        if output_dir:\n",
    "            os.makedirs(output_dir, exist_ok=True)\n",
    "        with open(output_path_json, 'w', encoding='utf-8') as f:\n",
    "            json.dump(df.to_dict(orient=\"records\"), f, ensure_ascii=False, indent=2)\n",
    "        print(f\"✅ 保存 JSON 至：{output_path_json}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9d690574",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "📥 加载 JSON 数据: filtered_processed_comments_300.json\n",
      "✅ 扁平化完成，共 1261235 条\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Cleaning: 100%|██████████| 1261235/1261235 [20:00<00:00, 1050.63it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ 保存 CSV 至：filtered_processed_comments__300_cleaned.csv\n",
      "✅ 保存 JSON 至：filtered_processed_comments_300_cleaned.json\n"
     ]
    }
   ],
   "source": [
    "# 示例用法\n",
    "if __name__ == \"__main__\":\n",
    "    input_json = \"filtered_processed_comments_300.json\"\n",
    "    output_csv = \"filtered_processed_comments__300_cleaned.csv\"\n",
    "    output_json = \"filtered_processed_comments_300_cleaned.json\"\n",
    "\n",
    "    preprocess_nested_json(input_json, output_path_csv=output_csv, output_path_json=output_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ad13c8d4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "🔹 第 1 个字典：\n",
      "  type: INTP\n",
      "  posts: What languages do you speak?|||That's authoritarianism versus libertarianism, not left versus right....\n",
      "  posts_cleaned: language speak authoritarianism versus libertarianism left versus right know everyone experience dep...\n",
      "\n",
      "🔹 第 2 个字典：\n",
      "  type: ENTP\n",
      "  posts: It is just arguing semantics. To many on the consumer end, beta and production phase both share the ...\n",
      "  posts_cleaned: arguing semantics many consumer end beta production phase share fact mainstream release ready intere...\n",
      "\n",
      "🔹 第 3 个字典：\n",
      "  type: ENTJ\n",
      "  posts: Come to Europe if you're interested. In many EU countries doing a PhD is actually a job, for which y...\n",
      "  posts_cleaned: come europe interested many country phd actually job get paid salary hassle tuition mean bullshit sc...\n",
      "\n",
      "🔹 第 4 个字典：\n",
      "  type: INTP\n",
      "  posts: Exactly, so I won't have to eat a dick, I knew what I was saying.Two favorite goddesses are Artemis ...\n",
      "  posts_cleaned: exactly eat dick knew saying two favorite goddess artemis athena mean complete package plus shot ath...\n",
      "\n",
      "🔹 第 5 个字典：\n",
      "  type: ENTP\n",
      "  posts: Ooohhh okay, I thought there was something else to it. Thanks!|||The PS Vita USB port does not have ...\n",
      "  posts_cleaned: ooohhh okay thought something else thanks vita usb port video output doubt still waiting finding pes...\n"
     ]
    }
   ],
   "source": [
    "import ijson\n",
    "\n",
    "filename = 'filtered_processed_comments_cleaned.json'\n",
    "count = 0\n",
    "max_items = 5\n",
    "max_value_length = 100  # 每个字段值显示的最大字符数\n",
    "\n",
    "with open(filename, 'r', encoding='utf-8') as f:\n",
    "    parser = ijson.items(f, 'item')\n",
    "\n",
    "    for obj in parser:\n",
    "        print(f\"\\n🔹 第 {count + 1} 个字典：\")\n",
    "        for key, value in obj.items():\n",
    "            # 字符串值裁剪，其他类型转为字符串后裁剪\n",
    "            val_str = str(value)\n",
    "            if len(val_str) > max_value_length:\n",
    "                val_str = val_str[:max_value_length] + '...'\n",
    "            print(f\"  {key}: {val_str}\")\n",
    "        count += 1\n",
    "        if count >= max_items:\n",
    "            break\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
