{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./2025-09-23_13-15-10/tot_llama_op_persona_personahub_cluster_diff_case_step3_3persona.jsonl',\n",
       " './2025-09-23_13-15-10/tot_llama_op_persona_personahub_cluster_diff_case_step3_3persona_gens.jsonl']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "dir_name = 'YOUR_DIRECTORY_NAME'\n",
    "directory_path = f'./{dir_name}/'\n",
    "\n",
    "file_list = [f for f in os.listdir(directory_path) if f.endswith(\".jsonl\") and os.path.isfile(os.path.join(directory_path, f))]\n",
    "full_paths = sorted([os.path.join(directory_path, f) for f in file_list])\n",
    "\n",
    "full_paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     post_id                                     author_persona  \\\n",
      "0  t3_1f0qqq  A movie enthusiast likely in his mid-to-late a...   \n",
      "1  t3_1f0qqq  A movie enthusiast likely in his mid-to-late a...   \n",
      "2  t3_1f0qqq  A movie enthusiast likely in his mid-to-late a...   \n",
      "3  t3_1h0dvp  A self-aware, introspective, and possibly marg...   \n",
      "4  t3_1h0dvp  A self-aware, introspective, and possibly marg...   \n",
      "\n",
      "                                             persona strategy  \\\n",
      "0  A film studies scholar with a focus on modern ...     None   \n",
      "1  A literature scholar specializing in 20th-cent...     None   \n",
      "2  A real estate developer interested in explorin...     None   \n",
      "3  A psychologist studying the impact of media an...     None   \n",
      "4  A sociologist studying popular culture and its...     None   \n",
      "\n",
      "                                         gen_counter  sample_idx  \n",
      "0  The proposed ban on children in movie theaters...           0  \n",
      "1  The proposal to ban children from movie theate...           1  \n",
      "2  Allowing children in movie theaters after a ce...           2  \n",
      "3  The notion that women are more valued than men...           0  \n",
      "4  The notion that women are more valued and desi...           1  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from collections import defaultdict\n",
    "\n",
    "for i in range(0, len(full_paths), 2):\n",
    "    meta_file = full_paths[i]\n",
    "    gens_file = full_paths[i+1]\n",
    "\n",
    "    model_name = meta_file.split('/')[-1][:-6]\n",
    "    if 'gens' in model_name:\n",
    "        input(model_name)\n",
    "    save_file = model_name\n",
    "    \n",
    "    data = []\n",
    "    with open(gens_file, 'r', encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            item = json.loads(line)\n",
    "            data.append(item)\n",
    "\n",
    "    metadata = []\n",
    "    metadata_1 = []\n",
    "    metadata_2 = []\n",
    "    with open(meta_file, 'r', encoding='utf-8') as f:\n",
    "        for line in f:\n",
    "            item = json.loads(line)\n",
    "            step = item['steps']['step']\n",
    "            if step == 0:\n",
    "                metadata.append(item)\n",
    "                continue\n",
    "            elif step == 1:\n",
    "                metadata_1.append(item)\n",
    "                continue\n",
    "            elif step == 2:\n",
    "                metadata_2.append(item)\n",
    "                continue\n",
    "    \n",
    "    op_dict = {}\n",
    "    persona_dict = defaultdict(list)\n",
    "    strategy_dict = defaultdict(list)\n",
    "    counter_dict = defaultdict(list)\n",
    "    pid_list = list()\n",
    "    for item, meta in zip(data, metadata):\n",
    "        pid = item['post_id']\n",
    "        if pid not in pid_list:\n",
    "            pid_list.append(pid)\n",
    "\n",
    "        if 'author_persona' in meta['steps']:\n",
    "            author_persona = meta['steps']['author_persona']\n",
    "            op_dict[pid] = author_persona\n",
    "\n",
    "        for i, y in enumerate(item['ys']):\n",
    "            if 'Counterargument:' not in y and 'Counterargument' not in y:\n",
    "                print(pid)\n",
    "                print(y)\n",
    "                print('===============')\n",
    "                continue\n",
    "            if 'Counterargument:' in y:\n",
    "                if y.count('Counterargument:') == 1:\n",
    "                    gen_counter = y.split('Counterargument:')[1].strip()\n",
    "                else:\n",
    "                    last_idx = y.rfind('Counterargument:')\n",
    "                    gen_counter = y[last_idx + len('Counterargument:'):].strip()\n",
    "            else:\n",
    "                if y.count('Counterargument') == 1:\n",
    "                    gen_counter = y.split('Counterargument')[1].strip()\n",
    "                else:\n",
    "                    last_idx = y.rfind('Counterargument')\n",
    "                    gen_counter = y[last_idx + len('Counterargument'):].strip()\n",
    "            counter_dict[pid].append(gen_counter)\n",
    "\n",
    "    for meta in metadata_1:\n",
    "        pid = meta['post_id']\n",
    "        if len(counter_dict[pid]) != 3:\n",
    "            print(f\"Warning: Post ID {pid} has {len(counter_dict[pid])} counterarguments, expected 3.\")\n",
    "            continue\n",
    "        if meta['steps']['step'] == 1 and 'selected_persona_strategy' in meta['steps']:\n",
    "            for persona in meta['steps']['selected_persona_strategy']:\n",
    "                if persona['strategy']:\n",
    "                    strategy = persona['strategy'].strip()\n",
    "                else:\n",
    "                    strategy = None\n",
    "                \n",
    "                if persona['persona']:\n",
    "                    try:\n",
    "                        persona = persona['persona'].strip()\n",
    "                    except:\n",
    "                        persona = str(persona['persona'])\n",
    "                else:\n",
    "                    persona = None\n",
    "                    \n",
    "                persona_dict[pid].append(persona)\n",
    "                strategy_dict[pid].append(strategy)\n",
    "                    \n",
    "    new_data = []\n",
    "    for pid in pid_list:\n",
    "        persona_list = persona_dict[pid]\n",
    "        strategy_list = strategy_dict[pid]\n",
    "        if not persona_list or not persona_list[0]:\n",
    "            persona_list = [None, None, None]\n",
    "            strategy_list = [None, None, None]\n",
    "        for sample_idx, persona in enumerate(persona_list):\n",
    "            new_data.append({\n",
    "                'post_id': pid,\n",
    "                'author_persona': op_dict[pid],\n",
    "                'persona': persona,\n",
    "                'strategy': strategy_list[sample_idx],\n",
    "                'gen_counter': counter_dict[pid][sample_idx],\n",
    "                'sample_idx': sample_idx\n",
    "            })\n",
    "\n",
    "    df = pd.DataFrame(new_data)\n",
    "    print(df.head(5))\n",
    "    df.to_csv(f'./{save_file}.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cmv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
