{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "      <th>src</th>\n",
       "      <th>dataset</th>\n",
       "      <th>model</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Brian worked hard all summer during his break....</td>\n",
       "      <td>0</td>\n",
       "      <td>roct_machine_continuation_opt_1.3b</td>\n",
       "      <td>roct</td>\n",
       "      <td>opt_1.3b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>The 18-year-old can play at right-back, centre...</td>\n",
       "      <td>0</td>\n",
       "      <td>xsum_machine_continuation_opt_iml_max_1.3b</td>\n",
       "      <td>xsum</td>\n",
       "      <td>opt_iml_max_1.3b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>then steps back The man then steps back and wa...</td>\n",
       "      <td>0</td>\n",
       "      <td>hswag_machine_topical_text-davinci-002</td>\n",
       "      <td>hswag</td>\n",
       "      <td>text-davinci-002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>I woke up on that burning summer morning as if...</td>\n",
       "      <td>1</td>\n",
       "      <td>wp_human</td>\n",
       "      <td>wp</td>\n",
       "      <td>human</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Capt. Gale waited patiently at the entrance to...</td>\n",
       "      <td>0</td>\n",
       "      <td>wp_machine_continuation_t0_11b</td>\n",
       "      <td>wp</td>\n",
       "      <td>t0_11b</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text  label  \\\n",
       "0  Brian worked hard all summer during his break....      0   \n",
       "1  The 18-year-old can play at right-back, centre...      0   \n",
       "2  then steps back The man then steps back and wa...      0   \n",
       "3  I woke up on that burning summer morning as if...      1   \n",
       "4  Capt. Gale waited patiently at the entrance to...      0   \n",
       "\n",
       "                                          src dataset             model  \n",
       "0          roct_machine_continuation_opt_1.3b    roct          opt_1.3b  \n",
       "1  xsum_machine_continuation_opt_iml_max_1.3b    xsum  opt_iml_max_1.3b  \n",
       "2      hswag_machine_topical_text-davinci-002   hswag  text-davinci-002  \n",
       "3                                    wp_human      wp             human  \n",
       "4              wp_machine_continuation_t0_11b      wp            t0_11b  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_path = \"data/Deepfake/cross_domains_cross_models/train.csv\"\n",
    "import pandas as pd\n",
    "import os\n",
    "import numpy as np\n",
    "\n",
    "df = pd.read_csv(data_path)\n",
    "pd.set_option('display.max_columns', None)\n",
    "\n",
    "df['dataset'] = df['src'].apply(lambda x: x.split('_')[0])\n",
    "df['model'] = df['src'].apply(lambda x: x.split('_')[1:])\n",
    "df['model'] = df['model'].apply(lambda x: '_'.join(x))\n",
    "df['model'] = df['model'].apply(lambda x: x[25:] if x.startswith('gen_machine_continuation_') else x)\n",
    "# model的值start with “machine_continuation_” 则去掉前缀\n",
    "df['model'] = df['model'].apply(lambda x: x[21:] if x.startswith('machine_continuation_') else x)\n",
    "# model的值start with “machine_specified_” 则去掉前缀\n",
    "df['model'] = df['model'].apply(lambda x: x[18:] if x.startswith('machine_specified_') else x)\n",
    "# model的值start with “machine_topical_” 则去掉前缀\n",
    "df['model'] = df['model'].apply(lambda x: x[16:] if x.startswith('machine_topical_') else x)\n",
    "df['model'] = df['model'].apply(lambda x: x[4:] if x.startswith('gen_') else x)\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Generate Embedding - Load original RoBERTa "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Dimension reduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. Visualization"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "detective",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
