{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#from utils import *\n",
    "import json\n",
    "import pandas as pd\n",
    "import os\n",
    "from datasets import load_dataset\n",
    "from nltk import sent_tokenize\n",
    "from glob import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Reusing dataset cnn_dailymail (/home/user/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)\n"
     ]
    }
   ],
   "source": [
    "dataset = load_dataset('cnn_dailymail', '3.0.0', split='test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = pd.read_csv('~/user/paraphrasing/untargeted/textrank/articles_no_sample_10p.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(zip(dataset['article'], [[]] *len(dataset), dataset['id']), columns=['text', 'summary', 'id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>summary</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>[]</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>[]</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>[]</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>[]</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>[]</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57400</th>\n",
       "      <td>When his sister died in a car crash at the age...</td>\n",
       "      <td>[]</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57401</th>\n",
       "      <td>Peter Morris was devastated when his sister, C...</td>\n",
       "      <td>[]</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57402</th>\n",
       "      <td>Peter Morris was devastated when his sister, C...</td>\n",
       "      <td>[]</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57403</th>\n",
       "      <td>Peter Morris was devastated when his sister, C...</td>\n",
       "      <td>[]</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57404</th>\n",
       "      <td>Peter Morris was devastated when his sister, C...</td>\n",
       "      <td>[]</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>57405 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    text summary  \\\n",
       "0      (CNN)James Best, best known for his portrayal ...      []   \n",
       "1      (CNN)James Best, best known for his portrayal ...      []   \n",
       "2      (CNN)James Best, best known for his portrayal ...      []   \n",
       "3      (CNN)James Best, best known for his portrayal ...      []   \n",
       "4      (CNN)James Best, best known for his portrayal ...      []   \n",
       "...                                                  ...     ...   \n",
       "57400  When his sister died in a car crash at the age...      []   \n",
       "57401  Peter Morris was devastated when his sister, C...      []   \n",
       "57402  Peter Morris was devastated when his sister, C...      []   \n",
       "57403  Peter Morris was devastated when his sister, C...      []   \n",
       "57404  Peter Morris was devastated when his sister, C...      []   \n",
       "\n",
       "                                             id  \n",
       "0      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "1      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "2      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "3      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "4      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "...                                         ...  \n",
       "57400  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "57401  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "57402  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "57403  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "57404  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "\n",
       "[57405 rows x 3 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.text = df.text.apply(sent_tokenize)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.summary = df.summary.apply(sent_tokenize)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>summary</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[(CNN)James Best, best known for his portrayal...</td>\n",
       "      <td>[]</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[(CNN)James Best, best known for his portrayal...</td>\n",
       "      <td>[]</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[(CNN)James Best, best known for his portrayal...</td>\n",
       "      <td>[]</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[(CNN)James Best, best known for his portrayal...</td>\n",
       "      <td>[]</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>[(CNN)James Best, best known for his portrayal...</td>\n",
       "      <td>[]</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57400</th>\n",
       "      <td>[When his sister died in a car crash at the ag...</td>\n",
       "      <td>[]</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57401</th>\n",
       "      <td>[Peter Morris was devastated when his sister, ...</td>\n",
       "      <td>[]</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57402</th>\n",
       "      <td>[Peter Morris was devastated when his sister, ...</td>\n",
       "      <td>[]</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57403</th>\n",
       "      <td>[Peter Morris was devastated when his sister, ...</td>\n",
       "      <td>[]</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57404</th>\n",
       "      <td>[Peter Morris was devastated when his sister, ...</td>\n",
       "      <td>[]</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>57405 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    text summary  \\\n",
       "0      [(CNN)James Best, best known for his portrayal...      []   \n",
       "1      [(CNN)James Best, best known for his portrayal...      []   \n",
       "2      [(CNN)James Best, best known for his portrayal...      []   \n",
       "3      [(CNN)James Best, best known for his portrayal...      []   \n",
       "4      [(CNN)James Best, best known for his portrayal...      []   \n",
       "...                                                  ...     ...   \n",
       "57400  [When his sister died in a car crash at the ag...      []   \n",
       "57401  [Peter Morris was devastated when his sister, ...      []   \n",
       "57402  [Peter Morris was devastated when his sister, ...      []   \n",
       "57403  [Peter Morris was devastated when his sister, ...      []   \n",
       "57404  [Peter Morris was devastated when his sister, ...      []   \n",
       "\n",
       "                                             id  \n",
       "0      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "1      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "2      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "3      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "4      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "...                                         ...  \n",
       "57400  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "57401  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "57402  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "57403  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "57404  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "\n",
       "[57405 rows x 3 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_json('~/user/paraphrasing/untargeted/matchsum/articles_same.jsonl', orient='records', lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./mtf_list.json','r') as f:\n",
    "    mtf_list = json.loads(f.read())\n",
    "with open('./ftm_list.json', 'r') as f:\n",
    "    ftm_list = json.loads(f.read())\n",
    "with open('./lists.json', 'r') as f:\n",
    "    lists = json.loads(f.read())\n",
    "    male_list = lists['male']\n",
    "    female_list = lists['female']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "original_text = df.text\n",
    "original_text = original_text.to_list()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "male_text = [remove_words(x, ftm_list).split('\\n') for x in original_text]\n",
    "female_text = [remove_words(x, mtf_list).split('\\n') for x in original_text]\n",
    "neutral_text = [remove_words(x, male_list+female_list, remove=True).split('\\n') for x in original_text]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "male_data = [{'text':x, 'summary':[]} for x in male_text]\n",
    "female_data = [{'text':x, 'summary':[]} for x in female_text]\n",
    "original_data = [{'text':x, 'summary':[]} for x in original_text]\n",
    "neutral_data = [{'text':x, 'summary':[]} for x in neutral_text]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "biased_idx = [int(x[:-4]) for x in os.listdir('../data/biased/matchsum/articles')]\n",
    "biased_idx.sort()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "index = [{'sent_id':list(range(len(original_text[x])))} for x in biased_idx]\n",
    "with open('../MatchSum/data/index/basic_index.id','w') as f:\n",
    "    for x in index[:-1]:\n",
    "        f.write(json.dumps(x) + '\\n')\n",
    "    f.write(json.dumps(index[-1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "index[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "male_data_biased = [male_data[x] for x in biased_idx]\n",
    "female_data_biased = [female_data[x] for x in biased_idx]\n",
    "neutral_data_biased = [neutral_data[x] for x in biased_idx]\n",
    "original_data_biased = [original_data[x] for x in biased_idx]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "neutral_data_biased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../MatchSum/data/articles/male_test_CNNDM_roberta.jsonl', 'w+') as f:\n",
    "    for x in male_data:\n",
    "        f.write(json.dumps(x) + '\\n')\n",
    "with open('../MatchSum/data/articles/female_test_CNNDM_roberta.jsonl', 'w+') as f:\n",
    "    for x in female_data:\n",
    "        f.write(json.dumps(x) + '\\n')\n",
    "with open('../MatchSum/data/articles/original_test_CNNDM_roberta.jsonl', 'w+') as f:\n",
    "    for x in original_data:\n",
    "        f.write(json.dumps(x) + '\\n')\n",
    "with open('../MatchSum/data/articles/neutral_test_CNNDM_roberta.jsonl', 'w+') as f:\n",
    "    for x in neutral_data:\n",
    "        f.write(json.dumps(x) + '\\n')\n",
    "with open('../MatchSum/data/articles/male_biased_test_CNNDM_roberta.jsonl', 'w+') as f:\n",
    "    for x in male_data_biased:\n",
    "        f.write(json.dumps(x) + '\\n')\n",
    "with open('../MatchSum/data/articles/female_biased_test_CNNDM_roberta.jsonl', 'w+') as f:\n",
    "    for x in female_data_biased:\n",
    "        f.write(json.dumps(x) + '\\n')\n",
    "with open('../MatchSum/data/articles/original_biased_test_CNNDM_roberta.jsonl', 'w+') as f:\n",
    "    for x in original_data_biased:\n",
    "        f.write(json.dumps(x) + '\\n')\n",
    "with open('../MatchSum/data/articles/neutral_biased_test_CNNDM_roberta.jsonl', 'w+') as f:\n",
    "    for x in neutral_data_biased:\n",
    "        f.write(json.dumps(x) + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "original_data = pd.read_json('../MatchSum/data/articles/original_test_CNNDM_roberta.jsonl', lines=True)\n",
    "male_data = pd.read_json('../MatchSum/data/articles/male_test_CNNDM_roberta.jsonl', lines=True)\n",
    "female_data = pd.read_json('../MatchSum/data/articles/female_test_CNNDM_roberta.jsonl', lines=True)\n",
    "neutral_data = pd.read_json('../MatchSum/data/articles/neutral_test_CNNDM_roberta.jsonl', lines=True)\n",
    "original_data_biased = pd.read_json('../MatchSum/data/articles/original_biased_test_CNNDM_roberta.jsonl', lines=True)\n",
    "male_data_biased = pd.read_json('../MatchSum/data/articles/male_biased_test_CNNDM_roberta.jsonl', lines=True)\n",
    "female_data_biased = pd.read_json('../MatchSum/data/articles/female_biased_test_CNNDM_roberta.jsonl', lines=True)\n",
    "neutral_data_biased = pd.read_json('../MatchSum/data/articles/neutral_biased_test_CNNDM_roberta.jsonl', lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "neutral_data_biased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "biased_index = list(neutral_data_biased.text.apply(len))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def write_temp(data, path='./temp.txt'):\n",
    "    formatted = data.text.apply(lambda x: ' [CLS] [SEP] '.join(x))\n",
    "\n",
    "    with open(f'{path}', 'w+') as f:\n",
    "        temp = formatted.to_list()\n",
    "        for article in temp[:-1]:\n",
    "            f.write(article + '\\n')\n",
    "        f.write(temp[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "write_temp(original_data, '../PreSumm/raw_data/original.txt')\n",
    "write_temp(male_data, '../PreSumm/raw_data/male.txt')\n",
    "write_temp(female_data, '../PreSumm/raw_data/female.txt')\n",
    "write_temp(neutral_data, '../PreSumm/raw_data/neutral.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_json('~/user/matchsum/processed/cnn_dm.jsonl', lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = df.iloc[-311971:]\n",
    "new_df.to_json('~/user/matchsum/processed/true_cnn_dm.jsonl', orient='records', lines=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!! cd ../MatchSum/preprocess && python3 get_candidate.py --tokenizer=roberta --data_path=../data/articles/female_test_CNNDM_roberta.jsonl --index_path=../data/index/basic_index.id --write_path=../data/processed/female_basic.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "paths = glob('../MatchSum/roberta/result/MatchSum_cnndm_roberta.ckpt/dec/*.dec')\n",
    "paths.sort(key=lambda x: int(x.split('/')[-1][:-4]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "article_df = pd.read_csv('~/user/paraphrasing/untargeted/textrank/articles_no_sample_10p.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "summaries = []\n",
    "\n",
    "for i, path in enumerate(paths):\n",
    "    with open(path, 'r') as f:\n",
    "        summ = f.read().replace('\\n', ' ')\n",
    "\n",
    "    summaries.append(summ)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "matchsum_df = pd.DataFrame(zip(article_df.article, summaries, article_df.id), columns=['article', 'summary', 'id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>article</th>\n",
       "      <th>summary</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>(CNN)James Best, best known for his portrayal ...</td>\n",
       "      <td>00200e794fa41d3f7ce92cbf43e9fd4cd652bb09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57400</th>\n",
       "      <td>When his sister died in a car crash at the age...</td>\n",
       "      <td>When his sister died in a car crash at the age...</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57401</th>\n",
       "      <td>Peter Morris was devastated when his sister, C...</td>\n",
       "      <td>Peter Morris was devastated when his sister, C...</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57402</th>\n",
       "      <td>Peter Morris was devastated when his sister, C...</td>\n",
       "      <td>Peter Morris was devastated when his sister, C...</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57403</th>\n",
       "      <td>Peter Morris was devastated when his sister, C...</td>\n",
       "      <td>Peter Morris was devastated when his sister, C...</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57404</th>\n",
       "      <td>Peter Morris was devastated when his sister, C...</td>\n",
       "      <td>Peter Morris was devastated when his sister, C...</td>\n",
       "      <td>fffd506034c5275fe57220e669ad7e01605d597c</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>57405 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 article  \\\n",
       "0      (CNN)James Best, best known for his portrayal ...   \n",
       "1      (CNN)James Best, best known for his portrayal ...   \n",
       "2      (CNN)James Best, best known for his portrayal ...   \n",
       "3      (CNN)James Best, best known for his portrayal ...   \n",
       "4      (CNN)James Best, best known for his portrayal ...   \n",
       "...                                                  ...   \n",
       "57400  When his sister died in a car crash at the age...   \n",
       "57401  Peter Morris was devastated when his sister, C...   \n",
       "57402  Peter Morris was devastated when his sister, C...   \n",
       "57403  Peter Morris was devastated when his sister, C...   \n",
       "57404  Peter Morris was devastated when his sister, C...   \n",
       "\n",
       "                                                 summary  \\\n",
       "0      (CNN)James Best, best known for his portrayal ...   \n",
       "1      (CNN)James Best, best known for his portrayal ...   \n",
       "2      (CNN)James Best, best known for his portrayal ...   \n",
       "3      (CNN)James Best, best known for his portrayal ...   \n",
       "4      (CNN)James Best, best known for his portrayal ...   \n",
       "...                                                  ...   \n",
       "57400  When his sister died in a car crash at the age...   \n",
       "57401  Peter Morris was devastated when his sister, C...   \n",
       "57402  Peter Morris was devastated when his sister, C...   \n",
       "57403  Peter Morris was devastated when his sister, C...   \n",
       "57404  Peter Morris was devastated when his sister, C...   \n",
       "\n",
       "                                             id  \n",
       "0      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "1      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "2      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "3      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "4      00200e794fa41d3f7ce92cbf43e9fd4cd652bb09  \n",
       "...                                         ...  \n",
       "57400  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "57401  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "57402  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "57403  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "57404  fffd506034c5275fe57220e669ad7e01605d597c  \n",
       "\n",
       "[57405 rows x 3 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "matchsum_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "matchsum_df.to_csv('~/user/paraphrasing/untargeted/matchsum/summaries_same.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.7 ('summarization')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "fb9173d43df76b287d1b53052eec4ff84d0ee52790be7057998c9269beecf529"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
