{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/user/HS502/yl02706/LLMs_Memorize\n"
     ]
    }
   ],
   "source": [
    "%cd .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from context_manager import ConversationContextManager"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "class parser(ConversationContextManager):\n",
    "    def __init__(\n",
    "        self,\n",
    "        path : str,\n",
    "        mask_ratio = 0.2, \n",
    "        keep_leading_word = True,\n",
    "        num_lead_words = 3,\n",
    "        ppl_threshold = None,\n",
    "        tokenizer = None,\n",
    "        compute_self_info = True,\n",
    "        sent_mask_token = \"<...some content omitted.>\",\n",
    "        phrase_mask_token = \"\",\n",
    "        num_articles = 2000,\n",
    "        lang = \"en\",\n",
    "    ):\n",
    "        self.path = path\n",
    "        self.lang = lang\n",
    "        self._prepare_phrase_tokenizer()\n",
    "        self.num_articles = num_articles\n",
    "        self.tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\") if tokenizer is None else tokenizer\n",
    "\n",
    "        self.keep_leading_word = keep_leading_word\n",
    "        self.num_lead_words = num_lead_words\n",
    "        self.ppl_threshold = ppl_threshold\n",
    "        self.max_token_len = 1800\n",
    "        self.sent_level_self_info = True\n",
    "        self.mask_ratio = mask_ratio\n",
    "\n",
    "        self.mask_token = sent_mask_token\n",
    "        self.phrase_mask_token = phrase_mask_token\n",
    "\n",
    "        # self.sent_tokenize_pattern = r\"((?<!e\\.g)(?<!i\\.e)(?<!w\\.r\\.t)(?<=\\.)\\s)|(?<=\\?\\s)|(?<=!\\s)\"\n",
    "        # self.sent_tokenize_pattern = r\"(?<!e\\.g)(?<!i\\.e)(?<=\\.\\s)|(?<=\\?\\s)|(?<=!\\s)\"\n",
    "        self.sent_tokenize_pattern = r\"(?<!\\w\\.\\w.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?)\\s\"\n",
    "        self.load_articles(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/user/HS502/yl02706/LLMs_Memorize/context_manager.py:513: GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
      "\n",
      "The code that caused this warning is on line 513 of the file /user/HS502/yl02706/LLMs_Memorize/context_manager.py. To get rid of this warning, pass the additional argument 'features=\"html.parser\"' to the BeautifulSoup constructor.\n",
      "\n",
      "  bsobj = BeautifulSoup(sent['value'])\n",
      "Token indices sequence length is longer than the specified maximum sequence length for this model (9144 > 1024). Running this sequence through the model will result in indexing errors\n"
     ]
    }
   ],
   "source": [
    "manager = parser('/vol/research/lyc/llm_memorize/conversation')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import GPT2Tokenizer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('human',\n",
       "  'how do I add multiple new columns in m for power query or power bi?'),\n",
       " ('gpt',\n",
       "  'In Power Query or Power BI, you can add multiple new columns by using the \"Add Column\" menu in the Home tab. Once you click on the \"Add Column\" menu, you will see options to add a custom column, index column, and more. To add multiple new columns, you can use the \"Custom Column\" option. You can also use \"Duplicate Column\" option to duplicate an existing column, and then rename the new column as desired. Another way to add multiple new columns is to use the \"Add Columns\" button to add columns from a table, then rename the new columns as desired. You can also add multiple columns by using the \"Add Column\" button to add columns from a table, then rename the new columns as desired. In addition, you can also use formula or other transformation in Power Query Editor to generate new columns based on existing columns or your own logic. '),\n",
       " ('human', 'can you show me the m code?'),\n",
       " ('gpt',\n",
       "  'Sure, here is an example of how you can use M code to add multiple new columns in Power Query: Copy codelet\\n    Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(\"i45WMlTSUzI1V...\", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [Column1 = _t, Column2 = _t, Column3 = _t]),\\n    #\"Added Custom Columns\" = Table.AddColumn(Source, \"Custom Column 1\", each \"Value 1\"),\\n    #\"Added Custom Columns1\" = Table.AddColumn(#\"Added Custom Columns\", \"Custom Column 2\", each \"Value 2\"),\\n    #\"Added Custom Columns2\" = Table.AddColumn(#\"Added Custom Columns1\", \"Custom Column 3\", each \"Value 3\")\\nin\\n    #\"Added Custom Columns2\"\\nThis code adds three new columns named \"Custom Column 1\", \"Custom Column 2\", and \"Custom Column 3\" and assigns the value \"Value 1\", \"Value 2\", and \"Value 3\" respectively to each row of the new columns. You can also use the Table.ExpandRecordColumn function to add multiple columns from a nested table or record.Copy codelet\\n    Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(\"i45WMlTSUzI1V...\", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [Column1 = _t, Column2 = _t, Column3 = _t]),\\n    #\"Expanded Record Column\" = Table.ExpandRecordColumn(Source, \"Record Column\", {\"Column4\", \"Column5\", \"Column6\"})\\nin\\n    #\"Expanded Record Column\"\\nThis code will add multiple new columns named \"Column4\", \"Column5\", and \"Column6\" from the record column. Please note that the example M code above is just an illustration, you should adjust the code according to the structure of your data. '),\n",
       " ('human',\n",
       "  'this means that there is no function to add multiple columns at once?'),\n",
       " ('gpt',\n",
       "  'I apologize for any confusion, there is no built-in function in Power Query or Power BI to add multiple new columns at once. However, there are a few ways to accomplish this by chaining multiple \"Add Column\" steps together, using a function such as Table.AddColumn which can be used multiple times, or using Table.ExpandRecordColumn function to add multiple columns from a nested table or record.You can also use formula in Power Query Editor to generate new columns based on existing columns or your own logic. For example, you can use the Table.DuplicateColumn function to create multiple new columns at once, by passing the name of the column you want to duplicate and the names of the new columns as a list of strings.Copy codelet\\n    Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(\"i45WMlTSUzI1V...\", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [Column1 = _t]),\\n    #\"Duplicate Columns\" = Table.DuplicateColumn(Source, \"Column1\", {\"Column2\", \"Column3\", \"Column4\"})\\nin\\n    #\"Duplicate Columns\"\\nThis code will create a new columns named \"Column2\", \"Column3\", and \"Column4\" with the same value as the \"Column1\" You can also use a custom function to add multiple columns at once. '),\n",
       " ('human', 'can you write a custom function for me to accomplish that?'),\n",
       " ('gpt',\n",
       "  'Sure, here\\'s an example of a custom function in Power Query that you can use to add multiple new columns at once: Copy codelet\\n    AddMultipleColumns = (table as table, columns as list, values as list) => \\n    let\\n        newtable = table,\\n        index = 0\\n    in\\n        for column in columns do\\n            newtable = Table.AddColumn(newtable, column, each values{index})\\n            index = index + 1\\n        newtable\\nin\\n    AddMultipleColumns\\nYou can use this function by passing in the table you want to add columns to, a list of column names, and a list of values. For example: Copy codelet\\n    Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(\"i45WMlTSUzI1V...\", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [Column1 = _t]),\\n    #\"Added Multiple Columns\" = AddMultipleColumns(Source, {\"Column2\", \"Column3\", \"Column4\"}, {\"Value1\", \"Value2\", \"Value3\"})\\nin\\n    #\"Added Multiple Columns\"\\nThis code will add three new columns named \"Column2\", \"Column3\", and \"Column4\" and assigns the value \"Value1\", \"Value2\", and \"Value3\" respectively to each row of the new columns. Please note that this is just one way to add multiple new columns at once, and you may need to adjust the code to fit your specific use case. Also, be aware that this function will only work if the number of columns and the number of values match and are in the same order. ')]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "manager.articles[0].context"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'yn2eWCt'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "manager.articles[0].entry_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "covs = [{'id': article.entry_id, 'chat': article.context} for article in manager.articles]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': 'yn2eWCt',\n",
       " 'chat': [('human',\n",
       "   'how do I add multiple new columns in m for power query or power bi?'),\n",
       "  ('gpt',\n",
       "   'In Power Query or Power BI, you can add multiple new columns by using the \"Add Column\" menu in the Home tab. Once you click on the \"Add Column\" menu, you will see options to add a custom column, index column, and more. To add multiple new columns, you can use the \"Custom Column\" option. You can also use \"Duplicate Column\" option to duplicate an existing column, and then rename the new column as desired. Another way to add multiple new columns is to use the \"Add Columns\" button to add columns from a table, then rename the new columns as desired. You can also add multiple columns by using the \"Add Column\" button to add columns from a table, then rename the new columns as desired. In addition, you can also use formula or other transformation in Power Query Editor to generate new columns based on existing columns or your own logic. '),\n",
       "  ('human', 'can you show me the m code?'),\n",
       "  ('gpt',\n",
       "   'Sure, here is an example of how you can use M code to add multiple new columns in Power Query: Copy codelet\\n    Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(\"i45WMlTSUzI1V...\", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [Column1 = _t, Column2 = _t, Column3 = _t]),\\n    #\"Added Custom Columns\" = Table.AddColumn(Source, \"Custom Column 1\", each \"Value 1\"),\\n    #\"Added Custom Columns1\" = Table.AddColumn(#\"Added Custom Columns\", \"Custom Column 2\", each \"Value 2\"),\\n    #\"Added Custom Columns2\" = Table.AddColumn(#\"Added Custom Columns1\", \"Custom Column 3\", each \"Value 3\")\\nin\\n    #\"Added Custom Columns2\"\\nThis code adds three new columns named \"Custom Column 1\", \"Custom Column 2\", and \"Custom Column 3\" and assigns the value \"Value 1\", \"Value 2\", and \"Value 3\" respectively to each row of the new columns. You can also use the Table.ExpandRecordColumn function to add multiple columns from a nested table or record.Copy codelet\\n    Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(\"i45WMlTSUzI1V...\", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [Column1 = _t, Column2 = _t, Column3 = _t]),\\n    #\"Expanded Record Column\" = Table.ExpandRecordColumn(Source, \"Record Column\", {\"Column4\", \"Column5\", \"Column6\"})\\nin\\n    #\"Expanded Record Column\"\\nThis code will add multiple new columns named \"Column4\", \"Column5\", and \"Column6\" from the record column. Please note that the example M code above is just an illustration, you should adjust the code according to the structure of your data. '),\n",
       "  ('human',\n",
       "   'this means that there is no function to add multiple columns at once?'),\n",
       "  ('gpt',\n",
       "   'I apologize for any confusion, there is no built-in function in Power Query or Power BI to add multiple new columns at once. However, there are a few ways to accomplish this by chaining multiple \"Add Column\" steps together, using a function such as Table.AddColumn which can be used multiple times, or using Table.ExpandRecordColumn function to add multiple columns from a nested table or record.You can also use formula in Power Query Editor to generate new columns based on existing columns or your own logic. For example, you can use the Table.DuplicateColumn function to create multiple new columns at once, by passing the name of the column you want to duplicate and the names of the new columns as a list of strings.Copy codelet\\n    Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(\"i45WMlTSUzI1V...\", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [Column1 = _t]),\\n    #\"Duplicate Columns\" = Table.DuplicateColumn(Source, \"Column1\", {\"Column2\", \"Column3\", \"Column4\"})\\nin\\n    #\"Duplicate Columns\"\\nThis code will create a new columns named \"Column2\", \"Column3\", and \"Column4\" with the same value as the \"Column1\" You can also use a custom function to add multiple columns at once. '),\n",
       "  ('human', 'can you write a custom function for me to accomplish that?'),\n",
       "  ('gpt',\n",
       "   'Sure, here\\'s an example of a custom function in Power Query that you can use to add multiple new columns at once: Copy codelet\\n    AddMultipleColumns = (table as table, columns as list, values as list) => \\n    let\\n        newtable = table,\\n        index = 0\\n    in\\n        for column in columns do\\n            newtable = Table.AddColumn(newtable, column, each values{index})\\n            index = index + 1\\n        newtable\\nin\\n    AddMultipleColumns\\nYou can use this function by passing in the table you want to add columns to, a list of column names, and a list of values. For example: Copy codelet\\n    Source = Table.FromRows(Json.Document(Binary.Decompress(Binary.FromText(\"i45WMlTSUzI1V...\", BinaryEncoding.Base64), Compression.Deflate)), let _t = ((type nullable text) meta [Serialized.Text = true]) in type table [Column1 = _t]),\\n    #\"Added Multiple Columns\" = AddMultipleColumns(Source, {\"Column2\", \"Column3\", \"Column4\"}, {\"Value1\", \"Value2\", \"Value3\"})\\nin\\n    #\"Added Multiple Columns\"\\nThis code will add three new columns named \"Column2\", \"Column3\", and \"Column4\" and assigns the value \"Value1\", \"Value2\", and \"Value3\" respectively to each row of the new columns. Please note that this is just one way to add multiple new columns at once, and you may need to adjust the code to fit your specific use case. Also, be aware that this function will only work if the number of columns and the number of values match and are in the same order. ')]}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "covs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame.from_dict(covs, orient='columns')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>chat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>yn2eWCt</td>\n",
       "      <td>[(human, how do I add multiple new columns in ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DhelrJT</td>\n",
       "      <td>[(human, \\n Sure, I can do that. What new tech...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>VY7cMKG</td>\n",
       "      <td>[(human, Poly Ether Ether Ketone), (gpt, Poly ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>H1aZlSE</td>\n",
       "      <td>[(human, can you design a referral system simi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>eL3wWKe</td>\n",
       "      <td>[(human, Why bollywood movies are not that goo...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>570</th>\n",
       "      <td>jIMnwHE</td>\n",
       "      <td>[(human, We will get you work ready with profe...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>571</th>\n",
       "      <td>I0teRKt</td>\n",
       "      <td>[(human, daj mi deset najbitnijih dogadjaja na...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>572</th>\n",
       "      <td>eAcBnIo</td>\n",
       "      <td>[(human, Write an email to a student), (gpt, S...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>573</th>\n",
       "      <td>Ywy0wXP</td>\n",
       "      <td>[(human, 你現在是一名專業的翻譯人員，負責將中文翻譯成英文，除了要將特殊字元移除，並...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>574</th>\n",
       "      <td>HqVGK3f</td>\n",
       "      <td>[(human, for the decimal integers 0 to 90000, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>575 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          id                                               chat\n",
       "0    yn2eWCt  [(human, how do I add multiple new columns in ...\n",
       "1    DhelrJT  [(human, \\n Sure, I can do that. What new tech...\n",
       "2    VY7cMKG  [(human, Poly Ether Ether Ketone), (gpt, Poly ...\n",
       "3    H1aZlSE  [(human, can you design a referral system simi...\n",
       "4    eL3wWKe  [(human, Why bollywood movies are not that goo...\n",
       "..       ...                                                ...\n",
       "570  jIMnwHE  [(human, We will get you work ready with profe...\n",
       "571  I0teRKt  [(human, daj mi deset najbitnijih dogadjaja na...\n",
       "572  eAcBnIo  [(human, Write an email to a student), (gpt, S...\n",
       "573  Ywy0wXP  [(human, 你現在是一名專業的翻譯人員，負責將中文翻譯成英文，除了要將特殊字元移除，並...\n",
       "574  HqVGK3f  [(human, for the decimal integers 0 to 90000, ...\n",
       "\n",
       "[575 rows x 2 columns]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = Dataset.from_pandas(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': 'eL3wWKe',\n",
       " 'chat': [['human', 'Why bollywood movies are not that good?'],\n",
       "  ['gpt',\n",
       "   'As an AI language model, I cannot make subjective judgments on the quality of movies. However, I can provide some possible reasons why some people may have the perception that Bollywood movies are not as good as movies from other industries: Lack of originality: Bollywood movies are sometimes criticized for lacking originality, with many films borrowing heavily from Hollywood and other international films. This can make the movies seem predictable and unoriginal.  Overemphasis on song and dance: Bollywood movies are known for their elaborate song and dance numbers, but some viewers may feel that this focus detracts from the story or makes the movie feel too long.  Poor production values: Some Bollywood movies may have lower production values compared to movies from other industries, which can affect the overall quality of the movie.  Cultural differences: Bollywood movies are often made for an Indian audience, and the cultural differences may make it difficult for some viewers to fully appreciate or connect with the movies.  It\\'s worth noting, however, that these are just some possible reasons why some people may not enjoy Bollywood movies. Ultimately, whether a movie is \"good\" or not is subjective and depends on personal preferences and tastes. '],\n",
       "  ['human',\n",
       "   'I want to do a road trip from Pune to Gujarat. Me and my wife will be travelling and we dont prefer very long driving sessions. Can you suggest a plan starting from Thursday early morning and ending in Pune on Sunday late night.'],\n",
       "  ['gpt',\n",
       "   \"Sure, here's a suggested itinerary for a road trip from Pune to Gujarat that takes into account your preferences for shorter driving sessions: Day 1 (Thursday): Pune to Nashik (Approx. 210 km, 4-5 hours drive) Start early in the morning from Pune and drive towards Nashik. On the way, you can stop at Bhimashankar Wildlife Sanctuary or Shani Shingnapur if interested. After reaching Nashik, you can explore the city's famous vineyards or visit the ancient Trimbakeshwar Shiva Temple. Spend the night in Nashik. Day 2 (Friday): Nashik to Saputara (Approx. 115 km, 3-4 hours drive) Leave early from Nashik and drive towards Saputara, a beautiful hill station in Gujarat. On the way, you can stop at Vani, which is famous for its beautiful flora and fauna. In Saputara, you can enjoy boating, trekking or visit the famous Sunset Point. Spend the night in Saputara. Day 3 (Saturday): Saputara to Ahmedabad (Approx. 400 km, 6-7 hours drive) Leave early from Saputara and drive towards Ahmedabad, the largest city in Gujarat. On the way, you can stop at Surat and visit the beautiful beaches and eat some local street food. In Ahmedabad, you can explore the Sabarmati Ashram, Kankaria Lake or take a heritage walk through the old city. Spend the night in Ahmedabad. Day 4 (Sunday): Ahmedabad to Pune (Approx. 670 km, 10-11 hours drive) Start early from Ahmedabad and drive towards Pune. On the way, you can stop at the famous Statue of Unity, the world's tallest statue, and spend some time there. Reach Pune by late night. Note: The driving times and distances mentioned are approximate and may vary depending on the route you take and the traffic conditions. Also, it's always a good idea to check the road and weather conditions before embarking on a road trip. \"]]}"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds[4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  4.77ba/s]\n",
      "Upload 1 LFS files: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]\n",
      "Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.05s/it]\n"
     ]
    }
   ],
   "source": [
    "ds.push_to_hub('liyucheng/sharegpt-500')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/user/HS502/yl02706/LLMs_Memorize/results'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/user/HS502/yl02706/LLMs_Memorize\n"
     ]
    }
   ],
   "source": [
    "%cd .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "UsageError: Line magic function `%ssh` not found.\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/user/HS502/yl02706/.conda/envs/lyc/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "2023-06-04 12:37:59.862541: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2023-06-04 12:38:06.139071: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
     ]
    }
   ],
   "source": [
    "with open('/vol/research/lyc/llm_memorize/news/NewsContextManager_sent.pkl', 'rb') as f:\n",
    "    news = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<context_manager.NewsContextManager at 0x7f19e84a3e50>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Members of Ukraine\\'s Armed Forces 80th Separate Air Assault Brigade at their position near the frontline city of Bakhmut, eastern Ukraine, last week\\n\\nMore than a dozen European Union member states have agreed to supply Ukraine with at least one million artillery shells over the next year.\\n\\nThe plan, worth €2bn in total, was agreed in Brussels on Monday.\\n\\nUkraine had told the EU it needed 350,000 shells a month to hold back advancing Russian troops and launch a counter-offensive this year.\\n\\nUkrainian Foreign Minister Dmytro Kuleba hailed the \"game-changing decision\" from the EU.\\n\\n\"Exactly what is needed,\" he wrote on Twitter. \"Urgent delivery and sustainable joint procurement.\"\\n\\nThe deal comes as Russia grows concerned at a Ukrainian counter-offensive near Bakhmut or in southern Ukraine, according to a report from think tank Institute for the Study of War.\\n\\nMeanwhile, Russia\\'s own new year offensive \"may be nearing culmination\", the ISW added.\\n\\nDefence and foreign ministers from 17 EU countries and Norway signed the ammunition agreement.\\n\\nThe first part of the plan commits €1bn ($1.06bn; £863m) of shared funding for EU states to draw upon their existing stockpiles, with the hope of sending this to Ukraine by the end of May.\\n\\nThe second part of the deal would see a further €1bn used to jointly order 155mm shells for Ukraine - the most sought-after artillery rounds.\\n\\nThe EU is hoping this joint order will incentivise European defence firms to increase their output, with the hope that contracts can be signed by the start of September.\\n\\nThe deal means each country will have to share details of their ammunition stockpiles - something normally kept secret.\\n\\nCurrent ammunition production in Europe is said to be lagging behind the levels Ukraine insists it needs to fight Russia.\\n\\nThere are also questions about how much EU countries can share from their own stockpiles without leaving themselves vulnerable.\\n\\n\"Increasing industrial capacity is essential,\" EU internal market commissioner Thierry Breton said during a visit to French arms industry company Nexter on Monday.\\n\\nEU foreign policy chief Josep Borrell recently warned that the bloc had to supply the artillery shells Ukraine needed, or face the prospect of Ukraine losing the war with Russia.\\n\\nHe said Russian forces were firing about 50,000 rounds of artillery each day and that Ukraine\\'s supplies must be lifted to the same level.\\n\\nHungary, which has not sent ammunition to Ukraine and has often threatened to veto sanctions against Russia, abstained from Monday\\'s vote in Brussels but foreign minister Peter Szijjarto said it would \"not prevent others from doing what they want\".\\n\\nNorway also announced on Monday that it had delivered eight German-made Leopard II tanks to Ukraine to be used in a possible Ukrainian counter-offensive against Russia this spring.\\n\\nSeparately, the US is authorising another $350m in military aid for Ukraine, Secretary of State Antony Blinken said. It includes ammunition for Himars missile launchers, 155mm shells, and howitzer cannon.\\n\\nLargest donors of military aid to Ukraine by country.'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news.articles[0].sections[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "context = re.sub(r\"\\n+\", \" \", news.articles[0].sections[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "===================================BUG REPORT===================================\n",
      "Welcome to bitsandbytes. For bug reports, please run\n",
      "\n",
      "python -m bitsandbytes\n",
      "\n",
      " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
      "================================================================================\n",
      "bin /user/HS502/yl02706/.conda/envs/lyc/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n",
      "/user/HS502/yl02706/.conda/envs/lyc/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
      "CUDA SETUP: Loading binary /user/HS502/yl02706/.conda/envs/lyc/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/user/HS502/yl02706/.conda/envs/lyc/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
      "  warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
     ]
    }
   ],
   "source": [
    "with open('/vol/research/lyc/llm_memorize/answer_summarisation_arxiv_0.35.pkl', 'rb') as f:\n",
    "    d = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'self-info-phrase': {'bleu': 0.2344040307581276,\n",
       "  'precisions': [0.5759471141622171,\n",
       "   0.2963437772421927,\n",
       "   0.18281015669442002,\n",
       "   0.12042977103218067],\n",
       "  'brevity_penalty': 0.9467535132081313,\n",
       "  'length_ratio': 0.9481220770454655,\n",
       "  'translation_length': 19665,\n",
       "  'reference_length': 20741,\n",
       "  'meteor': 0.44942608951729757,\n",
       "  'rouge1': 0.5429776341564643,\n",
       "  'rouge2': 0.2787215774333569,\n",
       "  'rougeL': 0.3869108258582408,\n",
       "  'rougeLsum': 0.38657165239318936,\n",
       "  'bertscore_precision': 0.9063382719348116,\n",
       "  'bertscore_recall': 0.900291630407659,\n",
       "  'bertscore_f1': 0.9032156514685329},\n",
       " 'Random-phrase': {'bleu': 0.17744827793685164,\n",
       "  'precisions': [0.5503884275381731,\n",
       "   0.25366196421815035,\n",
       "   0.13764519823308066,\n",
       "   0.08050404446156385],\n",
       "  'brevity_penalty': 0.8947381114356142,\n",
       "  'length_ratio': 0.8999083940022178,\n",
       "  'translation_length': 18665,\n",
       "  'reference_length': 20741,\n",
       "  'meteor': 0.4084892160455123,\n",
       "  'rouge1': 0.5116485251462238,\n",
       "  'rouge2': 0.2399070593008587,\n",
       "  'rougeL': 0.35334592470554504,\n",
       "  'rougeLsum': 0.3531436816128809,\n",
       "  'bertscore_precision': 0.9013194151040984,\n",
       "  'bertscore_recall': 0.8938641093853044,\n",
       "  'bertscore_f1': 0.8974812103480827},\n",
       " 'no2-phrase': {'bleu': 0.3070706729302389,\n",
       "  'precisions': [0.6059703894431928,\n",
       "   0.35829007138221936,\n",
       "   0.2498364409551848,\n",
       "   0.1853562005277045],\n",
       "  'brevity_penalty': 0.9697305346757852,\n",
       "  'length_ratio': 0.9701795472287276,\n",
       "  'translation_length': 12428,\n",
       "  'reference_length': 12810,\n",
       "  'meteor': 0.4945796528722132,\n",
       "  'rouge1': 0.5801524481851765,\n",
       "  'rouge2': 0.3412169064142398,\n",
       "  'rougeL': 0.4394513698947761,\n",
       "  'rougeLsum': 0.43945176655917995,\n",
       "  'bertscore_precision': 0.9131651520729065,\n",
       "  'bertscore_recall': 0.9119005638360977,\n",
       "  'bertscore_f1': 0.9124281704425812}}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d.metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "lyc",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
