{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## load original documents & HP-related index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1165029"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def read_file_as_documents(file_path):\n",
    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
    "        content = file.read()\n",
    "    # Splitting the content by newlines\n",
    "    documents = content.split('\\n')\n",
    "    # Removing empty strings if any\n",
    "    documents = [doc for doc in documents if doc.strip() != '']\n",
    "    return documents\n",
    "\n",
    "# obtained from huggingface wikitext 103\n",
    "file_path = 'huggingface_wikitext_wikitext-103-v1_train_en_en_document.txt'\n",
    "documents = read_file_as_documents(file_path)\n",
    "len(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4358\n"
     ]
    }
   ],
   "source": [
    "\n",
    "import json\n",
    "file_path = 'HP_related_index_for_wikitext-103.json'\n",
    "\n",
    "# Open the JSON file\n",
    "with open(file_path, \"r\") as f:\n",
    "  # Parse the JSON data and store it in a variable\n",
    "  data = json.load(f)\n",
    "\n",
    "# Access data using dictionary \n",
    "print(len(data[\"HP-related-index\"]))  \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## save the related/unrelated texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_lines_to_file(document, filename):\n",
    "  with open(filename, 'w', encoding='utf-8') as file:\n",
    "    lines = document    \n",
    "    # Write each line with a newline character at the end\n",
    "    for line in lines:\n",
    "      file.write(f\"{line}\\n\")\n",
    "\n",
    "def get_subset_by_index(data_list, index_list):\n",
    "  # Check if indices are within valid range\n",
    "  if any(index < 0 or index >= len(data_list) for index in index_list):\n",
    "    raise IndexError(\"Invalid index provided. Indices must be within the range of the list.\")\n",
    "\n",
    "  # Use list comprehension for concise selection\n",
    "  return [data_list[i] for i in index_list]\n",
    "\n",
    "def get_difference(list_a, list_b):\n",
    "  return list(set(list_a) - set(list_b))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4358"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fname= \"hp-wikitext-103.txt\"\n",
    "\n",
    "subset_doc = get_subset_by_index(documents,data[\"HP-related-index\"] )\n",
    "save_lines_to_file(subset_doc,fname )\n",
    "len(subset_doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1160671\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1160671"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unrelated_index= get_difference(range(len(documents)), data[\"HP-related-index\"])\n",
    "print(len(unrelated_index))\n",
    "fname= \"hp-unrelated-wikitext-103.txt\"\n",
    "\n",
    "subset_doc = get_subset_by_index(documents, unrelated_index)\n",
    "save_lines_to_file(subset_doc,fname )\n",
    "len(subset_doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8716"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## verify \n",
    "fname= \"hp-wikitext-103.txt\"\n",
    "loaded_subset_doc = read_file_as_documents(fname)\n",
    "len(loaded_subset_doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3486371"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## verify \n",
    "fname= \"hp-unrelated-wikitext-103.txt\"\n",
    "loaded_subset_doc = read_file_as_documents(fname)\n",
    "len(loaded_subset_doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## examine the related texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>In Philosopher 's Stone , Harry re @-@ enters the wizarding world at age 11 and enrols in Hogwarts School of Witchcraft and Wizardry . He makes friends with fellow students Ron Weasley and Hermione Granger , and is mentored by the school 's headmaster , Albus Dumbledore . He also meets Professor Severus Snape , who intensely dislikes and bullies him . Harry fights Voldemort several times while at school , as the wizard tries to regain a physical form . In Goblet of Fire , Harry is mysteriously entered in a dangerous magical competition called the Triwizard Tournament , which he discovers is a trap designed to allow the return of Lord Voldemort to full strength . During Order of the Phoenix , Harry and several of his friends face off against Voldemort 's Death Eaters , a group of Dark witches and wizards , and narrowly defeat them . In Half @-@ Blood Prince , Harry learns that Voldemort has divided his soul into several parts , creating \" horcruxes \" from various unknown objects to contain them ; in this way he has ensured his immortality as long as at least one of the horcruxes still exists . Two of these had already been destroyed , one a diary destroyed by Harry in the events of Chamber of Secrets and one a ring destroyed by Dumbledore shortly before the events of Half @-@ Blood Prince . Dumbledore takes Harry along in the attempt to destroy a third horcrux contained in a locket . However the horcrux has been taken by an unknown wizard , and upon their return Dumbledore is ambushed and disarmed by Draco Malfoy who cannot bring himself to kill him , then killed by Snape .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>Luna , Ron , Ginny , and Neville join them in the forest and all six fly to the Ministry on <unk> , expecting to find and rescue Sirius . Once in the Department of Mysteries , Harry realises that his vision was falsely planted by Voldemort ; however , he finds a glass sphere that bears his and the Dark Lord 's names . Death Eaters led by Lucius Malfoy attack in order to capture the sphere , which is a recording of a prophecy concerning Harry and Lord Voldemort , which is revealed to be the object Voldemort has been trying to obtain for the whole year , the Dark Lord believing that there was something he missed when he first heard the prophecy . Lucius explains that only the subjects of the prophecies , in this case Harry or Voldemort , can safely remove them from the shelves . Harry and his friends , soon joined by members of the Order , enter a battle with the Death Eaters . Amidst the chaos , Bellatrix Lestrange kills Sirius and Harry faces Voldemort . Voldemort attempts to kill Harry , but Dumbledore prevents him and <unk> fights the Dark Lord to a stalemate . In the midst of the duel , Voldemort unsuccessfully tries to possess Harry in an attempt to get Dumbledore to kill the boy . Dumbledore does not do so and Voldemort escapes just as Cornelius Fudge appears , finally faced with first @-@ hand evidence that Voldemort has truly returned .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>During another summer with his Aunt Petunia and Uncle Vernon , Harry Potter and Dudley are attacked by <unk> . After using magic to save Dudley and himself , Harry is expelled from Hogwarts , but the decision is later rescinded . Harry is whisked off by a group of wizards to Number 12 , Grimmauld Place , the home of his godfather , Sirius Black . The house also serves as the headquarters of the Order of the Phoenix , of which Mr. and Mrs. Weasley , Remus Lupin , Mad @-@ Eye Moody , and Sirius are members . Ron Weasley and Hermione Granger explain that the Order of the Phoenix is a secret organisation led by Hogwarts headmaster Albus Dumbledore , dedicated to fighting Lord Voldemort and his followers , the Death Eaters . From the members of the Order , Harry and the others learn that Voldemort is seeking an object that he did not have prior to his first defeat , and assume this object to be a weapon of some sort . Harry learns that the Ministry of Magic , led by Cornelius Fudge , is refusing to acknowledge Voldemort 's return because of the trouble that doing so would cause , and has been running a smear campaign against him and Dumbledore .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>Harry , Ron , Ginny , and Lockhart return to the main castle and reunite with McGonagall , Dumbledore , and Mr. and Mrs. Weasley . Ginny , whose <unk> by Voldemort caused all of the petrification and troubles over the course of the year , is given a reprieve by Dumbledore , who reasons that greater wizards have been duped by Voldemort before and takes great interest in the qualities of the diary , which Harry gives to him . Lucius Malfoy bursts in after this meeting , demanding to know why and how Dumbledore has returned to the school and accompanied by Dobby , revealing the family to whom he is enslaved . The house @-@ elf also provides Harry with unspoken cues regarding the diary ’ s ownership : While it was Tom Riddle ’ s , it had been in the Malfoys ’ possession , and Harry returns it , devising a scenario involving his own sock that frees Dobby from the Malfoys ’ employment . The petrified students are cured , the end @-@ of @-@ year exams are cancelled ( much to Hermione ’ s chagrin ) , Hagrid comes back in the middle of the final feast , and Harry returns to Privet Drive in higher spirits than he last left it .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>Scrimgeour arrives at the Burrow with Albus Dumbledore 's Will and distributes three items to Ron , Hermione , and Harry . Ron receives Dumbledore 's <unk> , Hermione a copy of The Tales of Beedle the Bard , and Harry the first Golden Snitch that he ever caught in a Quidditch match . Scrimgeour reveals that Harry was also bequeathed the Sword of Godric Gryffindor , but the Minister states that the sword was not Dumbledore 's to pass on and , in any case , is missing .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from IPython.display import HTML\n",
    "\n",
    "def display_string(long_string):\n",
    "    html_string = \"<pre style='white-space: pre-wrap; overflow-x: auto;'>\" + long_string + \"</pre>\"\n",
    "    display(HTML(html_string))\n",
    "\n",
    "for  indice in data[\"HP-related-index\"][:5]:\n",
    "    display_string(documents[indice])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>Typhoon Longwang , weakened from its passage of Taiwan , struck mainland China late on October 2 as a low @-@ end typhoon . Off the coast of <unk> County , gusts reached 164 km / h ( 102 mph ) on <unk> Island . Gusts onshore peaked at 137 km / h ( 85 mph ) in Changle within Fuzhou City . Though the storm brought typhoon @-@ force winds , they were mostly confined to coastal areas and its greatest impacts resulted from torrential rains . Much of Fuzhou City experienced over 200 mm ( 7 @.@ 9 in ) of rain , with a maxima of 332 mm ( 13 @.@ 1 in ) in Changle . Of that total , 316 mm ( 12 @.@ 4 in ) fell in a 12 ‑ hour span . Furthermore , one hour accumulations peaked at 152 mm ( 6 @.@ 0 in ) . These rains were described as a 1 @-@ in @-@ 100 year event . Zhejiang and Jiangxi Provinces also experienced heavy rains ; 292 mm ( 11 @.@ 5 in ) fell in Taizhou , Zhejiang while 128 mm ( 5 @.@ 0 in ) was recorded in <unk> County , Jiangxi .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>Reviewers commented on the differences between Free and Office of Strategic Influence . Rosenberg described Free as \" heavier , darker , more experimental and a lot more consistent \" than the first album . Stewart Mason of Allmusic described the album as \" downright commercial in a way that none of the duo 's previous projects have been \" . He regarded the album as less like the \" standard @-@ issue epic metal \" of the first album and more like \" Evanescence 's gothy metal @-@ pop crossed with late @-@ era Radiohead 's fondness for electronic interference \" . He considered the songwriting as \" catchy enough that it 's not unthinkable that unadventurous rock radio programmers could take a shine to the title track or ' Go ' \" , although noted that \" Fates Warning and Dream Theater fans might be less impressed \" . Rosenberg considers Free as musically \" a more progressive version of Nine Inch Nails . Instead of creating their darkness with angst and depression , OSI creates their darkness with sheer indifference . \" Begrand compared the album to Head Control System , \" in that it tends to stray from the typical metal template in an attempt at something a little more electronic influenced \" . Although not considering Free to be as \" enthralling \" as Head Control System , he noted that the album \" still has its moments \" .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>Harry Potter and the Philosopher 's Stone , along with the rest of the Harry Potter series , has been attacked by several religious groups and banned in some countries because of accusations that the novels promote witchcraft , but other religious commentators have written that the book exemplifies important viewpoints , including the power of self @-@ sacrifice and the ways in which people 's decisions shape their personalities . The series has been used as a source of object lessons in educational techniques , sociological analysis and marketing .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>In the third game of the 1992 season , Majkowski injured a ligament in his ankle against the Cincinnati Bengals , an injury severe enough that he would be out for four weeks . Favre replaced Majkowski for the remainder of the contest . Favre fumbled four times during the course of the game , a performance poor enough that the crowd chanted for Favre to be removed in favor of another Packers backup quarterback at the time , Ty Detmer . However , down 23 – 17 with 1 : 07 left in the game , the Packers started an offensive series on their own 8 @-@ yard line . Favre then completed a 42 yard pass to Sterling Sharpe . Three plays later , Favre threw the game – winning touchdown pass to <unk> Taylor with 13 seconds remaining .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>The first new series of Doctor Who featured eight scripts by Davies ; the remainder were allocated to experienced dramatists and writers for the show 's ancillary releases : Steven Moffat penned a two @-@ episode story , and Mark Gatiss , Robert Shearman , and Paul Cornell each wrote one script . Davies also approached his old friend Paul Abbott and Harry Potter author J. K. Rowling to write for the series ; both declined due to existing commitments . Shortly after he secured writers for the show , Davies stated that he had no intention of approaching writers from the old series ; the only writer he would have wished to work with was Holmes , who died in May 1986 .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>\" Extraordinary Merry Christmas \" is the ninth episode and mid @-@ season finale of the third season of the American musical television series Glee , and the fifty @-@ third overall . Written by Marti Noxon and directed by Matthew Morrison , the episode aired on Fox in the United States on December 13 , 2011 , and features the members of New Directions starring in a black @-@ and @-@ white Christmas television special that is presented within the episode itself .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>Stiller is also the only member of this group to have appeared in a Brat Pack film ( Fresh Horses ) .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>The front view of the building beside the River Thames with St Paul 's Cathedral in the background and the Millennium Bridge on the right is occasionally seen in popular media such as in the BBC News 60 @-@ second countdown as well as in an early scene of the 2005 movie , The Constant Gardener and in the 2009 film Harry Potter and the Half @-@ Blood Prince .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>In the words of Kelley , by 1959 , Sinatra was \" not simply the leader of the Rat Pack \" but had \" assumed the position of il padrone in Hollywood \" . He was asked by 20th Century Fox to be the master of ceremonies at a luncheon attended by President Nikita Khrushchev on September 19 , 1959 . Nice ' n ' Easy , a collection of ballads , topped the Billboard chart in October 1960 and remained in the charts for 86 weeks , winning critical plaudits . Granata noted the \" lifelike ambient sound \" quality of Nice and Easy , the perfection in the stereo balance , and the \" bold , bright and snappy \" sound of the band . He highlighted the \" close , warm and sharp \" feel of Sinatra 's voice , particularly on the songs \" September in the Rain \" , \" I Concentrate on You \" , and \" My Blue Heaven \" .</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<pre style='white-space: pre-wrap; overflow-x: auto;'>= = Packaging = =</pre>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for indice in data[\"HP-related-index\"][-10:]:\n",
    "    display_string(documents[indice])\n",
    "\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mmlu",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
