{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/XXXX-30/XXXX-29/XXXX-31/scratch/XXXX-1/frontier_conda_simpleton/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_from_disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'text', 'added', 'created', 'source'],\n",
       "    num_rows: 13095416\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds = load_from_disk(\"/XXXX-30/XXXX-29/XXXX-31/scratch/XXXX-1/llm-pretraining-root/output/data/raw_dolma_v1_6-sample/train\")\n",
    "# ds = load_from_disk(\"/XXXX-30/XXXX-29/XXXX-31/scratch/XXXX-1/llm-pretraining-root/output/data/genqa_all_shuffled\")\n",
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.8"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 10 * 0.05\n",
    "4 * 0.95"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1216"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# mbsz = 10\n",
    "# mbsz = 9.5\n",
    "mbsz = 3.8\n",
    "# mbsz = 0.5\n",
    "# num_nodes = 16\n",
    "num_nodes = 40\n",
    "num_gpus_per_node = 8\n",
    "\n",
    "# world_batch_size = mbsz * num_nodes * num_gpus_per_node\n",
    "world_batch_size = int(mbsz * num_nodes * num_gpus_per_node)\n",
    "world_batch_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[3232128, 3232129, 3232130]\n",
      "{'id': ['https://rnbmag.tv/black-girls-rock/', 'http://www.readabstracts.com/Consumer-news-and-advice/Auto-test-1993-Auto-report-93.html', 'https://informvest.net/skills-needed-to-be-a-successful-entrepreneur/'], 'text': ['BLACK GIRLS ROCK! Inc. is 501(c)3 non-profit youth empowerment and mentoring organization established to promote the arts for young women of color, as well as to encourage dialogue and analysis of the ways women of color are portrayed in the media.\\nSince 2006, BLACK GIRLS ROCK! has been dedicated to the healthy development of young women and girls. BLACK GIRLS ROCK! seeks to build the self-esteem and self-worth of young women of color by changing their outlook on life, broadening their horizons, and helping them to empower themselves. For the past five years, we have enjoyed the opportunity to enrich the lives of girls aged 12 to 17 years old through mentorship, arts education, cultural exploration and public service. At BLACK GIRLS ROCK!, young women are offered access to enrichment programs and opportunities that place special emphasis on personal development through the arts and cooperative learning.\\nBLACK GIRLS ROCK! Inc. continues to partner with BET Networks in the worldwide broadcast of BLACK GIRLS ROCK! The event continues to be a historic and monumental show highlighting the accomplishments of exceptional women of color who have made outstanding contributions in their careers, and who stand as inspirational and positive role models in their communities.\\nDon’t miss this year’s BLACK GIRLS ROCK! on Sunday, November 4, 2012. For more information about Black Girls Rock, Inc., check out their website.\\nPlus Size Style – Embrасе Yоur Shape With Thеѕе Pluѕ Sizе Fаѕhiоn...', 'Used automobiles, sport-utility vehicles, trucks, and vans by Dodge, Eagle, Ford, Geo, GMC and Honda for the model years 1990 - 1998 are evaluated for performance. Best buys are noted. Recall history and trouble spots are discussed.\\nSeventeen automobiles, the Dodge Caravan minivan and Ford Explorer truck are evaluated for accommodations, ride and drive. Good points, bad points, an overall conclusion and specifications for each 1993 model included are provided.', 'Well, some people are born with the skills of becoming a successful entrepreneur and others need to work hard to achieve those skills that can help them in making a business successful. These skills are mainly determined into two sets one is a hard skill and the other is a soft skill. The skills that are related to hard skills are financial planning, marketing, and accounting, these are required to manage or run a business. Whereas soft skills help scale up your business like decision-making, communication, and problem-solving. An entrepreneur needs to possess all these skills to get success.\\nBusiness management skills are one of the most important traits of a successful entrepreneur that allows them to make their business meet all the goals and run successfully. This skill will help the entrepreneurs in understanding every aspect of their business and they will be able to manage all the departments of their business effectively. Delegation and multitasking are part of business management skills and offer better productivity in the workplace.\\n“Entrepreneurs are not driven by fear; they are driven by the idea to create impact,” said Bhavish Aggarwal, co-founder of Ola Cabs, an Indian multinational ridesharing company offering services that include vehicle for hire and food delivery.\\nWell, every successful entrepreneur knows the value of effective communication skills and how they can help in building a strong relationship with clients, customers, employees, and potential stakeholders. An entrepreneur uses various methods to provide effective communication like emails, messages, meetings, or face-to-face conversations. Along with communication active listening is also crucial that allows you to understand the emotions and feelings of others. When you offer active listening to your employees then it will make them feel valued and respected in the organization.\\nEffective communication is essential, both in the office and in life. Extraordinary leaders like Neil Mitchell and others make sure they are heard and understood, but they also know the importance of listening. Over the years Neil Mitchell Players Health has been successful in applying his out-of-the-box thinking to client problems and developing unique solutions that create value for others.\\nRisk-taking skills require the ability to provide calculated strategy and understanding of the risk, it is essential for being a successful entrepreneur. Before taking any risk entrepreneurs will tend to do deep research work and calculations to find out the end results of those risks and then take the decision. A good entrepreneur is well aware of how taking risks can make or break the business.\\nNetworking skills can offer great opportunities for future business. So, many successful entrepreneurs focus on building strong networks that allow them to connect with different successful entrepreneurs that can help them improve their skills and understand the important values for running a business successfully.'], 'added': ['2023-03-23T14:56:02.290448+00:00', '2023-03-23T14:57:35.168595+00:00', '2023-03-23T14:58:25.159324+00:00'], 'created': ['2022-10-05T01:26:22Z', '2022-10-05T01:28:48Z', '2022-10-05T03:14:52Z'], 'source': ['common-crawl', 'common-crawl', 'common-crawl']}\n",
      "[1497, 464, 3002]\n"
     ]
    }
   ],
   "source": [
    "# target_step_index = 2500\n",
    "# target_step_index = 2600\n",
    "# target_step_index = 2630\n",
    "# target_step_index = 2640\n",
    "# target_step_index = 2650\n",
    "# target_step_index = 2700\n",
    "# target_step_index = 2800\n",
    "target_step_index = 2658\n",
    "# target_step_index = 2651\n",
    "# target_step_index = 2657\n",
    "\n",
    "# for i in range(2600, 2700):\n",
    "#     batch_indices = list(range(i * world_batch_size, (i + 1) * world_batch_size))\n",
    "#     batch = ds.select(batch_indices)\n",
    "#     # print(i, set(batch[:100][\"source\"]))\n",
    "#     # print(i, set(batch[:100][\"added\"]))\n",
    "#     # print(i, set(batch[:100][\"created\"]))\n",
    "#     print(i, set(batch[:100][\"template\"]))\n",
    "#     print(i, set(batch[:100][\"category\"]))\n",
    "\n",
    "batch_indices = list(range(target_step_index * world_batch_size, (target_step_index + 1) * world_batch_size))\n",
    "print(batch_indices[:3])\n",
    "\n",
    "batch = ds.select(batch_indices)\n",
    "\n",
    "string_lengths = [len(s) for s in batch[\"text\"]]\n",
    "\n",
    "print(batch[:3])\n",
    "print(string_lengths[:3])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['https://www.tabletmag.com/sections/community/articles/walk-like-a-man', 'https://www.theaccountant-online.com/news/newshoogervorst-backs-latin-america-standard-setters-body/', 'http://prabhupadaconnect.com/PhotoAlbum726.html', 'https://www.adpi.org/find-a-supplier/hoogwegt-u-s-inc/', 'https://dunbarry.com/blog/', 'https://forum.zeovit.com/forum/equipment-forums/pumps-plumbing-sump-design/3268-rio-hf-13-16-tubing', 'https://argent-casino.com/the-eu-wants-smartphones-and-tablets-to-last-longer-heres-why-you-should-care/', 'https://huffingtonwire.com/car-and-automobile/top-10-luxury-suvs/', 'https://www.ncfr.org/events/ncfr-webinars/how-be-anti-racist-researcher', 'https://ishabellemanalo.wordpress.com/2010/09/20/', 'https://thatasianlookingchick.com/2022/02/06/my-cat-died-over-six-months-ago-and-im-not-over-it-what-not-to-say-to-someone-grieving-the-loss-of-a-pet/', 'https://beyo11.wordpress.com/', 'https://www.commonsense.news/p/the-new-founders-america-needs?utm_source=substack&utm_medium=email', 'https://www.nbcsports.com/boston/red-sox/mlb-rumors-red-sox-designate-jackie-bradley-jr-assignment?b', 'https://www.nzssda.org.nz/stainless-steel-world-asia-2022-26-27-october-singapore/', 'https://www.capecodtimes.com/story/news/1999/10/23/right-whale-death-blow-to/51019563007/', 'https://www.gamingalexandria.com/wp/2022/02/video-game-notables-2022-the-gaming-alexandria-game-hall-of-fame/', 'https://eroticon.co/speaker-session-submission/', 'https://www.starpool.com/careers', 'https://www.wilsoninsurancebrokers.com/resource-center/insurance/what-does-boat-insurance-cover']\n",
      "['common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl', 'common-crawl']\n"
     ]
    }
   ],
   "source": [
    "# start = 1093060 # c4 to cc change\n",
    "start = 3238000\n",
    "print(ds.select(range(start,start+20))[\"id\"])\n",
    "print(ds.select(range(start,start+20))[\"source\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "898.8486842105264"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# start / world_batch_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'text', 'added', 'created', 'source'],\n",
       "    num_rows: 13095416\n",
       "})"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds = load_from_disk(\"/XXXX-30/XXXX-29/XXXX-31/scratch/XXXX-1/llm-pretraining-root/output/data/raw_dolma_v1_6-sample/train\")\n",
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Flattening the indices (num_proc=32): 100%|██████████| 13095416/13095416 [00:56<00:00, 230389.74 examples/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'text', 'added', 'created', 'source'],\n",
       "    num_rows: 13095416\n",
       "})"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shuffled_ds = ds.shuffle(seed=42)\n",
    "flat_shuffled_ds = shuffled_ds.flatten_indices(num_proc=32)\n",
    "flat_shuffled_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving the dataset (73/73 shards): 100%|██████████| 13095416/13095416 [00:45<00:00, 288773.87 examples/s]\n"
     ]
    }
   ],
   "source": [
    "flat_shuffled_ds.save_to_disk(\"/XXXX-30/XXXX-29/XXXX-31/scratch/XXXX-1/llm-pretraining-root/output/data/raw_dolma_v1_6-sample/train_shuffled\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = load_from_disk(\"/XXXX-30/XXXX-29/XXXX-31/scratch/XXXX-1/llm-pretraining-root/output/data/raw_dolma_v1_6-sample/train\")\n",
    "shuff_ds = load_from_disk(\"/XXXX-30/XXXX-29/XXXX-31/scratch/XXXX-1/llm-pretraining-root/output/data/raw_dolma_v1_6-sample/train_shuffled\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['e95bdc3748368f8aff7e6f14e84e38ef0cec38e8',\n",
       " '02c98cf97f31cf4e42f9cf9ebba879bb847ac8e3',\n",
       " '042baa35a67a866347693ea501a82b1e252af646',\n",
       " '61f0f8af384163a4db1ba02f8b04f6a9a16aa9df',\n",
       " 'e29fec239360b4cd0e4c3d599b4f035044759e5e',\n",
       " '6891ce767893a54e71a1a55d572048018fd3562c',\n",
       " '0d4c789d96239e9508178b6059ec892e15e7e8b7',\n",
       " '94f70b92dcc58c75352ea2a0047a4e88ec2ad52f',\n",
       " 'ec7869bfbdd98b66f0fbb392f4c8328e9e5081ac',\n",
       " '9fecdea14f9b06046c03838a59b95d83c6080337',\n",
       " '7fc3f2c79c61a6b3bcd7beb8b5357eb02f5292a5',\n",
       " '4627b6a99fd151387d2619addaa5eb9c89a67be0',\n",
       " 'db89f7306c86e8c39561e3796bbfa5cf321b4a75',\n",
       " 'da4cfbdcb6db7f909c5c4d6e28d3b918a190af6e',\n",
       " 'cce796f71bf12ab88dac0444f17a0fd410f0bff0',\n",
       " 'e5d2594da352d935984d1b2194c3e5ebe6a7e1a2',\n",
       " '9edbe5b097aa9b55cf8acf6a156456a408f78470',\n",
       " 'dcf731f9fac11bb97519a8a1294814b2d14e8338',\n",
       " '383ae34678253f0f7234fc36b46cf6be893934e4',\n",
       " '6bc20703b15b2085dae1142f361b84682b671a79',\n",
       " 'b43167f5a4fbf5a155a99adfbc63ac978f05275b',\n",
       " 'c69df1e7168a586ebdb721333008a25b5bbfadee',\n",
       " 'ceba1a066dee74194df51ded75115fe28a6ca337',\n",
       " '5bac49adc314db7397204799e6bef69f33845af6',\n",
       " 'b73eae64fa3a3d01a12b0ff723444406ed8439e0',\n",
       " '386a91d1a6f9aada5baecc2c328432f429f6349d',\n",
       " 'cd8ee2b4dce44b0a088f9f32d6c983afd4bc1384',\n",
       " 'b08410e076d36b4388e7959af0f4fa46771e8652',\n",
       " '1d1ade587d57b735a0ce10372e31b8f7b7df3427',\n",
       " 'c5d11a52f67649a125902e43c64857263a18470a',\n",
       " '21ddbad17fe185681c8cbbed6aa2db4996834d40',\n",
       " 'bd9decdbe4f06fadd3a33cd2c1fee6a6fae9de98',\n",
       " 'c900e48e8610c3e65d9c228e4e56790e9bd4137a',\n",
       " 'a55b80d614e0e9db221404e0073c32f500fd86a5',\n",
       " '857475ac1d98395989c458b87e34fb1c96bd78f3',\n",
       " '31313d863de0b377148feb791f7ea0609e7d9942',\n",
       " 'a8999cbdf83fda68f3b3af76395072f5e8c26b08',\n",
       " 'a03494a5c39b12b09a8124a75c237c8c47682968',\n",
       " '0393a5886968f43ebb0ec4512a0a539708102730',\n",
       " '4ec0377cdf216160297bdee2f8ee0318d51e6a50',\n",
       " '71ec72ac50c7aa2aa137c19ab4407c3f86ea11da',\n",
       " '50951f68e8938b0cc1b46e55c46cfa8ccd838914',\n",
       " '3112ce97795edc74c94274a9511e27a1ac7cb3d8',\n",
       " '2d0b6c32618fe7fe71701dec6fc02e60ed8c230e',\n",
       " '2617bef621e826d2b5c43931bee78dd23c156068',\n",
       " 'c528fe39fbe650ed23f47cca522cefb242ef9485',\n",
       " '5bc24c873243a8214a34551d270f743ce7b89ec1',\n",
       " '5d0644abb2cde627067734ed26f49babed4ca9b4',\n",
       " 'ea7d8a03d0bb3588189523af9d5e2a2e10716c5e',\n",
       " '38808854764980fa729dea74f977edf1b1e29da9',\n",
       " '1a4e8c0fc1c54e09e283937434a08e0f9a6c921f',\n",
       " '1af89d121c0438f445c26f4f94cb3c70a2bf03ec',\n",
       " '9a37aa40f02e02dfb639cd5b7f0edb5356bf8153',\n",
       " '478ceddfe767787a9a5c9f1c33888dc99ea20492',\n",
       " '1adc5680aedc64c905435924da05f84a9f19ef9b',\n",
       " '004c5a55fb8f76460847951778bee510556f84a3',\n",
       " 'a32866009299068f3b292c6243d19cf411c9bd51',\n",
       " 'dec4ef3bf2ee8a0ec46a9c9c7f75f68ad7cd902a',\n",
       " '48830d86db9adcb336bb82f0c4d345af8d0155ba',\n",
       " '038f8ecc9df7a5787eb974d3260e86cadff09ed2',\n",
       " '5c921ad2d89db1e9182bf12a07aef4704d04a1a8',\n",
       " '248aa4875c289ee1ff665b20aabcb6c340561892',\n",
       " 'c0ed8dd3e850575b92784f5c5c1cfc86beb112ec',\n",
       " '605c53ac3c9774f83fbf66efda5c0395e5f51f78',\n",
       " 'ce1cfcab154ff0dd0cf3167044598f264f3fb8a2',\n",
       " 'e232fa5242b25a63296a00af724489390f57f174',\n",
       " 'd0e7ceeaef2ec66b85074527f9e84712db98cbb5',\n",
       " '862efe4a4c49e53984d20db1f8ef5bfe67a97002',\n",
       " 'c745d1450093dc91931fef4305608e052376d7b1',\n",
       " '163c14f10b5193607a5bc443302fafd1bae15728',\n",
       " '0f823412ec6a1ba57df51098070bd60c818eb17f',\n",
       " '89be1aef04a8e1ffcf7393eff970cf79931a6ead',\n",
       " '5eef027b9ce489f95ad5a284a0b90227e3315a92',\n",
       " '6adcf1248a2dbf2bc46b33d760eeb6926045f32c',\n",
       " '7c2a329bd4ac33a3451fa07851b5e75e470140e5',\n",
       " 'c8b6cf6f58a40d99204086097db829dd80ceb1d7',\n",
       " '4fed5a7569ddbd3573f3dfeb91dea121ab4b0c0e',\n",
       " 'e65e6d41e67a43daf937f4fe801921737e8f1a1d',\n",
       " '8b541b1215edd0b29f61d5ba1eae3d49d5c6da96',\n",
       " '84c49c9c05966c93321a8ea5c86f600102836786',\n",
       " 'ae0166a1d604fbd4878c257469b6b8d4e40b1eed',\n",
       " 'acbf0207c600cee9eb226166436d456d71fa29b0',\n",
       " 'ed870c7ebeac78e9e3495da51fae7937c50ec421',\n",
       " 'f6f28074ca79be6fb0977f8481ba146371b3b550',\n",
       " '663d37ca4e8fe73d44301a1d3f02f9b06878e345',\n",
       " '793570fd7f77ba38eb5a4d16c7280f567ccd3904',\n",
       " '6570264a422770691262c8eb264de7bd08b35893',\n",
       " '3e0754d3ee1edc1fe405a958337544222d96aa8f',\n",
       " '3e89e3d502aaf1a815cd3e9bd3756a37254c4ce0',\n",
       " '110416bd84eff1f466a9eeef7c2c3fdcb31daf6a',\n",
       " '3706898d74158ac8a36a2e1e5d59504a490538a4',\n",
       " 'fd1e0a50ac2b1c952254b94b75baf6a08905f85a',\n",
       " '45a50fdbaf86f0c7e2123817b1e0b30cd3414a08',\n",
       " 'a6a0ab133d21962ccb4d3bd6355ea588d3563d12',\n",
       " 'f5c744dacb048e9c2786bdb8290f83ad904b9e86',\n",
       " '4f0101120b8f3dd56c6189c0ee7f679b11638815',\n",
       " '5ddd40917c8b3be52f6095f187968408db56e4fa',\n",
       " '3a482f624556bd599082156873da2a1b7b7418a4',\n",
       " '994d3e2b9b9c06e0e3d5ad57bed10af1d291b881',\n",
       " '353e579ad4ea74271d35a9e862deb82d0100cd8d']"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds[:100][\"id\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.bilaterals.org/?entry-into-force-of-the-canada&lang=es',\n",
       " 'https://www.truvox.com/news/the-interim-solution/press-releases',\n",
       " 'https://twana.wordpress.com/2008/10/30/confession-of-an-obama-blogger-exit-poll-of-americans-in-israel/',\n",
       " 'https://thedreamweddingchecklist.com/craigslist-hook-up-gays-gay-local-hookup/',\n",
       " 'fd1912e827ff448b8828ea6dc265714f',\n",
       " 'https://www.booked-online.com/hotel/en/merit-resort-samui-koh-samui/',\n",
       " 'http://androidfc.com/9669/download-backgammon-pack-18-games-6-102-apk/',\n",
       " 'https://slotenmakertedenhaag.nl/jamaica/y4ob5r-jaw-crusher.html',\n",
       " 'e0e1a14dc3f6a611a6eda7101ca771ae2e19d853',\n",
       " 'https://www.fashiongonerogue.com/60s-fashion-trends/',\n",
       " '1cd157314ad502e1c41454046ae808078c904e57',\n",
       " 'https://www.archivecollection.co/pages/collection-starter',\n",
       " 'http://connection.ebscohost.com/c/articles/110445305/long-term-effects-betamethasone-epididymal-tissue-epididymal-sperm-counts-fertility-male-mice',\n",
       " '1daaa5d38a5a4e23889aeabdf2340781',\n",
       " 'https://news.stthomas.edu/personal-counseling-sponsors-healthy-relationship-screening-day-tomorrow/',\n",
       " 'https://www.drshaibutler.com/post/overwhelmed-under-pressure',\n",
       " 'https://transparencia.aytosanlorenzo.org/paxil+cr+package+insert',\n",
       " 'https://www.hearthpwn.com/members/user-100163881/reputation',\n",
       " 'https://www.specialolympicsglobalyouth.org/product-page/digital-media-production-in-senegal',\n",
       " 'https://www.2carpros.com/questions/barely-starts-and-when-it-does-it-runs-rough-and-dies',\n",
       " 'https://thetimesofnewyork.com/2020/11/12/new-zealands-chatham-islands-is-likely-to-be-the-one-place-on-the-earth-experiencing-overtourism-proper-now/',\n",
       " '60d4845bef9cc06b0d4fb482e8045bfb',\n",
       " 'https://wolverinestatewatch.com/the-photographer-and-architect-from-ann-arbor-opens-a-studio-co-working-space-as-a-clubhouse-for-creative-people/',\n",
       " 'https://muffin.wow-womenonwriting.com/2020/05/meet-allison-hong-merrill-runner-up-in.html?showComment=1588603621549',\n",
       " 'https://bvobgyn.com/privacy-policy/',\n",
       " 'https://www.seoclerk.com/tags/Wiki-Links/backlinks',\n",
       " 'https://www.readerviews.com/keep-your-sensitivity-in-check-while-promoting-your-book/',\n",
       " 'http://www.footballtarget.com/liverpool/page/3/',\n",
       " 'http://www.tiphones.com/content/23112/Capcom-Brings-Some-Explosive-Fun-to-the-App-Store-With-the-Launch-of-%22BombLink%22-for-the-iPhone.htm',\n",
       " 'https://www.entostudio.com/staff-member/simone-martini/',\n",
       " 'https://www.sportsmobileforum.com/forums/f10/picked-up-my-dana60-in-oklahoma-city-28764-2.html',\n",
       " 'https://9to5toys.com/2019/07/23/ultimate-ears-boom-3-bluetooth-speaker-more/',\n",
       " 'https://www.lifestyle-vogue.com/new/syros-light-packaway-funnel-neck-quilted-jacket-in-golden-apricot-tokyo-laundry-yellow-tokyo-laundry-womens-jackets-coats/',\n",
       " '41675a766713faadfb7ca08a395c559c',\n",
       " 'https://42dev.eu/epididymite518431.html',\n",
       " 'https://www.spsc.tugraz.at/student-projects/wireless-power-transfer-for-6G.html',\n",
       " 'https://www.furnacerepair.net/libuse/LA/lennox-furnace-repair',\n",
       " 'https://www.flexjobs.com/search?accolade%5B%5D=Barrons+400&accolade%5B%5D=Information+Week+500&accolade%5B%5D=FlexJobs+Top+100+Remote&accolade%5B%5D=CareerBliss+50+Happiest+Companies+in+America&accolade%5B%5D=Most+Ethical+Companies&location=texas&schedule%5B%5D=Alternative+Schedule&search=&tele_level%5B%5D=Option+for+Telecommuting&title%5B%5D=Premium+Product+Support&will_travel%5B%5D=Yes%2C+a+lot',\n",
       " 'http://mcfn.org/donor-tracking?candidate=104',\n",
       " 'https://www.nurevdenevenakliye.com/credit-score-to-get-apple-credit/',\n",
       " 'https://www.parislogue.com/articles/hotels-around-gare-du-nord-train-station.html',\n",
       " 'https://www.dignitymemorial.com/obituaries/margate-fl/marie-erenstoft-10541115',\n",
       " 'http://elkinsalpacashearing.com.au/training-demonstrations/',\n",
       " 'https://www.gaiagps.com/hike/225365/duke-creek-falls-trailhead-via-dukes-creek-falls/',\n",
       " '62c1e403f4b17a9aaf536aef0a5b1d5a',\n",
       " 'adbe97cc11b44bf8a283f35b131b0719',\n",
       " 'https://thecardiffkook.info/brazzers-brazzers-exxtra-eva-notty-and-jordi-el-nino-pol-9632',\n",
       " '36e1ce9dcbf972c7f37cb2c008d439a7',\n",
       " 'https://dancesc.wordpress.com/tag/e4-udderbelly/',\n",
       " 'https://www.insider.com/best-resistance-band-workouts',\n",
       " '5f7f7a2e1b43458137507ae6028f6860',\n",
       " 'a61885acb26d4c258a210fbc90417bb7',\n",
       " 'https://www.listencorp.co.uk/post/rival-consoles-articulation',\n",
       " 'https://havenbenefits.com/hhs-dol-release-new-cobra-guidance-model-notice/',\n",
       " 'https://www.texpers.org/index.php?option=com_dailyplanetblog&view=entry&year=2022&month=10&day=12&id=175:public-outreach-is-part-of-life-saving-efforts-of-texas-29-500-firefighters',\n",
       " '18b0ee88a2b3dfc56a8802b7f22a8753093e90d5',\n",
       " 'https://www.wololo.net/talk/search.php?author_id=70552&sr=posts',\n",
       " 'e2a3fa0e0d60c480bd73937a7bda5da5e189dfec',\n",
       " 'https://greenerblog.blogspot.com/2010/09/has-stephen-hawking-got-his-sums-right.html',\n",
       " 'https://www.scranton.edu/information-technology/services/argos-faq.shtml',\n",
       " 'https://www.murumuru.nl/products/secret-key-rose-water-base-toner-550ml',\n",
       " 'https://al-salaam.com/tag/concepts',\n",
       " 'http://www.ellekae.com/my-2019-resolutions-reflections/',\n",
       " 'http://ruthrowland.blogspot.com/2016/11/',\n",
       " 'b3a3084a2e5ce5301c26e1dd28722901',\n",
       " 'f49c09ff84104bf3ad44530e0bf151a7',\n",
       " 'https://www.dunany.ca/fr/documents-fr/proces-verbal-de-lassemblee-generale-annuelle-2010',\n",
       " '52991887e4984c9c90a9b59f6a36a0cb',\n",
       " '7d8760111c01434f9c8f6cc7919a8fc8',\n",
       " 'https://www.funeralguide.co.uk/obituaries/85396?branded',\n",
       " '744973a6739de4fe14fd58b9c3e8a4a2',\n",
       " 'https://www.petitesideofstyle.com/2013/05/rolled-up-at-bottom.html',\n",
       " 'https://www.businessinsider.com.au/this-is-the-most-expensive-house-in-victoria-2016-6',\n",
       " '1b7b03f7d4684f06b19e7d25803754f5',\n",
       " 'https://sarvin.eu/collections/jumpsuits-trousers-skirts',\n",
       " 'beivhe',\n",
       " 'https://www.newghnews.com/psychologist-consultancy-firms-required/',\n",
       " '0a2207c3b9a4490294fecf780bcbe545',\n",
       " 'https://www.scisolinc.com/product/female-npt-valve/',\n",
       " 'https://www.medicalnewstoday.com/articles/322771.php',\n",
       " 'https://lvrg.com/get-downloads/',\n",
       " 'https://code.dccouncil.us/us/dc/council/code/sections/7-1303.14',\n",
       " 'https://www.nurse.com/jobs/job/4629293/travel-nurse-rn-picu-pediatric-intensive-care-2-096-per-week/',\n",
       " 'https://carolbalawyder.com/2016/04/08/how-i-got-published-julieann-dove/',\n",
       " 'http://www.snooth.com/wine/b-cellars-blend-23-2011/',\n",
       " 'a4f942d2291c4d29914a872a540f9fb1',\n",
       " 'https://www.si.com/hockey/news/islanders-veteran-goaltender-rick-dipietro-cleared-for-practice',\n",
       " 'https://timesofindia.indiatimes.com/business/india-business/india-has-grown-dramatically-for-apple-says-philip-schiller/articleshow/57954599.cms',\n",
       " '58ac5daa9dbd45a9bd07877a93512664',\n",
       " 'http://www.audiomeasurements.com/?page_id=5774',\n",
       " 'bd79711e52ba4e55befb13ff3a188165',\n",
       " 'http://www.womenagainstrape.net/category/resources/books',\n",
       " 'https://www.takesontech.com/fcc-web-site-point-to-new-motorola-handset-for-att',\n",
       " 'https://www.skypack.dev/view/part-loader',\n",
       " 'https://www.partykachevy.com/VehicleDetails/new-2020-Chevrolet-Silverado_1500-Double_Cab_Standard_Box_4_Wheel_Drive_Custom-Hamden-CT/3560221683',\n",
       " 'http://omskstar.ru/9003.php',\n",
       " 'https://www.stringfellow.bz/truck-equipment/snow-ice-equipment/material-spreaders-dump-truck/',\n",
       " 'https://stage-edx-prospectus.edx.org/masters/online-master-in-nutritional-sciences-utaustin',\n",
       " 'https://vmtconsulting.com/mentoring/',\n",
       " 'https://geoacoustics.com/geoacoustics-partner-with-tazmar-maritime/']"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shuff_ds[:100][\"id\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
