{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "57e128d3-7177-494a-ab7d-4a8d813f8429",
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import sent_tokenize"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "abe6fcdf-fd91-4d80-8d1d-2197d7110f9b",
   "metadata": {},
   "source": [
    "### NLTK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c3c85c46-e2e1-4b08-8eed-765cb7298b93",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"\"\"\n",
    "Hello Mr.Smith, how are you doing today?\n",
    "The weather is great, and Python is awesome.\n",
    "\n",
    "The sky is pinkish-blue. You shouldn't eat cardboard! What about domains like example.com?\n",
    "Let's test one more sentence.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3be0ce68-00ed-40a8-a62d-149b12b882cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentences = sent_tokenize(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cbad3bae-02d5-449e-afcb-d39a9cd8a0b5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['\\nHello Mr.Smith, how are you doing today?',\n",
       " 'The weather is great, and Python is awesome.',\n",
       " 'The sky is pinkish-blue.',\n",
       " \"You shouldn't eat cardboard!\",\n",
       " 'What about domains like example.com?',\n",
       " \"Let's test one more sentence.\"]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentences"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bd3878f0-a40f-426c-9531-6fbfc1b82ed9",
   "metadata": {},
   "source": [
    "#### Chat-GPT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ef890d26-8c06-4cc3-872e-47b3c89275e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "83464c48-a533-44da-b753-d8e971611fa2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_sentence(paragraph, return_offsets_mapping=True):\n",
    "    \"\"\"\n",
    "    Parse a paragraph into spans based on delimiters and return offset mappings.\n",
    "    \n",
    "    Args:\n",
    "        paragraph (str): The input English paragraph.\n",
    "    \n",
    "    Returns:\n",
    "        spans (list): A list of spans split by the delimiters.\n",
    "        offset_mapping (list): A list of tuples indicating the start and end position of each span.\n",
    "    \"\"\"\n",
    "    # Regex pattern for the delimiters\n",
    "    pattern = r\"[.,;?!]\"\n",
    "    \n",
    "    spans = []\n",
    "    offset_mapping = []\n",
    "    start = 0\n",
    "    \n",
    "    for match in re.finditer(pattern, paragraph):\n",
    "        end = match.end()\n",
    "        span = paragraph[start:end].strip()\n",
    "        if span:  # Only add non-empty spans\n",
    "            spans.append(span)\n",
    "            offset_mapping.append((start, end))\n",
    "        start = end\n",
    "    \n",
    "    # Add the last span if there's any text left after the final delimiter\n",
    "    if start < len(paragraph):\n",
    "        spans.append(paragraph[start:].strip())\n",
    "        offset_mapping.append((start, len(paragraph)))\n",
    "    \n",
    "    if return_offsets_mapping:\n",
    "        return {'input_ids': spans, 'offset_mapping': offset_mapping}\n",
    "    else:\n",
    "        return {'input_ids': spans}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "bad63e06-3dd6-4abf-854d-5a00d99eaddc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': ['Hello Mr.',\n",
       "  'Smith,',\n",
       "  'how are you doing today?',\n",
       "  'The weather is great,',\n",
       "  'and Python is awesome.',\n",
       "  'The sky is pinkish-blue.',\n",
       "  \"You shouldn't eat cardboard!\",\n",
       "  'What about domains like example.',\n",
       "  'com?',\n",
       "  \"Let's test one more sentence.\",\n",
       "  ''],\n",
       " 'offset_mapping': [(0, 10),\n",
       "  (10, 16),\n",
       "  (16, 41),\n",
       "  (41, 63),\n",
       "  (63, 86),\n",
       "  (86, 111),\n",
       "  (111, 140),\n",
       "  (140, 173),\n",
       "  (173, 177),\n",
       "  (177, 207),\n",
       "  (207, 208)]}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parse_sentence(text, return_offsets_mapping=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "33346ca5-2d70-407d-ace3-036ed6ae9326",
   "metadata": {},
   "source": [
    "### Gemini"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "30adf5ff-2726-40e4-99cb-dc5d97d2f177",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from typing import List, Tuple\n",
    "\n",
    "def split_paragraph(paragraph: str) -> Tuple[List[str], List[Tuple[int, int]]]:\n",
    "    \"\"\"\n",
    "    Splits an English paragraph into smaller spans based on delimiters.\n",
    "\n",
    "    The splitting logic follows a priority:\n",
    "    1. Double newline ('\\n\\n') for major blocks.\n",
    "    2. Newlines followed by list markers (e.g., '1.', '*', '-') for list items.\n",
    "    3. Sentence-ending punctuation ('.', '?', '!') followed by whitespace\n",
    "       for standard prose.\n",
    "\n",
    "    Args:\n",
    "        paragraph (str): The input English paragraph.\n",
    "\n",
    "    Returns:\n",
    "        tuple[list[str], list[tuple[int, int]]]:\n",
    "            - A list of the split text spans (stripped of leading/trailing whitespace).\n",
    "            - A list of tuples indicating the start and end character offset\n",
    "              of each span in the original paragraph.\n",
    "    \"\"\"\n",
    "    spans_data = []\n",
    "    original_paragraph = paragraph # Keep original for offset finding\n",
    "    paragraph = paragraph.strip() # Work with a stripped version for logic\n",
    "\n",
    "    if not paragraph:\n",
    "        return [], []\n",
    "\n",
    "    # --- Helper function to find accurate offsets of stripped text ---\n",
    "    def find_span_offsets(text_to_find: str, search_start: int, search_end: int) -> Tuple[int, int]:\n",
    "        \"\"\"Finds start/end offsets of stripped text within a range of the original.\"\"\"\n",
    "        try:\n",
    "            # Find the first non-whitespace char of the target within the slice\n",
    "            first_char_index = -1\n",
    "            for i, char in enumerate(text_to_find):\n",
    "                if not char.isspace():\n",
    "                    first_char_index = i\n",
    "                    break\n",
    "            if first_char_index == -1: # Span is all whitespace\n",
    "                 return (-1, -1)\n",
    "\n",
    "            # Find the corresponding substring in the original paragraph slice\n",
    "            original_slice = original_paragraph[search_start:search_end]\n",
    "            relative_start = original_slice.find(text_to_find[first_char_index:])\n",
    "            \n",
    "            if relative_start != -1:\n",
    "                 # Adjust relative start back to the actual beginning of the stripped content\n",
    "                 # by skipping leading whitespace found in the original slice\n",
    "                 adjusted_relative_start = relative_start\n",
    "                 while adjusted_relative_start > 0 and original_slice[adjusted_relative_start -1].isspace():\n",
    "                      adjusted_relative_start -=1\n",
    "                 \n",
    "                 # More robust: Find the first occurrence of the stripped text within the search range\n",
    "                 stripped_text = text_to_find.strip()\n",
    "                 if not stripped_text: return (-1,-1) # Should not happen if first_char_index != -1\n",
    "\n",
    "                 start_offset_in_original = -1\n",
    "                 temp_start = search_start\n",
    "                 while temp_start < search_end:\n",
    "                     found_pos = original_paragraph.find(stripped_text, temp_start, search_end)\n",
    "                     if found_pos == -1:\n",
    "                         break # Not found in remaining range\n",
    "\n",
    "                     # Check if this found position is preceded only by whitespace\n",
    "                     # relative to the search_start or beginning of string\n",
    "                     is_valid_start = True\n",
    "                     for i in range(search_start, found_pos):\n",
    "                         if not original_paragraph[i].isspace():\n",
    "                             is_valid_start = False\n",
    "                             break\n",
    "                     \n",
    "                     if is_valid_start:\n",
    "                         start_offset_in_original = found_pos\n",
    "                         break\n",
    "                     else:\n",
    "                         # Continue searching after this invalid match\n",
    "                         temp_start = found_pos + 1\n",
    "\n",
    "\n",
    "                 if start_offset_in_original != -1:\n",
    "                     end_offset_in_original = start_offset_in_original + len(stripped_text)\n",
    "                     return start_offset_in_original, end_offset_in_original\n",
    "\n",
    "\n",
    "            # Fallback if finding stripped version fails (should be rare)\n",
    "            # Approximate based on search range - less accurate\n",
    "            start = search_start + (len(original_paragraph[search_start:search_end]) - len(original_paragraph[search_start:search_end].lstrip()))\n",
    "            end = start + len(text_to_find.strip())\n",
    "            return start, end\n",
    "\n",
    "        except Exception:\n",
    "             # Fallback in case of unexpected error\n",
    "             return (-1, -1)\n",
    "\n",
    "    # --- Splitting Logic ---\n",
    "\n",
    "    # Priority 1: Split by '\\n\\n'\n",
    "    if '\\n\\n' in paragraph:\n",
    "        delimiter = \"\\n\\n\"\n",
    "        delimiter_len = len(delimiter)\n",
    "        start_offset = 0\n",
    "        indices = [m.start() for m in re.finditer(re.escape(delimiter), original_paragraph)]\n",
    "\n",
    "        for index in indices:\n",
    "            span_text_raw = original_paragraph[start_offset:index]\n",
    "            span_text_stripped = span_text_raw.strip()\n",
    "            if span_text_stripped:\n",
    "                s, e = find_span_offsets(span_text_raw, start_offset, index)\n",
    "                if s != -1:\n",
    "                   spans_data.append((span_text_stripped, (s, e)))\n",
    "            start_offset = index + delimiter_len\n",
    "\n",
    "        # Add the last part\n",
    "        span_text_raw = original_paragraph[start_offset:]\n",
    "        span_text_stripped = span_text_raw.strip()\n",
    "        if span_text_stripped:\n",
    "            s, e = find_span_offsets(span_text_raw, start_offset, len(original_paragraph))\n",
    "            if s != -1:\n",
    "                spans_data.append((span_text_stripped, (s, e)))\n",
    "\n",
    "    # Priority 2: Check for List Items (if \\n\\n wasn't the primary structure)\n",
    "    # Use multiline flag for ^ anchor\n",
    "    elif re.search(r\"^\\s*(\\d+\\.|\\*|-)\\s\", paragraph, re.MULTILINE):\n",
    "        # Split into lines, keeping original structure for offset calculation\n",
    "        lines = original_paragraph.splitlines(keepends=True)\n",
    "        current_span_text = \"\"\n",
    "        current_span_start_char_index = 0\n",
    "        start_char_index = 0 # Tracks position in original_paragraph\n",
    "\n",
    "        for i, line in enumerate(lines):\n",
    "            line_content_stripped = line.strip()\n",
    "            is_list_item = re.match(r\"^\\s*(\\d+\\.|\\*|-)\\s\", line_content_stripped)\n",
    "\n",
    "            # Determine if the line starts a new block (list item or first line)\n",
    "            is_new_block_start = is_list_item or (i == 0 and line_content_stripped)\n",
    "\n",
    "            if is_new_block_start and current_span_text:\n",
    "                 # Finish the previous span\n",
    "                 current_span_stripped = current_span_text.strip()\n",
    "                 if current_span_stripped:\n",
    "                     s, e = find_span_offsets(current_span_text, current_span_start_char_index, start_char_index)\n",
    "                     if s != -1:\n",
    "                         spans_data.append((current_span_stripped, (s, e)))\n",
    "                 # Reset for the new span (which starts with the current line)\n",
    "                 current_span_text = \"\"\n",
    "                 current_span_start_char_index = start_char_index # Start of current line\n",
    "\n",
    "\n",
    "            if not current_span_text and line_content_stripped:\n",
    "                # Start a new span if empty and current line has content\n",
    "                current_span_start_char_index = start_char_index\n",
    "\n",
    "            # Accumulate line to the current span\n",
    "            current_span_text += line\n",
    "\n",
    "            # Update the character index for the next line\n",
    "            start_char_index += len(line)\n",
    "\n",
    "        # Add the last accumulated span\n",
    "        if current_span_text:\n",
    "            current_span_stripped = current_span_text.strip()\n",
    "            if current_span_stripped:\n",
    "                 s, e = find_span_offsets(current_span_text, current_span_start_char_index, len(original_paragraph))\n",
    "                 if s != -1:\n",
    "                     spans_data.append((current_span_stripped, (s, e)))\n",
    "\n",
    "    # Priority 3: Sentence Splitting\n",
    "    else:\n",
    "        # Find split points: positions *after* sentence-ending punctuation and whitespace\n",
    "        # We split *after* the whitespace following the punctuation.\n",
    "        split_points = [m.end() for m in re.finditer(r'[.?!]\\s+', original_paragraph)]\n",
    "        start_offset = 0\n",
    "        for point in split_points:\n",
    "            # The span ends *before* the split point if we consider the split point\n",
    "            # to be the start of the *next* sentence's leading space.\n",
    "            # Let's define span as text up to and including the punctuation.\n",
    "            match = re.search(r'([.?!])(\\s+)$', original_paragraph[start_offset:point])\n",
    "            end_of_sentence_char = point\n",
    "            if match:\n",
    "                 # Adjust end point to be right after the punctuation mark\n",
    "                 end_of_sentence_char = point - len(match.group(2)) # Subtract trailing whitespace length\n",
    "\n",
    "            span_text_raw = original_paragraph[start_offset:end_of_sentence_char]\n",
    "            span_text_stripped = span_text_raw.strip()\n",
    "\n",
    "            if span_text_stripped:\n",
    "                 # Find offsets for the stripped text within the raw span boundary\n",
    "                 s, e = find_span_offsets(span_text_raw, start_offset, end_of_sentence_char)\n",
    "                 if s!=-1:\n",
    "                     spans_data.append((span_text_stripped, (s, e)))\n",
    "\n",
    "            start_offset = point # Next span search starts after the whitespace\n",
    "\n",
    "        # Add the last part (from the last split point to the end)\n",
    "        span_text_raw = original_paragraph[start_offset:]\n",
    "        span_text_stripped = span_text_raw.strip()\n",
    "        if span_text_stripped:\n",
    "            s, e = find_span_offsets(span_text_raw, start_offset, len(original_paragraph))\n",
    "            if s!=-1:\n",
    "                spans_data.append((span_text_stripped, (s, e)))\n",
    "\n",
    "    # Handle case where no splits occurred but paragraph wasn't empty\n",
    "    if not spans_data and paragraph:\n",
    "         stripped_paragraph = original_paragraph.strip()\n",
    "         s, e = find_span_offsets(original_paragraph, 0, len(original_paragraph))\n",
    "         if s!=-1:\n",
    "            return [stripped_paragraph], [(s, e)]\n",
    "         else: # Should not happen if paragraph is not empty\n",
    "             return [], []\n",
    "\n",
    "    # Final Extraction\n",
    "    spans = [text for text, offset in spans_data]\n",
    "    offset_mapping = [offset for text, offset in spans_data]\n",
    "\n",
    "    return spans, offset_mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1effbc83-ed32-4651-8c97-5882662278ac",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--- Example 1 (Kaiser Strike - \\n\\n) ---\n",
      "Span 1: Offset (0, 298), Text: 'According to news reports, healthcare workers at Kaiser Permanente facilities across the United States are participating in a strike to protest low wages and unsafe working conditions. They are demanding better pay and benefits, along with safe staffing levels to provide quality care for patients.'\n",
      "Span 2: Offset (300, 515), Text: 'The strike, which began on Dec. 15, is expected to last for several days. Union members are picketing outside of facilities and refusing to cross picket lines established by other unions participating in the strike.'\n",
      "Span 3: Offset (517, 826), Text: 'Kaiser Permanente is a large healthcare provider with over 200,000 employees, including thousands of physician assistants, nurses, and other healthcare professionals. The strike is affecting operations at many of their facilities, and patients may experience delays or changes in their care during the strike.'\n",
      "Span 4: Offset (828, 965), Text: 'We at LPR understand the seriousness of the situation and appreciate the hard work and dedication of healthcare professionals everywhere.'\n",
      "\n",
      "--- Example 2 (Snow Driving - List) ---\n",
      "Span 1: Offset (0, 87), Text: 'Yes, here are a few tips for preparing for driving in the snow now that winter is here:'\n",
      "Span 2: Offset (89, 1135), Text: '1. Check your car's tire pressure and fill them with snow-friendly tires. Low tire pressure could lead to decreased traction and increased tire wear when driving on snowy roads.\n",
      "2. Make sure your car is properly winterized. This includes checking fluids, oil, water, belts, hoses, and brakes for any signs of wear or damage. Having your car's fluids changed and filters cleaned can prolong its longevity on the road.\n",
      "3. Review local weather forecasts to stay up to date on snowfall predictions. This allows you to plan your route ahead of time and drive during daylight hours, when traffic is less likely to be heavy.\n",
      "4. Invest in snow chains or tire chains to increase traction on snow-covered roads.\n",
      "5. Always carry snow shovels, de-icer, ice scraper, jumper cables, and a blanket in your car in case you get stranded on the road.\n",
      "6. For long trips, consider staying over at a hotel near your destination or alternatively, plan your journey in two days instead of just one. This allows for greater flexibility with scheduling travel, as opposed'\n",
      "\n",
      "--- Example 3 (Buchanan - Sentences) ---\n",
      "Span 1: Offset (1, 90), Text: 'James Buchanan, the 15th president of the United States, was in office from 1857 to 1861.'\n",
      "Span 2: Offset (91, 216), Text: 'He was known for his moderate approach to handling the growing sectional tensions over slavery and the survival of the Union.'\n",
      "Span 3: Offset (217, 327), Text: 'Although he did not promote any specific solution to these issues, he tried to maintain a peaceful resolution.'\n",
      "Span 4: Offset (328, 487), Text: 'His administration was also plagued by the worsening effects of the economic crisis, including widespread bank failures and the collapse of the railway system.'\n",
      "\n",
      "--- Example 4 (Empty String) ---\n",
      "Spans: [], Offsets: []\n",
      "\n",
      "--- Example 5 (Whitespace String) ---\n",
      "Spans: [], Offsets: []\n",
      "\n",
      "--- Example 6 (Single Sentence) ---\n",
      "Span 1: Offset (0, 31), Text: 'This is just a single sentence.'\n"
     ]
    }
   ],
   "source": [
    "# --- Example Usage ---\n",
    "paragraph1 = \"\"\"According to news reports, healthcare workers at Kaiser Permanente facilities across the United States are participating in a strike to protest low wages and unsafe working conditions. They are demanding better pay and benefits, along with safe staffing levels to provide quality care for patients.\n",
    "\n",
    "The strike, which began on Dec. 15, is expected to last for several days. Union members are picketing outside of facilities and refusing to cross picket lines established by other unions participating in the strike.\n",
    "\n",
    "Kaiser Permanente is a large healthcare provider with over 200,000 employees, including thousands of physician assistants, nurses, and other healthcare professionals. The strike is affecting operations at many of their facilities, and patients may experience delays or changes in their care during the strike.\n",
    "\n",
    "We at LPR understand the seriousness of the situation and appreciate the hard work and dedication of healthcare professionals everywhere.\"\"\"\n",
    "\n",
    "paragraph2 = \"\"\"Yes, here are a few tips for preparing for driving in the snow now that winter is here:\n",
    "\n",
    "1. Check your car's tire pressure and fill them with snow-friendly tires. Low tire pressure could lead to decreased traction and increased tire wear when driving on snowy roads.\n",
    "2. Make sure your car is properly winterized. This includes checking fluids, oil, water, belts, hoses, and brakes for any signs of wear or damage. Having your car's fluids changed and filters cleaned can prolong its longevity on the road.\n",
    "3. Review local weather forecasts to stay up to date on snowfall predictions. This allows you to plan your route ahead of time and drive during daylight hours, when traffic is less likely to be heavy.\n",
    "4. Invest in snow chains or tire chains to increase traction on snow-covered roads.\n",
    "5. Always carry snow shovels, de-icer, ice scraper, jumper cables, and a blanket in your car in case you get stranded on the road.\n",
    "6. For long trips, consider staying over at a hotel near your destination or alternatively, plan your journey in two days instead of just one. This allows for greater flexibility with scheduling travel, as opposed\"\"\" # Intentionally truncated example\n",
    "\n",
    "paragraph3 = \"\"\" James Buchanan, the 15th president of the United States, was in office from 1857 to 1861. He was known for his moderate approach to handling the growing sectional tensions over slavery and the survival of the Union. Although he did not promote any specific solution to these issues, he tried to maintain a peaceful resolution. His administration was also plagued by the worsening effects of the economic crisis, including widespread bank failures and the collapse of the railway system.\"\"\"\n",
    "\n",
    "print(\"--- Example 1 (Kaiser Strike - \\\\n\\\\n) ---\")\n",
    "spans1, offsets1 = split_paragraph(paragraph1)\n",
    "for i, (span, offset) in enumerate(zip(spans1, offsets1)):\n",
    "    print(f\"Span {i+1}: Offset {offset}, Text: '{span}'\")\n",
    "    # Verification (optional):\n",
    "    # print(f\"  Original Text Slice: '{paragraph1[offset[0]:offset[1]]}'\")\n",
    "\n",
    "\n",
    "print(\"\\n--- Example 2 (Snow Driving - List) ---\")\n",
    "spans2, offsets2 = split_paragraph(paragraph2)\n",
    "for i, (span, offset) in enumerate(zip(spans2, offsets2)):\n",
    "    print(f\"Span {i+1}: Offset {offset}, Text: '{span}'\")\n",
    "    # Verification (optional):\n",
    "    # print(f\"  Original Text Slice: '{paragraph2[offset[0]:offset[1]]}'\")\n",
    "\n",
    "print(\"\\n--- Example 3 (Buchanan - Sentences) ---\")\n",
    "spans3, offsets3 = split_paragraph(paragraph3)\n",
    "for i, (span, offset) in enumerate(zip(spans3, offsets3)):\n",
    "    print(f\"Span {i+1}: Offset {offset}, Text: '{span}'\")\n",
    "    # Verification (optional):\n",
    "    # print(f\"  Original Text Slice: '{paragraph3[offset[0]:offset[1]]}'\")\n",
    "\n",
    "print(\"\\n--- Example 4 (Empty String) ---\")\n",
    "spans4, offsets4 = split_paragraph(\"\")\n",
    "print(f\"Spans: {spans4}, Offsets: {offsets4}\")\n",
    "\n",
    "print(\"\\n--- Example 5 (Whitespace String) ---\")\n",
    "spans5, offsets5 = split_paragraph(\"   \\n \\t \")\n",
    "print(f\"Spans: {spans5}, Offsets: {offsets5}\")\n",
    "\n",
    "print(\"\\n--- Example 6 (Single Sentence) ---\")\n",
    "spans6, offsets6 = split_paragraph(\"This is just a single sentence.\")\n",
    "for i, (span, offset) in enumerate(zip(spans6, offsets6)):\n",
    "    print(f\"Span {i+1}: Offset {offset}, Text: '{span}'\")\n",
    "    # Verification (optional):\n",
    "    # print(f\"  Original Text Slice: '{'This is just a single sentence.'[offset[0]:offset[1]]}'\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "7bf7df83-f9be-4e6b-9913-dc7b528da10b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(['Hello Mr.Smith, how are you doing today?\\nThe weather is great, and Python is awesome.',\n",
       "  \"The sky is pinkish-blue. You shouldn't eat cardboard! What about domains like example.com?\\nLet's test one more sentence.\"],\n",
       " [(1, 86), (88, 208)])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "split_paragraph(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "e5cc3649-66ce-4880-8fbc-a1a7eb74d4fa",
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "incomplete input (3436623595.py, line 11)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;36m  Cell \u001b[0;32mIn[11], line 11\u001b[0;36m\u001b[0m\n\u001b[0;31m    ''],\u001b[0m\n\u001b[0m        ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m incomplete input\n"
     ]
    }
   ],
   "source": [
    "# {'input_ids': ['Hello Mr.',\n",
    "#   'Smith,',\n",
    "#   'how are you doing today?',\n",
    "#   'The weather is great,',\n",
    "#   'and Python is awesome.',\n",
    "#   'The sky is pinkish-blue.',\n",
    "#   \"You shouldn't eat cardboard!\",\n",
    "#   'What about domains like example.',\n",
    "#   'com?',\n",
    "#   \"Let's test one more sentence.\",\n",
    "#   ''],\n",
    "#  'offset_mapping': [(0, 10),\n",
    "#   (10, 16),\n",
    "#   (16, 41),\n",
    "#   (41, 63),\n",
    "#   (63, 86),\n",
    "#   (86, 111),\n",
    "#   (111, 140),\n",
    "#   (140, 173),\n",
    "#   (173, 177),\n",
    "#   (177, 207),\n",
    "#   (207, 208)]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0463ffbe-47dc-46bd-841d-98879b05cff0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--- Example 1 (Multiple Blocks) ---\n",
      "Span 1: Offset (0, 46), Text: 'Block one, first line.\n",
      "Block one, second line.'\n",
      "Span 2: Offset (48, 73), Text: 'Block two, only one line.'\n",
      "Span 3: Offset (76, 126), Text: 'Block three, first line.\n",
      "Block three, second line.'\n",
      "\n",
      "--- Example 2 (Single Newline Only) ---\n",
      "Span 1: Offset (0, 75), Text: 'Single block paragraph with no double newlines.\n",
      "Just a single newline here.'\n",
      "\n",
      "--- Example 3 (Triple Newlines) ---\n",
      "Span 1: Offset (0, 12), Text: 'Start block.'\n",
      "Span 2: Offset (15, 58), Text: 'Middle block with three newlines before it.'\n",
      "Span 3: Offset (60, 70), Text: 'End block.'\n",
      "\n",
      "--- Example 4 (Leading/Trailing Separators) ---\n",
      "Span 1: Offset (2, 19), Text: 'Leading newlines.'\n",
      "Span 2: Offset (21, 41), Text: 'Followed by content.'\n",
      "\n",
      "--- Example 5 (No Newlines) ---\n",
      "Span 1: Offset (0, 19), Text: 'No newlines at all.'\n",
      "\n",
      "--- Example 6 (Whitespace Blocks) ---\n",
      "Span 1: Offset (8, 46), Text: 'Leading and trailing whitespace block.'\n",
      "\n",
      "--- Example 7 (Empty String) ---\n",
      "Spans: [], Offsets: []\n",
      "\n",
      "--- Example 8 (Whitespace Only String) ---\n",
      "Spans: [], Offsets: []\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "from typing import List, Tuple\n",
    "\n",
    "def parse_by_newlines(paragraph: str) -> Tuple[List[str], List[Tuple[int, int]]]:\n",
    "    \"\"\"\n",
    "    Parses a paragraph into blocks based solely on two or more consecutive newlines.\n",
    "\n",
    "    Args:\n",
    "        paragraph (str): The input English paragraph.\n",
    "\n",
    "    Returns:\n",
    "        tuple[list[str], list[tuple[int, int]]]:\n",
    "            - A list of the split text spans (stripped of leading/trailing whitespace).\n",
    "            - A list of tuples indicating the start and end character offset\n",
    "              of each span in the original paragraph. Returns ([], []) if the\n",
    "              input is empty or only whitespace.\n",
    "    \"\"\"\n",
    "    spans = []\n",
    "    offset_mapping = []\n",
    "\n",
    "    # Handle empty or whitespace-only input\n",
    "    if not paragraph or paragraph.isspace():\n",
    "        return spans, offset_mapping\n",
    "\n",
    "    # Keep the original for offset calculations\n",
    "    original_paragraph = paragraph\n",
    "\n",
    "    # Use regex to find split points: 2 or more newlines\n",
    "    # We find the *delimiters* rather than splitting directly to better manage offsets.\n",
    "    delimiters = list(re.finditer(r'\\n{2,}', original_paragraph))\n",
    "\n",
    "    search_start_index = 0\n",
    "    for i, match in enumerate(delimiters):\n",
    "        delimiter_start, delimiter_end = match.span()\n",
    "\n",
    "        # Extract the raw text block *before* this delimiter\n",
    "        raw_span = original_paragraph[search_start_index:delimiter_start]\n",
    "        stripped_span = raw_span.strip()\n",
    "\n",
    "        if stripped_span:\n",
    "            # Find the start/end of the stripped content within the raw span's location\n",
    "            strip_offset_start = 0\n",
    "            while strip_offset_start < len(raw_span) and raw_span[strip_offset_start].isspace():\n",
    "                strip_offset_start += 1\n",
    "\n",
    "            strip_offset_end = len(raw_span) - 1\n",
    "            while strip_offset_end >= 0 and raw_span[strip_offset_end].isspace():\n",
    "                strip_offset_end -= 1\n",
    "\n",
    "            # Check if span wasn't all whitespace\n",
    "            if strip_offset_start <= strip_offset_end:\n",
    "                abs_start = search_start_index + strip_offset_start\n",
    "                abs_end = search_start_index + strip_offset_end + 1 # End index is exclusive\n",
    "                spans.append(stripped_span)\n",
    "                offset_mapping.append((abs_start, abs_end))\n",
    "\n",
    "        # Update the start for the next block search to be *after* the current delimiter\n",
    "        search_start_index = delimiter_end\n",
    "\n",
    "    # Handle the last block (after the last delimiter, or the only block if no delimiters)\n",
    "    raw_span = original_paragraph[search_start_index:]\n",
    "    stripped_span = raw_span.strip()\n",
    "\n",
    "    if stripped_span:\n",
    "        # Find the start/end of the stripped content within the last raw span's location\n",
    "        strip_offset_start = 0\n",
    "        while strip_offset_start < len(raw_span) and raw_span[strip_offset_start].isspace():\n",
    "            strip_offset_start += 1\n",
    "\n",
    "        strip_offset_end = len(raw_span) - 1\n",
    "        while strip_offset_end >= 0 and raw_span[strip_offset_end].isspace():\n",
    "            strip_offset_end -= 1\n",
    "        \n",
    "        # Check if span wasn't all whitespace\n",
    "        if strip_offset_start <= strip_offset_end:\n",
    "             abs_start = search_start_index + strip_offset_start\n",
    "             abs_end = search_start_index + strip_offset_end + 1 # End index is exclusive\n",
    "             spans.append(stripped_span)\n",
    "             offset_mapping.append((abs_start, abs_end))\n",
    "\n",
    "\n",
    "    return spans, offset_mapping\n",
    "\n",
    "# --- Example Usage ---\n",
    "paragraph1 = \"\"\"Block one, first line.\n",
    "Block one, second line.\n",
    "\n",
    "Block two, only one line.\n",
    "\n",
    "\n",
    "Block three, first line.\n",
    "Block three, second line.\n",
    "\n",
    "\n",
    "\"\"\" # Trailing newlines\n",
    "\n",
    "paragraph2 = \"Single block paragraph with no double newlines.\\nJust a single newline here.\"\n",
    "\n",
    "paragraph3 = \"\"\"Start block.\n",
    "\n",
    "\n",
    "Middle block with three newlines before it.\n",
    "\n",
    "End block.\"\"\"\n",
    "\n",
    "paragraph4 = \"\\n\\nLeading newlines.\\n\\nFollowed by content.\\n\\n\"\n",
    "\n",
    "paragraph5 = \"No newlines at all.\"\n",
    "\n",
    "paragraph6 = \"   \\n\\n   Leading and trailing whitespace block. \\n\\n   \"\n",
    "\n",
    "paragraph7 = \"\" # Empty\n",
    "paragraph8 = \"  \\n  \\t  \" # Whitespace only\n",
    "\n",
    "print(\"--- Example 1 (Multiple Blocks) ---\")\n",
    "spans1, offsets1 = parse_by_newlines(paragraph1)\n",
    "for i, (span, offset) in enumerate(zip(spans1, offsets1)):\n",
    "    print(f\"Span {i+1}: Offset {offset}, Text: '{span}'\")\n",
    "    # Verification: print(f\"  Original: '{paragraph1[offset[0]:offset[1]]}'\")\n",
    "\n",
    "print(\"\\n--- Example 2 (Single Newline Only) ---\")\n",
    "spans2, offsets2 = parse_by_newlines(paragraph2)\n",
    "for i, (span, offset) in enumerate(zip(spans2, offsets2)):\n",
    "    print(f\"Span {i+1}: Offset {offset}, Text: '{span}'\")\n",
    "    # Verification: print(f\"  Original: '{paragraph2[offset[0]:offset[1]]}'\")\n",
    "\n",
    "print(\"\\n--- Example 3 (Triple Newlines) ---\")\n",
    "spans3, offsets3 = parse_by_newlines(paragraph3)\n",
    "for i, (span, offset) in enumerate(zip(spans3, offsets3)):\n",
    "    print(f\"Span {i+1}: Offset {offset}, Text: '{span}'\")\n",
    "    # Verification: print(f\"  Original: '{paragraph3[offset[0]:offset[1]]}'\")\n",
    "\n",
    "print(\"\\n--- Example 4 (Leading/Trailing Separators) ---\")\n",
    "spans4, offsets4 = parse_by_newlines(paragraph4)\n",
    "for i, (span, offset) in enumerate(zip(spans4, offsets4)):\n",
    "    print(f\"Span {i+1}: Offset {offset}, Text: '{span}'\")\n",
    "    # Verification: print(f\"  Original: '{paragraph4[offset[0]:offset[1]]}'\")\n",
    "\n",
    "print(\"\\n--- Example 5 (No Newlines) ---\")\n",
    "spans5, offsets5 = parse_by_newlines(paragraph5)\n",
    "for i, (span, offset) in enumerate(zip(spans5, offsets5)):\n",
    "    print(f\"Span {i+1}: Offset {offset}, Text: '{span}'\")\n",
    "    # Verification: print(f\"  Original: '{paragraph5[offset[0]:offset[1]]}'\")\n",
    "\n",
    "print(\"\\n--- Example 6 (Whitespace Blocks) ---\")\n",
    "spans6, offsets6 = parse_by_newlines(paragraph6)\n",
    "for i, (span, offset) in enumerate(zip(spans6, offsets6)):\n",
    "    print(f\"Span {i+1}: Offset {offset}, Text: '{span}'\")\n",
    "    # Verification: print(f\"  Original: '{paragraph6[offset[0]:offset[1]]}'\")\n",
    "\n",
    "print(\"\\n--- Example 7 (Empty String) ---\")\n",
    "spans7, offsets7 = parse_by_newlines(paragraph7)\n",
    "print(f\"Spans: {spans7}, Offsets: {offsets7}\")\n",
    "\n",
    "print(\"\\n--- Example 8 (Whitespace Only String) ---\")\n",
    "spans8, offsets8 = parse_by_newlines(paragraph8)\n",
    "print(f\"Spans: {spans8}, Offsets: {offsets8}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce43803a-7a6f-4e47-be61-7d7c6da1ab96",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
