{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "043d0987-0ee5-40ab-acfa-8f3543291c88",
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "try:\n",
    "    nltk.data.find('tokenizers/punkt')\n",
    "except nltk.downloader.DownloadError:\n",
    "    nltk.download('punkt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "89a8a1c3-6677-4748-85c0-52824fece18d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Constituency Parse Tree:\n",
      "                 S                          \n",
      "      ___________|_______                    \n",
      "     |                   VP                 \n",
      "     |        ___________|_______            \n",
      "     |       |       |           PP         \n",
      "     |       |       |        ___|___        \n",
      "     NP      |       NP      |       NP     \n",
      "  ___|___    |    ___|___    |    ___|___    \n",
      "Det      N   V  Det      N   P  Det      N  \n",
      " |       |   |   |       |   |   |       |   \n",
      "the     man saw  a      dog  in the     park\n",
      "\n",
      "Constituency Parse Tree:\n",
      "                 S                      \n",
      "      ___________|___                    \n",
      "     |               VP                 \n",
      "     |        _______|___                \n",
      "     |       |           NP             \n",
      "     |       |    _______|___            \n",
      "     |       |   |   |       PP         \n",
      "     |       |   |   |    ___|___        \n",
      "     NP      |   |   |   |       NP     \n",
      "  ___|___    |   |   |   |    ___|___    \n",
      "Det      N   V  Det  N   P  Det      N  \n",
      " |       |   |   |   |   |   |       |   \n",
      "the     man saw  a  dog  in the     park\n",
      "\n",
      "\n",
      "--- Another Example ---\n",
      "Could not parse the sentence: Grammar does not cover some of the input words: \"'i'\".\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "\n",
    "# Define a simple context-free grammar\n",
    "# S: Sentence, NP: Noun Phrase, VP: Verb Phrase, PP: Prepositional Phrase\n",
    "# Det: Determiner, N: Noun, V: Verb, P: Preposition\n",
    "grammar = nltk.CFG.fromstring(\"\"\"\n",
    "    S -> NP VP\n",
    "    VP -> V NP | V NP PP\n",
    "    PP -> P NP\n",
    "    NP -> Det N | Det N PP | 'I' | N\n",
    "    Det -> 'the' | 'a' | 'my'\n",
    "    N -> 'man' | 'dog' | 'cat' | 'park' | 'telescope' | 'saw'\n",
    "    V -> 'saw' | 'walked' | 'ate'\n",
    "    P -> 'in' | 'with' | 'on'\n",
    "\"\"\")\n",
    "\n",
    "# Create a parser (e.g., ChartParser or RecursiveDescentParser)\n",
    "# ChartParser is generally more efficient for ambiguous grammars.\n",
    "parser = nltk.ChartParser(grammar)\n",
    "# You could also use:\n",
    "# parser = nltk.RecursiveDescentParser(grammar)\n",
    "\n",
    "# Sentence to parse\n",
    "sentence_str = \"the man saw a dog in the park\"\n",
    "# sentence_str = \"I saw the man with a telescope\"\n",
    "\n",
    "# Tokenize the sentence\n",
    "tokens = nltk.word_tokenize(sentence_str.lower()) # Convert to lowercase if your grammar is in lowercase\n",
    "\n",
    "# Parse the sentence\n",
    "try:\n",
    "    for tree in parser.parse(tokens):\n",
    "        print(\"Constituency Parse Tree:\")\n",
    "        tree.pretty_print() # Print in a more readable tree format\n",
    "        # To draw the tree (requires tkinter and ghostscript usually)\n",
    "        # tree.draw()\n",
    "except ValueError as e:\n",
    "    print(f\"Could not parse the sentence: {e}\")\n",
    "    print(\"This might happen if the grammar cannot derive the sentence or due to recursion depth with RecursiveDescentParser for complex sentences.\")\n",
    "\n",
    "print(\"\\n--- Another Example ---\")\n",
    "sentence_str_2 = \"I ate the pizza\"\n",
    "tokens_2 = nltk.word_tokenize(sentence_str_2.lower())\n",
    "# Add 'pizza' to the grammar if it's not there (for demonstration)\n",
    "# In a real scenario, you'd have a more comprehensive grammar or use a statistical parser.\n",
    "# For this example, we'll assume 'pizza' could be an N.\n",
    "# If you want to modify the grammar dynamically (not typical for CFG parsers like this):\n",
    "# You would typically redefine the grammar string and recreate the CFG object.\n",
    "# For now, ensure 'pizza' is covered or use a sentence with existing Nouns.\n",
    "# Let's use an existing noun for simplicity with the current grammar.\n",
    "sentence_str_2 = \"I saw the cat\"\n",
    "tokens_2 = nltk.word_tokenize(sentence_str_2.lower())\n",
    "\n",
    "try:\n",
    "    for tree in parser.parse(tokens_2):\n",
    "        print(\"Constituency Parse Tree:\")\n",
    "        tree.pretty_print()\n",
    "except ValueError as e:\n",
    "    print(f\"Could not parse the sentence: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c8da6962-79cb-4ab5-961d-f8c78294d644",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['i', 'saw', 'the', 'cat']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokens_2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e1bb0e40-faa8-49f2-9d11-6da8dccb978e",
   "metadata": {},
   "source": [
    "### Stanza"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c9bad2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import stanza\n",
    "try:\n",
    "    stanza.download('en') # Download English model package\n",
    "except:\n",
    "    print(\"Stanza model 'en' likely already downloaded.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3c68b5b2-dde0-4767-b5d7-e49fc15c6103",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--- Sentence 1 ---\n",
      "Constituency Parse Tree (S-expression format):\n",
      "(ROOT (S (NP (DT The) (JJ quick) (JJ brown) (NN fox)) (VP (VBZ jumps) (PP (IN over) (NP (DT the) (JJ lazy) (NN dog)))) (. .)))\n",
      "\n",
      "NLTK Tree Visualization:\n",
      "                     ROOT                          \n",
      "                      |                             \n",
      "                      S                            \n",
      "       _______________|__________________________   \n",
      "      |                         VP               | \n",
      "      |                _________|___             |  \n",
      "      |               |             PP           | \n",
      "      |               |     ________|___         |  \n",
      "      NP              |    |            NP       | \n",
      "  ____|__________     |    |     _______|____    |  \n",
      " DT   JJ    JJ   NN  VBZ   IN   DT      JJ   NN  . \n",
      " |    |     |    |    |    |    |       |    |   |  \n",
      "The quick brown fox jumps over the     lazy dog  . \n",
      "\n"
     ]
    }
   ],
   "source": [
    "if nlp:\n",
    "    # Process a sentence\n",
    "    doc = nlp(\"The quick brown fox jumps over the lazy dog.\")\n",
    "    # doc = nlp(\"This is a more complex sentence with several clauses and phrases.\")\n",
    "    # doc = nlp(\"brown fox\")\n",
    "\n",
    "    for i, sentence in enumerate(doc.sentences):\n",
    "        print(f\"--- Sentence {i+1} ---\")\n",
    "        print(\"Constituency Parse Tree (S-expression format):\")\n",
    "        print(sentence.constituency) # Prints the tree in S-expression format (like LISP)\n",
    "\n",
    "        # To visualize with NLTK's Tree (optional, but nice)\n",
    "        try:\n",
    "            from nltk.tree import Tree\n",
    "            nltk_tree = Tree.fromstring(str(sentence.constituency))\n",
    "            print(\"\\nNLTK Tree Visualization:\")\n",
    "            nltk_tree.pretty_print()\n",
    "            # To draw the tree (requires tkinter and ghostscript usually)\n",
    "            # nltk_tree.draw()\n",
    "        except ImportError:\n",
    "            print(\"\\nNLTK Tree visualization skipped: NLTK not fully available or tree conversion issue.\")\n",
    "        except Exception as e_nltk:\n",
    "            print(f\"\\nError during NLTK tree visualization: {e_nltk}\")\n",
    "\n",
    "else:\n",
    "    print(\"Stanza NLP pipeline could not be initialized.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "83579096-ae6f-4410-8e89-8ec454fd65eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import stanza\n",
    "from nltk.tree import Tree\n",
    "import logging\n",
    "\n",
    "# Configure logging for Stanza (optional, to reduce verbosity during downloads/loading)\n",
    "# logging.basicConfig(level=logging.INFO) # Set to INFO or WARNING to see less\n",
    "\n",
    "# Global NLP pipeline to avoid re-initialization on every call\n",
    "STANZA_NLP_PIPELINE = None\n",
    "\n",
    "def initialize_stanza_pipeline():\n",
    "    \"\"\"Initializes the Stanza pipeline for English constituency parsing if not already done.\"\"\"\n",
    "    global STANZA_NLP_PIPELINE\n",
    "    if STANZA_NLP_PIPELINE is None:\n",
    "        print(\"Initializing Stanza pipeline for English constituency parsing...\")\n",
    "        try:\n",
    "            # Download English models if not already present.\n",
    "            # Using 'stanford' as resources_url can sometimes be more reliable.\n",
    "            stanza.download('en', verbose=False, resources_url='stanford')\n",
    "        except Exception as e:\n",
    "            # This might happen if models are already downloaded or due to network issues.\n",
    "            # The pipeline initialization will confirm if models are truly missing.\n",
    "            print(f\"Stanza download check/attempt info: {e}\")\n",
    "        \n",
    "        STANZA_NLP_PIPELINE = stanza.Pipeline(\n",
    "            lang='en',\n",
    "            processors='tokenize,pos,constituency', # Essential processors\n",
    "            verbose=False,\n",
    "            # This tells Stanza to use existing models if found, and not re-download.\n",
    "            download_method=stanza.DownloadMethod.REUSE_RESOURCES\n",
    "        )\n",
    "        print(\"Stanza pipeline initialized.\")\n",
    "\n",
    "def _extract_spans_recursive(\n",
    "    node,\n",
    "    original_words, # List of Stanza Word objects for character offsets\n",
    "    current_token_index_list, # List containing a single int, to pass by reference\n",
    "    spans, # List to append span strings\n",
    "    offset_mappings, # List to append (start_char, end_char) tuples\n",
    "    phrase_labels_to_capture # Set of phrase labels to capture as whole units\n",
    "):\n",
    "    \"\"\"\n",
    "    Recursively traverses the NLTK tree (derived from Stanza's parse)\n",
    "    to extract semantic spans based on specified phrase labels.\n",
    "    \"\"\"\n",
    "    # Case 1: Node is a terminal string (a word)\n",
    "    if isinstance(node, str):\n",
    "        token_idx = current_token_index_list[0]\n",
    "        if token_idx < len(original_words):\n",
    "            word_obj = original_words[token_idx]\n",
    "            spans.append(word_obj.text) # The 'node' string is also the word text\n",
    "            offset_mappings.append((word_obj.start_char, word_obj.end_char))\n",
    "            current_token_index_list[0] += 1 # Consume this token\n",
    "        else:\n",
    "            # This indicates a mismatch, e.g., tree has more leaves than original sentence tokens.\n",
    "            # Should be rare with consistent Stanza output.\n",
    "            logging.warning(f\"Token index {token_idx} out of bounds for original_words (len {len(original_words)}). Leaf: '{node}'\")\n",
    "        return\n",
    "\n",
    "    # Case 2: Node is a non-terminal (an NLTK Tree object representing a phrase)\n",
    "    # Check if this phrase's label is one we want to capture as a single unit\n",
    "    if node.label() in phrase_labels_to_capture:\n",
    "        leaf_strings = node.leaves() # Get all word strings under this phrase\n",
    "        num_leaves = len(leaf_strings)\n",
    "        \n",
    "        start_token_idx = current_token_index_list[0]\n",
    "        # Calculate end_token_idx carefully to avoid going out of bounds\n",
    "        end_token_idx = min(start_token_idx + num_leaves - 1, len(original_words) - 1)\n",
    "\n",
    "        if num_leaves > 0 and start_token_idx <= end_token_idx:\n",
    "            # Construct the span text by joining the leaf strings\n",
    "            span_text = \" \".join(leaf_strings)\n",
    "            \n",
    "            # Get character offsets from the corresponding Stanza Word objects\n",
    "            start_char = original_words[start_token_idx].start_char\n",
    "            end_char = original_words[end_token_idx].end_char # Use the adjusted end_token_idx\n",
    "            \n",
    "            spans.append(span_text)\n",
    "            offset_mappings.append((start_char, end_char))\n",
    "            \n",
    "            # Advance the global token index past all leaves consumed by this phrase\n",
    "            current_token_index_list[0] = end_token_idx + 1\n",
    "        elif num_leaves > 0: # num_leaves > 0 but token indices became problematic\n",
    "            logging.warning(f\"Could not form span for node '{node.label()}' due to token index mismatch. Processing children instead as fallback.\")\n",
    "            # Fallback: if indices are problematic, process children to not lose tokens entirely\n",
    "            for child in node:\n",
    "                 _extract_spans_recursive(child, original_words, current_token_index_list, spans, offset_mappings, phrase_labels_to_capture)\n",
    "        # If num_leaves is 0 (empty phrase node), do nothing and don't advance token index.\n",
    "        return # This phrase is captured, so don't recurse into its children\n",
    "\n",
    "    # Case 3: Node is a non-terminal, but its label is NOT in phrase_labels_to_capture.\n",
    "    # So, recurse on its children to break it down further.\n",
    "    for child in node:\n",
    "        _extract_spans_recursive(child, original_words, current_token_index_list, spans, offset_mappings, phrase_labels_to_capture)\n",
    "\n",
    "\n",
    "def sentence_tokenizer(sentence_str: str, phrase_labels_to_capture: list = None) -> dict:\n",
    "    \"\"\"\n",
    "    Parses a sentence into semantic units (spans) using Stanza's constituency parser.\n",
    "    The \"level\" of grouping is controlled by `phrase_labels_to_capture`.\n",
    "\n",
    "    Args:\n",
    "        sentence_str: The input sentence string.\n",
    "        phrase_labels_to_capture: A list of NLTK tree node labels (e.g., 'NP', 'PP', 'S').\n",
    "                                  Phrases with these labels will be captured as single spans.\n",
    "                                  If None, a default set of common phrase labels is used,\n",
    "                                  typically excluding 'S' and 'VP' to encourage finer-grained units.\n",
    "                                  To get the example output from the prompt, you would use ['NP', '.'].\n",
    "\n",
    "    Returns:\n",
    "        A dictionary with two keys:\n",
    "        - 'input_ids': A list of strings, where each string is a semantic span.\n",
    "        - 'offset_mapping': A list of tuples (start_char, end_char) indicating\n",
    "                            the character position of each span in the original sentence.\n",
    "    \"\"\"\n",
    "    initialize_stanza_pipeline() # Ensure the Stanza pipeline is ready\n",
    "    \n",
    "    if STANZA_NLP_PIPELINE is None: # Guard against pipeline initialization failure\n",
    "        logging.error(\"Stanza pipeline could not be initialized. Returning empty.\")\n",
    "        return {'input_ids': [], 'offset_mapping': []}\n",
    "\n",
    "    # Default phrase labels if none provided. This list aims for common semantic chunks.\n",
    "    # Excludes 'S' and 'VP' to generally get smaller units than full clauses/verb phrases\n",
    "    # unless specified by the user. Includes '.' for punctuation.\n",
    "    if phrase_labels_to_capture is None:\n",
    "        phrase_labels_to_capture_set = {'NP', 'PP', 'ADJP', 'ADVP', 'SBAR', '.'}\n",
    "    else:\n",
    "        phrase_labels_to_capture_set = set(phrase_labels_to_capture) # Use a set for efficient lookups\n",
    "\n",
    "    doc = STANZA_NLP_PIPELINE(sentence_str.strip()) # Process the cleaned sentence\n",
    "    if not doc.sentences:\n",
    "        return {'input_ids': [], 'offset_mapping': []} # Handle empty or unparsable input\n",
    "\n",
    "    stz_sentence = doc.sentences[0] # Process the first sentence\n",
    "    \n",
    "    # Convert Stanza's constituency string to an NLTK Tree object\n",
    "    try:\n",
    "        constituency_str = str(stz_sentence.constituency)\n",
    "        if not constituency_str or constituency_str.strip() == \"()\" or \"XX\" in constituency_str : # Empty, trivial, or failed parse tree\n",
    "            logging.warning(f\"Empty or trivial constituency tree for: '{sentence_str}'. Defaulting to word-level spans.\")\n",
    "            spans_result = [word.text for word in stz_sentence.words]\n",
    "            offset_mappings_result = [(word.start_char, word.end_char) for word in stz_sentence.words]\n",
    "            return {'input_ids': spans_result, 'offset_mapping': offset_mappings_result}\n",
    "        root_nltk_tree = Tree.fromstring(constituency_str)\n",
    "    except Exception as e:\n",
    "        logging.error(f\"Error parsing Stanza constituency tree string: {e}. Tree string was: '{constituency_str}'. Defaulting to word-level spans.\")\n",
    "        spans_result = [word.text for word in stz_sentence.words]\n",
    "        offset_mappings_result = [(word.start_char, word.end_char) for word in stz_sentence.words]\n",
    "        return {'input_ids': spans_result, 'offset_mapping': offset_mappings_result}\n",
    "        \n",
    "    original_words = stz_sentence.words # List of Stanza Word objects\n",
    "    if not original_words:\n",
    "        return {'input_ids': [], 'offset_mapping': []} # No words, no spans\n",
    "\n",
    "    spans_result = []\n",
    "    offset_mappings_result = []\n",
    "    current_token_index_list = [0] # Index for original_words, passed as a list for mutability\n",
    "\n",
    "    # Stanza's constituency tree often starts with a ROOT node,\n",
    "    # whose child (e.g., S) is the actual root of the sentence's grammatical structure.\n",
    "    effective_tree_root = root_nltk_tree\n",
    "    if root_nltk_tree.label() == 'ROOT' and len(root_nltk_tree) == 1:\n",
    "        effective_tree_root = root_nltk_tree[0] # Descend into the typical single child (e.g., S)\n",
    "    \n",
    "    # Start the recursive extraction from the effective root of the sentence tree\n",
    "    _extract_spans_recursive(\n",
    "        effective_tree_root,\n",
    "        original_words,\n",
    "        current_token_index_list,\n",
    "        spans_result,\n",
    "        offset_mappings_result,\n",
    "        phrase_labels_to_capture_set # Pass the set\n",
    "    )\n",
    "    \n",
    "    # Safety check: If not all tokens were consumed by the tree traversal\n",
    "    # (e.g., due to unusual tree structure or trailing unparsed elements),\n",
    "    # append any remaining tokens as individual spans.\n",
    "    final_token_idx_consumed = current_token_index_list[0]\n",
    "    if final_token_idx_consumed < len(original_words):\n",
    "        logging.warning(\n",
    "            f\"Not all tokens consumed by tree traversal. Consumed: {final_token_idx_consumed}, Total: {len(original_words)}. \"\n",
    "            f\"Appending remaining {len(original_words) - final_token_idx_consumed} token(s) individually.\"\n",
    "        )\n",
    "        for i in range(final_token_idx_consumed, len(original_words)):\n",
    "            word_obj = original_words[i]\n",
    "            spans_result.append(word_obj.text)\n",
    "            offset_mappings_result.append((word_obj.start_char, word_obj.end_char))\n",
    "            \n",
    "    return {'input_ids': spans_result, 'offset_mapping': offset_mappings_result}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "71454386-797c-4a66-ac13-27c732781ca9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Processing sentence: 'The quick brown fox jumps over the lazy dog.'\n",
      "Initializing Stanza pipeline for English constituency parsing...\n",
      "Stanza pipeline initialized.\n",
      "\n",
      "Full Constituency Tree for Inspection:\n",
      "                     ROOT                          \n",
      "                      |                             \n",
      "                      S                            \n",
      "       _______________|__________________________   \n",
      "      |                         VP               | \n",
      "      |                _________|___             |  \n",
      "      |               |             PP           | \n",
      "      |               |     ________|___         |  \n",
      "      NP              |    |            NP       | \n",
      "  ____|__________     |    |     _______|____    |  \n",
      " DT   JJ    JJ   NN  VBZ   IN   DT      JJ   NN  . \n",
      " |    |     |    |    |    |    |       |    |   |  \n",
      "The quick brown fox jumps over the     lazy dog  . \n",
      "\n",
      "\n",
      "Tokenizing with specific phrase_labels_to_capture = ['NP', '.']:\n",
      "Spans: ['The quick brown fox', 'jumps', 'over', 'the lazy dog', '.']\n",
      "Offsets: [(0, 19), (20, 25), (26, 30), (31, 43), (43, 44)]\n",
      "------------------------------\n",
      "\n",
      "Tokenizing 'The quick brown fox jumps over the lazy dog.' with default phrase_labels_to_capture:\n",
      "Spans: ['The quick brown fox', 'jumps', 'over the lazy dog', '.']\n",
      "Offsets: [(0, 19), (20, 25), (26, 43), (43, 44)]\n",
      "------------------------------\n",
      "\n",
      "Processing sentence: 'He said that he would go to the park soon.'\n",
      "\n",
      "Tokenizing with phrase_labels_to_capture = ['NP', 'PP', 'SBAR', '.']:\n",
      "Spans: ['He', 'said', 'that he would go to the park soon', '.']\n",
      "Offsets: [(0, 2), (3, 7), (8, 41), (41, 42)]\n",
      "------------------------------\n",
      "\n",
      "Full Constituency Tree for Inspection (sentence3):\n",
      "         ROOT                        \n",
      "          |                           \n",
      "          S                          \n",
      "  ________|________________________   \n",
      " |             VP                  | \n",
      " |     ________|____               |  \n",
      " NP   |             NP             | \n",
      " |    |    _________|_______       |  \n",
      " DT  VBZ  DT        JJ      NN     . \n",
      " |    |   |         |       |      |  \n",
      "This  is  a       simple sentence  . \n",
      "\n",
      "\n",
      "Processing sentence: 'This is a simple sentence.'\n",
      "Tokenizing with default phrase_labels_to_capture:\n",
      "Spans: ['This', 'is', 'a simple sentence', '.']\n",
      "Offsets: [(0, 4), (5, 7), (8, 25), (25, 26)]\n",
      "------------------------------\n",
      "\n",
      "Processing sentence: 'A complex sentence with an adjective phrase, a very tall man, and a prepositional phrase, in the park, might be tricky.'\n",
      "Tokenizing with phrase_labels_to_capture = ['NP', 'PP', 'ADJP', 'ADVP', '.']\n",
      "Spans: ['A complex sentence with an adjective phrase , a very tall man , and a prepositional phrase , in the park ,', 'might', 'be', 'tricky', '.']\n",
      "Offsets: [(0, 102), (103, 108), (109, 111), (112, 118), (118, 119)]\n"
     ]
    }
   ],
   "source": [
    "# Initialize pipeline (can take a moment on first run or if models need downloading)\n",
    "# You might want to call initialize_stanza_pipeline() once at the start of your application.\n",
    "# initialize_stanza_pipeline() # Called implicitly by sentence_tokenizer if STANZA_NLP_PIPELINE is None\n",
    "\n",
    "sentence1 = \"The quick brown fox jumps over the lazy dog.\"\n",
    "print(f\"\\nProcessing sentence: '{sentence1}'\")\n",
    "\n",
    "# First, let's see the full constituency tree to understand its structure\n",
    "# (especially the label for punctuation)\n",
    "if STANZA_NLP_PIPELINE is None: initialize_stanza_pipeline() # Ensure initialized for inspection\n",
    "if STANZA_NLP_PIPELINE:\n",
    "    doc_inspect = STANZA_NLP_PIPELINE(sentence1)\n",
    "    if doc_inspect.sentences:\n",
    "        print(\"\\nFull Constituency Tree for Inspection:\")\n",
    "        stz_sent_inspect = doc_inspect.sentences[0]\n",
    "        Tree.fromstring(str(stz_sent_inspect.constituency)).pretty_print()\n",
    "        # For \"The quick brown fox jumps over the lazy dog.\", the last node is typically (. .)\n",
    "        # The label for this node is '.'\n",
    "\n",
    "# To get the specific output like: [\"The quick brown fox\", \"jumps\", \"over\", \"the lazy dog\", \".\"]\n",
    "# We need to capture 'NP' nodes and the punctuation node (whose label is often '.')\n",
    "# Other intermediate phrases like 'VP' or 'PP' should be broken down.\n",
    "specific_phrase_labels = ['NP', '.'] \n",
    "print(f\"\\nTokenizing with specific phrase_labels_to_capture = {specific_phrase_labels}:\")\n",
    "result_specific = sentence_tokenizer(sentence1, phrase_labels_to_capture=specific_phrase_labels)\n",
    "print(\"Spans:\", result_specific['input_ids'])\n",
    "print(\"Offsets:\", result_specific['offset_mapping'])\n",
    "# Expected: Spans: ['The quick brown fox', 'jumps', 'over', 'the lazy dog', '.']\n",
    "\n",
    "print(\"-\" * 30)\n",
    "\n",
    "# Test with default phrase_labels_to_capture (for more general semantic units)\n",
    "# Default is currently: {'NP', 'PP', 'ADJP', 'ADVP', 'SBAR', '.'}\n",
    "print(f\"\\nTokenizing '{sentence1}' with default phrase_labels_to_capture:\")\n",
    "result_default = sentence_tokenizer(sentence1)\n",
    "print(\"Spans:\", result_default['input_ids'])\n",
    "print(\"Offsets:\", result_default['offset_mapping'])\n",
    "# Expected with default: Spans: ['The quick brown fox', 'jumps', 'over', 'the lazy dog', '.']\n",
    "# (If PP is not in default, then it is \"jumps\", \"over\", \"the lazy dog\", \".\")\n",
    "# My current default {'NP', 'PP', 'ADJP', 'ADVP', 'SBAR', '.'} would give:\n",
    "# ['The quick brown fox', 'jumps', ['over the lazy dog'], '.'] (as PP would be captured)\n",
    "# So the example output is very specific to ['NP', '.']\n",
    "\n",
    "\n",
    "print(\"-\" * 30)\n",
    "sentence2 = \"He said that he would go to the park soon.\"\n",
    "print(f\"\\nProcessing sentence: '{sentence2}'\")\n",
    "# Let's capture SBAR (subordinate clauses) as units, along with NPs, PPs, and punctuation.\n",
    "sbar_labels = ['NP', 'PP', 'SBAR', '.']\n",
    "print(f\"\\nTokenizing with phrase_labels_to_capture = {sbar_labels}:\")\n",
    "result_sbar = sentence_tokenizer(sentence2, phrase_labels_to_capture=sbar_labels)\n",
    "print(\"Spans:\", result_sbar['input_ids'])\n",
    "print(\"Offsets:\", result_sbar['offset_mapping'])\n",
    "# Expect \"that he would go to the park soon\" (or parts of it if 'soon' is separate ADVP) to be captured by SBAR.\n",
    "# e.g. ['He', 'said', 'that he would go to the park soon', '.'] if \"soon\" is part of SBAR\n",
    "# or   ['He', 'said', 'that he would go to the park', 'soon', '.'] if SBAR is \"that he would go to the park\"\n",
    "\n",
    "print(\"-\" * 30)\n",
    "sentence3 = \"This is a simple sentence.\"\n",
    "if STANZA_NLP_PIPELINE:\n",
    "    doc_inspect_s3 = STANZA_NLP_PIPELINE(sentence3)\n",
    "    if doc_inspect_s3.sentences:\n",
    "        print(\"\\nFull Constituency Tree for Inspection (sentence3):\")\n",
    "        Tree.fromstring(str(doc_inspect_s3.sentences[0].constituency)).pretty_print()\n",
    "        \n",
    "print(f\"\\nProcessing sentence: '{sentence3}'\")\n",
    "print(f\"Tokenizing with default phrase_labels_to_capture:\")\n",
    "result_simple = sentence_tokenizer(sentence3)\n",
    "print(\"Spans:\", result_simple['input_ids'])\n",
    "print(\"Offsets:\", result_simple['offset_mapping'])\n",
    "\n",
    "print(\"-\" * 30)\n",
    "sentence4 = \"A complex sentence with an adjective phrase, a very tall man, and a prepositional phrase, in the park, might be tricky.\"\n",
    "print(f\"\\nProcessing sentence: '{sentence4}'\")\n",
    "complex_labels = ['NP', 'PP', 'ADJP', 'ADVP', '.']\n",
    "print(f\"Tokenizing with phrase_labels_to_capture = {complex_labels}\")\n",
    "result_complex = sentence_tokenizer(sentence4, phrase_labels_to_capture=complex_labels)\n",
    "print(\"Spans:\", result_complex['input_ids'])\n",
    "print(\"Offsets:\", result_complex['offset_mapping'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "554a7b6b-545a-4ec7-b67b-019d4686818e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import stanza\n",
    "import nltk\n",
    "from nltk.tree import Tree\n",
    "\n",
    "# It's good practice to initialize the Stanza pipeline once\n",
    "# and pass it to the function if you're calling it multiple times.\n",
    "NLP_PIPELINE = None\n",
    "\n",
    "def initialize_stanza(lang='en', processors='tokenize,pos,constituency'):\n",
    "    \"\"\"Initializes and returns the Stanza pipeline.\"\"\"\n",
    "    global NLP_PIPELINE\n",
    "    if NLP_PIPELINE is None or NLP_PIPELINE.lang != lang or NLP_PIPELINE.processors_str != processors:\n",
    "        print(f\"Initializing Stanza pipeline for {lang} with {processors}...\")\n",
    "        try:\n",
    "            stanza.download(lang=lang, processors=processors, verbose=False)\n",
    "            NLP_PIPELINE = stanza.Pipeline(lang=lang, processors=processors, verbose=False, download_method=None)\n",
    "            print(\"Stanza pipeline initialized.\")\n",
    "        except Exception as e:\n",
    "            print(f\"Error initializing Stanza pipeline: {e}\")\n",
    "            NLP_PIPELINE = None # Ensure it's None if failed\n",
    "            raise\n",
    "    return NLP_PIPELINE\n",
    "\n",
    "def sentence_tokenizer(sentence_text: str, target_nltk_height: int, nlp: stanza.Pipeline) -> dict:\n",
    "    \"\"\"\n",
    "    Parses a sentence into spans based on a specified NLTK tree height.\n",
    "\n",
    "    Args:\n",
    "        sentence_text: The input sentence string.\n",
    "        target_nltk_height: The desired NLTK height of subtrees to be extracted as spans.\n",
    "                            - NLTK height of a pre-terminal (e.g., (DT The)) is 2.\n",
    "                              Using target_nltk_height = 2 will generally result in token-level spans.\n",
    "                            - NLTK height of a phrase whose children are only pre-terminals\n",
    "                              (e.g., (NP (DT a) (NN dog))) is 3.\n",
    "                              Using target_nltk_height = 3 will group words under such \"flat\" phrases.\n",
    "                            - Must be an integer >= 2.\n",
    "        nlp: An initialized Stanza Pipeline object.\n",
    "\n",
    "    Returns:\n",
    "        A dictionary with:\n",
    "        - 'input_ids': A list of strings, where each string is a span.\n",
    "        - 'offset_mapping': A list of (start_char, end_char) tuples for each span,\n",
    "                            where end_char is exclusive.\n",
    "    \"\"\"\n",
    "    if not sentence_text.strip():\n",
    "        return {'input_ids': [], 'offset_mapping': []}\n",
    "    if not nlp:\n",
    "        raise ValueError(\"Stanza pipeline (nlp) not initialized or provided.\")\n",
    "    if not isinstance(target_nltk_height, int) or target_nltk_height < 2:\n",
    "        raise ValueError(\"target_nltk_height must be an integer >= 2.\")\n",
    "\n",
    "    doc = nlp(sentence_text)\n",
    "    if not doc.sentences:\n",
    "        return {'input_ids': [], 'offset_mapping': []}\n",
    "\n",
    "    # For this function, we process the first sentence.\n",
    "    # It could be extended to handle multiple sentences in a document.\n",
    "    stz_sentence = doc.sentences[0]\n",
    "\n",
    "    if not stz_sentence.constituency:\n",
    "        # Fallback to basic tokenization if no constituency tree is available for some reason\n",
    "        print(\"Warning: No constituency tree found in Stanza sentence. Falling back to tokenization.\")\n",
    "        spans = [token.text for token in stz_sentence.tokens]\n",
    "        offsets = [(token.start_char, token.end_char) for token in stz_sentence.tokens]\n",
    "        return {'input_ids': spans, 'offset_mapping': offsets}\n",
    "\n",
    "    try:\n",
    "        # The Stanza constituency output might be a simple string if parsing failed,\n",
    "        # or a LISP-style tree string.\n",
    "        if not str(stz_sentence.constituency).startswith(\"(\"): # Basic check for tree structure\n",
    "             raise ValueError(\"Constituency output does not look like a tree.\")\n",
    "        nltk_tree = Tree.fromstring(str(stz_sentence.constituency))\n",
    "    except ValueError as e:\n",
    "        print(f\"Warning: Could not parse constituency tree string: '{stz_sentence.constituency}'. Error: {e}. Falling back to tokenization.\")\n",
    "        spans = [token.text for token in stz_sentence.tokens]\n",
    "        offsets = [(token.start_char, token.end_char) for token in stz_sentence.tokens]\n",
    "        return {'input_ids': spans, 'offset_mapping': offsets}\n",
    "\n",
    "\n",
    "    stz_tokens = stz_sentence.tokens\n",
    "    collected_spans_info = [] # Will store dicts of {'text': ..., 'start_char': ..., 'end_char': ...}\n",
    "\n",
    "    # Helper recursive function to find spans\n",
    "    def _find_spans_recursive(current_node, current_leaf_idx):\n",
    "        \"\"\"\n",
    "        Traverses the tree, collects spans, and returns the updated leaf index.\n",
    "        current_leaf_idx is the index in stz_tokens that the first leaf of current_node corresponds to.\n",
    "        \"\"\"\n",
    "        if not isinstance(current_node, Tree):\n",
    "            # This case should ideally not be reached if called with Tree objects.\n",
    "            # If current_node is a leaf string, its parent (pre-terminal) should handle it.\n",
    "            return current_leaf_idx\n",
    "\n",
    "        node_height = current_node.height()\n",
    "        take_this_node_as_span = False\n",
    "\n",
    "        if node_height == target_nltk_height:\n",
    "            take_this_node_as_span = True\n",
    "        elif node_height < target_nltk_height and node_height >= 2: # NLTK Height 2 is pre-terminal.\n",
    "            # This node is \"flatter\" than the target. Take it as is to ensure full coverage.\n",
    "            take_this_node_as_span = True\n",
    "        \n",
    "        if take_this_node_as_span:\n",
    "            span_leaves = current_node.leaves()\n",
    "            num_leaves_in_span = len(span_leaves)\n",
    "\n",
    "            if num_leaves_in_span == 0: # Should not happen for valid subtrees from constituency parser\n",
    "                return current_leaf_idx\n",
    "            \n",
    "            # Boundary check for token indices\n",
    "            start_token_idx_for_span = current_leaf_idx\n",
    "            end_token_idx_for_span = current_leaf_idx + num_leaves_in_span - 1\n",
    "\n",
    "            if not (0 <= start_token_idx_for_span < len(stz_tokens) and \\\n",
    "                    0 <= end_token_idx_for_span < len(stz_tokens) and \\\n",
    "                    start_token_idx_for_span <= end_token_idx_for_span):\n",
    "                print(f\"Warning: Span token indices [{start_token_idx_for_span}-{end_token_idx_for_span}] \"\n",
    "                      f\"out of bounds (total tokens: {len(stz_tokens)}). Span leaves: '{' '.join(span_leaves)}'. Skipping.\")\n",
    "                # Still need to advance the leaf index by the number of leaves this node claims\n",
    "                return current_leaf_idx + num_leaves_in_span\n",
    "\n",
    "            span_text = \" \".join(span_leaves)\n",
    "            span_start_char = stz_tokens[start_token_idx_for_span].start_char\n",
    "            # Stanza's token.end_char is already exclusive for slicing.\n",
    "            span_end_char = stz_tokens[end_token_idx_for_span].end_char \n",
    "\n",
    "            collected_spans_info.append({\n",
    "                'text': span_text,\n",
    "                'start_char': span_start_char,\n",
    "                'end_char': span_end_char\n",
    "            })\n",
    "            return current_leaf_idx + num_leaves_in_span\n",
    "        \n",
    "        elif node_height > target_nltk_height: # Recurse on children\n",
    "            updated_leaf_idx = current_leaf_idx\n",
    "            for child in current_node:\n",
    "                if isinstance(child, Tree):\n",
    "                    updated_leaf_idx = _find_spans_recursive(child, updated_leaf_idx)\n",
    "                else:\n",
    "                    # Child is a leaf string. This means 'current_node' is a pre-terminal (height 2).\n",
    "                    # This path is taken if node_height (2) > target_nltk_height (e.g., target_nltk_height = 1, which is invalid for this func).\n",
    "                    # Or, if a token somehow isn't captured by its pre-terminal being taken as a span.\n",
    "                    # This case should ideally be covered by pre-terminals (H=2) being caught by `take_this_node_as_span`.\n",
    "                    # However, to be safe, if we encounter a direct leaf string here, we treat it as a single token span.\n",
    "                    if 0 <= updated_leaf_idx < len(stz_tokens):\n",
    "                         collected_spans_info.append({\n",
    "                            'text': child,\n",
    "                            'start_char': stz_tokens[updated_leaf_idx].start_char,\n",
    "                            'end_char': stz_tokens[updated_leaf_idx].end_char\n",
    "                        })\n",
    "                    else:\n",
    "                        print(f\"Warning: Leaf index {updated_leaf_idx} out of bounds for single token '{child}'.\")\n",
    "                    updated_leaf_idx += 1 # Advance for this single leaf\n",
    "            return updated_leaf_idx\n",
    "        else:\n",
    "            # This case implies node_height < 2 (e.g., 1 for Tree('A', [])).\n",
    "            # Standard constituency trees end in pre-terminals (H=2) with string children.\n",
    "            # If we reach here, it means the node is too small and wasn't processed.\n",
    "            # We should advance the leaf counter by its leaves to maintain consistency.\n",
    "            return current_leaf_idx + len(current_node.leaves())\n",
    "\n",
    "    # Start the recursive processing from the root of the parsed tree\n",
    "    if isinstance(nltk_tree, Tree):\n",
    "        _find_spans_recursive(nltk_tree, 0) # Initial leaf index is 0\n",
    "\n",
    "    final_spans = [info['text'] for info in collected_spans_info]\n",
    "    final_offsets = [(info['start_char'], info['end_char']) for info in collected_spans_info]\n",
    "\n",
    "    return {'input_ids': final_spans, 'offset_mapping': final_offsets}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e616969f-361c-42de-a72f-43bc35f6ae7e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initializing Stanza pipeline for en with tokenize,pos,constituency...\n",
      "Stanza pipeline initialized.\n",
      "Sentence: 'The quick brown fox jumps over the lazy dog.'\n",
      "  Spans (target_nltk_height=2): ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']\n",
      "  Offsets (target_nltk_height=2): [(0, 3), (4, 9), (10, 15), (16, 19), (20, 25), (26, 30), (31, 34), (35, 39), (40, 43), (43, 44)]\n",
      "  Spans (target_nltk_height=3): ['The quick brown fox', 'jumps', 'over', 'the lazy dog', '.']\n",
      "  Offsets (target_nltk_height=3): [(0, 19), (20, 25), (26, 30), (31, 43), (43, 44)]\n",
      "  Spans (target_nltk_height=4): ['The quick brown fox', 'jumps', 'over the lazy dog', '.']\n",
      "  Offsets (target_nltk_height=4): [(0, 19), (20, 25), (26, 43), (43, 44)]\n",
      "\n",
      "Sentence: 'This is a simple sentence.'\n",
      "  Spans (target_nltk_height=2): ['This', 'is', 'a', 'simple', 'sentence', '.']\n",
      "  Spans (target_nltk_height=3): ['This', 'is', 'a simple sentence', '.']\n",
      "\n",
      "Sentence: 'Constituency parsing can be complex.'\n",
      "  Spans (target_nltk_height=3): ['Constituency parsing', 'can', 'be', 'complex', '.']\n",
      "\n",
      "Sentence: ''\n",
      "  Spans: []\n",
      "\n",
      "Sentence: 'Go.'\n",
      "  Spans (target_nltk_height=2): ['Go', '.']\n",
      "  Spans (target_nltk_height=3): ['Go', '.']\n",
      "\n",
      "Note on the example '[\"The quick brown fox\", \"jumps\", \"over\", \"the lazy dog\", \".\"]':\n",
      "This specific segmentation requires a custom logic beyond simple NLTK tree height.\n",
      "For example, with target_nltk_height=3, you get larger phrase chunks:\n",
      "  e.g., ['The quick brown fox', 'jumps', 'over', 'the lazy dog', '.']\n",
      "And with target_nltk_height=2, you get token-level chunks:\n",
      "  e.g., ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']\n"
     ]
    }
   ],
   "source": [
    "if __name__ == '__main__':\n",
    "    # Example Usage:\n",
    "    try:\n",
    "        nlp_pipeline = initialize_stanza() # Initialize the global pipeline\n",
    "\n",
    "        sentence1 = \"The quick brown fox jumps over the lazy dog.\"\n",
    "        sentence2 = \"This is a simple sentence.\"\n",
    "        sentence3 = \"Constituency parsing can be complex.\"\n",
    "        empty_sentence = \"\"\n",
    "        short_sentence = \"Go.\"\n",
    "\n",
    "        print(f\"Sentence: '{sentence1}'\")\n",
    "        # NLTK Height 2: Pre-terminals (token-level spans)\n",
    "        result_h2_s1 = sentence_tokenizer(sentence1, target_nltk_height=2, nlp=nlp_pipeline)\n",
    "        print(f\"  Spans (target_nltk_height=2): {result_h2_s1['input_ids']}\")\n",
    "        print(f\"  Offsets (target_nltk_height=2): {result_h2_s1['offset_mapping']}\")\n",
    "\n",
    "        # NLTK Height 3: \"Flat\" phrases (phrases whose children are pre-terminals)\n",
    "        result_h3_s1 = sentence_tokenizer(sentence1, target_nltk_height=3, nlp=nlp_pipeline)\n",
    "        print(f\"  Spans (target_nltk_height=3): {result_h3_s1['input_ids']}\")\n",
    "        print(f\"  Offsets (target_nltk_height=3): {result_h3_s1['offset_mapping']}\")\n",
    "        \n",
    "        # NLTK Height 4: Higher-level phrases\n",
    "        result_h4_s1 = sentence_tokenizer(sentence1, target_nltk_height=4, nlp=nlp_pipeline)\n",
    "        print(f\"  Spans (target_nltk_height=4): {result_h4_s1['input_ids']}\") # Often gives larger chunks\n",
    "        print(f\"  Offsets (target_nltk_height=4): {result_h4_s1['offset_mapping']}\")\n",
    "\n",
    "\n",
    "        print(f\"\\nSentence: '{sentence2}'\")\n",
    "        # (ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (JJ simple) (NN sentence))) (. .)))\n",
    "        # S height = 4\n",
    "        # NP(This) height = 3 (This is DT H2)\n",
    "        # VP height = 3\n",
    "        # NP(a simple sen) height = 3\n",
    "        # . height = 2\n",
    "        result_h2_s2 = sentence_tokenizer(sentence2, target_nltk_height=2, nlp=nlp_pipeline)\n",
    "        print(f\"  Spans (target_nltk_height=2): {result_h2_s2['input_ids']}\")\n",
    "        result_h3_s2 = sentence_tokenizer(sentence2, target_nltk_height=3, nlp=nlp_pipeline)\n",
    "        print(f\"  Spans (target_nltk_height=3): {result_h3_s2['input_ids']}\")\n",
    "\n",
    "\n",
    "        print(f\"\\nSentence: '{sentence3}'\")\n",
    "        result_h3_s3 = sentence_tokenizer(sentence3, target_nltk_height=3, nlp=nlp_pipeline)\n",
    "        print(f\"  Spans (target_nltk_height=3): {result_h3_s3['input_ids']}\")\n",
    "        \n",
    "        print(f\"\\nSentence: '{empty_sentence}'\")\n",
    "        result_empty = sentence_tokenizer(empty_sentence, target_nltk_height=2, nlp=nlp_pipeline)\n",
    "        print(f\"  Spans: {result_empty['input_ids']}\")\n",
    "\n",
    "        print(f\"\\nSentence: '{short_sentence}'\")\n",
    "        # (ROOT (S (VP (VB Go)) (. .)))\n",
    "        # S height = 3. VP height = 2. VB height = 2. . height = 2\n",
    "        result_h2_short = sentence_tokenizer(short_sentence, target_nltk_height=2, nlp=nlp_pipeline)\n",
    "        print(f\"  Spans (target_nltk_height=2): {result_h2_short['input_ids']}\")\n",
    "        result_h3_short = sentence_tokenizer(short_sentence, target_nltk_height=3, nlp=nlp_pipeline)\n",
    "        print(f\"  Spans (target_nltk_height=3): {result_h3_short['input_ids']}\")\n",
    "\n",
    "\n",
    "        # Regarding your example output: [\"The quick brown fox\", \"jumps\", \"over\", \"the lazy dog\", \".\"]\n",
    "        # This specific output for \"The quick brown fox jumps over the lazy dog.\" is not directly achievable\n",
    "        # with a single target_nltk_height using the logic above because it mixes levels/strategies\n",
    "        # (e.g., taking a full NP, but then taking individual words like \"jumps\" and \"over\" which are\n",
    "        # heads of VP and PP respectively, rather than the full VP or PP at a consistent height).\n",
    "        # Achieving that specific output would require a more tailored chunking strategy.\n",
    "        print(\"\\nNote on the example '[\\\"The quick brown fox\\\", \\\"jumps\\\", \\\"over\\\", \\\"the lazy dog\\\", \\\".\\\"]':\")\n",
    "        print(\"This specific segmentation requires a custom logic beyond simple NLTK tree height.\")\n",
    "        print(\"For example, with target_nltk_height=3, you get larger phrase chunks:\")\n",
    "        print(f\"  e.g., {result_h3_s1['input_ids']}\")\n",
    "        print(\"And with target_nltk_height=2, you get token-level chunks:\")\n",
    "        print(f\"  e.g., {result_h2_s1['input_ids']}\")\n",
    "\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"An error occurred in the main execution: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "29d51dd9-53bd-4622-8508-b03ff1c695cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentence = \" movie with this red-hot guy. and he was really good. My only problem was that the movie was so incredibly\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "8ea4e70d-01d0-432c-9d20-0ed46581e64b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['movie', 'with', 'this', 'red - hot', 'guy', '.']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence_tokenizer(sentence, target_nltk_height=3, nlp=nlp_pipeline)[\"input_ids\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "3e488fd2-be46-4909-84fb-86d84c498a55",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': ['movie with this red - hot guy .'], 'offset_mapping': [(1, 29)]}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence_tokenizer(sentence, target_nltk_height=6, nlp=nlp_pipeline)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "1fa5e8a4-495b-47a9-acfd-6899dece9383",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'movie with this red-hot guy. and'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence[1:33]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "84dcafe5-e7ea-4636-9312-783d008fd5ea",
   "metadata": {},
   "source": [
    "## Multi-Sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "44e885e2-a0af-41b9-b45b-872f4a927c56",
   "metadata": {},
   "outputs": [],
   "source": [
    "# It's good practice to initialize the Stanza pipeline once\n",
    "# and pass it to the function if you're calling it multiple times.\n",
    "NLP_PIPELINE = None\n",
    "\n",
    "def initialize_stanza(lang='en', processors='tokenize,pos,constituency'):\n",
    "    \"\"\"Initializes and returns the Stanza pipeline.\"\"\"\n",
    "    global NLP_PIPELINE\n",
    "    if NLP_PIPELINE is None or NLP_PIPELINE.lang != lang or NLP_PIPELINE.processors_str != processors:\n",
    "        print(f\"Initializing Stanza pipeline for {lang} with {processors}...\")\n",
    "        try:\n",
    "            stanza.download(lang=lang, processors=processors, verbose=False) # Ensure models are downloaded\n",
    "            NLP_PIPELINE = stanza.Pipeline(lang=lang, processors=processors, verbose=False, download_method=None)\n",
    "            print(\"Stanza pipeline initialized.\")\n",
    "        except Exception as e:\n",
    "            print(f\"Error initializing Stanza pipeline: {e}\")\n",
    "            NLP_PIPELINE = None # Ensure it's None if failed\n",
    "            raise\n",
    "    return NLP_PIPELINE\n",
    "\n",
    "def sentence_tokenizer_recursive_multisentence(sentence_text: str, target_nltk_height: int, nlp: stanza.Pipeline) -> dict:\n",
    "    \"\"\"\n",
    "    Parses multiple sentences in a text into spans based on a specified NLTK tree height for each sentence.\n",
    "\n",
    "    Args:\n",
    "        sentence_text: The input text string, potentially containing multiple sentences.\n",
    "        target_nltk_height: The desired NLTK height of subtrees to be extracted as spans.\n",
    "                            - NLTK height of a pre-terminal (e.g., (DT The)) is 2.\n",
    "                              Using target_nltk_height = 2 will generally result in token-level spans.\n",
    "                            - NLTK height of a phrase whose children are only pre-terminals\n",
    "                              (e.g., (NP (DT a) (NN dog))) is 3.\n",
    "                              Using target_nltk_height = 3 will group words under such \"flat\" phrases.\n",
    "                            - Must be an integer >= 2.\n",
    "        nlp: An initialized Stanza Pipeline object.\n",
    "\n",
    "    Returns:\n",
    "        A dictionary with:\n",
    "        - 'input_ids': A list of strings, where each string is a span, from all sentences.\n",
    "        - 'offset_mapping': A list of (start_char, end_char) tuples for each span,\n",
    "                            where end_char is exclusive. Offsets are relative to the original sentence_text.\n",
    "    \"\"\"\n",
    "    if not sentence_text.strip():\n",
    "        return {'input_ids': [], 'offset_mapping': []}\n",
    "    if not nlp:\n",
    "        raise ValueError(\"Stanza pipeline (nlp) not initialized or provided.\")\n",
    "    if not isinstance(target_nltk_height, int) or target_nltk_height < 2:\n",
    "        raise ValueError(\"target_nltk_height must be an integer >= 2.\")\n",
    "\n",
    "    doc = nlp(sentence_text)\n",
    "    if not doc.sentences:\n",
    "        return {'input_ids': [], 'offset_mapping': []}\n",
    "\n",
    "    overall_collected_spans_info = [] # Accumulates dicts from ALL sentences\n",
    "\n",
    "    # Define the recursive helper function here.\n",
    "    # It can access target_nltk_height from the outer scope.\n",
    "    def _recursive_span_finder(current_node, current_leaf_idx, current_sentence_tokens, current_sentence_output_list):\n",
    "        if not isinstance(current_node, Tree):\n",
    "            return current_leaf_idx\n",
    "\n",
    "        node_height = current_node.height()\n",
    "        take_this_node_as_span = False\n",
    "\n",
    "        if node_height == target_nltk_height:\n",
    "            take_this_node_as_span = True\n",
    "        elif node_height < target_nltk_height and node_height >= 2:\n",
    "            take_this_node_as_span = True\n",
    "        \n",
    "        if take_this_node_as_span:\n",
    "            span_leaves = current_node.leaves()\n",
    "            num_leaves_in_span = len(span_leaves)\n",
    "\n",
    "            if num_leaves_in_span == 0:\n",
    "                return current_leaf_idx\n",
    "            \n",
    "            start_token_idx_for_span = current_leaf_idx\n",
    "            end_token_idx_for_span = current_leaf_idx + num_leaves_in_span - 1\n",
    "\n",
    "            if not (0 <= start_token_idx_for_span < len(current_sentence_tokens) and \\\n",
    "                    0 <= end_token_idx_for_span < len(current_sentence_tokens) and \\\n",
    "                    start_token_idx_for_span <= end_token_idx_for_span):\n",
    "                print(f\"Warning: Span token indices [{start_token_idx_for_span}-{end_token_idx_for_span}] \"\n",
    "                      f\"out of bounds (total tokens in current sentence: {len(current_sentence_tokens)}). \"\n",
    "                      f\"Span leaves: '{' '.join(span_leaves)}'. Skipping.\")\n",
    "                return current_leaf_idx + num_leaves_in_span\n",
    "\n",
    "            span_text = \" \".join(span_leaves)\n",
    "            span_start_char = current_sentence_tokens[start_token_idx_for_span].start_char\n",
    "            span_end_char = current_sentence_tokens[end_token_idx_for_span].end_char\n",
    "\n",
    "            current_sentence_output_list.append({\n",
    "                'text': span_text,\n",
    "                'start_char': span_start_char,\n",
    "                'end_char': span_end_char\n",
    "            })\n",
    "            return current_leaf_idx + num_leaves_in_span\n",
    "        \n",
    "        elif node_height > target_nltk_height:\n",
    "            updated_leaf_idx = current_leaf_idx\n",
    "            for child in current_node:\n",
    "                if isinstance(child, Tree):\n",
    "                    updated_leaf_idx = _recursive_span_finder(child, updated_leaf_idx, current_sentence_tokens, current_sentence_output_list)\n",
    "                else: # Child is a leaf string\n",
    "                    if 0 <= updated_leaf_idx < len(current_sentence_tokens):\n",
    "                        current_sentence_output_list.append({\n",
    "                            'text': child,\n",
    "                            'start_char': current_sentence_tokens[updated_leaf_idx].start_char,\n",
    "                            'end_char': current_sentence_tokens[updated_leaf_idx].end_char\n",
    "                        })\n",
    "                    else:\n",
    "                        print(f\"Warning: Leaf index {updated_leaf_idx} out of bounds for single token '{child}' in current sentence.\")\n",
    "                    updated_leaf_idx += 1\n",
    "            return updated_leaf_idx\n",
    "        else: # node_height < 2\n",
    "            return current_leaf_idx + len(current_node.leaves())\n",
    "\n",
    "\n",
    "    for stz_sentence in doc.sentences:\n",
    "        current_sentence_collected_info = [] # Spans for *this* sentence only\n",
    "        local_sentence_tokens = stz_sentence.tokens\n",
    "\n",
    "        # Fallback to tokenization for this sentence if no valid constituency tree\n",
    "        perform_fallback = False\n",
    "        if not stz_sentence.constituency:\n",
    "            print(f\"Warning: No constituency tree found for sentence: '{stz_sentence.text}'. \"\n",
    "                  \"Falling back to tokenization for this sentence.\")\n",
    "            perform_fallback = True\n",
    "        elif not str(stz_sentence.constituency).startswith(\"(\"):\n",
    "            print(f\"Warning: Constituency output for sentence '{stz_sentence.text}' \"\n",
    "                  f\"does not look like a tree: '{stz_sentence.constituency}'. \"\n",
    "                  \"Falling back to tokenization for this sentence.\")\n",
    "            perform_fallback = True\n",
    "\n",
    "        if perform_fallback:\n",
    "            for token in local_sentence_tokens:\n",
    "                current_sentence_collected_info.append({\n",
    "                    'text': token.text,\n",
    "                    'start_char': token.start_char,\n",
    "                    'end_char': token.end_char\n",
    "                })\n",
    "            overall_collected_spans_info.extend(current_sentence_collected_info)\n",
    "            continue # Move to the next sentence\n",
    "\n",
    "        try:\n",
    "            nltk_tree = Tree.fromstring(str(stz_sentence.constituency))\n",
    "        except ValueError as e:\n",
    "            print(f\"Warning: Could not parse constituency tree string for sentence: '{stz_sentence.text}'. \"\n",
    "                  f\"Error: {e}. Falling back to tokenization for this sentence.\")\n",
    "            for token in local_sentence_tokens:\n",
    "                current_sentence_collected_info.append({\n",
    "                    'text': token.text,\n",
    "                    'start_char': token.start_char,\n",
    "                    'end_char': token.end_char\n",
    "                })\n",
    "            overall_collected_spans_info.extend(current_sentence_collected_info)\n",
    "            continue # Move to the next sentence\n",
    "\n",
    "        # Process this sentence's tree\n",
    "        if isinstance(nltk_tree, Tree):\n",
    "            _recursive_span_finder(nltk_tree, 0, local_sentence_tokens, current_sentence_collected_info)\n",
    "        \n",
    "        overall_collected_spans_info.extend(current_sentence_collected_info)\n",
    "\n",
    "    # Prepare final output\n",
    "    final_spans = [info['text'] for info in overall_collected_spans_info]\n",
    "    final_offsets = [(info['start_char'], info['end_char']) for info in overall_collected_spans_info]\n",
    "\n",
    "    return {'input_ids': final_spans, 'offset_mapping': final_offsets}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3296593c-e1fd-4d69-ad49-5addecf1a456",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initializing Stanza pipeline for en with tokenize,pos,constituency...\n",
      "Stanza pipeline initialized.\n",
      "Multi-sentence Text: 'The quick brown fox jumps over the lazy dog. This is the second sentence. And a third one!'\n",
      "\n",
      "  Spans (target_nltk_height=2):\n",
      "    - 'The' (Offset: (0, 3), Original: 'The')\n",
      "    - 'quick' (Offset: (4, 9), Original: 'quick')\n",
      "    - 'brown' (Offset: (10, 15), Original: 'brown')\n",
      "    - 'fox' (Offset: (16, 19), Original: 'fox')\n",
      "    - 'jumps' (Offset: (20, 25), Original: 'jumps')\n",
      "    - 'over' (Offset: (26, 30), Original: 'over')\n",
      "    - 'the' (Offset: (31, 34), Original: 'the')\n",
      "    - 'lazy' (Offset: (35, 39), Original: 'lazy')\n",
      "    - 'dog' (Offset: (40, 43), Original: 'dog')\n",
      "    - '.' (Offset: (43, 44), Original: '.')\n",
      "    - 'This' (Offset: (45, 49), Original: 'This')\n",
      "    - 'is' (Offset: (50, 52), Original: 'is')\n",
      "    - 'the' (Offset: (53, 56), Original: 'the')\n",
      "    - 'second' (Offset: (57, 63), Original: 'second')\n",
      "    - 'sentence' (Offset: (64, 72), Original: 'sentence')\n",
      "    - '.' (Offset: (72, 73), Original: '.')\n",
      "    - 'And' (Offset: (74, 77), Original: 'And')\n",
      "    - 'a' (Offset: (78, 79), Original: 'a')\n",
      "    - 'third' (Offset: (80, 85), Original: 'third')\n",
      "    - 'one' (Offset: (86, 89), Original: 'one')\n",
      "    - '!' (Offset: (89, 90), Original: '!')\n",
      "\n",
      "  Spans (target_nltk_height=3):\n",
      "    - 'The quick brown fox' (Offset: (0, 19), Original: 'The quick brown fox')\n",
      "    - 'jumps' (Offset: (20, 25), Original: 'jumps')\n",
      "    - 'over' (Offset: (26, 30), Original: 'over')\n",
      "    - 'the lazy dog' (Offset: (31, 43), Original: 'the lazy dog')\n",
      "    - '.' (Offset: (43, 44), Original: '.')\n",
      "    - 'This' (Offset: (45, 49), Original: 'This')\n",
      "    - 'is' (Offset: (50, 52), Original: 'is')\n",
      "    - 'the second sentence' (Offset: (53, 72), Original: 'the second sentence')\n",
      "    - '.' (Offset: (72, 73), Original: '.')\n",
      "    - 'And' (Offset: (74, 77), Original: 'And')\n",
      "    - 'a third one' (Offset: (78, 89), Original: 'a third one')\n",
      "    - '!' (Offset: (89, 90), Original: '!')\n",
      "\n",
      "Multi-sentence Text: 'Go. This is complex. What?'\n",
      "\n",
      "  Spans (target_nltk_height=3):\n",
      "    - 'Go' (Offset: (0, 2), Original: 'Go')\n",
      "    - '.' (Offset: (2, 3), Original: '.')\n",
      "    - 'This' (Offset: (4, 8), Original: 'This')\n",
      "    - 'is' (Offset: (9, 11), Original: 'is')\n",
      "    - 'complex' (Offset: (12, 19), Original: 'complex')\n",
      "    - '.' (Offset: (19, 20), Original: '.')\n",
      "    - 'What ?' (Offset: (21, 26), Original: 'What?')\n"
     ]
    }
   ],
   "source": [
    "if __name__ == '__main__':\n",
    "    try:\n",
    "        nlp_pipeline = initialize_stanza() # Initialize the global pipeline\n",
    "\n",
    "        multi_sentence_text = \"The quick brown fox jumps over the lazy dog. This is the second sentence. And a third one!\"\n",
    "        \n",
    "        print(f\"Multi-sentence Text: '{multi_sentence_text}'\")\n",
    "        \n",
    "        # NLTK Height 2: Pre-terminals (token-level spans for each sentence)\n",
    "        result_h2_multi = sentence_tokenizer_recursive_multisentence(multi_sentence_text, target_nltk_height=2, nlp=nlp_pipeline)\n",
    "        print(f\"\\n  Spans (target_nltk_height=2):\")\n",
    "        for i, span_text in enumerate(result_h2_multi['input_ids']):\n",
    "            offset = result_h2_multi['offset_mapping'][i]\n",
    "            original_substring = multi_sentence_text[offset[0]:offset[1]]\n",
    "            print(f\"    - '{span_text}' (Offset: {offset}, Original: '{original_substring}')\")\n",
    "\n",
    "        # NLTK Height 3: \"Flat\" phrases for each sentence\n",
    "        result_h3_multi = sentence_tokenizer_recursive_multisentence(multi_sentence_text, target_nltk_height=3, nlp=nlp_pipeline)\n",
    "        print(f\"\\n  Spans (target_nltk_height=3):\")\n",
    "        for i, span_text in enumerate(result_h3_multi['input_ids']):\n",
    "            offset = result_h3_multi['offset_mapping'][i]\n",
    "            original_substring = multi_sentence_text[offset[0]:offset[1]]\n",
    "            print(f\"    - '{span_text}' (Offset: {offset}, Original: '{original_substring}')\")\n",
    "\n",
    "        # Example with a sentence that might have parsing issues or is very short\n",
    "        test_text_2 = \"Go. This is complex. What?\"\n",
    "        print(f\"\\nMulti-sentence Text: '{test_text_2}'\")\n",
    "        result_h3_test2 = sentence_tokenizer_recursive_multisentence(test_text_2, target_nltk_height=3, nlp=nlp_pipeline)\n",
    "        print(f\"\\n  Spans (target_nltk_height=3):\")\n",
    "        for i, span_text in enumerate(result_h3_test2['input_ids']):\n",
    "            offset = result_h3_test2['offset_mapping'][i]\n",
    "            original_substring = test_text_2[offset[0]:offset[1]]\n",
    "            print(f\"    - '{span_text}' (Offset: {offset}, Original: '{original_substring}')\")\n",
    "\n",
    "\n",
    "    except Exception as e:\n",
    "        print(f\"An error occurred in the main execution: {e}\")\n",
    "        # If stanza models are missing, you might see an error here.\n",
    "        # Make sure you have run:\n",
    "        # import stanza\n",
    "        # stanza.download('en') # or the language you need"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "5ec8de5d-594e-475b-8d3b-6b88645a05b4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Multi-sentence Text: ' How do you get someone out of your head? I'm completely lost. I want those feelings to go away, to leave me alone. What do I do?'\n",
      "Warning: Span token indices [4-4] out of bounds (total tokens in current sentence: 4). Span leaves: '.'. Skipping.\n",
      "\n",
      "  Spans (target_nltk_height=3):\n",
      "    - 'How' (Offset: (1, 4), Original: 'How')\n",
      "    - 'do' (Offset: (5, 7), Original: 'do')\n",
      "    - 'you' (Offset: (8, 11), Original: 'you')\n",
      "    - 'get' (Offset: (12, 15), Original: 'get')\n",
      "    - 'someone' (Offset: (16, 23), Original: 'someone')\n",
      "    - 'out' (Offset: (24, 27), Original: 'out')\n",
      "    - 'of' (Offset: (28, 30), Original: 'of')\n",
      "    - 'your head' (Offset: (31, 40), Original: 'your head')\n",
      "    - '?' (Offset: (40, 41), Original: '?')\n",
      "    - 'I' (Offset: (42, 45), Original: 'I'm')\n",
      "    - ''m' (Offset: (46, 56), Original: 'completely')\n",
      "    - 'completely lost' (Offset: (57, 62), Original: 'lost.')\n",
      "    - 'I' (Offset: (63, 64), Original: 'I')\n",
      "    - 'want' (Offset: (65, 69), Original: 'want')\n",
      "    - 'those feelings' (Offset: (70, 84), Original: 'those feelings')\n",
      "    - 'to' (Offset: (85, 87), Original: 'to')\n",
      "    - 'go' (Offset: (88, 90), Original: 'go')\n",
      "    - 'away' (Offset: (91, 95), Original: 'away')\n",
      "    - ',' (Offset: (95, 96), Original: ',')\n",
      "    - 'to' (Offset: (97, 99), Original: 'to')\n",
      "    - 'leave' (Offset: (100, 105), Original: 'leave')\n",
      "    - 'me' (Offset: (106, 108), Original: 'me')\n",
      "    - 'alone' (Offset: (109, 114), Original: 'alone')\n",
      "    - '.' (Offset: (114, 115), Original: '.')\n",
      "    - 'What' (Offset: (116, 120), Original: 'What')\n",
      "    - 'do' (Offset: (121, 123), Original: 'do')\n",
      "    - 'I' (Offset: (124, 125), Original: 'I')\n",
      "    - 'do' (Offset: (126, 128), Original: 'do')\n",
      "    - '?' (Offset: (128, 129), Original: '?')\n"
     ]
    }
   ],
   "source": [
    "test_text_2 = \" How do you get someone out of your head? I'm completely lost. I want those feelings to go away, to leave me alone. What do I do?\"\n",
    "print(f\"\\nMulti-sentence Text: '{test_text_2}'\")\n",
    "result_h3_test2 = sentence_tokenizer_recursive_multisentence(test_text_2, target_nltk_height=3, nlp=nlp_pipeline)\n",
    "print(f\"\\n  Spans (target_nltk_height=3):\")\n",
    "for i, span_text in enumerate(result_h3_test2['input_ids']):\n",
    "    offset = result_h3_test2['offset_mapping'][i]\n",
    "    original_substring = test_text_2[offset[0]:offset[1]]\n",
    "    print(f\"    - '{span_text}' (Offset: {offset}, Original: '{original_substring}')\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6395b5f-6247-43c2-8975-903313dd7193",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
