#!/usr/bin/env python
# coding: utf-8

import nltk
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')


# Define a simple context-free grammar
# S: Sentence, NP: Noun Phrase, VP: Verb Phrase, PP: Prepositional Phrase
# Det: Determiner, N: Noun, V: Verb, P: Preposition
grammar = nltk.CFG.fromstring("""
    S -> NP VP
    VP -> V NP | V NP PP
    PP -> P NP
    NP -> Det N | Det N PP | 'I' | N
    Det -> 'the' | 'a' | 'my'
    N -> 'man' | 'dog' | 'cat' | 'park' | 'telescope' | 'saw'
    V -> 'saw' | 'walked' | 'ate'
    P -> 'in' | 'with' | 'on'
""")

# Create a parser (e.g., ChartParser or RecursiveDescentParser)
# ChartParser is generally more efficient for ambiguous grammars.
parser = nltk.ChartParser(grammar)
# You could also use:
# parser = nltk.RecursiveDescentParser(grammar)

# Sentence to parse
sentence_str = "the man saw a dog in the park"
# sentence_str = "I saw the man with a telescope"

# Tokenize the sentence
tokens = nltk.word_tokenize(sentence_str.lower()) # Convert to lowercase if your grammar is in lowercase

# Parse the sentence
try:
    for tree in parser.parse(tokens):
        print("Constituency Parse Tree:")
        tree.pretty_print() # Print in a more readable tree format
        # To draw the tree (requires tkinter and ghostscript usually)
        # tree.draw()
except ValueError as e:
    print(f"Could not parse the sentence: {e}")
    print("This might happen if the grammar cannot derive the sentence or due to recursion depth with RecursiveDescentParser for complex sentences.")

print("\n--- Another Example ---")
sentence_str_2 = "I ate the pizza"
tokens_2 = nltk.word_tokenize(sentence_str_2.lower())
# Add 'pizza' to the grammar if it's not there (for demonstration)
# In a real scenario, you'd have a more comprehensive grammar or use a statistical parser.
# For this example, we'll assume 'pizza' could be an N.
# If you want to modify the grammar dynamically (not typical for CFG parsers like this):
# You would typically redefine the grammar string and recreate the CFG object.
# For now, ensure 'pizza' is covered or use a sentence with existing Nouns.
# Let's use an existing noun for simplicity with the current grammar.
sentence_str_2 = "I saw the cat"
tokens_2 = nltk.word_tokenize(sentence_str_2.lower())

try:
    for tree in parser.parse(tokens_2):
        print("Constituency Parse Tree:")
        tree.pretty_print()
except ValueError as e:
    print(f"Could not parse the sentence: {e}")


import stanza
try:
    stanza.download('en') # Download English model package
except:
    print("Stanza model 'en' likely already downloaded.")


# Initialize the Stanza pipeline.
# For constituency parsing, you need 'tokenize', 'pos', and 'constituency' processors.
# Stanza will download models on first use if not already present and 'download_method=stanza.DownloadMethod.REUSE_RESOURCES' is not set
# or if the model path is not correctly configured.
try:
    nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None) # Use None to attempt download if needed
except Exception as e:
    print(f"Error initializing Stanza pipeline. Ensure models are downloaded. {e}")
    print("Attempting to download 'en' model explicitly if not done.")
    try:
        stanza.download('en')
        nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
    except Exception as e_retry:
        print(f"Failed to initialize Stanza even after attempting download: {e_retry}")
        nlp = None


if nlp:
    # Process a sentence
    doc = nlp("The quick brown fox jumps over the lazy dog.")
    # doc = nlp("This is a more complex sentence with several clauses and phrases.")
    # doc = nlp("brown fox")

    for i, sentence in enumerate(doc.sentences):
        print(f"--- Sentence {i+1} ---")
        print("Constituency Parse Tree (S-expression format):")
        print(sentence.constituency) # Prints the tree in S-expression format (like LISP)

        # To visualize with NLTK's Tree (optional, but nice)
        try:
            from nltk.tree import Tree
            nltk_tree = Tree.fromstring(str(sentence.constituency))
            print("\nNLTK Tree Visualization:")
            nltk_tree.pretty_print()
            # To draw the tree (requires tkinter and ghostscript usually)
            # nltk_tree.draw()
        except ImportError:
            print("\nNLTK Tree visualization skipped: NLTK not fully available or tree conversion issue.")
        except Exception as e_nltk:
            print(f"\nError during NLTK tree visualization: {e_nltk}")

else:
    print("Stanza NLP pipeline could not be initialized.")


import stanza
from nltk.tree import Tree
import logging

# Configure logging for Stanza (optional, to reduce verbosity during downloads/loading)
# logging.basicConfig(level=logging.INFO) # Set to INFO or WARNING to see less

# Global NLP pipeline to avoid re-initialization on every call
STANZA_NLP_PIPELINE = None

def initialize_stanza_pipeline():
    """Initializes the Stanza pipeline for English constituency parsing if not already done."""
    global STANZA_NLP_PIPELINE
    if STANZA_NLP_PIPELINE is None:
        print("Initializing Stanza pipeline for English constituency parsing...")
        try:
            # Download English models if not already present.
            # Using 'stanford' as resources_url can sometimes be more reliable.
            stanza.download('en', verbose=False, resources_url='stanford')
        except Exception as e:
            # This might happen if models are already downloaded or due to network issues.
            # The pipeline initialization will confirm if models are truly missing.
            print(f"Stanza download check/attempt info: {e}")
        
        STANZA_NLP_PIPELINE = stanza.Pipeline(
            lang='en',
            processors='tokenize,pos,constituency', # Essential processors
            verbose=False,
            # This tells Stanza to use existing models if found, and not re-download.
            download_method=stanza.DownloadMethod.REUSE_RESOURCES
        )
        print("Stanza pipeline initialized.")

def _extract_spans_recursive(
    node,
    original_words, # List of Stanza Word objects for character offsets
    current_token_index_list, # List containing a single int, to pass by reference
    spans, # List to append span strings
    offset_mappings, # List to append (start_char, end_char) tuples
    phrase_labels_to_capture # Set of phrase labels to capture as whole units
):
    """
    Recursively traverses the NLTK tree (derived from Stanza's parse)
    to extract semantic spans based on specified phrase labels.
    """
    # Case 1: Node is a terminal string (a word)
    if isinstance(node, str):
        token_idx = current_token_index_list[0]
        if token_idx < len(original_words):
            word_obj = original_words[token_idx]
            spans.append(word_obj.text) # The 'node' string is also the word text
            offset_mappings.append((word_obj.start_char, word_obj.end_char))
            current_token_index_list[0] += 1 # Consume this token
        else:
            # This indicates a mismatch, e.g., tree has more leaves than original sentence tokens.
            # Should be rare with consistent Stanza output.
            logging.warning(f"Token index {token_idx} out of bounds for original_words (len {len(original_words)}). Leaf: '{node}'")
        return

    # Case 2: Node is a non-terminal (an NLTK Tree object representing a phrase)
    # Check if this phrase's label is one we want to capture as a single unit
    if node.label() in phrase_labels_to_capture:
        leaf_strings = node.leaves() # Get all word strings under this phrase
        num_leaves = len(leaf_strings)
        
        start_token_idx = current_token_index_list[0]
        # Calculate end_token_idx carefully to avoid going out of bounds
        end_token_idx = min(start_token_idx + num_leaves - 1, len(original_words) - 1)

        if num_leaves > 0 and start_token_idx <= end_token_idx:
            # Construct the span text by joining the leaf strings
            span_text = " ".join(leaf_strings)
            
            # Get character offsets from the corresponding Stanza Word objects
            start_char = original_words[start_token_idx].start_char
            end_char = original_words[end_token_idx].end_char # Use the adjusted end_token_idx
            
            spans.append(span_text)
            offset_mappings.append((start_char, end_char))
            
            # Advance the global token index past all leaves consumed by this phrase
            current_token_index_list[0] = end_token_idx + 1
        elif num_leaves > 0: # num_leaves > 0 but token indices became problematic
            logging.warning(f"Could not form span for node '{node.label()}' due to token index mismatch. Processing children instead as fallback.")
            # Fallback: if indices are problematic, process children to not lose tokens entirely
            for child in node:
                 _extract_spans_recursive(child, original_words, current_token_index_list, spans, offset_mappings, phrase_labels_to_capture)
        # If num_leaves is 0 (empty phrase node), do nothing and don't advance token index.
        return # This phrase is captured, so don't recurse into its children

    # Case 3: Node is a non-terminal, but its label is NOT in phrase_labels_to_capture.
    # So, recurse on its children to break it down further.
    for child in node:
        _extract_spans_recursive(child, original_words, current_token_index_list, spans, offset_mappings, phrase_labels_to_capture)


def sentence_tokenizer(sentence_str: str, phrase_labels_to_capture: list = None) -> dict:
    """
    Parses a sentence into semantic units (spans) using Stanza's constituency parser.
    The "level" of grouping is controlled by `phrase_labels_to_capture`.

    Args:
        sentence_str: The input sentence string.
        phrase_labels_to_capture: A list of NLTK tree node labels (e.g., 'NP', 'PP', 'S').
                                  Phrases with these labels will be captured as single spans.
                                  If None, a default set of common phrase labels is used,
                                  typically excluding 'S' and 'VP' to encourage finer-grained units.
                                  To get the example output from the prompt, you would use ['NP', '.'].

    Returns:
        A dictionary with two keys:
        - 'input_ids': A list of strings, where each string is a semantic span.
        - 'offset_mapping': A list of tuples (start_char, end_char) indicating
                            the character position of each span in the original sentence.
    """
    initialize_stanza_pipeline() # Ensure the Stanza pipeline is ready
    
    if STANZA_NLP_PIPELINE is None: # Guard against pipeline initialization failure
        logging.error("Stanza pipeline could not be initialized. Returning empty.")
        return {'input_ids': [], 'offset_mapping': []}

    # Default phrase labels if none provided. This list aims for common semantic chunks.
    # Excludes 'S' and 'VP' to generally get smaller units than full clauses/verb phrases
    # unless specified by the user. Includes '.' for punctuation.
    if phrase_labels_to_capture is None:
        phrase_labels_to_capture_set = {'NP', 'PP', 'ADJP', 'ADVP', 'SBAR', '.'}
    else:
        phrase_labels_to_capture_set = set(phrase_labels_to_capture) # Use a set for efficient lookups

    doc = STANZA_NLP_PIPELINE(sentence_str.strip()) # Process the cleaned sentence
    if not doc.sentences:
        return {'input_ids': [], 'offset_mapping': []} # Handle empty or unparsable input

    stz_sentence = doc.sentences[0] # Process the first sentence
    
    # Convert Stanza's constituency string to an NLTK Tree object
    try:
        constituency_str = str(stz_sentence.constituency)
        if not constituency_str or constituency_str.strip() == "()" or "XX" in constituency_str : # Empty, trivial, or failed parse tree
            logging.warning(f"Empty or trivial constituency tree for: '{sentence_str}'. Defaulting to word-level spans.")
            spans_result = [word.text for word in stz_sentence.words]
            offset_mappings_result = [(word.start_char, word.end_char) for word in stz_sentence.words]
            return {'input_ids': spans_result, 'offset_mapping': offset_mappings_result}
        root_nltk_tree = Tree.fromstring(constituency_str)
    except Exception as e:
        logging.error(f"Error parsing Stanza constituency tree string: {e}. Tree string was: '{constituency_str}'. Defaulting to word-level spans.")
        spans_result = [word.text for word in stz_sentence.words]
        offset_mappings_result = [(word.start_char, word.end_char) for word in stz_sentence.words]
        return {'input_ids': spans_result, 'offset_mapping': offset_mappings_result}
        
    original_words = stz_sentence.words # List of Stanza Word objects
    if not original_words:
        return {'input_ids': [], 'offset_mapping': []} # No words, no spans

    spans_result = []
    offset_mappings_result = []
    current_token_index_list = [0] # Index for original_words, passed as a list for mutability

    # Stanza's constituency tree often starts with a ROOT node,
    # whose child (e.g., S) is the actual root of the sentence's grammatical structure.
    effective_tree_root = root_nltk_tree
    if root_nltk_tree.label() == 'ROOT' and len(root_nltk_tree) == 1:
        effective_tree_root = root_nltk_tree[0] # Descend into the typical single child (e.g., S)
    
    # Start the recursive extraction from the effective root of the sentence tree
    _extract_spans_recursive(
        effective_tree_root,
        original_words,
        current_token_index_list,
        spans_result,
        offset_mappings_result,
        phrase_labels_to_capture_set # Pass the set
    )
    
    # Safety check: If not all tokens were consumed by the tree traversal
    # (e.g., due to unusual tree structure or trailing unparsed elements),
    # append any remaining tokens as individual spans.
    final_token_idx_consumed = current_token_index_list[0]
    if final_token_idx_consumed < len(original_words):
        logging.warning(
            f"Not all tokens consumed by tree traversal. Consumed: {final_token_idx_consumed}, Total: {len(original_words)}. "
            f"Appending remaining {len(original_words) - final_token_idx_consumed} token(s) individually."
        )
        for i in range(final_token_idx_consumed, len(original_words)):
            word_obj = original_words[i]
            spans_result.append(word_obj.text)
            offset_mappings_result.append((word_obj.start_char, word_obj.end_char))
            
    return {'input_ids': spans_result, 'offset_mapping': offset_mappings_result}



# Initialize pipeline (can take a moment on first run or if models need downloading)
# You might want to call initialize_stanza_pipeline() once at the start of your application.
# initialize_stanza_pipeline() # Called implicitly by sentence_tokenizer if STANZA_NLP_PIPELINE is None

sentence1 = "The quick brown fox jumps over the lazy dog."
print(f"\nProcessing sentence: '{sentence1}'")

# First, let's see the full constituency tree to understand its structure
# (especially the label for punctuation)
if STANZA_NLP_PIPELINE is None: initialize_stanza_pipeline() # Ensure initialized for inspection
if STANZA_NLP_PIPELINE:
    doc_inspect = STANZA_NLP_PIPELINE(sentence1)
    if doc_inspect.sentences:
        print("\nFull Constituency Tree for Inspection:")
        stz_sent_inspect = doc_inspect.sentences[0]
        Tree.fromstring(str(stz_sent_inspect.constituency)).pretty_print()
        # For "The quick brown fox jumps over the lazy dog.", the last node is typically (. .)
        # The label for this node is '.'


# It's good practice to initialize the Stanza pipeline once
# and pass it to the function if you're calling it multiple times.
NLP_PIPELINE = None

def initialize_stanza(lang='en', processors='tokenize,pos,constituency'):
    """Initializes and returns the Stanza pipeline."""
    global NLP_PIPELINE
    if NLP_PIPELINE is None or NLP_PIPELINE.lang != lang or NLP_PIPELINE.processors_str != processors:
        print(f"Initializing Stanza pipeline for {lang} with {processors}...")
        try:
            stanza.download(lang=lang, processors=processors, verbose=False)
            NLP_PIPELINE = stanza.Pipeline(lang=lang, processors=processors, verbose=False, download_method=None)
            print("Stanza pipeline initialized.")
        except Exception as e:
            print(f"Error initializing Stanza pipeline: {e}")
            NLP_PIPELINE = None # Ensure it's None if failed
            raise
    return NLP_PIPELINE

def sentence_tokenizer(sentence_text: str, target_nltk_height: int, nlp: stanza.Pipeline) -> dict:
    """
    Parses a sentence into spans based on a specified NLTK tree height.

    Args:
        sentence_text: The input sentence string.
        target_nltk_height: The desired NLTK height of subtrees to be extracted as spans.
                            - NLTK height of a pre-terminal (e.g., (DT The)) is 2.
                              Using target_nltk_height = 2 will generally result in token-level spans.
                            - NLTK height of a phrase whose children are only pre-terminals
                              (e.g., (NP (DT a) (NN dog))) is 3.
                              Using target_nltk_height = 3 will group words under such "flat" phrases.
                            - Must be an integer >= 2.
        nlp: An initialized Stanza Pipeline object.

    Returns:
        A dictionary with:
        - 'input_ids': A list of strings, where each string is a span.
        - 'offset_mapping': A list of (start_char, end_char) tuples for each span,
                            where end_char is exclusive.
    """
    if not sentence_text.strip():
        return {'input_ids': [], 'offset_mapping': []}
    if not nlp:
        raise ValueError("Stanza pipeline (nlp) not initialized or provided.")
    if not isinstance(target_nltk_height, int) or target_nltk_height < 2:
        raise ValueError("target_nltk_height must be an integer >= 2.")

    doc = nlp(sentence_text)
    if not doc.sentences:
        return {'input_ids': [], 'offset_mapping': []}

    # For this function, we process the first sentence.
    # It could be extended to handle multiple sentences in a document.
    stz_sentence = doc.sentences[0]

    if not stz_sentence.constituency:
        # Fallback to basic tokenization if no constituency tree is available for some reason
        print("Warning: No constituency tree found in Stanza sentence. Falling back to tokenization.")
        spans = [token.text for token in stz_sentence.tokens]
        offsets = [(token.start_char, token.end_char) for token in stz_sentence.tokens]
        return {'input_ids': spans, 'offset_mapping': offsets}

    try:
        # The Stanza constituency output might be a simple string if parsing failed,
        # or a LISP-style tree string.
        if not str(stz_sentence.constituency).startswith("("): # Basic check for tree structure
             raise ValueError("Constituency output does not look like a tree.")
        nltk_tree = Tree.fromstring(str(stz_sentence.constituency))
    except ValueError as e:
        print(f"Warning: Could not parse constituency tree string: '{stz_sentence.constituency}'. Error: {e}. Falling back to tokenization.")
        spans = [token.text for token in stz_sentence.tokens]
        offsets = [(token.start_char, token.end_char) for token in stz_sentence.tokens]
        return {'input_ids': spans, 'offset_mapping': offsets}


    stz_tokens = stz_sentence.tokens
    collected_spans_info = [] # Will store dicts of {'text': ..., 'start_char': ..., 'end_char': ...}

    # Helper recursive function to find spans
    def _find_spans_recursive(current_node, current_leaf_idx):
        """
        Traverses the tree, collects spans, and returns the updated leaf index.
        current_leaf_idx is the index in stz_tokens that the first leaf of current_node corresponds to.
        """
        if not isinstance(current_node, Tree):
            # This case should ideally not be reached if called with Tree objects.
            # If current_node is a leaf string, its parent (pre-terminal) should handle it.
            return current_leaf_idx

        node_height = current_node.height()
        take_this_node_as_span = False

        if node_height == target_nltk_height:
            take_this_node_as_span = True
        elif node_height < target_nltk_height and node_height >= 2: # NLTK Height 2 is pre-terminal.
            # This node is "flatter" than the target. Take it as is to ensure full coverage.
            take_this_node_as_span = True
        
        if take_this_node_as_span:
            span_leaves = current_node.leaves()
            num_leaves_in_span = len(span_leaves)

            if num_leaves_in_span == 0: # Should not happen for valid subtrees from constituency parser
                return current_leaf_idx
            
            # Boundary check for token indices
            start_token_idx_for_span = current_leaf_idx
            end_token_idx_for_span = current_leaf_idx + num_leaves_in_span - 1

            if not (0 <= start_token_idx_for_span < len(stz_tokens) and \
                    0 <= end_token_idx_for_span < len(stz_tokens) and \
                    start_token_idx_for_span <= end_token_idx_for_span):
                print(f"Warning: Span token indices [{start_token_idx_for_span}-{end_token_idx_for_span}] "
                      f"out of bounds (total tokens: {len(stz_tokens)}). Span leaves: '{' '.join(span_leaves)}'. Skipping.")
                # Still need to advance the leaf index by the number of leaves this node claims
                return current_leaf_idx + num_leaves_in_span

            span_text = " ".join(span_leaves)
            span_start_char = stz_tokens[start_token_idx_for_span].start_char
            # Stanza's token.end_char is already exclusive for slicing.
            span_end_char = stz_tokens[end_token_idx_for_span].end_char 

            collected_spans_info.append({
                'text': span_text,
                'start_char': span_start_char,
                'end_char': span_end_char
            })
            return current_leaf_idx + num_leaves_in_span
        
        elif node_height > target_nltk_height: # Recurse on children
            updated_leaf_idx = current_leaf_idx
            for child in current_node:
                if isinstance(child, Tree):
                    updated_leaf_idx = _find_spans_recursive(child, updated_leaf_idx)
                else:
                    # Child is a leaf string. This means 'current_node' is a pre-terminal (height 2).
                    # This path is taken if node_height (2) > target_nltk_height (e.g., target_nltk_height = 1, which is invalid for this func).
                    # Or, if a token somehow isn't captured by its pre-terminal being taken as a span.
                    # This case should ideally be covered by pre-terminals (H=2) being caught by `take_this_node_as_span`.
                    # However, to be safe, if we encounter a direct leaf string here, we treat it as a single token span.
                    if 0 <= updated_leaf_idx < len(stz_tokens):
                         collected_spans_info.append({
                            'text': child,
                            'start_char': stz_tokens[updated_leaf_idx].start_char,
                            'end_char': stz_tokens[updated_leaf_idx].end_char
                        })
                    else:
                        print(f"Warning: Leaf index {updated_leaf_idx} out of bounds for single token '{child}'.")
                    updated_leaf_idx += 1 # Advance for this single leaf
            return updated_leaf_idx
        else:
            # This case implies node_height < 2 (e.g., 1 for Tree('A', [])).
            # Standard constituency trees end in pre-terminals (H=2) with string children.
            # If we reach here, it means the node is too small and wasn't processed.
            # We should advance the leaf counter by its leaves to maintain consistency.
            return current_leaf_idx + len(current_node.leaves())

    # Start the recursive processing from the root of the parsed tree
    if isinstance(nltk_tree, Tree):
        _find_spans_recursive(nltk_tree, 0) # Initial leaf index is 0

    final_spans = [info['text'] for info in collected_spans_info]
    final_offsets = [(info['start_char'], info['end_char']) for info in collected_spans_info]

    return {'input_ids': final_spans, 'offset_mapping': final_offsets}


# It's good practice to initialize the Stanza pipeline once
# and pass it to the function if you're calling it multiple times.
NLP_PIPELINE = None

def initialize_stanza(lang='en', processors='tokenize,pos,constituency'):
    """Initializes and returns the Stanza pipeline."""
    global NLP_PIPELINE
    if NLP_PIPELINE is None or NLP_PIPELINE.lang != lang or NLP_PIPELINE.processors_str != processors:
        print(f"Initializing Stanza pipeline for {lang} with {processors}...")
        try:
            stanza.download(lang=lang, processors=processors, verbose=False) # Ensure models are downloaded
            NLP_PIPELINE = stanza.Pipeline(lang=lang, processors=processors, verbose=False, download_method=None)
            print("Stanza pipeline initialized.")
        except Exception as e:
            print(f"Error initializing Stanza pipeline: {e}")
            NLP_PIPELINE = None # Ensure it's None if failed
            raise
    return NLP_PIPELINE

def sentence_tokenizer_recursive_multisentence(sentence_text: str, target_nltk_height: int, nlp: stanza.Pipeline) -> dict:
    """
    Parses multiple sentences in a text into spans based on a specified NLTK tree height for each sentence.

    Args:
        sentence_text: The input text string, potentially containing multiple sentences.
        target_nltk_height: The desired NLTK height of subtrees to be extracted as spans.
                            - NLTK height of a pre-terminal (e.g., (DT The)) is 2.
                              Using target_nltk_height = 2 will generally result in token-level spans.
                            - NLTK height of a phrase whose children are only pre-terminals
                              (e.g., (NP (DT a) (NN dog))) is 3.
                              Using target_nltk_height = 3 will group words under such "flat" phrases.
                            - Must be an integer >= 2.
        nlp: An initialized Stanza Pipeline object.

    Returns:
        A dictionary with:
        - 'input_ids': A list of strings, where each string is a span, from all sentences.
        - 'offset_mapping': A list of (start_char, end_char) tuples for each span,
                            where end_char is exclusive. Offsets are relative to the original sentence_text.
    """
    if not sentence_text.strip():
        return {'input_ids': [], 'offset_mapping': []}
    if not nlp:
        raise ValueError("Stanza pipeline (nlp) not initialized or provided.")
    if not isinstance(target_nltk_height, int) or target_nltk_height < 2:
        raise ValueError("target_nltk_height must be an integer >= 2.")

    doc = nlp(sentence_text)
    if not doc.sentences:
        return {'input_ids': [], 'offset_mapping': []}

    overall_collected_spans_info = [] # Accumulates dicts from ALL sentences

    # Define the recursive helper function here.
    # It can access target_nltk_height from the outer scope.
    def _recursive_span_finder(current_node, current_leaf_idx, current_sentence_tokens, current_sentence_output_list):
        if not isinstance(current_node, Tree):
            return current_leaf_idx

        node_height = current_node.height()
        take_this_node_as_span = False

        if node_height == target_nltk_height:
            take_this_node_as_span = True
        elif node_height < target_nltk_height and node_height >= 2:
            take_this_node_as_span = True
        
        if take_this_node_as_span:
            span_leaves = current_node.leaves()
            num_leaves_in_span = len(span_leaves)

            if num_leaves_in_span == 0:
                return current_leaf_idx
            
            start_token_idx_for_span = current_leaf_idx
            end_token_idx_for_span = current_leaf_idx + num_leaves_in_span - 1

            if not (0 <= start_token_idx_for_span < len(current_sentence_tokens) and \
                    0 <= end_token_idx_for_span < len(current_sentence_tokens) and \
                    start_token_idx_for_span <= end_token_idx_for_span):
                print(f"Warning: Span token indices [{start_token_idx_for_span}-{end_token_idx_for_span}] "
                      f"out of bounds (total tokens in current sentence: {len(current_sentence_tokens)}). "
                      f"Span leaves: '{' '.join(span_leaves)}'. Skipping.")
                return current_leaf_idx + num_leaves_in_span

            span_text = " ".join(span_leaves)
            span_start_char = current_sentence_tokens[start_token_idx_for_span].start_char
            span_end_char = current_sentence_tokens[end_token_idx_for_span].end_char

            current_sentence_output_list.append({
                'text': span_text,
                'start_char': span_start_char,
                'end_char': span_end_char
            })
            return current_leaf_idx + num_leaves_in_span
        
        elif node_height > target_nltk_height:
            updated_leaf_idx = current_leaf_idx
            for child in current_node:
                if isinstance(child, Tree):
                    updated_leaf_idx = _recursive_span_finder(child, updated_leaf_idx, current_sentence_tokens, current_sentence_output_list)
                else: # Child is a leaf string
                    if 0 <= updated_leaf_idx < len(current_sentence_tokens):
                        current_sentence_output_list.append({
                            'text': child,
                            'start_char': current_sentence_tokens[updated_leaf_idx].start_char,
                            'end_char': current_sentence_tokens[updated_leaf_idx].end_char
                        })
                    else:
                        print(f"Warning: Leaf index {updated_leaf_idx} out of bounds for single token '{child}' in current sentence.")
                    updated_leaf_idx += 1
            return updated_leaf_idx
        else: # node_height < 2
            return current_leaf_idx + len(current_node.leaves())


    for stz_sentence in doc.sentences:
        current_sentence_collected_info = [] # Spans for *this* sentence only
        local_sentence_tokens = stz_sentence.tokens

        # Fallback to tokenization for this sentence if no valid constituency tree
        perform_fallback = False
        if not stz_sentence.constituency:
            print(f"Warning: No constituency tree found for sentence: '{stz_sentence.text}'. "
                  "Falling back to tokenization for this sentence.")
            perform_fallback = True
        elif not str(stz_sentence.constituency).startswith("("):
            print(f"Warning: Constituency output for sentence '{stz_sentence.text}' "
                  f"does not look like a tree: '{stz_sentence.constituency}'. "
                  "Falling back to tokenization for this sentence.")
            perform_fallback = True

        if perform_fallback:
            for token in local_sentence_tokens:
                current_sentence_collected_info.append({
                    'text': token.text,
                    'start_char': token.start_char,
                    'end_char': token.end_char
                })
            overall_collected_spans_info.extend(current_sentence_collected_info)
            continue # Move to the next sentence

        try:
            nltk_tree = Tree.fromstring(str(stz_sentence.constituency))
        except ValueError as e:
            print(f"Warning: Could not parse constituency tree string for sentence: '{stz_sentence.text}'. "
                  f"Error: {e}. Falling back to tokenization for this sentence.")
            for token in local_sentence_tokens:
                current_sentence_collected_info.append({
                    'text': token.text,
                    'start_char': token.start_char,
                    'end_char': token.end_char
                })
            overall_collected_spans_info.extend(current_sentence_collected_info)
            continue # Move to the next sentence

        # Process this sentence's tree
        if isinstance(nltk_tree, Tree):
            _recursive_span_finder(nltk_tree, 0, local_sentence_tokens, current_sentence_collected_info)
        
        overall_collected_spans_info.extend(current_sentence_collected_info)

    # Prepare final output
    final_spans = [info['text'] for info in overall_collected_spans_info]
    final_offsets = [(info['start_char'], info['end_char']) for info in overall_collected_spans_info]

    return {'input_ids': final_spans, 'offset_mapping': final_offsets}


# In[16]:


if __name__ == '__main__':
    try:
        nlp_pipeline = initialize_stanza() # Initialize the global pipeline

        multi_sentence_text = "The quick brown fox jumps over the lazy dog. This is the second sentence. And a third one!"
        
        print(f"Multi-sentence Text: '{multi_sentence_text}'")
        
        # NLTK Height 2: Pre-terminals (token-level spans for each sentence)
        result_h2_multi = sentence_tokenizer_recursive_multisentence(multi_sentence_text, target_nltk_height=2, nlp=nlp_pipeline)
        print(f"\n  Spans (target_nltk_height=2):")
        for i, span_text in enumerate(result_h2_multi['input_ids']):
            offset = result_h2_multi['offset_mapping'][i]
            original_substring = multi_sentence_text[offset[0]:offset[1]]
            print(f"    - '{span_text}' (Offset: {offset}, Original: '{original_substring}')")

        # NLTK Height 3: "Flat" phrases for each sentence
        result_h3_multi = sentence_tokenizer_recursive_multisentence(multi_sentence_text, target_nltk_height=3, nlp=nlp_pipeline)
        print(f"\n  Spans (target_nltk_height=3):")
        for i, span_text in enumerate(result_h3_multi['input_ids']):
            offset = result_h3_multi['offset_mapping'][i]
            original_substring = multi_sentence_text[offset[0]:offset[1]]
            print(f"    - '{span_text}' (Offset: {offset}, Original: '{original_substring}')")

        # Example with a sentence that might have parsing issues or is very short
        test_text_2 = "Go. This is complex. What?"
        print(f"\nMulti-sentence Text: '{test_text_2}'")
        result_h3_test2 = sentence_tokenizer_recursive_multisentence(test_text_2, target_nltk_height=3, nlp=nlp_pipeline)
        print(f"\n  Spans (target_nltk_height=3):")
        for i, span_text in enumerate(result_h3_test2['input_ids']):
            offset = result_h3_test2['offset_mapping'][i]
            original_substring = test_text_2[offset[0]:offset[1]]
            print(f"    - '{span_text}' (Offset: {offset}, Original: '{original_substring}')")


    except Exception as e:
        print(f"An error occurred in the main execution: {e}")
        # If stanza models are missing, you might see an error here.
        # Make sure you have run:
        # import stanza
        # stanza.download('en') # or the language you need
