import ahocorasick
from typing import List, Set, Tuple
seed_collection = {'## step', 'a good starting point is', 'a more direct approach is', 'a more straightforward approach', 'a simpler approach is', 'alright', 'alternatively', 'an error in the thought process', 'analyze the', 'analyzing the given', 'and then find', 'another approach', 'are looking for', 'based on the given', 'break down the problem', 'break it down', 'break it down into manageable steps', 'but', 'but wait', 'but why', 'but without more information', 'can be rewritten as', 'can conclude that', 'can see that', 'check if', 'consider the properties of', 'correct the approach', 'define the variables', 'determine how many', 'directly address the problem', 'finally need to', 'finally we need to', 'find a simpler', 'find a way to', 'find out how many', 'find the critical points', 'first need to', 'follow these steps', 'for simplicity', 'from earlier we have', 'from the above', 'given the complexity', 'given the complexity of', 'given the constraints', 'given the nature of', 'go back to the', 'goal is to', 'hmm,', 'however', 'however, we need to', 'identify the', 'identify the given information', 'if we consider', 'in a way that', 'in the context of', 'it seems', "it's better", 'let me', 'let me verify', "let's", "let's double-check", 'looking back at the', 'maybe', 'need to', 'need to account for', 'need to analyze', 'need to check', 'need to consider', 'need to count', 'need to determine', 'need to ensure', 'need to express', 'need to find', 'need to follow', 'need to identify', 'need to minimize', 'need to reconsider', 'need to show', 'need to solve', 'need to think about', 'need to understand', 'need to use', 'next', 'note that', 'now', 'now let', 'now need to', 'now we need to', 'one way to', 'perhaps', 'problem is asking', 'problem states that', 'proceed with the following', 'rearrange the equation', 'recall that', 'referring to a previous step', 'rewrite the equation', 'seems a bit complicated', 'should consider', 'should focus on', 'should look for', 'similarly', 'simplify the problem', 'since', 'so', 'so after', 'so again', 'specifically', 'start by', 'step by step', 'step by step reasoning', 'step by step solution', 'step-by-step reasoning', 'that seems', 'the correct approach is', 'the first step is', 'the key insight is', 'the key to solving this', 'the logical flow is', 'the next step is', 'the path to the solution', 'the problem asks for', 'the problem is about', 'the problem mentions', 'the problem says', 'the problem states', 'the problem states that', 'there is a mistake', 'there seems to be', 'therefore', 'think of this as', 'this allows us to', 'this approach seems', 'this can be seen as', 'this implies', 'this is because', 'this is not the correct approach', 'this leads to', 'this means', 'this means that', 'this seems a bit', 'this suggests', 'this suggests that', 'thus', 'to confirm', 'to consider the constraints', 'to determine', 'to do this', 'to ensure', 'to ensure correctness', 'to find', 'to make it easier', 'to proceed', 'to see if', 'to solve this problem', 'to verify', 'understand the given information', 'understanding the problem', 'understanding the problem first', 'upon closer inspection', 'use the concept of', 'use the fact', 'use the fact that', 'use the method of', 'use the properties of', 'verify the solution', 'wait', 'want to find', 'we are dealing with', 'we can', 'we can approach this', 'we can conclude', 'we can deduce', 'we can infer', 'we can see', 'we can start by', 'we can think of this as', 'we can use', 'we know', 'what is being asked', 'will consider the', 'work our way',
'hmm,',
'wait, no',
'okay',
'perhaps I can',
'so, yes',
'wait, but',
'says that',
'states that',
'maybe i can',
"let's denote",
"denote",
"try to"}

def get_planning_token_text_indices(
    texts: List[str]
) -> List[List[int]]:
    """
    Identifies the character indices within original texts that fall into any
    of the provided seed grams.

    Args:
        texts: The original list of text strings.
        seed_collection: A set of n-gram strings to find (the "seeds").
                         These are assumed to be lowercase.

    Returns:
        A list where each item corresponds to an original text. Each item is a
        sorted list of character indices covered by a matched gram.
    """
    # 1. Build the Aho-Corasick automaton for efficient multi-pattern matching.
    A = ahocorasick.Automaton()
    for gram in seed_collection:
        if gram:  # Ensure the gram is not an empty string
            A.add_word(gram, gram)
    A.make_automaton()

    # 2. Process each text to find and collect character indices.
    all_results = []
    for text in texts:
        # A set provides fast, duplicate-free storage of indices.
        directive_indices = set()
        
        # Search the lowercased text for case-insensitive matching.
        text_lower = text.lower()

        # 3. Find all matches and their character positions.
        # The automaton yields (end_character_index, matched_gram_string).
        for end_char_idx, matched_gram in A.iter(text_lower):
            start_char_idx = end_char_idx - len(matched_gram) + 1
            
            # 4. Add all character indices from the match span to the set.
            # The set automatically handles overlaps from different matches.
            directive_indices.update(range(start_char_idx, end_char_idx + 1))

        # 5. Append the sorted list of indices for this text to the results.
        all_results.append(sorted(list(directive_indices)))

    return all_results

def get_highlight_token_positions(text, text_highlight, offsets, tokens=None):
    # Return early if there are no highlights to find
    if not text_highlight:
        return []

    ptr = 0
    token_indices = []
    if tokens is None: tokens = offsets

    for idx, (token, offset) in enumerate(zip(tokens, offsets)):
        start, end = offset
        
        # A flag to ensure we only add a token once, even if it has multiple highlights
        found_in_this_token = False

        # Keep advancing the pointer as long as it's in bounds AND the highlight
        # it points to is within the current token's character span.
        while ptr < len(text_highlight) and start <= text_highlight[ptr] < end:
            found_in_this_token = True
            ptr += 1  # Consume the highlight and move to the next one

        if found_in_this_token:
            token_indices.append([idx, token])

        # Optimization: if we have found all highlights, we can stop searching.
        if ptr >= len(text_highlight):
            break
            
    return token_indices