Module pastalib.utils.tokenizer_utils
Utils for interacting with huggingface tokenizers.
Expand source code
"""Utils for interacting with huggingface tokenizers."""
from contextlib import contextmanager
from typing import Any, Iterator, Optional, Sequence, Tuple
from .typing import StrSequence, Tokenizer, TokenizerOffsetMapping
def find_token_range(
string: str,
substring: str,
tokenizer: Optional[Tokenizer] = None,
occurrence: int = 0,
offset_mapping: Optional[TokenizerOffsetMapping] = None,
**kwargs: Any,
) -> Tuple[int, int]:
"""Find index range of tokenized string containing tokens for substring.
The kwargs are forwarded to the tokenizer.
A simple example:
string = 'The batman is the night.'
substring = 'batman'
tokenizer = ...
# Example tokenization: ['the', 'bat', '##man', 'is', 'the', 'night']
assert find_token_range(string, substring, tokenizer) == (1, 3)
Args:
string: The string.
substring: The substring to find token range for.
tokenizer: The tokenizer. If not set, offset_mapping must be.
occurrence: The occurence of the substring to look for.
Zero indexed. Defaults to 0, the first occurrence.
offset_mapping: Precomputed offset mapping. If not set, tokenizer will be run.
Raises:
ValueError: If substring is not actually in string or if banned
kwargs are specified.
Returns:
Tuple[int, int]: The start (inclusive) and end (exclusive) token idx.
"""
if tokenizer is None and offset_mapping is None:
raise ValueError("must set either tokenizer= or offset_mapping=")
if "return_offsets_mapping" in kwargs:
raise ValueError("cannot set return_offsets_mapping")
if substring not in string:
raise ValueError(f'"{substring}" not found in "{string}"')
char_start = string.index(substring)
for _ in range(occurrence):
try:
char_start = string.index(substring, char_start + 1)
except ValueError as error:
raise ValueError(
f"could not find {occurrence + 1} occurrences "
f'of "{substring} in "{string}"'
) from error
char_end = char_start + len(substring)
if offset_mapping is None:
assert tokenizer is not None
tokens = tokenizer(string, return_offsets_mapping=True, **kwargs)
offset_mapping = tokens.offset_mapping
token_start, token_end = None, None
for index, (token_char_start, token_char_end) in enumerate(offset_mapping):
if token_start is None:
if token_char_start <= char_start and token_char_end >= char_start:
token_start = index
if token_end is None:
if token_char_start <= char_end and token_char_end >= char_end:
token_end = index
break
assert token_start is not None
assert token_end is not None
assert token_start <= token_end
return (token_start, token_end + 1)
def batch_convert_ids_to_tokens(
batch: Sequence[Sequence[int]], tokenizer: Tokenizer, **kwargs: Any
) -> Sequence[StrSequence]:
"""Call `convert_ids_to_tokens` on every sequence in the batch."""
return [tokenizer.convert_ids_to_tokens(ids, **kwargs) for ids in batch]
@contextmanager
def set_padding_side(
tokenizer: Tokenizer, padding_side: str = "right"
) -> Iterator[None]:
"""Temporarily set padding side for tokenizer.
Useful for when you want to batch generate with causal LMs like GPT, as these
require the padding to be on the left side in such settings but are much easier
to mess around with when the padding, by default, is on the right.
Example usage:
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
with tokenizer_utils.set_padding_side(tokenizer, "left"):
inputs = mt.tokenizer(...)
# ... later
model.generate(**inputs)
"""
_padding_side = tokenizer.padding_side
tokenizer.padding_side = padding_side
yield
tokenizer.padding_side = _padding_side
Functions
def batch_convert_ids_to_tokens(batch: Sequence[Sequence[int]], tokenizer: transformers.tokenization_utils_fast.PreTrainedTokenizerFast, **kwargs: Any) ‑> Sequence[list[str] | tuple[str, ...]]-
Call
convert_ids_to_tokenson every sequence in the batch.Expand source code
def batch_convert_ids_to_tokens( batch: Sequence[Sequence[int]], tokenizer: Tokenizer, **kwargs: Any ) -> Sequence[StrSequence]: """Call `convert_ids_to_tokens` on every sequence in the batch.""" return [tokenizer.convert_ids_to_tokens(ids, **kwargs) for ids in batch] def find_token_range(string: str, substring: str, tokenizer: transformers.tokenization_utils_fast.PreTrainedTokenizerFast | None = None, occurrence: int = 0, offset_mapping: Optional[Sequence[tuple[int, int]]] = None, **kwargs: Any) ‑> Tuple[int, int]-
Find index range of tokenized string containing tokens for substring.
The kwargs are forwarded to the tokenizer.
A simple example:
string = 'The batman is the night.' substring = 'batman' tokenizer = ... # Example tokenization: ['the', 'bat', '##man', 'is', 'the', 'night'] assert find_token_range(string, substring, tokenizer) == (1, 3)Args
string- The string.
substring- The substring to find token range for.
tokenizer- The tokenizer. If not set, offset_mapping must be.
occurrence- The occurence of the substring to look for. Zero indexed. Defaults to 0, the first occurrence.
offset_mapping- Precomputed offset mapping. If not set, tokenizer will be run.
Raises
ValueError- If substring is not actually in string or if banned kwargs are specified.
Returns
Tuple[int, int]- The start (inclusive) and end (exclusive) token idx.
Expand source code
def find_token_range( string: str, substring: str, tokenizer: Optional[Tokenizer] = None, occurrence: int = 0, offset_mapping: Optional[TokenizerOffsetMapping] = None, **kwargs: Any, ) -> Tuple[int, int]: """Find index range of tokenized string containing tokens for substring. The kwargs are forwarded to the tokenizer. A simple example: string = 'The batman is the night.' substring = 'batman' tokenizer = ... # Example tokenization: ['the', 'bat', '##man', 'is', 'the', 'night'] assert find_token_range(string, substring, tokenizer) == (1, 3) Args: string: The string. substring: The substring to find token range for. tokenizer: The tokenizer. If not set, offset_mapping must be. occurrence: The occurence of the substring to look for. Zero indexed. Defaults to 0, the first occurrence. offset_mapping: Precomputed offset mapping. If not set, tokenizer will be run. Raises: ValueError: If substring is not actually in string or if banned kwargs are specified. Returns: Tuple[int, int]: The start (inclusive) and end (exclusive) token idx. """ if tokenizer is None and offset_mapping is None: raise ValueError("must set either tokenizer= or offset_mapping=") if "return_offsets_mapping" in kwargs: raise ValueError("cannot set return_offsets_mapping") if substring not in string: raise ValueError(f'"{substring}" not found in "{string}"') char_start = string.index(substring) for _ in range(occurrence): try: char_start = string.index(substring, char_start + 1) except ValueError as error: raise ValueError( f"could not find {occurrence + 1} occurrences " f'of "{substring} in "{string}"' ) from error char_end = char_start + len(substring) if offset_mapping is None: assert tokenizer is not None tokens = tokenizer(string, return_offsets_mapping=True, **kwargs) offset_mapping = tokens.offset_mapping token_start, token_end = None, None for index, (token_char_start, token_char_end) in enumerate(offset_mapping): if token_start is None: if token_char_start <= char_start and token_char_end >= char_start: token_start = index if token_end is None: if token_char_start <= char_end and token_char_end >= char_end: token_end = index break assert token_start is not None assert token_end is not None assert token_start <= token_end return (token_start, token_end + 1) def set_padding_side(tokenizer: transformers.tokenization_utils_fast.PreTrainedTokenizerFast, padding_side: str = 'right') ‑> Iterator[None]-
Temporarily set padding side for tokenizer.
Useful for when you want to batch generate with causal LMs like GPT, as these require the padding to be on the left side in such settings but are much easier to mess around with when the padding, by default, is on the right.
Example usage: tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2") with tokenizer_utils.set_padding_side(tokenizer, "left"): inputs = mt.tokenizer(…) # … later model.generate(**inputs)
Expand source code
@contextmanager def set_padding_side( tokenizer: Tokenizer, padding_side: str = "right" ) -> Iterator[None]: """Temporarily set padding side for tokenizer. Useful for when you want to batch generate with causal LMs like GPT, as these require the padding to be on the left side in such settings but are much easier to mess around with when the padding, by default, is on the right. Example usage: tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2") with tokenizer_utils.set_padding_side(tokenizer, "left"): inputs = mt.tokenizer(...) # ... later model.generate(**inputs) """ _padding_side = tokenizer.padding_side tokenizer.padding_side = padding_side yield tokenizer.padding_side = _padding_side