Skip to content

Tokenization

Tokenizer

A wrapper class for the BERT tokenizer from the Hugging Face Transformers library. Use this with vocab_file and it makes sure that the correct vocabulary is used.

Parameters:

Name Type Description Default
checkpoint str

The name or path of the pre-trained BERT checkpoint to use.

'bert-base-uncased'
vocab_file str

The path to the custom vocabulary file to use (optional).

'config/vocab.txt'

Attributes:

Name Type Description
tokenizer BertTokenizer

The BERT tokenizer object.

Source code in notebooks/experiments/tokenization.py
class Tokenizer:
    r"""A wrapper class for the BERT tokenizer from the Hugging Face Transformers library.
    Use this with `vocab_file` and it makes sure that the correct vocabulary is used.

    Args:
        checkpoint (str): The name or path of the pre-trained BERT checkpoint to use.
        vocab_file (str): The path to the custom vocabulary file to use (optional).

    Attributes:
        tokenizer (BertTokenizer): The BERT tokenizer object.

    """

    def __init__(self, checkpoint: str = "bert-base-uncased", vocab_file: str = "config/vocab.txt") -> None:
        r"""Initializes the Tokenizer object with the specified checkpoint and vocabulary file.

        Args:
            checkpoint (str): The name or path of the pre-trained BERT checkpoint to use.
            vocab_file (str): The path to the custom vocabulary file to use (optional).

        Returns:
            None.

        """
        self.tokenizer = BertTokenizer.from_pretrained(checkpoint, vocab_file=vocab_file)

    def __call__(self, text: Union[str, List[str]], add_special_tokens: bool = True) -> list[int]:
        r"""Tokenizes the input text using the BERT tokenizer.

        Args:
            text (str): The input text to tokenize.
            add_special_tokens (bool): Whether to add special tokens to the tokenized text (optional).

        Returns:
            tokens (List[int]): A list of token IDs representing the tokenized text.

        """
        tokens = self.tokenizer.encode(text, add_special_tokens=add_special_tokens)
        return tokens

    def decode(self, tokens: list[int], skip_special_tokens: bool = True) -> list[str]:
        r"""Decodes the input token IDs into a list of strings.

        Args:
            tokens (List[int]): A list of token IDs to decode.
            skip_special_tokens (bool): Whether to add special tokens to the tokenized text (optional).

        Returns:
            text (List[str]): A list of strings representing the decoded tokens.

        """
        text_list = self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
        return text_list

__call__(text, add_special_tokens=True)

Tokenizes the input text using the BERT tokenizer.

Parameters:

Name Type Description Default
text str

The input text to tokenize.

required
add_special_tokens bool

Whether to add special tokens to the tokenized text (optional).

True

Returns:

Name Type Description
tokens List[int]

A list of token IDs representing the tokenized text.

Source code in notebooks/experiments/tokenization.py
def __call__(self, text: Union[str, List[str]], add_special_tokens: bool = True) -> list[int]:
    r"""Tokenizes the input text using the BERT tokenizer.

    Args:
        text (str): The input text to tokenize.
        add_special_tokens (bool): Whether to add special tokens to the tokenized text (optional).

    Returns:
        tokens (List[int]): A list of token IDs representing the tokenized text.

    """
    tokens = self.tokenizer.encode(text, add_special_tokens=add_special_tokens)
    return tokens

__init__(checkpoint='bert-base-uncased', vocab_file='config/vocab.txt')

Initializes the Tokenizer object with the specified checkpoint and vocabulary file.

Parameters:

Name Type Description Default
checkpoint str

The name or path of the pre-trained BERT checkpoint to use.

'bert-base-uncased'
vocab_file str

The path to the custom vocabulary file to use (optional).

'config/vocab.txt'

Returns:

Type Description
None

None.

Source code in notebooks/experiments/tokenization.py
def __init__(self, checkpoint: str = "bert-base-uncased", vocab_file: str = "config/vocab.txt") -> None:
    r"""Initializes the Tokenizer object with the specified checkpoint and vocabulary file.

    Args:
        checkpoint (str): The name or path of the pre-trained BERT checkpoint to use.
        vocab_file (str): The path to the custom vocabulary file to use (optional).

    Returns:
        None.

    """
    self.tokenizer = BertTokenizer.from_pretrained(checkpoint, vocab_file=vocab_file)

decode(tokens, skip_special_tokens=True)

Decodes the input token IDs into a list of strings.

Parameters:

Name Type Description Default
tokens List[int]

A list of token IDs to decode.

required
skip_special_tokens bool

Whether to add special tokens to the tokenized text (optional).

True

Returns:

Name Type Description
text List[str]

A list of strings representing the decoded tokens.

Source code in notebooks/experiments/tokenization.py
def decode(self, tokens: list[int], skip_special_tokens: bool = True) -> list[str]:
    r"""Decodes the input token IDs into a list of strings.

    Args:
        tokens (List[int]): A list of token IDs to decode.
        skip_special_tokens (bool): Whether to add special tokens to the tokenized text (optional).

    Returns:
        text (List[str]): A list of strings representing the decoded tokens.

    """
    text_list = self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
    return text_list