Source code for flambe.tokenizer.subword

from typing import List

from flambe.tokenizer import Tokenizer
import fastBPE


[docs]class BPETokenizer(Tokenizer):
    """Implement a subword level tokenizer using
       byte pair encoding.  Tokenization is done using
       fastBPE (https://github.com/glample/fastBPE) and
       requires a fastBPE codes file.

    """

    def __init__(self, codes_path: str) -> None:
        """Initialize the tokenizer.

        Parameters
        ----------
        codes_path : str
            Path to codes file created using
            fastBPE.

        """
        self.bpe = fastBPE.fastBPE(codes_path)

[docs]    def tokenize(self, example: str) -> List[str]:
        """Tokenize an input example.

        Parameters
        ----------
        example : str
            The input example, as a string

        Returns
        -------
        List[str]
            The output subword tokens, as a list of strings

        """
        return self.bpe.apply([example])[0].split(" ")