Source code for flambe.tokenizer.word


from typing import Union, List

from nltk import ngrams

from flambe.tokenizer import Tokenizer


[docs]class WordTokenizer(Tokenizer): """Implement a word level tokenizer."""
[docs] def tokenize(self, example: str) -> List[str]: """Tokenize an input example. Parameters ---------- example : str The input example, as a string Returns ------- List[str] The output word tokens, as a list of strings """ return example.split()
[docs]class NGramsTokenizer(Tokenizer): """Implement a n-gram tokenizer Examples -------- >>> t = NGramsTokenizer(ngrams=2).tokenize("hi how are you?") ['hi, how', 'how are', 'are you?'] >>> t = NGramsTokenizer(ngrams=[1,2]).tokenize("hi how are you?") ['hi,', 'how', 'are', 'you?', 'hi, how', 'how are', 'are you?'] Parameters ---------- ngrams: Union[int, List[int]] An int or a list of ints. If it's a list of ints, all n-grams (for each int) will be considered in the tokenizer. """ def __init__(self, ngrams: Union[int, List[int]] = 1) -> None: """[summary] Parameters ---------- ngrams : Union[int, List[int]], optional [description], by default 1 Returns ------- None [description] """ self.ngrams = ngrams @staticmethod
[docs] def _tokenize(example: str, n: int) -> List[str]: """Tokenize an input example using ngrams. """ return list(" ".join(x) if len(x) > 1 else x[0] for x in ngrams(example.split(), n))
[docs] def tokenize(self, example: str) -> List[str]: """Tokenize an input example. Parameters ---------- example : str The input example, as a string. Returns ------- List[str] The output word tokens, as a list of strings """ if isinstance(self.ngrams, List): ret: List[str] = [] for i in self.ngrams: ret.extend(self._tokenize(example, i)) return ret else: return NGramsTokenizer._tokenize(example, self.ngrams)