Source code for flambe.tokenizer.word


from typing import Union, List, Optional

import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from flambe.tokenizer import Tokenizer


[docs]class WordTokenizer(Tokenizer): """Implement a word level tokenizer using nltk.tokenize.word_tokenize """
[docs] def tokenize(self, example: str) -> List[str]: """Tokenize an input example. Parameters ---------- example : str The input example, as a string Returns ------- List[str] The output word tokens, as a list of strings """ return example.split()
[docs]class NLTKWordTokenizer(Tokenizer): """Implement a word level tokenizer using nltk.tokenize.word_tokenize """ def __init__(self, **kwargs): super().__init__(**kwargs) nltk.download('punkt', quiet=True)
[docs] def tokenize(self, example: str) -> List[str]: """Tokenize an input example. Parameters ---------- example : str The input example, as a string Returns ------- List[str] The output word tokens, as a list of strings """ return word_tokenize(example)
[docs]class NGramsTokenizer(Tokenizer): """Implement a n-gram tokenizer Examples -------- >>> t = NGramsTokenizer(ngrams=2).tokenize("hi how are you?") ['hi, how', 'how are', 'are you?'] >>> t = NGramsTokenizer(ngrams=[1,2]).tokenize("hi how are you?") ['hi,', 'how', 'are', 'you?', 'hi, how', 'how are', 'are you?'] Parameters ---------- ngrams: Union[int, List[int]] An int or a list of ints. If it's a list of ints, all n-grams (for each int) will be considered in the tokenizer. exclude_stopwords: bool Whether to exlude stopword or not. See the related param stop_words stop_words: Optional[List] List of stop words to exclude when exclude_stopwords is True. If None set to nltk.corpus.stopwords. """ def __init__(self, ngrams: Union[int, List[int]] = 1, exclude_stopwords: bool = False, stop_words: Optional[List] = None) -> None: """ Initialize the NGramsTokenizer Parameters ---------- ngrams : Union[int, List[int]], optional [description], by default 1 exclude_stopwords: bool [description], by default False stop_words: Optional[List] [description], by default None """ self.ngrams = ngrams self.exclude_stopwords = exclude_stopwords if self.exclude_stopwords: self.stop_words = stop_words if self.stop_words is None: nltk.download('stopwords', quiet=True) self.stop_words = stopwords.words('english') nltk.download('punkt', quiet=True) @staticmethod
[docs] def _tokenize(example: str, n: int) -> List[str]: """Tokenize an input example using ngrams. """ return list(" ".join(x) if len(x) > 1 else x[0] for x in ngrams(word_tokenize(example), n))
[docs] def tokenize(self, example: str) -> List[str]: """Tokenize an input example. Parameters ---------- example : str The input example, as a string. Returns ------- List[str] The output word tokens, as a list of strings """ if self.exclude_stopwords and self.stop_words: example = ' '.join([word for word in word_tokenize(example) if word not in self.stop_words]) if isinstance(self.ngrams, List): ret: List[str] = [] for i in self.ngrams: ret.extend(self._tokenize(example, i)) return ret else: return NGramsTokenizer._tokenize(example, self.ngrams)