Source code for flambe.field.bow

from typing import Dict, Optional
from collections import OrderedDict as odict

import torch

from flambe.field import Field
from flambe.tokenizer import Tokenizer, NGramsTokenizer


[docs]class BoWField(Field): """Featurize raw text inputs using bag of words (BoW) This class performs tokenization and numericalization. The pad, unk, when given, are assigned the first indices in the vocabulary, in that order. This means, that whenever a pad token is specified, it will always use the 0 index. Examples -------- >>> f = BoWField(min_freq=2, normalize=True) >>> f.setup(['thank you', 'thank you very much', 'thanks a lot']) >>> f._vocab.keys() ['thank', you'] Note that 'thank' and 'you' are the only ones that appear twice. >>> f.process("thank you really. You help was awesome") tensor([1, 2]) """ def __init__(self, # nosec tokenizer: Optional[Tokenizer] = None, lower: bool = False, unk_token: str = '<unk>', min_freq: int = 5, normalize: bool = False, scale_factor: float = None) -> None: """Initialize the BoW object. Parameters ---------- tokenizer : Tokenizer, optional Tokenizer to use, by default NGramsTokenizer() lower : bool, optional If given, lowercase the input, by default False unk_token : str, optional The token to use for out of vocabulary tokens (defaults to '<unk>') min_freq : int, optional Minimum frequency to include token in the vocabulary (defaults to 5) normalize : bool, optional Normalize or not the bag of words using L1 norm (defaults to False) scale_factor : float, optional Factor to scale the resulting normalized feature value. Only available when normalize is True (defaults to 1.0) """ self.tokenizer = tokenizer or NGramsTokenizer() self.lower = lower self.unk = unk_token self.min_freq = min_freq self.normalize = normalize self.scale_factor = scale_factor self.vocab: Dict[str, int] = odict() self.vocab[unk_token] = 0 self.full_vocab: Dict[str, int] = {} if scale_factor and not normalize: raise ValueError(f"Cannot specify scale_factor without normalizing") self.register_attrs('vocab', 'full_vocab') @property
[docs] def vocab_size(self) -> int: """Get the vocabulary length. Returns ------- int The length of the vocabulary """ return len(self.vocab)
[docs] def process(self, example): # Lowercase and tokenize example = example.lower() if self.lower else example tokens = self.tokenizer(example) # Numericalize numericals = [0] * len(self.vocab) for token in tokens: if token in self.vocab: numericals[self.vocab[token]] += 1 else: if token not in self.full_vocab: if self.unk is None or self.unk not in self.vocab: raise ValueError("Encounterd out-of-vocabulary token \ but the unk_token is either missing \ or not defined in the vocabulary.") else: # Accumulate in numericals numericals[self.vocab[self.unk]] += 1 # type: ignore processed = torch.tensor(numericals).float() if self.normalize: processed = torch.nn.functional.normalize(processed, dim=0, p=1) if self.scale_factor: processed = self.scale_factor * processed return processed
[docs] def setup(self, *data) -> None: for dataset in data: for example in dataset: # Lowercase if requested example = example.lower() if self.lower else example # Tokenize and accumulate in vocabulary for token in self.tokenizer(example): self.full_vocab[token] = self.full_vocab.get(token, 0) + 1 # Filter only the once that have high frequency for k, v in self.full_vocab.items(): if v >= self.min_freq: self.vocab.setdefault(k, len(self.vocab))