Source code for flambe.field.bow

from typing import Dict, Optional
from collections import OrderedDict as odict

import torch

from flambe.field import Field
from flambe.tokenizer import Tokenizer, NGramsTokenizer


[docs]class BoWField(Field):
    """Featurize raw text inputs using bag of words (BoW)

    This class performs tokenization and numericalization.

    The pad, unk, when given, are assigned the first indices in the
    vocabulary, in that order. This means, that whenever a pad token
    is specified, it will always use the 0 index.

    Examples
    --------

    >>> f = BoWField(min_freq=2, normalize=True)
    >>> f.setup(['thank you', 'thank you very much', 'thanks a lot'])
    >>> f._vocab.keys()
    ['thank', you']

    Note that 'thank' and 'you' are the only ones that appear twice.

    >>> f.process("thank you really. You help was awesome")
    tensor([1, 2])

    """

    def __init__(self,  # nosec
                 tokenizer: Optional[Tokenizer] = None,
                 lower: bool = False,
                 unk_token: str = '<unk>',
                 min_freq: int = 5,
                 normalize: bool = False,
                 scale_factor: float = None) -> None:
        """Initialize the BoW object.

        Parameters
        ----------
        tokenizer : Tokenizer, optional
            Tokenizer to use, by default NGramsTokenizer()
        lower : bool, optional
            If given, lowercase the input, by default False
        unk_token : str, optional
            The token to use for out of vocabulary tokens
            (defaults to '<unk>')
        min_freq : int, optional
            Minimum frequency to include token in the vocabulary
            (defaults to 5)
        normalize : bool, optional
            Normalize or not the bag of words using L1 norm
            (defaults to False)
        scale_factor : float, optional
            Factor to scale the resulting normalized feature value.
            Only available when normalize is True (defaults to 1.0)

        """
        self.tokenizer = tokenizer or NGramsTokenizer()
        self.lower = lower
        self.unk = unk_token

        self.min_freq = min_freq
        self.normalize = normalize
        self.scale_factor = scale_factor

        self.vocab: Dict[str, int] = odict()
        self.vocab[unk_token] = 0
        self.full_vocab: Dict[str, int] = {}

        if scale_factor and not normalize:
            raise ValueError(f"Cannot specify scale_factor without normalizing")

        self.register_attrs('vocab', 'full_vocab')

    @property
[docs]    def vocab_size(self) -> int:
        """Get the vocabulary length.

        Returns
        -------
        int
            The length of the vocabulary

        """
        return len(self.vocab)

[docs]    def process(self, example):
        # Lowercase and tokenize
        example = example.lower() if self.lower else example
        tokens = self.tokenizer(example)

        # Numericalize
        numericals = [0] * len(self.vocab)
        for token in tokens:
            if token in self.vocab:
                numericals[self.vocab[token]] += 1
            else:
                if token not in self.full_vocab:
                    if self.unk is None or self.unk not in self.vocab:
                        raise ValueError("Encounterd out-of-vocabulary token \
                                          but the unk_token is either missing \
                                          or not defined in the vocabulary.")
                    else:
                        # Accumulate in numericals
                        numericals[self.vocab[self.unk]] += 1  # type: ignore

        processed = torch.tensor(numericals).float()
        if self.normalize:
            processed = torch.nn.functional.normalize(processed, dim=0, p=1)
        if self.scale_factor:
            processed = self.scale_factor * processed

        return processed

[docs]    def setup(self, *data) -> None:
        for dataset in data:
            for example in dataset:
                # Lowercase if requested
                example = example.lower() if self.lower else example
                # Tokenize and accumulate in vocabulary
                for token in self.tokenizer(example):
                    self.full_vocab[token] = self.full_vocab.get(token, 0) + 1

        # Filter only the once that have high frequency
        for k, v in self.full_vocab.items():
            if v >= self.min_freq:
                self.vocab.setdefault(k, len(self.vocab))