Source code for flambe.field.text

from typing import Optional, Dict
from collections import OrderedDict as odict

import torch
import numpy as np
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import temporary_file

from flambe.field import Field
from flambe.tokenizer import Tokenizer, WordTokenizer


[docs]class TextField(Field):
    """Featurize raw text inputs

    This class performs tokenization and numericalization, as well as
    decorating the input sequences with optional start and end tokens.

    When a vocabulary is passed during initialiazation, it is used to
    map the the words to indices. However, the vocabulary can also be
    generated from input data, through the `setup` method. Once
    a vocabulary has been built, this object can also be used to load
    external pretrained embeddings.

    The pad, unk, sos and eos tokens, when given, are assigned the
    first indices in the vocabulary, in that order. This means, that
    whenever a pad token is specified, it will always use the 0 index.

    """

    def __init__(self,  # nosec
                 tokenizer: Optional[Tokenizer] = None,
                 lower: bool = False,
                 pad_token: Optional[str] = '<pad>',
                 unk_token: Optional[str] = '<unk>',
                 sos_token: Optional[str] = None,
                 eos_token: Optional[str] = None,
                 embeddings: Optional[str] = None,
                 embeddings_format: str = 'glove',
                 embeddings_binary: bool = False,
                 unk_init_all: bool = False) -> None:
        """Initialize the TextField.

        Parameters
        ----------
        tokenizer : Tokenizer, optional
            Tokenizer to use, by default WordTokenizer()
        lower : bool, optional
            If given, lowercase the input, by default False
        pad_token : str, optional
            Reserved padding token. Note that this object does not
            perform padding. Padding is done on the fly, when sampling.
            (defaults to '<pad>')
        unk_token : str, optional
            The token to use for out of vocabulary tokens
            (defaults to '<unk>')
        sos_token : str, optional
            Start of sentence tokens to add to the start of
            each sequence (defaults to '<sos>')
        eos : Iterable[str], optional
            List of end of sentence tokens to add to the end of each
            sequence (defaults to an empty list)
        embeddings : Optional[str], optional
            Path to pretrained embeddings, by default None
        embeddings_format : str, optional
            The format of the input embeddings, should be one of:
            'glove', 'word2vec', 'fasttext' or 'gensim'. The latter can
            be used to download embeddings hosted on gensim on the fly.
            See https://github.com/RaRe-Technologies/gensim-data
            for the list of available embedding aliases.
        embeddings_binary : bool, optional
            Whether the input embeddings are provided in binary format,
            by default False
        unk_init_all : bool, optional
            If True, every token not provided in the input embeddings is
            given a random embedding from a normal distribution.
            Otherwise, all of them map to the '<unk>' token.

        """
        self.tokenizer = tokenizer or WordTokenizer()
        self.lower = lower

        self.pad = pad_token
        self.unk = unk_token
        self.sos = sos_token
        self.eos = eos_token

        self.embeddings = embeddings
        self.embeddings_format = embeddings_format
        self.embeddings_binary = embeddings_binary
        self.embedding_matrix: Optional[torch.Tensor] = None
        self.unk_init_all = unk_init_all

        self.vocab: Dict = odict()
        specials = [pad_token, unk_token, sos_token, eos_token]
        self.specials = [special for special in specials if special is not None]

        index = -1
        for token in self.specials:
            self.vocab[token] = index = index + 1

        self.register_attrs('vocab')

    @property
[docs]    def vocab_size(self) -> int:
        """Get the vocabulary length.

        Returns
        -------
        int
            The length of the vocabulary

        """
        unique_ids = set(v for k, v in self.vocab.items())
        return len(unique_ids)

[docs]    def setup(self, *data: np.ndarray) -> None:
        """Build the vocabulary and sets embeddings.

        Parameters
        ----------
        data : Iterable[str]
            List of input strings.

        """
        if self.embeddings is not None:
            # Load embedding model
            embeddings_matrix = []
            if self.embeddings_format == 'glove':
                with temporary_file('temp.txt') as temp:
                    glove2word2vec(self.embeddings, temp)
                    model = KeyedVectors.load_word2vec_format(temp, binary=self.embeddings_binary)
            elif self.embeddings_format == 'word2vec':
                model = KeyedVectors.load_word2vec_format(self.embeddings,
                                                          binary=self.embeddings_binary)
            elif self.embeddings_format == 'fasttext':
                model = KeyedVectors.load_fasttext_format(self.embeddings,
                                                          binary=self.embeddings_binary)
            elif self.embeddings_format == 'gensim':
                model = api.load(self.embeddings)
            else:
                raise ValueError("Only formats supported are word2vec, fasttext and gensim")

            # Add embeddings for special tokens
            for special in self.specials:
                if special in model:
                    embeddings_matrix.append(torch.tensor(model[special]))
                else:
                    embeddings_matrix.append(torch.randn(model.vector_size))

        # Iterate over all examples
        examples = (e for dataset in data for e in dataset if dataset is not None)

        # Get current last id
        index = len(self.vocab) - 1

        for example in examples:
            # Lowercase if requested
            example = example.lower() if self.lower else example
            # Tokenize and add to vocabulary
            for token in self.tokenizer(example):
                if token not in self.vocab:
                    if self.embeddings is not None:
                        if token in model:
                            self.vocab[token] = index = index + 1
                            embeddings_matrix.append(torch.tensor(model[token]))
                        elif self.unk_init_all:
                            # Give every OOV it's own embedding
                            self.vocab[token] = index = index + 1
                            embeddings_matrix.append(torch.randn(model.vector_size))
                        else:
                            # Collapse all OOV's to the same token id
                            self.vocab[token] = self.vocab[self.unk]
                    else:
                        self.vocab[token] = index = index + 1

        if self.embeddings is not None:
            self.embedding_matrix = torch.stack(embeddings_matrix)

    # TODO update when we add generics
[docs]    def process(self, example: str) -> torch.Tensor:  # type: ignore
        """Process an example, and create a Tensor.

        Parameters
        ----------
        example: str
            The example to process, as a single string

        Returns
        -------
        torch.Tensor
            The processed example, tokenized and numericalized

        """
        # Lowercase and tokenize
        example = example.lower() if self.lower else example
        tokens = self.tokenizer(example)

        # Add extra tokens
        if self.sos is not None:
            tokens = [self.sos] + list(tokens)
        if self.eos is not None:
            tokens = list(tokens) + [self.eos]

        # Numericalize
        numericals = []
        for token in tokens:
            if token not in self.vocab:
                if self.unk is None or self.unk not in self.vocab:
                    raise ValueError("Encounterd out-of-vocabulary token \
                                      but the unk_token is either missing \
                                      or not defined in the vocabulary.")
                else:
                    token = self.unk

            numerical = self.vocab[token]  # type: ignore
            numericals.append(numerical)

        return torch.tensor(numericals).long()