Source code for flambe.field.text

from typing import Optional, Dict
from collections import OrderedDict as odict

import torch
import numpy as np
import gensim.downloader as api
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import temporary_file

from flambe.field import Field
from flambe.tokenizer import Tokenizer, WordTokenizer


[docs]class TextField(Field): """Featurize raw text inputs This class performs tokenization and numericalization, as well as decorating the input sequences with optional start and end tokens. When a vocabulary is passed during initialiazation, it is used to map the the words to indices. However, the vocabulary can also be generated from input data, through the `setup` method. Once a vocabulary has been built, this object can also be used to load external pretrained embeddings. The pad, unk, sos and eos tokens, when given, are assigned the first indices in the vocabulary, in that order. This means, that whenever a pad token is specified, it will always use the 0 index. """ def __init__(self, # nosec tokenizer: Optional[Tokenizer] = None, lower: bool = False, pad_token: Optional[str] = '<pad>', unk_token: Optional[str] = '<unk>', sos_token: Optional[str] = None, eos_token: Optional[str] = None, embeddings: Optional[str] = None, embeddings_format: str = 'glove', embeddings_binary: bool = False, unk_init_all: bool = False) -> None: """Initialize the TextField. Parameters ---------- tokenizer : Tokenizer, optional Tokenizer to use, by default WordTokenizer() lower : bool, optional If given, lowercase the input, by default False pad_token : str, optional Reserved padding token. Note that this object does not perform padding. Padding is done on the fly, when sampling. (defaults to '<pad>') unk_token : str, optional The token to use for out of vocabulary tokens (defaults to '<unk>') sos_token : str, optional Start of sentence tokens to add to the start of each sequence (defaults to '<sos>') eos : Iterable[str], optional List of end of sentence tokens to add to the end of each sequence (defaults to an empty list) embeddings : Optional[str], optional Path to pretrained embeddings, by default None embeddings_format : str, optional The format of the input embeddings, should be one of: 'glove', 'word2vec', 'fasttext' or 'gensim'. The latter can be used to download embeddings hosted on gensim on the fly. See https://github.com/RaRe-Technologies/gensim-data for the list of available embedding aliases. embeddings_binary : bool, optional Whether the input embeddings are provided in binary format, by default False unk_init_all : bool, optional If True, every token not provided in the input embeddings is given a random embedding from a normal distribution. Otherwise, all of them map to the '<unk>' token. """ self.tokenizer = tokenizer or WordTokenizer() self.lower = lower self.pad = pad_token self.unk = unk_token self.sos = sos_token self.eos = eos_token self.embeddings = embeddings self.embeddings_format = embeddings_format self.embeddings_binary = embeddings_binary self.embedding_matrix: Optional[torch.Tensor] = None self.unk_init_all = unk_init_all self.vocab: Dict = odict() specials = [pad_token, unk_token, sos_token, eos_token] self.specials = [special for special in specials if special is not None] index = -1 for token in self.specials: self.vocab[token] = index = index + 1 self.register_attrs('vocab') @property
[docs] def vocab_size(self) -> int: """Get the vocabulary length. Returns ------- int The length of the vocabulary """ unique_ids = set(v for k, v in self.vocab.items()) return len(unique_ids)
[docs] def setup(self, *data: np.ndarray) -> None: """Build the vocabulary and sets embeddings. Parameters ---------- data : Iterable[str] List of input strings. """ if self.embeddings is not None: # Load embedding model embeddings_matrix = [] if self.embeddings_format == 'glove': with temporary_file('temp.txt') as temp: glove2word2vec(self.embeddings, temp) model = KeyedVectors.load_word2vec_format(temp, binary=self.embeddings_binary) elif self.embeddings_format == 'word2vec': model = KeyedVectors.load_word2vec_format(self.embeddings, binary=self.embeddings_binary) elif self.embeddings_format == 'fasttext': model = KeyedVectors.load_fasttext_format(self.embeddings, binary=self.embeddings_binary) elif self.embeddings_format == 'gensim': model = api.load(self.embeddings) else: raise ValueError("Only formats supported are word2vec, fasttext and gensim") # Add embeddings for special tokens for special in self.specials: if special in model: embeddings_matrix.append(torch.tensor(model[special])) else: embeddings_matrix.append(torch.randn(model.vector_size)) # Iterate over all examples examples = (e for dataset in data for e in dataset if dataset is not None) # Get current last id index = len(self.vocab) - 1 for example in examples: # Lowercase if requested example = example.lower() if self.lower else example # Tokenize and add to vocabulary for token in self.tokenizer(example): if token not in self.vocab: if self.embeddings is not None: if token in model: self.vocab[token] = index = index + 1 embeddings_matrix.append(torch.tensor(model[token])) elif self.unk_init_all: # Give every OOV it's own embedding self.vocab[token] = index = index + 1 embeddings_matrix.append(torch.randn(model.vector_size)) else: # Collapse all OOV's to the same token id self.vocab[token] = self.vocab[self.unk] else: self.vocab[token] = index = index + 1 if self.embeddings is not None: self.embedding_matrix = torch.stack(embeddings_matrix)
# TODO update when we add generics
[docs] def process(self, example: str) -> torch.Tensor: # type: ignore """Process an example, and create a Tensor. Parameters ---------- example: str The example to process, as a single string Returns ------- torch.Tensor The processed example, tokenized and numericalized """ # Lowercase and tokenize example = example.lower() if self.lower else example tokens = self.tokenizer(example) # Add extra tokens if self.sos is not None: tokens = [self.sos] + list(tokens) if self.eos is not None: tokens = list(tokens) + [self.eos] # Numericalize numericals = [] for token in tokens: if token not in self.vocab: if self.unk is None or self.unk not in self.vocab: raise ValueError("Encounterd out-of-vocabulary token \ but the unk_token is either missing \ or not defined in the vocabulary.") else: token = self.unk numerical = self.vocab[token] # type: ignore numericals.append(numerical) return torch.tensor(numericals).long()