Source code for flambe.nlp.transformers.bert

"""
Intergation of the pytorch_transformers bert module
"""

from typing import Tuple, Dict, Any, Optional, Union

import torch
from torch import Tensor

from flambe.compile import registrable_factory
from flambe.field import TextField
from flambe.nn import Module

import pytorch_transformers as pt


[docs]class BERTTextField(TextField, pt.BertTokenizer): """Perform WordPiece tokenization. Inspired by: https://github.com/huggingface/pytorch-pretrained-BERT/ blob/master/pytorch_pretrained_bert/tokenization.py. Note that this object requires a pretrained vocabulary. """ def __init__(self, # nosec vocab_file: str, sos_token: str = '[CLS]', eos_token: str = '[SEP]', do_lower_case: bool = False, max_len_truncate: int = 100, **kwargs) -> None: """Initialize the BERTTextField. Parameters ---------- vocab_file : str Where to load the vocabulary from do_lower_case : bool, optional Set to lowercasr the input data max_len_truncate : int, optional The maximum length of a sequence never_split : tuple, optional These won't be passed to the WordPieceTokenizer """ pt.BertTokenizer.__init__(self, vocab_file, do_lower_case=do_lower_case) self._vocab = self.vocab self.sos = sos_token self.eos = eos_token self.lower = do_lower_case self.tokenizer = self.tokenize self.max_len_truncate = max_len_truncate @registrable_factory @classmethod
[docs] def from_alias(cls, path: str = 'bert-base-cased', cache_dir: Optional[str] = None, do_lower_case: bool = False, max_len_truncate: int = 100, **kwargs) -> 'BERTTextField': """Initialize from a pretrained tokenizer. Parameters ---------- path: str Path to a pretrained model, or one of the following string aliases currently available: . `bert-base-uncased` . `bert-large-uncased` . `bert-base-cased` . `bert-large-cased` . `bert-base-multilingual-uncased` . `bert-base-multilingual-cased` . `bert-base-chinese` """ if 'uncased' in path and not do_lower_case: raise ValueError("Using uncased model but do_lower_case is False.") field = super().from_pretrained(path, cache_dir=cache_dir, **kwargs) field.basic_tokenizer.do_lower_case = do_lower_case field.max_len_truncate = max_len_truncate return field
[docs] def process(self, example: str) -> torch.Tensor: # type: ignore """Process an example, and create a Tensor. Parameters ---------- example: str The example to process, as a single string Returns ------- torch.Tensor The processed example, tokenized and numericalized """ tokens = self.tokenize(example) tokens = tokens[:self.max_len_truncate] # Add extra tokens if self.sos is not None: tokens = [self.sos] + list(tokens) if self.eos is not None: tokens = list(tokens) + [self.eos] numericals = self.convert_tokens_to_ids(tokens) return torch.tensor(numericals)
[docs]class BERTEmbeddings(Module, pt.modeling_bert.BertPreTrainedModel): """Integrate the pytorch_pretrained_bert BERT word embedding model. This module can be used as any normal encoder, or it can be loaded with the official pretrained BERT models. Simply used the `from_pretrained` class method when initializing the model. Currently available: . `bert-base-uncased` . `bert-large-uncased` . `bert-base-cased` . `bert-large-cased` . `bert-base-multilingual-uncased` . `bert-base-multilingual-cased` . `bert-base-chinese` """ def __init__(self, input_size_or_config: Union[int, pt.BertConfig], embedding_size: int = 768, embedding_dropout: float = 0.1, embedding_freeze: bool = False, pad_index: int = 0, max_position_embeddings: int = 512, type_vocab_size: int = 2, **kwargs) -> None: """Initialize the BERTEmbeddings. Parameters ---------- input_size_or_config: int Vocabulary size of `inputs_ids` in the model's input hidden_size: int, optional Size of the encoder layers and the pooler layer. hidden_dropout_prob: float, optional The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: float, optional The dropout ratio for the attention probabilities. max_position_embeddings: int, optional The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size: int, optional The vocabulary size of the `token_type_ids` """ Module.__init__(self) if isinstance(input_size_or_config, int): tmp_config: Dict[str, Any] = {} tmp_config['vocab_size'] = input_size_or_config tmp_config['hidden_size'] = embedding_size tmp_config['hidden_dropout_prob'] = embedding_dropout tmp_config['max_position_embeddings'] = max_position_embeddings tmp_config['type_vocab_size'] = type_vocab_size config: pt.BertConfig = pt.BertConfig(**tmp_config) else: config = input_size_or_config self.config = config self.pad_index = pad_index self.embeddings = pt.modeling.BertEmbeddings(config) self.apply(self.init_bert_weights) if embedding_freeze: for param in self.parameters(): param.requires_grad = False @registrable_factory @classmethod
[docs] def from_alias(cls, path: str = 'bert-base-cased', cache_dir: Optional[str] = None, **kwargs) -> 'BERTEmbeddings': """Initialize from a pretrained model. Parameters ---------- path: str Path to a pretrained model, or one of the following string aliases currently available: . `bert-base-uncased` . `bert-large-uncased` . `bert-base-cased` . `bert-large-cased` . `bert-base-multilingual-uncased` . `bert-base-multilingual-cased` . `bert-base-chinese` """ return super().from_pretrained(path, cache_dir=cache_dir, **kwargs)
[docs] def forward(self, data: Tensor) -> Tuple[Tensor, Optional[Tensor]]: """Performs a forward pass through the network. Parameters ---------- data : torch.Tensor The input data, as a float tensor, batch first Returns ------- torch.Tensor The encoded output, as a float tensor, batch_first torch.Tensor, optional The padding mask if a pad index was given """ mask = None if self.pad_index is not None: mask = (data != self.pad_index).float() embedded = self.embeddings(data) return embedded, mask
[docs]class BERTEncoder(Module, pt.modeling_bert.BertPreTrainedModel): """Integrate the pytorch_pretrained_bert BERT encoder model. This module can be used as any normal encoder, or it can be loaded with the official pretrained BERT models. Simply used the `from_pretrained` class method when initializing the model. Currently available: . `bert-base-uncased` . `bert-large-uncased` . `bert-base-cased` . `bert-large-cased` . `bert-base-multilingual-uncased` . `bert-base-multilingual-cased` . `bert-base-chinese` """ def __init__(self, input_size_or_config: Union[int, pt.modeling_bert.BertConfig], hidden_size: int = 768, num_hidden_layers: int = 12, num_attention_heads: int = 12, intermediate_size: int = 3072, hidden_act: str = "gelu", hidden_dropout_prob: float = 0.1, attention_probs_dropout_prob: float = 0.1, max_position_embeddings: int = 512, type_vocab_size: int = 2, initializer_range: float = 0.02, pool_last: bool = False, **kwargs) -> None: """Initialize the BertEncoder. Parameters ---------- input_size_or_config: int Vocabulary size of `inputs_ids` in the model's input hidden_size: int, optional Size of the encoder layers and the pooler layer. num_hidden_layers: int, optional Number of hidden layers in the Transformer encoder. num_attention_heads: int, optional Number of attention heads for each attention layer in the Transformer encoder. intermediate_size: int, optional The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act: str, optional The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported. hidden_dropout_prob: float, optional The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: float, optional The dropout ratio for the attention probabilities. max_position_embeddings: int, optional The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size: int, optional The vocabulary size of the `token_type_ids` initializer_range: float, optional The sttdev of the truncated_normal_initializer for initializing all weight matrices. """ Module.__init__(self) if isinstance(input_size_or_config, int): tmp_config: Dict[str, Any] = {} tmp_config['vocab_size_or_config_json_file'] = input_size_or_config tmp_config['hidden_size'] = hidden_size tmp_config['num_hidden_layers'] = num_hidden_layers tmp_config['num_attention_heads'] = num_attention_heads tmp_config['hidden_act'] = hidden_act tmp_config['intermediate_size'] = intermediate_size tmp_config['hidden_dropout_prob'] = hidden_dropout_prob tmp_config['attention_probs_dropout_prob'] = attention_probs_dropout_prob tmp_config['max_position_embeddings'] = max_position_embeddings tmp_config['type_vocab_size'] = type_vocab_size tmp_config['initializer_range'] = initializer_range config: pt.BertConfig = pt.BertConfig(**tmp_config) else: config = input_size_or_config self.config = config self.encoder = pt.modeling_bert.BertEncoder(config) self.pooler = pt.modeling_bert.BertPooler(config) self.pool_last = pool_last self.apply(self.init_bert_weights) @registrable_factory @classmethod
[docs] def from_alias(cls, path: str = 'bert-base-cased', cache_dir: Optional[str] = None, pool_last: bool = False, **kwargs) -> 'BERTEncoder': """Initialize from a pretrained model. Parameters ---------- path: str Path to a pretrained model, or one of the following string aliases currently available: . `bert-base-uncased` . `bert-large-uncased` . `bert-base-cased` . `bert-large-cased` . `bert-base-multilingual-uncased` . `bert-base-multilingual-cased` . `bert-base-chinese` """ model = super().from_pretrained(path, cache_dir=cache_dir, **kwargs) model.pool_last = pool_last return model
[docs] def forward(self, data: Tensor, mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: """Performs a forward pass through the network. Parameters ---------- data : torch.Tensor The input data, as a long tensor Returns ------- torch.Tensor The encoded output, as a float tensor or the pooled output """ attention_mask = mask if mask is not None else torch.ones_like(data) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to # [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular # masking of causal attention used in OpenAI GPT, we just need # to prepare the broadcastdimension here. extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # Since attention_mask is 1.0 for positions we want to attend # and 0.0 for masked positions, this operation will create a # tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, # this is effectively the same as removing these entirely. extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # Write (1.0 - extended_attention_mask) weird way for mypy # so it uses Tensor.__add__ instead of float's extended_attention_mask = (-extended_attention_mask + 1.0) * -10000.0 encoded_layers = self.encoder(data, extended_attention_mask, output_all_encoded_layers=True) sequence_output = encoded_layers[-1] pooled_output = self.pooler(sequence_output) out = pooled_output if self.pool_last else sequence_output return out