"""
Intergation of the pytorch_transformers bert module
"""
from typing import Tuple, Dict, Any, Optional, Union
import torch
from torch import Tensor
from flambe.compile import registrable_factory
from flambe.field import TextField
from flambe.nn import Module
import pytorch_transformers as pt
[docs]class BERTTextField(TextField, pt.BertTokenizer):
"""Perform WordPiece tokenization.
Inspired by: https://github.com/huggingface/pytorch-pretrained-BERT/
blob/master/pytorch_pretrained_bert/tokenization.py.
Note that this object requires a pretrained vocabulary.
"""
def __init__(self, # nosec
vocab_file: str,
sos_token: str = '[CLS]',
eos_token: str = '[SEP]',
do_lower_case: bool = False,
max_len_truncate: int = 100, **kwargs) -> None:
"""Initialize the BERTTextField.
Parameters
----------
vocab_file : str
Where to load the vocabulary from
do_lower_case : bool, optional
Set to lowercasr the input data
max_len_truncate : int, optional
The maximum length of a sequence
never_split : tuple, optional
These won't be passed to the WordPieceTokenizer
"""
pt.BertTokenizer.__init__(self, vocab_file, do_lower_case=do_lower_case)
self._vocab = self.vocab
self.sos = sos_token
self.eos = eos_token
self.lower = do_lower_case
self.tokenizer = self.tokenize
self.max_len_truncate = max_len_truncate
@registrable_factory
@classmethod
[docs] def from_alias(cls,
path: str = 'bert-base-cased',
cache_dir: Optional[str] = None,
do_lower_case: bool = False,
max_len_truncate: int = 100,
**kwargs) -> 'BERTTextField':
"""Initialize from a pretrained tokenizer.
Parameters
----------
path: str
Path to a pretrained model, or one of the following string
aliases currently available:
. `bert-base-uncased`
. `bert-large-uncased`
. `bert-base-cased`
. `bert-large-cased`
. `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased`
. `bert-base-chinese`
"""
if 'uncased' in path and not do_lower_case:
raise ValueError("Using uncased model but do_lower_case is False.")
field = super().from_pretrained(path, cache_dir=cache_dir, **kwargs)
field.basic_tokenizer.do_lower_case = do_lower_case
field.max_len_truncate = max_len_truncate
return field
[docs] def process(self, example: str) -> torch.Tensor: # type: ignore
"""Process an example, and create a Tensor.
Parameters
----------
example: str
The example to process, as a single string
Returns
-------
torch.Tensor
The processed example, tokenized and numericalized
"""
tokens = self.tokenize(example)
tokens = tokens[:self.max_len_truncate]
# Add extra tokens
if self.sos is not None:
tokens = [self.sos] + list(tokens)
if self.eos is not None:
tokens = list(tokens) + [self.eos]
numericals = self.convert_tokens_to_ids(tokens)
return torch.tensor(numericals)
[docs]class BERTEmbeddings(Module, pt.modeling_bert.BertPreTrainedModel):
"""Integrate the pytorch_pretrained_bert BERT word embedding model.
This module can be used as any normal encoder, or it can be
loaded with the official pretrained BERT models. Simply used
the `from_pretrained` class method when initializing the model.
Currently available:
. `bert-base-uncased`
. `bert-large-uncased`
. `bert-base-cased`
. `bert-large-cased`
. `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased`
. `bert-base-chinese`
"""
def __init__(self,
input_size_or_config: Union[int, pt.BertConfig],
embedding_size: int = 768,
embedding_dropout: float = 0.1,
embedding_freeze: bool = False,
pad_index: int = 0,
max_position_embeddings: int = 512,
type_vocab_size: int = 2, **kwargs) -> None:
"""Initialize the BERTEmbeddings.
Parameters
----------
input_size_or_config: int
Vocabulary size of `inputs_ids` in the model's input
hidden_size: int, optional
Size of the encoder layers and the pooler layer.
hidden_dropout_prob: float, optional
The dropout probabilitiy for all fully connected layers
in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: float, optional
The dropout ratio for the attention probabilities.
max_position_embeddings: int, optional
The maximum sequence length that this model might
ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
type_vocab_size: int, optional
The vocabulary size of the `token_type_ids`
"""
Module.__init__(self)
if isinstance(input_size_or_config, int):
tmp_config: Dict[str, Any] = {}
tmp_config['vocab_size'] = input_size_or_config
tmp_config['hidden_size'] = embedding_size
tmp_config['hidden_dropout_prob'] = embedding_dropout
tmp_config['max_position_embeddings'] = max_position_embeddings
tmp_config['type_vocab_size'] = type_vocab_size
config: pt.BertConfig = pt.BertConfig(**tmp_config)
else:
config = input_size_or_config
self.config = config
self.pad_index = pad_index
self.embeddings = pt.modeling.BertEmbeddings(config)
self.apply(self.init_bert_weights)
if embedding_freeze:
for param in self.parameters():
param.requires_grad = False
@registrable_factory
@classmethod
[docs] def from_alias(cls,
path: str = 'bert-base-cased',
cache_dir: Optional[str] = None, **kwargs) -> 'BERTEmbeddings':
"""Initialize from a pretrained model.
Parameters
----------
path: str
Path to a pretrained model, or one of the following string
aliases currently available:
. `bert-base-uncased`
. `bert-large-uncased`
. `bert-base-cased`
. `bert-large-cased`
. `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased`
. `bert-base-chinese`
"""
return super().from_pretrained(path, cache_dir=cache_dir, **kwargs)
[docs] def forward(self, data: Tensor) -> Tuple[Tensor, Optional[Tensor]]:
"""Performs a forward pass through the network.
Parameters
----------
data : torch.Tensor
The input data, as a float tensor, batch first
Returns
-------
torch.Tensor
The encoded output, as a float tensor, batch_first
torch.Tensor, optional
The padding mask if a pad index was given
"""
mask = None
if self.pad_index is not None:
mask = (data != self.pad_index).float()
embedded = self.embeddings(data)
return embedded, mask
[docs]class BERTEncoder(Module, pt.modeling_bert.BertPreTrainedModel):
"""Integrate the pytorch_pretrained_bert BERT encoder model.
This module can be used as any normal encoder, or it can be
loaded with the official pretrained BERT models. Simply used
the `from_pretrained` class method when initializing the model.
Currently available:
. `bert-base-uncased`
. `bert-large-uncased`
. `bert-base-cased`
. `bert-large-cased`
. `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased`
. `bert-base-chinese`
"""
def __init__(self,
input_size_or_config: Union[int, pt.modeling_bert.BertConfig],
hidden_size: int = 768,
num_hidden_layers: int = 12,
num_attention_heads: int = 12,
intermediate_size: int = 3072,
hidden_act: str = "gelu",
hidden_dropout_prob: float = 0.1,
attention_probs_dropout_prob: float = 0.1,
max_position_embeddings: int = 512,
type_vocab_size: int = 2,
initializer_range: float = 0.02,
pool_last: bool = False, **kwargs) -> None:
"""Initialize the BertEncoder.
Parameters
----------
input_size_or_config: int
Vocabulary size of `inputs_ids` in the model's input
hidden_size: int, optional
Size of the encoder layers and the pooler layer.
num_hidden_layers: int, optional
Number of hidden layers in the Transformer encoder.
num_attention_heads: int, optional
Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: int, optional
The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: str, optional
The non-linear activation function (function or string) in
the encoder and pooler.
If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: float, optional
The dropout probabilitiy for all fully connected layers
in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: float, optional
The dropout ratio for the attention probabilities.
max_position_embeddings: int, optional
The maximum sequence length that this model might
ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
type_vocab_size: int, optional
The vocabulary size of the `token_type_ids`
initializer_range: float, optional
The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""
Module.__init__(self)
if isinstance(input_size_or_config, int):
tmp_config: Dict[str, Any] = {}
tmp_config['vocab_size_or_config_json_file'] = input_size_or_config
tmp_config['hidden_size'] = hidden_size
tmp_config['num_hidden_layers'] = num_hidden_layers
tmp_config['num_attention_heads'] = num_attention_heads
tmp_config['hidden_act'] = hidden_act
tmp_config['intermediate_size'] = intermediate_size
tmp_config['hidden_dropout_prob'] = hidden_dropout_prob
tmp_config['attention_probs_dropout_prob'] = attention_probs_dropout_prob
tmp_config['max_position_embeddings'] = max_position_embeddings
tmp_config['type_vocab_size'] = type_vocab_size
tmp_config['initializer_range'] = initializer_range
config: pt.BertConfig = pt.BertConfig(**tmp_config)
else:
config = input_size_or_config
self.config = config
self.encoder = pt.modeling_bert.BertEncoder(config)
self.pooler = pt.modeling_bert.BertPooler(config)
self.pool_last = pool_last
self.apply(self.init_bert_weights)
@registrable_factory
@classmethod
[docs] def from_alias(cls,
path: str = 'bert-base-cased',
cache_dir: Optional[str] = None,
pool_last: bool = False, **kwargs) -> 'BERTEncoder':
"""Initialize from a pretrained model.
Parameters
----------
path: str
Path to a pretrained model, or one of the following string
aliases currently available:
. `bert-base-uncased`
. `bert-large-uncased`
. `bert-base-cased`
. `bert-large-cased`
. `bert-base-multilingual-uncased`
. `bert-base-multilingual-cased`
. `bert-base-chinese`
"""
model = super().from_pretrained(path, cache_dir=cache_dir, **kwargs)
model.pool_last = pool_last
return model
[docs] def forward(self,
data: Tensor,
mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
"""Performs a forward pass through the network.
Parameters
----------
data : torch.Tensor
The input data, as a long tensor
Returns
-------
torch.Tensor
The encoded output, as a float tensor or the pooled output
"""
attention_mask = mask if mask is not None else torch.ones_like(data)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to
# [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular
# masking of causal attention used in OpenAI GPT, we just need
# to prepare the broadcastdimension here.
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
# Since attention_mask is 1.0 for positions we want to attend
# and 0.0 for masked positions, this operation will create a
# tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax,
# this is effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)
# Write (1.0 - extended_attention_mask) weird way for mypy
# so it uses Tensor.__add__ instead of float's
extended_attention_mask = (-extended_attention_mask + 1.0) * -10000.0
encoded_layers = self.encoder(data, extended_attention_mask, output_all_encoded_layers=True)
sequence_output = encoded_layers[-1]
pooled_output = self.pooler(sequence_output)
out = pooled_output if self.pool_last else sequence_output
return out