Source code for flambe.nlp.transformers.openai

"""
Intergation of the pytorch_transformers openai module
"""

import copy
from typing import Dict, Any, Optional, Tuple, Union

import torch
import torch.nn as nn
from torch import Tensor

from flambe.compile import registrable_factory
from flambe.nn import Module
from flambe.field import TextField

import pytorch_transformers as pt


[docs]class OpenAIGPTTextField(TextField, pt.OpenAIGPTTokenizer): """Perform WordPiece tokenization. Inspired by: https://github.com/huggingface/pytorch-pretrained-BERT/ blob/master/pytorch_pretrained_bert/tokenization_openai.py. Note that this object requires a pretrained vocabulary. """ def __init__(self, vocab_file: str, merges_file: str, max_len: int = 100, lower: bool = False) -> None: """Initialize the BERTTextField. Parameters ---------- vocab_file : str Where to load the vocabulary from merges_file : str Where to load the wordpiece splits from """ pt.OpenAIGPTTokenizer.__init__(self, vocab_file, merges_file) self.lower = lower self._vocab = self.encoder @registrable_factory @classmethod
[docs] def from_alias(cls, path: str = 'openai-gpt', cache_dir: Optional[str] = None) -> 'OpenAIGPTTextField': """Initialize from a pretrained tokenizer. """ field = super().from_pretrained(path, cache_dir=cache_dir) return field
[docs] def process(self, example: str) -> torch.Tensor: # type: ignore """Process an example, and create a Tensor. Parameters ---------- example: str The example to process, as a single string Returns ------- torch.Tensor The processed example, tokenized and numericalized """ tokens = self.tokenize(example) tokens = tokens[:self.max_len] numericals = self.convert_tokens_to_ids(tokens) return torch.tensor(numericals)
[docs]class OpenAIGPTEmbeddings(Module, pt.modeling_openai.OpenAIGPTPreTrainedModel): """Integrate the pytorch_pretrained_bert OpenAI embedding model. This module can be used as any normal encoder, or it can be loaded with the official pretrained OpenAI models. Simply used the `from_pretrained` class method when initializing the model. """ def __init__(self, input_size_or_config: Union[int, pt.OpenAIGPTConfig] = 40478, embedding_size: int = 768, embedding_dropout: float = 0.1, embedding_freeze: bool = False, pad_index: int = 0, n_special: int = 0, n_positions: int = 512, initializer_range=0.02) -> None: """Initialize the OpenAIGPTEmbeddings. Parameters ---------- input_size_or_config: int Vocabulary size or configuration n_special: int The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...) n_positions: int, optional Number of positional embeddings. embedding_size: int, optional Dimensionality of the embeddings and hidden states. embedding_dropout: float, optional The dropout ratio for the embeddings. initializer_range: float, optional The sttdev of the truncated_normal_initializer for initializing all weight matrices. """ Module.__init__(self) if isinstance(input_size_or_config, int): tmp_config: Dict[str, Any] = {} tmp_config['vocab_size_or_config_json_file'] = input_size_or_config tmp_config['n_embd'] = embedding_size tmp_config['embd_pdrop'] = embedding_dropout tmp_config['n_special'] = n_special tmp_config['n_positions'] = n_positions self.config: pt.OpenAIGPTConfig = pt.OpenAIGPTConfig(**tmp_config) else: self.config = input_size_or_config num_tokens = self.config.vocab_size + self.config.n_special self.tokens_embed = nn.Embedding(num_tokens, self.config.n_embd) self.positions_embed = nn.Embedding(self.config.n_positions, self.config.n_embd) self.drop = nn.Dropout(self.config.embd_pdrop) self.apply(self.init_weights) if embedding_freeze: for param in self.parameters(): param.requires_grad = False @registrable_factory @classmethod
[docs] def from_alias(cls, path: str = 'openai-gpt', cache_dir: Optional[str] = None) -> 'OpenAIGPTEmbeddings': """Initialize from a pretrained model. Parameters ---------- path: str Path to a pretrained model, or one of the following string aliases currently available: . `openai-gpt` """ return super().from_pretrained(path, cache_dir=cache_dir)
[docs] def set_num_special_tokens(self, num_special_tokens): " Update input embeddings with new embedding matrice if needed " return super().set_num_special_tokens(num_special_tokens)
[docs] def forward(self, data: Tensor) -> Tuple[Tensor, Optional[Tensor]]: """Performs a forward pass through the network. Parameters ---------- data : torch.Tensor The input data, as a float tensor, batch first Returns ------- torch.Tensor The encoded output, as a float tensor, batch_first torch.Tensor, optional The padding mask if a pad index was given """ mask = None if self.pad_index is not None: mask = (data != self.pad_index).float() # This was used when we had a single embedding # matrice from position and token embeddings # start = self.config.vocab_size + self.config.n_special # end = start + data.size(-1) # position_ids = torch.arange(start, end, # dtype=torch.long, device=data.device) position_ids = torch.arange(data.size(-1), dtype=torch.long, device=data.device) position_ids = position_ids.unsqueeze(0).expand_as(data) position_ids = position_ids.view(-1, position_ids.size(-1)) inputs_embeds = self.tokens_embed(data) position_embeds = self.positions_embed(position_ids) token_type_embeds = 0 # Add the position information to the input embeddings # h = e.sum(dim=2) hidden_states = inputs_embeds + position_embeds + token_type_embeds return hidden_states, mask
[docs]class OpenAIGPTEncoder(Module, pt.modeling_openai.OpenAIGPTPreTrainedModel): """Integrate the pytorch_pretrained_bert OpenAIGPT encoder model. This module can be used as any normal encoder, or it can be loaded with the official pretrained BERT models. Simply used the `from_pretrained` class method when initializing the model. Currently available: . `openai-gpt` """ def __init__(self, input_size_or_config: Union[int, pt.OpenAIGPTConfig] = 768, n_ctx: int = 512, n_layer: int = 12, n_head: int = 12, afn: Union[str, nn.Module] = "gelu", resid_pdrop: float = 0.1, embd_pdrop: float = 0.1, attn_pdrop: float = 0.1, layer_norm_epsilon: float = 1e-5, initializer_range=0.02) -> None: """Initialize the OpenAIGPTEncoder. Parameters ---------- input_size_or_config: Union[int, OpenAIGPTConfig] Vocabulary size or configuration n_ctx: int Size of the causal mask (usually same as n_positions). n_layer: int, optional Number of hidden layers in the Transformer encoder. n_head: int, optional Number of attention heads for each attention layer in the Transformer encoder. afn: Union[str, nn.Module] The non-linear activation function (module or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported. resid_pdrop: float, optional The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attn_pdrop: float, optional The dropout ratio for the attention probabilities. layer_norm_epsilon: float, optional epsilon to use in the layer norm layers initializer_range: float, optional The sttdev of the truncated_normal_initializer for initializing all weight matrices. """ Module.__init__(self) if isinstance(input_size_or_config, int): tmp_config: Dict[str, Any] = {} tmp_config['n_embd'] = input_size_or_config tmp_config['n_ctx'] = n_ctx tmp_config['n_layer'] = n_layer tmp_config['n_head'] = n_head tmp_config['afn'] = afn tmp_config['resid_pdrop'] = resid_pdrop tmp_config['attn_pdrop'] = attn_pdrop tmp_config['layer_norm_epsilon'] = layer_norm_epsilon tmp_config['initializer_range'] = initializer_range self.config: pt.OpenAIGPTConfig = pt.OpenAIGPTConfig(**tmp_config) else: self.config = input_size_or_config block = pt.modeling_openai.Block(self.config.n_ctx, self.config, scale=True) self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(self.config.n_layer)]) self.apply(self.init_weights) @registrable_factory @classmethod
[docs] def from_alias(cls, path: str = 'openai-gpt', cache_dir: Optional[str] = None) -> 'OpenAIGPTEncoder': """Initialize from a pretrained model. Parameters ---------- path: str Path to a pretrained model, or one of the following string aliases currently available: . `openai-gpt` """ return super().from_pretrained(path, cache_dir=cache_dir)
[docs] def forward(self, data: Tensor, mask: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: """Performs a forward pass through the network. Parameters ---------- data : torch.Tensor The input data, as a long tensor Returns ------- torch.Tensor The encoded output, as a float tensor or the pooled output """ for block in self.h: hidden_states = block(hidden_states) return hidden_states