Source code for flambe.nlp.transformers.field

from typing import Optional

import torch
from transformers import AutoTokenizer

from flambe.field import Field


[docs]class PretrainedTransformerField(Field): """Field intergation of the transformers library. Instantiate this object using any alias available in the `transformers` library. More information can be found here: https://huggingface.co/transformers/ """ def __init__(self, alias: str, cache_dir: Optional[str] = None, max_len_truncate: Optional[int] = None, add_special_tokens: bool = True, **kwargs) -> None: """Initialize a pretrained tokenizer. Parameters ---------- alias: str Alias of a pretrained tokenizer. cache_dir: str, optional A directory where to cache the downloaded vocabularies. max_len_truncate: int, optional If given, truncates the length of the tokenized sequence. add_special_tokens: bool, optional Add the special tokens to the inputs. Default ``True``. """ self._tokenizer = AutoTokenizer.from_pretrained(alias, cache_dir=cache_dir, **kwargs) self.max_len_truncate = max_len_truncate self.add_special_tokens = add_special_tokens @property
[docs] def padding_idx(self) -> int: """Get the padding index. Returns ------- int The padding index in the vocabulary """ pad_token = self._tokenizer.pad_token return self._tokenizer.convert_tokens_to_ids(pad_token)
@property
[docs] def vocab_size(self) -> int: """Get the vocabulary length. Returns ------- int The length of the vocabulary """ return len(self._tokenizer)
[docs] def process(self, example: str) -> torch.Tensor: # type: ignore """Process an example, and create a Tensor. Parameters ---------- example: str The example to process, as a single string Returns ------- torch.Tensor The processed example, tokenized and numericalized """ tokens = self._tokenizer.encode(example, add_special_tokens=self.add_special_tokens) if self.max_len_truncate is not None: tokens = tokens[:self.max_len_truncate] return torch.tensor(tokens)