Source code for flambe.nlp.transformers.field

from typing import Optional

import torch
from transformers import AutoTokenizer

from flambe.field import Field


[docs]class PretrainedTransformerField(Field):
    """Field intergation of the transformers library.

    Instantiate this object using any alias available in the
    `transformers` library. More information can be found here:

    https://huggingface.co/transformers/

    """

    def __init__(self,
                 alias: str,
                 cache_dir: Optional[str] = None,
                 max_len_truncate: Optional[int] = None,
                 add_special_tokens: bool = True, **kwargs) -> None:
        """Initialize a pretrained tokenizer.

        Parameters
        ----------
        alias: str
            Alias of a pretrained tokenizer.
        cache_dir: str, optional
            A directory where to cache the downloaded vocabularies.
        max_len_truncate: int, optional
            If given, truncates the length of the tokenized sequence.
        add_special_tokens: bool, optional
            Add the special tokens to the inputs. Default ``True``.

        """
        self._tokenizer = AutoTokenizer.from_pretrained(alias, cache_dir=cache_dir, **kwargs)
        self.max_len_truncate = max_len_truncate
        self.add_special_tokens = add_special_tokens

    @property
[docs]    def padding_idx(self) -> int:
        """Get the padding index.

        Returns
        -------
        int
            The padding index in the vocabulary

        """
        pad_token = self._tokenizer.pad_token
        return self._tokenizer.convert_tokens_to_ids(pad_token)

    @property
[docs]    def vocab_size(self) -> int:
        """Get the vocabulary length.

        Returns
        -------
        int
            The length of the vocabulary

        """
        return len(self._tokenizer)

[docs]    def process(self, example: str) -> torch.Tensor:  # type: ignore
        """Process an example, and create a Tensor.

        Parameters
        ----------
        example: str
            The example to process, as a single string

        Returns
        -------
        torch.Tensor
            The processed example, tokenized and numericalized

        """
        tokens = self._tokenizer.encode(example, add_special_tokens=self.add_special_tokens)

        if self.max_len_truncate is not None:
            tokens = tokens[:self.max_len_truncate]

        return torch.tensor(tokens)