Source code for flambe.dataset.tabular

import os
from typing import Optional, List, Tuple, Iterable, Dict, Union, Any

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from flambe.dataset import Dataset
from flambe.compile import registrable_factory
from flambe.field import Field


[docs]class DataView: """TabularDataset view for the train, val or test split. This class must be used only internally in the TabularDataset class. A DataView is a lazy Iterable that receives the operations from the TabularDataset object. When __getitem__ is called, then all the fields defined in the transform are applied. This object can cache examples already transformed. To enable this, make sure to use this view under a Singleton pattern (there must only be one DataView per split in the TabularDataset). """ def __init__(self, data: np.ndarray, transform_hooks: List[Tuple[Field, Union[int, List[int]]]], cache: bool) -> None: """ Parameters ---------- data: np.ndarray A 2d numpy array holding the data transform_hooks: List[Tuple[Field, Union[int, List[int]]]] The transformations that will be applied to each example. cache: bool To apply cache or not. """ self.data = data # Stores the raw data self.transform_hooks = transform_hooks self.cache = cache # Caches the transformed data self.cached_data: Dict[int, Any] = {} @property
[docs] def raw(self): """Returns an subscriptable version of the data""" return self.data
[docs] def __getitem__(self, index): """ Get an item from an index and apply the transformations dinamically. """ if self.data is None: raise IndexError() if self.cache and index in self.cached_data: return self.cached_data[index] ex = self.data[index] if len(self.transform_hooks) > 0: ret = [] for field, cols in self.transform_hooks: _ex = ex[cols] if isinstance(cols, List): processed_ex = field.process(*_ex) else: processed_ex = field.process(_ex) if isinstance(processed_ex, tuple): ret.extend(processed_ex) else: ret.append(processed_ex) ret = tuple(ret) else: ret = tuple(ex) if self.cache: self.cached_data[index] = ret return ret
[docs] def is_empty(self) -> bool: """ Return if the DataView has data """ return len(self) == 0
[docs] def cols(self) -> int: """ Return the amount of columns the DataView has.""" if self.is_empty(): raise ValueError("Empty DataView contains no columns") return len(self[0])
[docs] def __len__(self) -> int: """ Return the length of the dataview, ie the amount of examples it contains. """ if self.data is None: return 0 return len(self.data)
[docs] def __setitem__(self): """Raise an error as DataViews are immutable.""" raise ValueError("Dataset objects are immutable")
[docs] def __delitem__(self): """Raise an error as DataViews are immutable.""" raise ValueError("Dataset objects are immutable")
[docs]class TabularDataset(Dataset): """Loader for tabular data, usually in `csv` or `tsv` format. A TabularDataset can represent any data that can be organized in a table. Internally, we store all information in a 2D numpy generic array. This object also behaves as a sequence over the whole dataset, chaining the training, validation and test data, in that order. This is useful in creating vocabularies or loading embeddings over the full datasets. Attributes ---------- train: np.ndarray The list of training examples val: np.ndarray The list of validation examples test: np.ndarray The list of text examples """ def __init__(self, train: Iterable[Iterable], val: Optional[Iterable[Iterable]] = None, test: Optional[Iterable[Iterable]] = None, cache: bool = True, named_columns: Optional[List[str]] = None, transform: Dict[str, Union[Field, Dict]] = None) -> None: """Initialize the TabularDataset. Parameters ---------- train: Iterable[Iterable] The train data val: Iterable[Iterable], optional The val data, optional test: Iterable[Iterable], optional The test data, optional cache: bool Whether to cache fetched examples. Only use True if the dataset fits in memory. Defaults to False. named_columns: Optional[List[Union[str, int]]] The columns' names of the dataset, in order. transform: Dict[str, Dict[str, Any]] The fields to be applied to the columns. Each field is identified with a name for easy linking. For example: { 'text': {'field': SomeField(), 'columns': [0, 1]}, 'label': {'field': SomeOtherField(), 'columns': 2} } """ self._train = np.array(train, dtype=np.object) self._val = None self._test = None if val is not None: self._val = np.array(val, dtype=np.object) if test is not None: self._test = np.array(test, dtype=np.object) self.cache = cache self.named_columns = named_columns cols = [] # All datasets should be 2-dimensional for k, d in {"val": self._val, "test": self._test, "train": self._train}.items(): if d is not None: cols.append(d.shape[-1]) if len(d.shape) != 2: # This happens when examples differ in the amount of # columns and numpy stores them in a 1-D tensor # (with tuples as values) raise ValueError( f"{k} dataset contains examples with different amount of columns" ) # Check that all splits contain same columns if np.unique(cols).shape != (1,): raise ValueError("All splits containing data should have same amount of columns") if named_columns and len(named_columns) != cols[0]: raise ValueError("Columns parameter should have same size as the dataset's amount " + " of columns") # Store the hooks for lazy loading self.transform_hooks: List[Tuple[Field, Union[int, List[int]]]] = [] self.transform = transform if transform: self._set_transforms(transform) self.train_view: Optional[DataView] = None self.val_view: Optional[DataView] = None self.test_view: Optional[DataView] = None
[docs] def _set_transforms(self, transform: Dict[str, Union[Field, Dict]]) -> None: """Set transformations attributes and hooks to the data splits. This method adds attributes for each field in the transform dict. It also adds hooks for the 'process' call in each field. ATTENTION: This method works with the _train, _val and _test hidden attributes as this runs in the constructor and creates the hooks to be used in creating the properties. """ columns: Union[int, List[int]] for k, t in enumerate(transform.items()): name, value = t if isinstance(value, Field): field = value columns = k else: try: field, tmp_cols = value['field'], value.get('columns', k) # Process as list to avoid repeating code if not isinstance(tmp_cols, List): tmp_cols = [tmp_cols] for i, c in enumerate(tmp_cols[:]): if isinstance(c, str): if not self.named_columns: raise ValueError( "Columns parameter is required for str-based indexing" ) try: tmp_cols[i] = self.named_columns.index(c) except ValueError: raise ValueError( f"Dataset has no column name {c}. " + f"Available columns: {self.named_columns}" ) columns = tmp_cols # If it was a value originally then process # it as a single value if len(tmp_cols) == 1: columns = tmp_cols[0] except KeyError: raise ValueError( f"If a dict is provided in 'transform', then it must have the 'field' key." f" transform item = {k, t}" ) setattr(self, name, field) args = [self._train[:, columns]] if self._val is not None: args.append(self._val[:, columns]) if self._test is not None: args.append(self._test[:, columns]) field.setup(*args) self.transform_hooks.append((field, columns))
@registrable_factory @classmethod
[docs] def from_path(cls, train_path: str, val_path: Optional[str] = None, test_path: Optional[str] = None, sep: Optional[str] = '\t', header: Optional[str] = 'infer', columns: Optional[Union[List[str], List[int]]] = None, encoding: Optional[str] = 'utf-8', transform: Dict[str, Union[Field, Dict]] = None) -> 'TabularDataset': """Load a TabularDataset from the given file paths. Parameters ---------- train_path : str The path to the train data val_path : str, optional The path to the optional validation data test_path : str, optional The path to the optional test data sep: str Separator to pass to the `read_csv` method header: Optional[Union[str, int]] Use 0 for first line, None for no headers, and 'infer' to detect it automatically, defaults to 'infer' columns: List[str] List of columns to load, can be used to select a subset of columns, or change their order at loading time encoding: str The encoding format passed to the pandas reader transform: Dict[str, Union[Field, Dict]] The fields to be applied to the columns. Each field is identified with a name for easy linking. """ if ( columns and any(isinstance(c, int) for c in columns) and any(isinstance(c, str) for c in columns) ): raise ValueError("Columns parameters need to be all string or all integers.") train, cols = cls._load_file(train_path, sep, header, columns, encoding) val, test = None, None if val_path is not None: val, _ = cls._load_file(val_path, sep, header, columns, encoding) if test_path is not None: test, _ = cls._load_file(test_path, sep, header, columns, encoding) return cls(train=train, val=val, test=test, transform=transform, named_columns=cols)
@registrable_factory @classmethod
[docs] def autogen(cls, data_path: str, test_path: Optional[str] = None, seed: Optional[int] = None, test_ratio: Optional[float] = 0.2, val_ratio: Optional[float] = 0.2, sep: Optional[str] = '\t', header: Optional[str] = 'infer', columns: Optional[Union[List[str], List[int]]] = None, encoding: Optional[str] = 'utf-8', transform: Dict[str, Union[Field, Dict]] = None) -> 'TabularDataset': """Generate a test and validation set from the given file paths, then load a TabularDataset. Parameters ---------- data_path: str The path to the data test_path: Optional[str] The path to the test data seed: Optional[int] Random seed to be used in test/val generation test_ratio: Optional[float] The ratio of the test dataset in relation to the whole dataset. If `test_path` is specified, this field has no effect. val_ratio: Optional[float] The ratio of the validation dataset in relation to the training dataset (whole - test) sep: str Separator to pass to the `read_csv` method header: Optional[Union[str, int]] Use 0 for first line, None for no headers, and 'infer' to detect it automatically, defaults to 'infer' columns: List[str] List of columns to load, can be used to select a subset of columns, or change their order at loading time encoding: str The encoding format passed to the pandas reader transform: Dict[str, Union[Field, Dict]] The fields to be applied to the columns. Each field is identified with a name for easy linking. """ if ( columns and any(isinstance(c, int) for c in columns) and any(isinstance(c, str) for c in columns) ): raise ValueError("Columns parameters need to be all string or all integers.") data, cols = cls._load_file(data_path, sep=sep, header=header, columns=columns, encoding=encoding) train, val, test = None, None, None if test_path is not None: train, val = train_test_split(data, test_size=val_ratio, random_state=seed) test, _ = cls._load_file(test_path, sep=sep, header=header, columns=columns, encoding=encoding) else: train_val, test = train_test_split(data, test_size=test_ratio, random_state=seed) train, val = train_test_split(train_val, test_size=val_ratio, random_state=seed) return cls(train=train, val=val, test=test, transform=transform, named_columns=cols)
@classmethod
[docs] def _load_file(cls, path: str, sep: Optional[str] = '\t', header: Optional[str] = 'infer', columns: Optional[Union[List[str], List[int]]] = None, encoding: Optional[str] = 'utf-8') -> Tuple[List[Tuple], Optional[List[str]]]: """Load data from the given path. The path may be either a single file or a directory. If it is a directory, each file is loaded according to the specified options and all the data is concatenated into a single list. The files will be processed in order based on file name. Parameters ---------- path : str Path to data, could be a directory, a file, or a smart_open link sep: str Separator to pass to the `read_csv` method header: Optional[Union[str, int]] Use 0 for first line, None for no headers, and 'infer' to detect it automatically, defaults to 'infer' columns: Optional[Union[List[str], List[int]]] List of columns to load, can be used to select a subset of columns, or change their order at loading time encoding: str The encoding format passed to the pandas reader Returns ------- Tuple[List[Tuple], Optional[List[str]]] A tuple containing the list of examples (where each example is itself also a list or tuple of entries in the dataset) and an optional list of named columns (one string for each column in the dataset) """ # Get all paths if isinstance(path, str) and os.path.isdir(path): file_paths = [os.path.join(path, name) for name in os.listdir(path)] file_paths = sorted(file_paths) else: file_paths = [path] data: List = [] for file_path in file_paths: # Don't fail on buggy files try: examples = pd.read_csv(file_path, sep=sep, header=header, index_col=False, dtype=str, encoding=encoding, keep_default_na=False) # Select columns if columns is not None: examples = examples[columns] data.extend(examples.values.tolist()) except Exception as e: print("Warning: failed to load file {file_path}") print(e) if len(data) == 0: raise ValueError(f"No data found at {path}") # Take the named columns from the columns parameter # if they are strings or try to use the pd.DataFrame # column names if they are strings. named_cols: List[str] = [] if columns: for i, c in enumerate(columns): # type: ignore if isinstance(c, str): named_cols.append(c) elif all(isinstance(c, str) for c in examples.columns): named_cols = examples.columns.tolist() return data, named_cols if len(named_cols) > 0 else None
@property
[docs] def train(self) -> np.ndarray: """Returns the training data as a numpy nd array""" if self.train_view is None: self.train_view = DataView(self._train, self.transform_hooks, self.cache) return self.train_view
@property
[docs] def val(self) -> np.ndarray: """Returns the validation data as a numpy nd array""" if self.val_view is None: self.val_view = DataView(self._val, self.transform_hooks, self.cache) return self.val_view
@property
[docs] def test(self) -> np.ndarray: """Returns the test data as a numpy nd array""" if self.test_view is None: self.test_view = DataView(self._test, self.transform_hooks, self.cache) return self.test_view
@property
[docs] def raw(self) -> np.ndarray: """Returns all partitions of the data as a numpy nd array""" args = [self._train] if not self.val.is_empty(): args.append(self.val.raw) if not self.test.is_empty(): args.append(self.test.raw) return np.concatenate(args, axis=0)
@property
[docs] def cols(self) -> int: """Returns the amount of columns in the tabular dataset""" return self.train.cols()
[docs] def __len__(self): """Get the length of the dataset.""" return len(self.train) + len(self.val) + len(self.test)
[docs] def __iter__(self): """Iterate through the dataset.""" for i in range(len(self)): yield self[i]
[docs] def __getitem__(self, index): """Get the item at the given index.""" ceiling = len(self.train) if index < ceiling: return self.train[index] offset = ceiling ceiling += len(self.val) if index < ceiling: return self.val[index - offset] offset = ceiling ceiling += len(self.test) if index < ceiling: return self.test[index - offset]