Source code for renate.benchmark.datasets.nlp_datasets

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
import functools
import logging
from typing import Any, Dict, Optional

import datasets
import torch
import transformers
from datasets import load_dataset

from renate import defaults
from renate.benchmark.datasets.base import DataIncrementalDataModule
from renate.data.data_module import RenateDataModule


class _InputTargetWrapper(torch.utils.data.Dataset):
    """Make a Hugging Face dataset comply with the `(input, target)` format."""

    def __init__(self, dataset, target_column: str = "label"):
        self._dataset = dataset
        self._target_column = target_column

    def __len__(self):
        return len(self._dataset)

    def __getitem__(self, idx):
        item = self._dataset[idx]
        target = item.pop(self._target_column)
        return item, target



[docs]
class HuggingFaceTextDataModule(RenateDataModule):
    """Data module wrapping Hugging Face text datasets.

    This is a convenience wrapper to expose a Hugging Face dataset as a `RenateDataModule`. Datasets
    will be pre-tokenized and will return `input, target = dataset[i]`, where `input` is a
    dictionary with fields `["input_ids", "attention_mask"]`, and `target` is a tensor.

    We expect the dataset to have a "train" and a "test" split. An additional "validation" split
    will be used if present. Otherwise, a validation set may be split off of the training data
    using the `val_size` argument.

    Args:
        data_path: the path to the folder containing the dataset files.
        tokenizer: Tokenizer to apply to the dataset. See https://huggingface.co/docs/tokenizers/
            for more information on tokenizers.
        dataset_name: Name of the dataset, see https://huggingface.co/datasets. This is a wrapper
            for text datasets only.
        input_column: Name of the column containing the input text.
        target_column: Name of the column containing the target (e.g., class label).
        tokenizer_kwargs: Keyword arguments passed when calling the tokenizer's ``__call__``
           function. Typical options are `max_length`, `padding` and `truncation`.
           See https://huggingface.co/docs/tokenizers/
           for more information on tokenizers. If `None` is passed, this defaults to
           `{"padding": "max_length", max_length: 128, truncation: True}`.
        val_size: Fraction of the training data to be used for validation.
        seed: Seed used to fix random number generation.
    """

    def __init__(
        self,
        data_path: str,
        tokenizer: transformers.PreTrainedTokenizer,
        dataset_name: str = "ag_news",
        input_column: str = "text",
        target_column: str = "label",
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
        val_size: float = defaults.VALIDATION_SIZE,
        seed: int = defaults.SEED,
    ):
        super(HuggingFaceTextDataModule, self).__init__(
            data_path=data_path,
            val_size=val_size,
            seed=seed,
        )
        self._dataset_name = dataset_name
        self._input_column = input_column
        self._target_column = target_column
        self._tokenizer = tokenizer
        self._tokenizer_kwargs = tokenizer_kwargs or defaults.TOKENIZER_KWARGS


[docs]
    def prepare_data(self) -> None:
        """Download data."""
        split_names = datasets.get_dataset_split_names(self._dataset_name)
        if "train" not in split_names:
            raise RuntimeError(f"Dataset {self._dataset_name} does not contain a 'train' split.")
        if "test" not in split_names:
            raise RuntimeError(f"Dataset {self._dataset_name} does not contain a 'test' split.")
        self._train_data = datasets.load_dataset(
            self._dataset_name, split="train", cache_dir=self._data_path
        )
        available_columns = list(self._train_data.features)
        if self._input_column not in available_columns:
            raise ValueError(
                f"Input column '{self._input_column}' does not exist in {self._dataset_name}. "
                f"Available columns: {available_columns}."
            )
        if self._target_column not in available_columns:
            raise ValueError(
                f"Target column '{self._target_column}' does not exist in {self._dataset_name}. "
                f"Available columns: {available_columns}."
            )
        self._test_data = datasets.load_dataset(
            self._dataset_name, split="test", cache_dir=self._data_path
        )
        if "validation" in split_names:
            logging.info(f"Using 'validation' split of dataset {self._dataset_name}.")
            self._val_data = datasets.load_dataset(
                self._dataset_name, split="validation", cache_dir=self._data_path
            )
        else:
            logging.info(
                f"No 'validation' split in dataset {self._dataset_name}. Splitting validation data "
                f"from the 'train' split using `val_size={self._val_size}`."
            )
            self._val_data = None



[docs]
    def setup(self) -> None:
        """Set up train, test and val datasets."""
        self.prepare_data()  # This will use cached datasets if they have already been downloaded.

        def tokenize_fn(batch):
            return self._tokenizer(batch[self._input_column], **self._tokenizer_kwargs)

        columns = ["input_ids", "attention_mask", self._target_column]

        self._train_data = self._train_data.map(tokenize_fn, batched=True)
        self._train_data.set_format(type="torch", columns=columns)
        self._train_data = _InputTargetWrapper(self._train_data, self._target_column)
        self._test_data = self._test_data.map(tokenize_fn, batched=True)
        self._test_data.set_format(type="torch", columns=columns)
        self._test_data = _InputTargetWrapper(self._test_data, self._target_column)
        if self._val_data is not None:
            self._val_data = self._val_data.map(tokenize_fn, batched=True)
            self._val_data.set_format(type="torch", columns=columns)
            self._val_data = _InputTargetWrapper(self._val_data, self._target_column)
        else:
            self._train_data, self._val_data = self._split_train_val_data(self._train_data)





[docs]
class MultiTextDataModule(DataIncrementalDataModule):
    """
    Inspired by the dataset used in "Episodic Memory in Lifelong Language Learning"
    by d’Autume et al. this is a collection of four different datasets that we call domains:
    AGNews, Yelp, DBPedia and Yahoo Answers.

    The output space if the union of the output space of all the domains.
    The dataset has 33 classes: 4 from AGNews, 5 from Yelp, 14 from DBPedia, and 10 from Yahoo.

    The largest available size for the training set is 115000 and for the test set is 7600.

    Args:
        data_path: The path to the folder where the data files will be downloaded to.
        tokenizer: Tokenizer to apply to the dataset. See https://huggingface.co/docs/tokenizers/
            for more information on tokenizers.
        tokenizer_kwargs: Keyword arguments passed when calling the tokenizer's ``__call__``
           function. Typical options are `max_length`, `padding` and `truncation`.
           See https://huggingface.co/docs/tokenizers/
           for more information on tokenizers. If `None` is passed, this defaults to
           `{"padding": "max_length", max_length: 128, truncation: True}`.
        data_id: The dataset to be used
        train_size: The size of the data stored as training set, must be smaller than 115000.
        test_size: The size of the data stored as test set, must be smaller than 7600.
        val_size: Fraction of the training data to be used for validation.
        seed: Seed used to fix random number generation.
    """

    _multi_dataset_info = {
        "ag_news": ["text", "label"],
        "yelp_review_full": ["text", "label"],
        "dbpedia_14": ["content", "label"],
        "yahoo_answers_topics": ["question_title", "topic"],
    }
    _label_offset = {
        "ag_news": 0,
        "yelp_review_full": 4,
        "dbpedia_14": 9,
        "yahoo_answers_topics": 23,
    }

    domains = list(_multi_dataset_info)

    def __init__(
        self,
        data_path: str,
        tokenizer: transformers.PreTrainedTokenizer,
        data_id: str,
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
        train_size: int = 115000,
        test_size: int = 7600,
        val_size: float = defaults.VALIDATION_SIZE,
        seed: int = defaults.SEED,
    ):
        super().__init__(data_path=data_path, data_id=data_id, val_size=val_size, seed=seed)

        if train_size > 115000:
            raise ValueError("The `train_size` must be smaller than or equal to 115000")
        self._train_size = train_size

        if test_size > 7600:
            raise ValueError("The `test_size` must be smaller than or equal to 7600")
        self._test_size = test_size

        self._tokenizer = tokenizer
        self._tokenizer_kwargs = tokenizer_kwargs or defaults.TOKENIZER_KWARGS

        if data_id not in self.domains:
            raise ValueError(
                f"The selected domain is not available. Select one among " f"{self.domains}"
            )

        self.data_id = data_id


[docs]
    def prepare_data(self) -> None:
        """Download dataset."""
        for split in ["train", "test"]:
            load_dataset(self.data_id, split=split, cache_dir=self._data_path)



[docs]
    def setup(self) -> None:
        """Set up train, test and val datasets."""
        rnd_gen = torch.Generator().manual_seed(self._seed)

        def preprocess(example, text_field_name, label_field_name):
            return {
                **self._tokenizer(example[text_field_name], **self._tokenizer_kwargs),
                "label": example[label_field_name]
                + MultiTextDataModule._label_offset[self.data_id],
            }

        def get_split(split_name):
            dataset = load_dataset(self.data_id, split=split_name, cache_dir=self._data_path)
            # the following is hack needed because the output space of the new dataset is
            # the union of the output spaces of the single datasets
            # HF datasets check for the max label id, and we need to make sure we update that
            # without this change the setup will fail with a value error (label id > max labels)
            new_features = dataset.features.copy()
            new_features[self._multi_dataset_info[self.data_id][1]] = datasets.ClassLabel(
                num_classes=33
            )

            dataset = dataset.cast(new_features)

            if "train" == split_name:
                set_size = self._train_size
            else:
                set_size = self._test_size

            rnd_idx = torch.randint(
                high=len(dataset),
                size=(set_size,),
                generator=rnd_gen,
            ).tolist()
            dataset = dataset.select(indices=rnd_idx)

            dataset = dataset.map(
                functools.partial(
                    preprocess,
                    text_field_name=self._multi_dataset_info[self.data_id][0],
                    label_field_name=self._multi_dataset_info[self.data_id][1],
                ),
                remove_columns=list(dataset.features),
                num_proc=4,
            )

            dataset.set_format(type="torch")

            return _InputTargetWrapper(dataset)

        self._train_data = get_split("train")
        self._train_data, self._val_data = self._split_train_val_data(self._train_data)
        self._test_data = get_split("test")