Preprocessing Datasets Using PyTorch

PyTorch
Published

June 26, 2023

load and explore the GLUE SST-2 dataset

GLUE SST-2 is a dataset containing movie reviews and their sentiments as labels.

import torch
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from datasets import load_dataset

raw_datasets = load_dataset("glue", "sst2")

raw_datasets
DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})
raw_datasets['train'].features
{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

use BERT tokenizer for tokenization

tokenize the dataset using the Dataset.map() function

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# no padding at this stage
def f(x):
    return tokenizer(x["sentence"], truncation=True)

tokenized_datasets = raw_datasets.map(f, batched=True).with_format('pytorch')
Loading cached processed dataset at /home/limin/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-989431ea55d09aff.arrow
Loading cached processed dataset at /home/limin/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-dcfc5e44e548784f.arrow
Loading cached processed dataset at /home/limin/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-974d6c4aa35125e2.arrow
train_dataset = tokenized_datasets['train']
train_dataset
Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 67349
})
# remove some columns

train_dataset = train_dataset.remove_columns(['sentence', 'idx'])
train_dataset
Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 67349
})
train_dataset[0]
{'label': tensor(0),
 'input_ids': tensor([  101,  5342,  2047,  3595,  8496,  2013,  1996, 18643,  3197,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}

use DataCollatorWithPadding for dynamic padding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# try it with some samples
samples = train_dataset[:10]
print([len(x) for x in samples["input_ids"]])

batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
[10, 11, 15, 10, 22, 13, 29, 6, 13, 16]
{'input_ids': torch.Size([10, 29]),
 'token_type_ids': torch.Size([10, 29]),
 'attention_mask': torch.Size([10, 29]),
 'labels': torch.Size([10])}
# instantiate a train dataloader
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=data_collator)
it = iter(train_dataloader)
for _ in range(2):
    batch = next(it)
    print({k: batch[k].shape for k in batch.keys()})
{'input_ids': torch.Size([32, 40]), 'token_type_ids': torch.Size([32, 40]), 'attention_mask': torch.Size([32, 40]), 'labels': torch.Size([32])}
{'input_ids': torch.Size([32, 46]), 'token_type_ids': torch.Size([32, 46]), 'attention_mask': torch.Size([32, 46]), 'labels': torch.Size([32])}

how many batches?

len(list(train_dataloader))
2105
train_dataset.num_rows / 32
2104.65625