Currently my custom data set gives None indices in the data loader, but NOT in the pure data set. When I wrap it in pytorch data loader it fails.
Code is in colab but will put it here in case colab dies someday:
pip install datasets
pip install pytorch
pip install transformers
then run
token = None
batch_size = 10
from datasets import load_dataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
probe_network = probe_network.to(device)
# -- Get batch from dataset
from datasets import load_dataset
# path, name = 'brando/debug1_af', 'debug1_af'
path, name = 'brando/debug0_af', 'debug0_af'
remove_columns = []
dataset = load_dataset(path, name, streaming=True, split="train", token=token).with_format("torch")
print(f'{dataset=}')
batch = dataset.take(batch_size)
# print(f'{next(iter(batch))=}')
# - Prepare functions to tokenize batch
def preprocess(examples): # gets the raw text batch according to the specific names in table in data set & tokenize
return tokenizer(examples["link"], padding="max_length", max_length=128, truncation=True, return_tensors="pt")
def map(batch): # apply preprocess to batch to all examples in batch represented as a dataset
return batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = batch.map(preprocess, batched=True, remove_columns=remove_columns)
tokenized_batch = map(batch)
# print(f'{next(iter(tokenized_batch))=}')
from torch.utils.data import Dataset, DataLoader, SequentialSampler
dataset = tokenized_batch
print(f'{type(dataset)=}')
print(f'{dataset.__class__=}')
print(f'{isinstance(dataset, Dataset)=}')
# for i, d in enumerate(dataset):
# assert isinstance(d, dict)
# # dd = dataset[i]
# # assert isinstance(dd, dict)
loader_opts = {}
classifier_opts = {}
# data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 1),
# num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=SequentialSampler(range(512)) )
data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 1),
num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=None)
print(f'{iter(data_loader)=}')
print(f'{next(iter(data_loader))=}')
print('Done\a')
error:
dataset=<datasets.iterable_dataset.IterableDataset object at 0x7e42c2f21d20>
type(dataset)=<class 'datasets.iterable_dataset.IterableDataset'>
dataset.__class__=<class 'datasets.iterable_dataset.IterableDataset'>
isinstance(dataset, Dataset)=True
iter(data_loader)=<torch.utils.data.dataloader._SingleProcessDataLoaderIter object at 0x7e42c2f21660>
/usr/local/lib/python3.10/dist-packages/datasets/formatting/torch_formatter.py:68: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
126 try:
--> 127 return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
128 except TypeError:
9 frames
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in <dictcomp>(.0)
126 try:
--> 127 return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
128 except TypeError:
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
149
--> 150 raise TypeError(default_collate_err_msg_format.format(elem_type))
151
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-6-1153c5915bd8> in <cell line: 49>()
47 num_workers=loader_opts.get('num_workers', 0), drop_last=False, sampler=None)
48 print(f'{iter(data_loader)=}')
---> 49 print(f'{next(iter(data_loader))=}')
50 print('Done\a')
/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in __next__(self)
631 # TODO(https://github.com/pytorch/pytorch/issues/76750)
632 self._reset() # type: ignore[call-arg]
--> 633 data = self._next_data()
634 self._num_yielded += 1
635 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
675 def _next_data(self):
676 index = self._next_index() # may raise StopIteration
--> 677 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
678 if self._pin_memory:
679 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
40 else:
41 data = next(self.dataset_iter)
---> 42 return self.collate_fn(data)
43
44
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in default_collate(batch)
263 >>> default_collate(batch) # Handle `CustomType` automatically
264 """
--> 265 return collate(batch, collate_fn_map=default_collate_fn_map)
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
128 except TypeError:
129 # The mapping type may not support `__init__(iterable)`.
--> 130 return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
131 elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
132 return elem_type(*(collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in <dictcomp>(.0)
128 except TypeError:
129 # The mapping type may not support `__init__(iterable)`.
--> 130 return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
131 elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
132 return elem_type(*(collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))
/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py in collate(batch, collate_fn_map)
148 return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
149
--> 150 raise TypeError(default_collate_err_msg_format.format(elem_type))
151
152
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>
why is this error happening?
I've done all the checks, e.g., make sure the return indices are dicts, even went in detail debugging mode with pdb inside of pytorch's code.
Without using a collate fn remove the columns so default collate works
otherwise write your own collate e.g.,