PyTorch: "TypeError: Caught TypeError in DataLoader worker process 0."

8.7k Views Asked by At

I am trying to implement RoBERTa model for sentiment analysis. First, I declared GPReviewDataset to create a PyTorch Dataset.

MAX_LEN = 160
class GPReviewDataset(Dataset):
  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.reviews)
  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

Next, I implement create_data_loader to create a couple of data loaders. Here’s a helper function to do it:

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = GPReviewDataset(
    reviews=df.text.to_numpy(),
    targets=df.sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )
BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
dt = next(iter(train_data_loader))

However, when I run this code then it stops and gives me out these errors:

TypeError                                 Traceback (most recent call last)
<ipython-input-35-a673c0794f60> in <module>()
----> 1 dt = next(iter(train_data_loader))

3 frames
/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    433         if self._sampler_iter is None:
    434             self._reset()
--> 435         data = self._next_data()
    436         self._num_yielded += 1
    437         if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
   1083             else:
   1084                 del self._task_info[idx]
-> 1085                 return self._process_data(data)
   1086 
   1087     def _try_put_index(self):

/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in _process_data(self, data)
   1109         self._try_put_index()
   1110         if isinstance(data, ExceptionWrapper):
-> 1111             data.reraise()
   1112         return data
   1113 

/usr/local/lib/python3.6/dist-packages/torch/_utils.py in reraise(self)
    426             # have message field
    427             raise self.exc_type(message=msg)
--> 428         raise self.exc_type(msg)
    429 
    430 

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "<ipython-input-18-1e537ce5a428>", line 25, in __getitem__
    'targets': torch.tensor(target, dtype=torch.long)
TypeError: new(): invalid data type 'str'

I do not understand why it happens, can anyone please help me and explain.

1

There are 1 best solutions below

0
On

You need to define your classes as integer. I assume you are working on a classification problem. But it looks like you have defined your classes as strings. You need to convert your classes from string to integer. For example, if df.sentiment corresponds to positive, you must represent with 0 or df.sentiment corresponds to negative you need to represent with 1 in a new column.

def to_int_sentiment(label):
  if label == "positive":
    return 0
  elif label == "negative":
    return 1

df['int_sentiment'] = df.sentiment.apply(to_int_sentiment)

Then you should use column df.int_sentiment instead of df.sentiment. So you have to change create_data_loader function as follows.

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = GPReviewDataset(
    reviews=df.text.to_numpy(),
    targets=df.int_sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )