How to create a huggingface dataset and import from a list?

209 Views Asked by At

I have a list and I want to convert it to a huggingface dataset for training model, I follow some tips and here is my code,

from datasets import Dataset
class MkqaChineseDataset(Dataset):

    def __init__(self, data):
       # super().__init__() if add this, it shows super().__init__() TypeError: __init__() missing 1 required positional argument: 'arrow_table'
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        return {
            "input_ids": sample["input_ids"], 
            "attention_mask": sample["attention_mask"],
            "labels":sample["input_ids"]
        }


buffer_test = [
    {'input_ids': torch.Tensor([9437,29,210]), 'attention_mask': torch.Tensor([1, 1, 1])},
    {'input_ids': torch.Tensor([37,9,211]), 'attention_mask':  torch.Tensor([1, 1, 1])},
    {'input_ids': torch.Tensor([937,19,212]), 'attention_mask': torch.Tensor([1, 1, 1])}
]
print(buffer_test)
mkqa = MkqaChineseDataset(buffer_test)
res = isinstance(mkqa, Dataset)
print(res)

However, it shows attributes error:

    self.data = data
AttributeError: can't set attribute
1

There are 1 best solutions below

0
On

You can use Dataset.from_list e.g. in your case

buffer_test = [
    {'input_ids': torch.Tensor([9437,29,210]), 'attention_mask': torch.Tensor([1, 1, 1])},
    {'input_ids': torch.Tensor([37,9,211]), 'attention_mask':  torch.Tensor([1, 1, 1])},
    {'input_ids': torch.Tensor([937,19,212]), 'attention_mask': torch.Tensor([1, 1, 1])}
]

ds = Dataset.from_list(buffer_test)