I am new to distillbert and want to use it for token classification. I have my own dataset and own class labels. Used the following function to tokenize.
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
label_all_tokens=True
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(examples["tokens"], truncation=True,
is_split_into_words=True, padding=True)
labels = []
for i, label in enumerate(examples['ner_tags']):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
# Special tokens have a word id that is None. We set the label to -100 so they are automatically
# ignored in the loss function.
if word_idx is None:
label_ids.append(-100)
# We set the label for the first token of each word.
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
# For the other tokens in a word, we set the label to either the current label or -100, depending on
# the label_all_tokens flag.
else:
label_ids.append(label[word_idx] if label_all_tokens else -100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
getting this error:
TypeError: PreTokenizedEncodeInput must be Union[PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]]
it would be great if any one could provide help. Thanks
image for full error https://i.stack.imgur.com/kP4J6.jpg