from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenized_input = tokenizer(
sentences, truncation=True, is_split_into_words=True, padding='max_length', max_length=120
)
Sentences is a list containing lists inside
for sen in sentences[:5]:
print(sen)
['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
['Families', 'of', 'soldiers', 'killed', 'in', 'the', 'conflict', 'joined', 'the', 'protesters', 'who', 'carried', 'banners', 'with', 'such', 'slogans', 'as', '"', 'Bush', 'Number', 'One', 'Terrorist', '"', 'and', '"', 'Stop', 'the', 'Bombings', '.', '"']
['They', 'marched', 'from', 'the', 'Houses', 'of', 'Parliament', 'to', 'a', 'rally', 'in', 'Hyde', 'Park', '.']
['Police', 'put', 'the', 'number', 'of', 'marchers', 'at', '10,000', 'while', 'organizers', 'claimed', 'it', 'was', '1,00,000', '.']
['The', 'protest', 'comes', 'on', 'the', 'eve', 'of', 'the', 'annual', 'conference', 'of', 'Britain', "'s", 'ruling', 'Labor', 'Party', 'in', 'the', 'southern', 'English', 'seaside', 'resort', 'of', 'Brighton', '.']
I get this error
<ipython-input-79-1d6d1ec05183> in <module>()
2 tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
3 tokenized_input = tokenizer(
----> 4 sentences, truncation=True, is_split_into_words=True, padding='max_length', max_length=120
5 )
2 frames
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in _batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose)
407 batch_text_or_text_pairs,
408 add_special_tokens=add_special_tokens,
--> 409 is_pretokenized=is_split_into_words,
410 )
411
TypeError: PreTokenizedEncodeInput must be Union[PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]]
It works for slices of sentences
for example
tokenized_input = tokenizer(
sentences[:76], truncation=True, is_split_into_words=True, padding='max_length', max_length=120
)
compiles just fine but if i do sentences[:77] there's an error