from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab import pathlib import tensorflow as tf
BATCH_SIZE = 1028
text_dataset = tf.keras.utils.text_dataset_from_directory(pathlib.Path('path').parent, labels="inferred", label_mode="int", batch_size=BATCH_SIZE) text_examples = tf.keras.utils.text_dataset_from_directory(pathlib.Path('anotherpath').parent, subset='both', labels="inferred", label_mode="int", batch_size=BATCH_SIZE, validation_split=0.2, seed=2)
bert_tokenizer_params=dict(lower_case=True) reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]
bert_vocab_args = dict(
# Arguments for text.BertTokenizer
bert_tokenizer_params=bert_tokenizer_params,
vocab_size=1048576,
reserved_tokens=reserved_tokens,
# Arguments for wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn
learn_params={},
)
train_examles = text_dataset, text_examples myvocab = bert_vocab.bert_vocab_from_dataset( train_examles, **bert_vocab_args)
def write_vocab_file(filepath, vocab): with open(filepath, 'w') as f: for token in vocab: print(token, file=f)
write_vocab_file('vocab.txt', myvocab)
Traceback (most recent call last): File ".py", line 28, in myvocab = bert_vocab.bert_vocab_from_dataset( File ".venv\lib\site-packages\tensorflow_text\tools\wordpiece_vocab\bert_vocab_from_dataset.py", line 82, in bert_vocab_from_dataset element_spec = dataset.element_spec AttributeError: 'tuple' object has no attribute 'element_spec'
try write vocab from https://www.tensorflow.org/text/guide/subwords_tokenizer?hl=ru, but cant.