I am trying to use TensorFlow transform (1.13.0) and TensorFlow (2.12.1) as part of my pipeline and noticed that it doesn't return the correct answer.
This is what i am running:
with beam.Pipeline() as pipeline:
with tft_beam.Context(temp_dir=os.path.join(working_dir, TRANSFORM_TEMP_DIR)):
def preprocessing_fn(inputs):
"""Preprocess input columns into transformed columns."""
# extract the columns and assign to local variables
text = inputs['text']
entities = inputs['entities']
# data transformations using tft functions
entities = tf.strings.to_number(tf.strings.split(entities, ','), tf.int64)
words = tf.strings.split(text)
# word_characters = tf.strings.unicode_split(words, input_encoding='UTF-8')
word_characters = tf.compat.v1.strings.bytes_split(words)
with tf.init_scope():
init = tf.lookup.KeyValueTensorInitializer(
keys=tf.constant([x for x in list(characters_list)]),
values=tf.constant(list(range(2, len(characters_list) + 2)), dtype=tf.int64))
table = tf.lookup.StaticVocabularyTable(
init, num_oov_buckets = 1, experimental_is_anonymous = True)
characters = table[word_characters]
# return the transformed data
return {
'entities': entities,
'chars': characters,
'word_characters':word_characters,
'words': words,
'text': text
}
transformed_dataset, transform_fun = (
(raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
preprocessing_fn)
)
and for the following example:
[{'text': 'Well get to Korea in a minute .', 'entities': '1,1,1,1,3,1,5,5,1'}]
I do get the following results:
Transformed data:
[{'chars$ragged_values': array([61, 17, 24, 24, 19, 17, 32, 32, 27, 49, 27, 30, 17, 13, 21, 26, 13,
25, 21, 26, 33, 32, 17, 65]),
'chars$row_lengths_1': array([4, 3, 2, 5, 2, 1, 6, 1]),
'entities$ragged_values': array([1, 1, 1, 1, 3, 1, 5, 5, 1]),
'text': b'Well get to Korea in a minute .',
'word_characters$ragged_values': array([b'W', b'e', b'l', b'l', b'g', b'e', b't', b't', b'o', b'K', b'o',
b'r', b'e', b'a', b'i', b'n', b'a', b'm', b'i', b'n', b'u', b't',
b'e', b'.'], dtype=object),
dtype=object)}]
However the answer should be:
Transformed data:
[{'chars$ragged_values': array([[61, 17, 24, 24], [19, 17, 32], [32, 27], [49, 27, 30, 17, 13], [21, 26],
[13], [25, 21, 26, 33, 32, 17], [65]]),
'chars$row_lengths_1': array([4, 3, 2, 5, 2, 1, 6, 1]),
'entities$ragged_values': array([1, 1, 1, 1, 3, 1, 5, 5, 1]),
'text': b'Well get to Korea in a minute .',
'word_characters$ragged_values': array([[b'W', b'e', b'l', b'l'], [b'g', b'e', b't'], [b't', b'o'],
[b'K', b'o', b'r', b'e', b'a'], [b'i', b'n'], [b'a'],
[b'm', b'i', b'n', b'u', b't', b'e'], [b'.']], dtype=object),
dtype=object)}]
This behavior only happens within the tf transform pipeline and i get the correct answer outside of the pipeline.
Link to the Google Colab: enter link description here
I will appreciate any hints on this.