What is a fast way to read tfrecords, add a field, and write them back out? Below is a program that does this, given a directory of *.tfrecords
. Say doing this
example = tf.train.Example()
example.ParseFromString(rec_str.numpy())
output_string = example.SerializeToString()
writer.write(output_string)
for each record in the input is one unit of time, then adding
features = dict(example.features.feature.items())
features["pred"] = tf.train.Feature(float_list=tf.train.FloatList(value=prediction_output))
example = tf.train.Example(features=tf.train.Features(feature=features))
before serializing and writing back is like 3 units of time.
It seems to add a lot - my tfrecords have like 50 scalar fields in them, I'm just adding one more, does anyone know of a more efficient way to do this? Below is a program. You can experiment with gzip compression, that didn't seem to make much difference for my files.
import os
import time
import glob
import logging
import numpy as np
import tensorflow as tf
def write_tfrecords_with_ap(input_dir, output_dir, write_zip, add_val, num_files):
t0_all = time.time()
tm_write = 0.0
tm_feat_dict = 0.0
os.makedirs(output_dir, exist_ok=True)
logging.info(f"Writing new tfrecords to {output_dir}")
input_files = glob.glob(os.path.join(input_dir, "*.tfrecords"))[0:num_files]
assert len(input_files)
prediction_output = np.empty(shape=(1,), dtype=np.float32)
num_records = 0
for input_tfrec_file in input_files:
print(f"input {input_tfrec_file}")
t0_write = time.time()
input_dset = tf.data.TFRecordDataset(input_tfrec_file, compression_type="GZIP",)
basename = os.path.basename(input_tfrec_file)
output_tfrec_file = os.path.join(output_dir, basename)
options = None
if write_zip:
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter(output_tfrec_file, options=options) as writer:
for rec_str in input_dset:
num_records += 1
example = tf.train.Example()
example.ParseFromString(rec_str.numpy())
if add_val:
t0_feat_dict = time.time()
features = dict(example.features.feature.items())
tm_feat_dict += time.time() - t0_feat_dict
prediction_output[0] = 3.0
features["pred"] = tf.train.Feature(float_list=tf.train.FloatList(value=prediction_output))
example = tf.train.Example(features=tf.train.Features(feature=features))
output_string = example.SerializeToString()
writer.write(output_string)
print(f"out {output_tfrec_file}")
tm_write += time.time() - t0_write
tm_all = time.time() - t0_all
print(
f"Wrote {num_files} tfrecords files with {num_records} records. minutes: total={round(tm_all/60.0, 2)} "
f"writing={round(tm_write/60.0, 2)} "
f"feat_dict={round(tm_feat_dict/60.0, 2)}"
)
Iterating over the entire
Dataset
(= loading every sample into memory), and writing it back to disk again after appending your data is the fastest way to to it. As the IO is the most time consuming step, I don't think you'll gain much time by optimizing your code. You would need to append to the existing files, but you can't do that withtfrecords
, and it won't be implemented any time soon: How to append data to TensorFlow tfrecords file.