Why is keras slower than estimator in TensorFlow model trainning?

170 Views Asked by At

I trainning the same model through keras and estimator with TensorFlow in CPU, and found that keras was much slower than estimator.

batch_size = 256
emb_hash_bucket_size = 5000000
emb_dimension = 100


def get_dataset():
    def data_generator():
        sample = random.randint(0, emb_hash_bucket_size - 1)
        label = random.random()
        while True:
            yield sample, label

    dataset = tf.compat.v1.data.Dataset.from_generator(
        data_generator,
        output_signature=(
            tf.TensorSpec(shape=(), dtype=tf.int64),
            tf.TensorSpec(shape=(), dtype=tf.float32),
        )
    )
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset


class Model(tf.keras.models.Model):
    def __init__(self, *args, **kwargs):
        super(Model, self).__init__(*args, **kwargs)

        feature_columns = [
            tf.feature_column.embedding_column(
                tf.feature_column.categorical_column_with_hash_bucket(
                    "id", hash_bucket_size=emb_hash_bucket_size, dtype=tf.int64),
                dimension=emb_dimension,
            ),
        ]
        self.dense_features = tf.keras.layers.DenseFeatures(
            feature_columns, trainable=True, name="embeddings")
        self.dense = tf.keras.layers.Dense(1, trainable=True, activation=tf.nn.relu)

    def call(self, inputs, *args, **kwargs):
        features = self.dense_features({"id": inputs})
        return self.dense(features)

def estimator_train():
    def model_fn(features, labels, mode, params):
        model = Model()
        outputs = model(features)

        loss = tf.reduce_mean(tf.math.abs(tf.subtract(outputs, labels)))

        optimizer = tf.compat.v1.train.AdagradOptimizer(learning_rate=0.01)
        train_op = optimizer.minimize(loss, global_step=tf.compat.v1.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir="./checkpoint",
        config=tf.estimator.RunConfig(
            save_checkpoints_secs=None, save_checkpoints_steps=None, log_step_count_steps=10),
    )
    estimator.train(input_fn=get_dataset)


def keras_train():
    model = Model()

    def loss(y_true, y_pred):
        return tf.reduce_mean(tf.math.abs(tf.subtract(y_pred, y_true)))

    optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.01)
    model.compile(optimizer=optimizer, loss=loss)
    model.fit(get_dataset(), batch_size=batch_size, epochs=1)
  • Using TensorFlow 2.5.0 (docker image tensorflow/tensorflow:2.5.0)

    Run estimator_train() outputs:

    INFO:tensorflow:global_step/sec: 39.5762 INFO:tensorflow:loss = 0.008541529, step = 40 (0.253 sec) INFO:tensorflow:global_step/sec: 41.4785 INFO:tensorflow:loss = 0.007802263, step = 50 (0.241 sec) INFO:tensorflow:global_step/sec: 44.8391 INFO:tensorflow:loss = 0.0073087374, step = 60 (0.223 sec) INFO:tensorflow:global_step/sec: 40.6107 INFO:tensorflow:loss = 0.006965399, step = 70 (0.246 sec)

    But run keras_train() outputs:

    181/Unknown - 372s 2s/step - loss: 0.0237

  • Using TensorFlow 2.6.0 (docker image tensorflow/tensorflow:2.6.0)

    Run estimator_train() outputs:

    INFO:tensorflow:global_step/sec: 0.494564 INFO:tensorflow:loss = 0.2802072, step = 40 (20.220 sec) INFO:tensorflow:global_step/sec: 0.496139 INFO:tensorflow:loss = 0.2802072, step = 50 (20.156 sec) INFO:tensorflow:global_step/sec: 0.495584 INFO:tensorflow:loss = 0.2802072, step = 60 (20.178 sec) INFO:tensorflow:global_step/sec: 0.496082 INFO:tensorflow:loss = 0.2802072, step = 70 (20.158 sec)

    But run keras_train() outputs:

    30/Unknown - 62s 2s/step - loss: 0.1707

0

There are 0 best solutions below