Spaces:
Runtime error
Runtime error
# Copyright 2023 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""NCF framework to train and evaluate the NeuMF model. | |
The NeuMF model assembles both MF and MLP models under the NCF framework. Check | |
`neumf_model.py` for more details about the models. | |
""" | |
import json | |
import os | |
# pylint: disable=g-bad-import-order | |
from absl import app | |
from absl import flags | |
from absl import logging | |
import tensorflow as tf, tf_keras | |
# pylint: enable=g-bad-import-order | |
from official.common import distribute_utils | |
from official.recommendation import constants as rconst | |
from official.recommendation import movielens | |
from official.recommendation import ncf_common | |
from official.recommendation import ncf_input_pipeline | |
from official.recommendation import neumf_model | |
from official.utils.flags import core as flags_core | |
from official.utils.misc import keras_utils | |
from official.utils.misc import model_helpers | |
FLAGS = flags.FLAGS | |
def metric_fn(logits, dup_mask, match_mlperf): | |
dup_mask = tf.cast(dup_mask, tf.float32) | |
logits = tf.slice(logits, [0, 1], [-1, -1]) | |
in_top_k, _, metric_weights, _ = neumf_model.compute_top_k_and_ndcg( | |
logits, dup_mask, match_mlperf) | |
metric_weights = tf.cast(metric_weights, tf.float32) | |
return in_top_k, metric_weights | |
class MetricLayer(tf_keras.layers.Layer): | |
"""Custom layer of metrics for NCF model.""" | |
def __init__(self, match_mlperf): | |
super(MetricLayer, self).__init__() | |
self.match_mlperf = match_mlperf | |
def get_config(self): | |
return {"match_mlperf": self.match_mlperf} | |
def from_config(cls, config, custom_objects=None): | |
return cls(**config) | |
def call(self, inputs, training=False): | |
logits, dup_mask = inputs | |
if training: | |
hr_sum = 0.0 | |
hr_count = 0.0 | |
else: | |
metric, metric_weights = metric_fn(logits, dup_mask, self.match_mlperf) | |
hr_sum = tf.reduce_sum(metric * metric_weights) | |
hr_count = tf.reduce_sum(metric_weights) | |
self.add_metric(hr_sum, name="hr_sum", aggregation="mean") | |
self.add_metric(hr_count, name="hr_count", aggregation="mean") | |
return logits | |
class LossLayer(tf_keras.layers.Layer): | |
"""Pass-through loss layer for NCF model.""" | |
def __init__(self, loss_normalization_factor): | |
# The loss may overflow in float16, so we use float32 instead. | |
super(LossLayer, self).__init__(dtype="float32") | |
self.loss_normalization_factor = loss_normalization_factor | |
self.loss = tf_keras.losses.SparseCategoricalCrossentropy( | |
from_logits=True, reduction="sum") | |
def get_config(self): | |
return {"loss_normalization_factor": self.loss_normalization_factor} | |
def from_config(cls, config, custom_objects=None): | |
return cls(**config) | |
def call(self, inputs): | |
logits, labels, valid_pt_mask_input = inputs | |
loss = self.loss( | |
y_true=labels, y_pred=logits, sample_weight=valid_pt_mask_input) | |
loss = loss * (1.0 / self.loss_normalization_factor) | |
self.add_loss(loss) | |
return logits | |
class IncrementEpochCallback(tf_keras.callbacks.Callback): | |
"""A callback to increase the requested epoch for the data producer. | |
The reason why we need this is because we can only buffer a limited amount of | |
data. So we keep a moving window to represent the buffer. This is to move the | |
one of the window's boundaries for each epoch. | |
""" | |
def __init__(self, producer): | |
self._producer = producer | |
def on_epoch_begin(self, epoch, logs=None): | |
self._producer.increment_request_epoch() | |
class CustomEarlyStopping(tf_keras.callbacks.Callback): | |
"""Stop training has reached a desired hit rate.""" | |
def __init__(self, monitor, desired_value): | |
super(CustomEarlyStopping, self).__init__() | |
self.monitor = monitor | |
self.desired = desired_value | |
self.stopped_epoch = 0 | |
def on_epoch_end(self, epoch, logs=None): | |
current = self.get_monitor_value(logs) | |
if current and current >= self.desired: | |
self.stopped_epoch = epoch | |
self.model.stop_training = True | |
def on_train_end(self, logs=None): | |
if self.stopped_epoch > 0: | |
print("Epoch %05d: early stopping" % (self.stopped_epoch + 1)) | |
def get_monitor_value(self, logs): | |
logs = logs or {} | |
monitor_value = logs.get(self.monitor) | |
if monitor_value is None: | |
logging.warning( | |
"Early stopping conditioned on metric `%s` " | |
"which is not available. Available metrics are: %s", self.monitor, | |
",".join(list(logs.keys()))) | |
return monitor_value | |
def _get_keras_model(params): | |
"""Constructs and returns the model.""" | |
batch_size = params["batch_size"] | |
user_input = tf_keras.layers.Input( | |
shape=(1,), name=movielens.USER_COLUMN, dtype=tf.int32) | |
item_input = tf_keras.layers.Input( | |
shape=(1,), name=movielens.ITEM_COLUMN, dtype=tf.int32) | |
valid_pt_mask_input = tf_keras.layers.Input( | |
shape=(1,), name=rconst.VALID_POINT_MASK, dtype=tf.bool) | |
dup_mask_input = tf_keras.layers.Input( | |
shape=(1,), name=rconst.DUPLICATE_MASK, dtype=tf.int32) | |
label_input = tf_keras.layers.Input( | |
shape=(1,), name=rconst.TRAIN_LABEL_KEY, dtype=tf.bool) | |
base_model = neumf_model.construct_model(user_input, item_input, params) | |
logits = base_model.output | |
zeros = tf_keras.layers.Lambda(lambda x: x * 0)(logits) | |
softmax_logits = tf_keras.layers.concatenate([zeros, logits], axis=-1) | |
# Custom training loop calculates loss and metric as a part of | |
# training/evaluation step function. | |
if not params["keras_use_ctl"]: | |
softmax_logits = MetricLayer( | |
params["match_mlperf"])([softmax_logits, dup_mask_input]) | |
# TODO(b/134744680): Use model.add_loss() instead once the API is well | |
# supported. | |
softmax_logits = LossLayer(batch_size)( | |
[softmax_logits, label_input, valid_pt_mask_input]) | |
keras_model = tf_keras.Model( | |
inputs={ | |
movielens.USER_COLUMN: user_input, | |
movielens.ITEM_COLUMN: item_input, | |
rconst.VALID_POINT_MASK: valid_pt_mask_input, | |
rconst.DUPLICATE_MASK: dup_mask_input, | |
rconst.TRAIN_LABEL_KEY: label_input | |
}, | |
outputs=softmax_logits) | |
keras_model.summary() | |
return keras_model | |
def run_ncf(_): | |
"""Run NCF training and eval with Keras.""" | |
keras_utils.set_session_config(enable_xla=FLAGS.enable_xla) | |
if FLAGS.seed is not None: | |
print("Setting tf seed") | |
tf.random.set_seed(FLAGS.seed) | |
model_helpers.apply_clean(FLAGS) | |
if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras": | |
tf_keras.mixed_precision.set_global_policy("mixed_float16") | |
strategy = distribute_utils.get_distribution_strategy( | |
distribution_strategy=FLAGS.distribution_strategy, | |
num_gpus=FLAGS.num_gpus, | |
tpu_address=FLAGS.tpu) | |
params = ncf_common.parse_flags(FLAGS) | |
params["distribute_strategy"] = strategy | |
params["use_tpu"] = (FLAGS.distribution_strategy == "tpu") | |
if params["use_tpu"] and not params["keras_use_ctl"]: | |
logging.error("Custom training loop must be used when using TPUStrategy.") | |
return | |
batch_size = params["batch_size"] | |
time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) | |
callbacks = [time_callback] | |
producer, input_meta_data = None, None | |
generate_input_online = params["train_dataset_path"] is None | |
if generate_input_online: | |
# Start data producing thread. | |
num_users, num_items, _, _, producer = ncf_common.get_inputs(params) | |
producer.start() | |
per_epoch_callback = IncrementEpochCallback(producer) | |
callbacks.append(per_epoch_callback) | |
else: | |
assert params["eval_dataset_path"] and params["input_meta_data_path"] | |
with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader: | |
input_meta_data = json.loads(reader.read().decode("utf-8")) | |
num_users = input_meta_data["num_users"] | |
num_items = input_meta_data["num_items"] | |
params["num_users"], params["num_items"] = num_users, num_items | |
if FLAGS.early_stopping: | |
early_stopping_callback = CustomEarlyStopping( | |
"val_HR_METRIC", desired_value=FLAGS.hr_threshold) | |
callbacks.append(early_stopping_callback) | |
(train_input_dataset, eval_input_dataset, num_train_steps, | |
num_eval_steps) = ncf_input_pipeline.create_ncf_input_data( | |
params, producer, input_meta_data, strategy) | |
steps_per_epoch = None if generate_input_online else num_train_steps | |
with distribute_utils.get_strategy_scope(strategy): | |
keras_model = _get_keras_model(params) | |
optimizer = tf_keras.optimizers.Adam( | |
learning_rate=params["learning_rate"], | |
beta_1=params["beta1"], | |
beta_2=params["beta2"], | |
epsilon=params["epsilon"]) | |
if FLAGS.fp16_implementation == "graph_rewrite": | |
optimizer = \ | |
tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( | |
optimizer, | |
loss_scale=flags_core.get_loss_scale(FLAGS, | |
default_for_fp16="dynamic")) | |
elif FLAGS.dtype == "fp16": | |
loss_scale = flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic") | |
# Note Model.compile automatically wraps the optimizer with a | |
# LossScaleOptimizer using dynamic loss scaling. We explicitly wrap it | |
# here for the case where a custom training loop or fixed loss scale is | |
# used. | |
if loss_scale == "dynamic": | |
optimizer = tf_keras.mixed_precision.LossScaleOptimizer(optimizer) | |
else: | |
optimizer = tf_keras.mixed_precision.LossScaleOptimizer( | |
optimizer, dynamic=False, initial_scale=loss_scale) | |
if params["keras_use_ctl"]: | |
train_loss, eval_results = run_ncf_custom_training( | |
params, | |
strategy, | |
keras_model, | |
optimizer, | |
callbacks, | |
train_input_dataset, | |
eval_input_dataset, | |
num_train_steps, | |
num_eval_steps, | |
generate_input_online=generate_input_online) | |
else: | |
keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly) | |
if not FLAGS.ml_perf: | |
# Create Tensorboard summary and checkpoint callbacks. | |
summary_dir = os.path.join(FLAGS.model_dir, "summaries") | |
summary_callback = tf_keras.callbacks.TensorBoard( | |
summary_dir, profile_batch=0) | |
checkpoint_path = os.path.join(FLAGS.model_dir, "checkpoint") | |
checkpoint_callback = tf_keras.callbacks.ModelCheckpoint( | |
checkpoint_path, save_weights_only=True) | |
callbacks += [summary_callback, checkpoint_callback] | |
history = keras_model.fit( | |
train_input_dataset, | |
epochs=FLAGS.train_epochs, | |
steps_per_epoch=steps_per_epoch, | |
callbacks=callbacks, | |
validation_data=eval_input_dataset, | |
validation_steps=num_eval_steps, | |
verbose=2) | |
logging.info("Training done. Start evaluating") | |
eval_loss_and_metrics = keras_model.evaluate( | |
eval_input_dataset, steps=num_eval_steps, verbose=2) | |
logging.info("Keras evaluation is done.") | |
# Keras evaluate() API returns scalar loss and metric values from | |
# evaluation as a list. Here, the returned list would contain | |
# [evaluation loss, hr sum, hr count]. | |
eval_hit_rate = eval_loss_and_metrics[1] / eval_loss_and_metrics[2] | |
# Format evaluation result into [eval loss, eval hit accuracy]. | |
eval_results = [eval_loss_and_metrics[0], eval_hit_rate] | |
if history and history.history: | |
train_history = history.history | |
train_loss = train_history["loss"][-1] | |
stats = build_stats(train_loss, eval_results, time_callback) | |
return stats | |
def run_ncf_custom_training(params, | |
strategy, | |
keras_model, | |
optimizer, | |
callbacks, | |
train_input_dataset, | |
eval_input_dataset, | |
num_train_steps, | |
num_eval_steps, | |
generate_input_online=True): | |
"""Runs custom training loop. | |
Args: | |
params: Dictionary containing training parameters. | |
strategy: Distribution strategy to be used for distributed training. | |
keras_model: Model used for training. | |
optimizer: Optimizer used for training. | |
callbacks: Callbacks to be invoked between batches/epochs. | |
train_input_dataset: tf.data.Dataset used for training. | |
eval_input_dataset: tf.data.Dataset used for evaluation. | |
num_train_steps: Total number of steps to run for training. | |
num_eval_steps: Total number of steps to run for evaluation. | |
generate_input_online: Whether input data was generated by data producer. | |
When data is generated by data producer, then train dataset must be | |
re-initialized after every epoch. | |
Returns: | |
A tuple of train loss and a list of training and evaluation results. | |
""" | |
loss_object = tf_keras.losses.SparseCategoricalCrossentropy( | |
reduction="sum", from_logits=True) | |
train_input_iterator = iter( | |
strategy.experimental_distribute_dataset(train_input_dataset)) | |
def train_step(train_iterator): | |
"""Called once per step to train the model.""" | |
def step_fn(features): | |
"""Computes loss and applied gradient per replica.""" | |
with tf.GradientTape() as tape: | |
softmax_logits = keras_model(features) | |
# The loss can overflow in float16, so we cast to float32. | |
softmax_logits = tf.cast(softmax_logits, "float32") | |
labels = features[rconst.TRAIN_LABEL_KEY] | |
loss = loss_object( | |
labels, | |
softmax_logits, | |
sample_weight=features[rconst.VALID_POINT_MASK]) | |
loss *= (1.0 / params["batch_size"]) | |
if FLAGS.dtype == "fp16": | |
loss = optimizer.get_scaled_loss(loss) | |
grads = tape.gradient(loss, keras_model.trainable_variables) | |
if FLAGS.dtype == "fp16": | |
grads = optimizer.get_unscaled_gradients(grads) | |
# Converting gradients to dense form helps in perf on GPU for NCF | |
grads = neumf_model.sparse_to_dense_grads( | |
list(zip(grads, keras_model.trainable_variables))) | |
optimizer.apply_gradients(grads) | |
return loss | |
per_replica_losses = strategy.run(step_fn, args=(next(train_iterator),)) | |
mean_loss = strategy.reduce( | |
tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) | |
return mean_loss | |
def eval_step(eval_iterator): | |
"""Called once per eval step to compute eval metrics.""" | |
def step_fn(features): | |
"""Computes eval metrics per replica.""" | |
softmax_logits = keras_model(features) | |
in_top_k, metric_weights = metric_fn(softmax_logits, | |
features[rconst.DUPLICATE_MASK], | |
params["match_mlperf"]) | |
hr_sum = tf.reduce_sum(in_top_k * metric_weights) | |
hr_count = tf.reduce_sum(metric_weights) | |
return hr_sum, hr_count | |
per_replica_hr_sum, per_replica_hr_count = ( | |
strategy.run(step_fn, args=(next(eval_iterator),))) | |
hr_sum = strategy.reduce( | |
tf.distribute.ReduceOp.SUM, per_replica_hr_sum, axis=None) | |
hr_count = strategy.reduce( | |
tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None) | |
return hr_sum, hr_count | |
if not FLAGS.run_eagerly: | |
train_step = tf.function(train_step) | |
eval_step = tf.function(eval_step) | |
for callback in callbacks: | |
callback.on_train_begin() | |
# Not writing tensorboard summaries if running in MLPerf. | |
if FLAGS.ml_perf: | |
eval_summary_writer, train_summary_writer = None, None | |
else: | |
summary_dir = os.path.join(FLAGS.model_dir, "summaries") | |
eval_summary_writer = tf.summary.create_file_writer( | |
os.path.join(summary_dir, "eval")) | |
train_summary_writer = tf.summary.create_file_writer( | |
os.path.join(summary_dir, "train")) | |
train_loss = 0 | |
for epoch in range(FLAGS.train_epochs): | |
for cb in callbacks: | |
cb.on_epoch_begin(epoch) | |
# As NCF dataset is sampled with randomness, not repeating | |
# data elements in each epoch has significant impact on | |
# convergence. As so, offline-generated TF record files | |
# contains all epoch worth of data. Thus we do not need | |
# to initialize dataset when reading from tf record files. | |
if generate_input_online: | |
train_input_iterator = iter( | |
strategy.experimental_distribute_dataset(train_input_dataset)) | |
train_loss = 0 | |
for step in range(num_train_steps): | |
current_step = step + epoch * num_train_steps | |
for c in callbacks: | |
c.on_batch_begin(current_step) | |
train_loss += train_step(train_input_iterator) | |
# Write train loss once in every 1000 steps. | |
if train_summary_writer and step % 1000 == 0: | |
with train_summary_writer.as_default(): | |
tf.summary.scalar( | |
"training_loss", train_loss / (step + 1), step=current_step) | |
for c in callbacks: | |
c.on_batch_end(current_step) | |
train_loss /= num_train_steps | |
logging.info("Done training epoch %s, epoch loss=%.3f", epoch + 1, | |
train_loss) | |
eval_input_iterator = iter( | |
strategy.experimental_distribute_dataset(eval_input_dataset)) | |
hr_sum = 0.0 | |
hr_count = 0.0 | |
for _ in range(num_eval_steps): | |
step_hr_sum, step_hr_count = eval_step(eval_input_iterator) | |
hr_sum += step_hr_sum | |
hr_count += step_hr_count | |
logging.info("Done eval epoch %s, hit_rate=%.3f", epoch + 1, | |
hr_sum / hr_count) | |
if eval_summary_writer: | |
with eval_summary_writer.as_default(): | |
tf.summary.scalar("hit_rate", hr_sum / hr_count, step=current_step) | |
if (FLAGS.early_stopping and | |
float(hr_sum / hr_count) > params["hr_threshold"]): | |
break | |
for c in callbacks: | |
c.on_train_end() | |
# Saving the model at the end of training. | |
if not FLAGS.ml_perf: | |
checkpoint = tf.train.Checkpoint(model=keras_model, optimizer=optimizer) | |
checkpoint_path = os.path.join(FLAGS.model_dir, "ctl_checkpoint") | |
checkpoint.save(checkpoint_path) | |
logging.info("Saving model as TF checkpoint: %s", checkpoint_path) | |
return train_loss, [None, hr_sum / hr_count] | |
def build_stats(loss, eval_result, time_callback): | |
"""Normalizes and returns dictionary of stats. | |
Args: | |
loss: The final loss at training time. | |
eval_result: Output of the eval step. Assumes first value is eval_loss and | |
second value is accuracy_top_1. | |
time_callback: Time tracking callback likely used during keras.fit. | |
Returns: | |
Dictionary of normalized results. | |
""" | |
stats = {} | |
if loss: | |
stats["loss"] = loss | |
if eval_result: | |
stats["eval_loss"] = eval_result[0] | |
stats["eval_hit_rate"] = eval_result[1] | |
if time_callback: | |
timestamp_log = time_callback.timestamp_log | |
stats["step_timestamp_log"] = timestamp_log | |
stats["train_finish_time"] = time_callback.train_finish_time | |
if len(timestamp_log) > 1: | |
stats["avg_exp_per_second"] = ( | |
time_callback.batch_size * time_callback.log_steps * | |
(len(time_callback.timestamp_log) - 1) / | |
(timestamp_log[-1].timestamp - timestamp_log[0].timestamp)) | |
return stats | |
def main(_): | |
logging.info("Result is %s", run_ncf(FLAGS)) | |
if __name__ == "__main__": | |
ncf_common.define_ncf_flags() | |
app.run(main) | |