|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""NCF framework to train and evaluate the NeuMF model. |
|
|
|
The NeuMF model assembles both MF and MLP models under the NCF framework. Check |
|
`neumf_model.py` for more details about the models. |
|
""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import json |
|
import os |
|
|
|
|
|
from absl import app |
|
from absl import flags |
|
from absl import logging |
|
import tensorflow.compat.v2 as tf |
|
|
|
|
|
from official.recommendation import constants as rconst |
|
from official.recommendation import movielens |
|
from official.recommendation import ncf_common |
|
from official.recommendation import ncf_input_pipeline |
|
from official.recommendation import neumf_model |
|
from official.utils.flags import core as flags_core |
|
from official.utils.misc import distribution_utils |
|
from official.utils.misc import keras_utils |
|
from official.utils.misc import model_helpers |
|
|
|
|
|
FLAGS = flags.FLAGS |
|
|
|
|
|
def metric_fn(logits, dup_mask, match_mlperf): |
|
dup_mask = tf.cast(dup_mask, tf.float32) |
|
logits = tf.slice(logits, [0, 1], [-1, -1]) |
|
in_top_k, _, metric_weights, _ = neumf_model.compute_top_k_and_ndcg( |
|
logits, |
|
dup_mask, |
|
match_mlperf) |
|
metric_weights = tf.cast(metric_weights, tf.float32) |
|
return in_top_k, metric_weights |
|
|
|
|
|
class MetricLayer(tf.keras.layers.Layer): |
|
"""Custom layer of metrics for NCF model.""" |
|
|
|
def __init__(self, match_mlperf): |
|
super(MetricLayer, self).__init__() |
|
self.match_mlperf = match_mlperf |
|
|
|
def get_config(self): |
|
return {"match_mlperf": self.match_mlperf} |
|
|
|
@classmethod |
|
def from_config(cls, config, custom_objects=None): |
|
return cls(**config) |
|
|
|
def call(self, inputs, training=False): |
|
logits, dup_mask = inputs |
|
|
|
if training: |
|
hr_sum = 0.0 |
|
hr_count = 0.0 |
|
else: |
|
metric, metric_weights = metric_fn(logits, dup_mask, self.match_mlperf) |
|
hr_sum = tf.reduce_sum(metric * metric_weights) |
|
hr_count = tf.reduce_sum(metric_weights) |
|
|
|
self.add_metric(hr_sum, name="hr_sum", aggregation="mean") |
|
self.add_metric(hr_count, name="hr_count", aggregation="mean") |
|
return logits |
|
|
|
|
|
class LossLayer(tf.keras.layers.Layer): |
|
"""Pass-through loss layer for NCF model.""" |
|
|
|
def __init__(self, loss_normalization_factor): |
|
|
|
super(LossLayer, self).__init__(dtype="float32") |
|
self.loss_normalization_factor = loss_normalization_factor |
|
self.loss = tf.keras.losses.SparseCategoricalCrossentropy( |
|
from_logits=True, reduction="sum") |
|
|
|
def get_config(self): |
|
return {"loss_normalization_factor": self.loss_normalization_factor} |
|
|
|
@classmethod |
|
def from_config(cls, config, custom_objects=None): |
|
return cls(**config) |
|
|
|
def call(self, inputs): |
|
logits, labels, valid_pt_mask_input = inputs |
|
loss = self.loss( |
|
y_true=labels, y_pred=logits, sample_weight=valid_pt_mask_input) |
|
loss = loss * (1.0 / self.loss_normalization_factor) |
|
self.add_loss(loss) |
|
return logits |
|
|
|
|
|
class IncrementEpochCallback(tf.keras.callbacks.Callback): |
|
"""A callback to increase the requested epoch for the data producer. |
|
|
|
The reason why we need this is because we can only buffer a limited amount of |
|
data. So we keep a moving window to represent the buffer. This is to move the |
|
one of the window's boundaries for each epoch. |
|
""" |
|
|
|
def __init__(self, producer): |
|
self._producer = producer |
|
|
|
def on_epoch_begin(self, epoch, logs=None): |
|
self._producer.increment_request_epoch() |
|
|
|
|
|
class CustomEarlyStopping(tf.keras.callbacks.Callback): |
|
"""Stop training has reached a desired hit rate.""" |
|
|
|
def __init__(self, monitor, desired_value): |
|
super(CustomEarlyStopping, self).__init__() |
|
|
|
self.monitor = monitor |
|
self.desired = desired_value |
|
self.stopped_epoch = 0 |
|
|
|
def on_epoch_end(self, epoch, logs=None): |
|
current = self.get_monitor_value(logs) |
|
if current and current >= self.desired: |
|
self.stopped_epoch = epoch |
|
self.model.stop_training = True |
|
|
|
def on_train_end(self, logs=None): |
|
if self.stopped_epoch > 0: |
|
print("Epoch %05d: early stopping" % (self.stopped_epoch + 1)) |
|
|
|
def get_monitor_value(self, logs): |
|
logs = logs or {} |
|
monitor_value = logs.get(self.monitor) |
|
if monitor_value is None: |
|
logging.warning("Early stopping conditioned on metric `%s` " |
|
"which is not available. Available metrics are: %s", |
|
self.monitor, ",".join(list(logs.keys()))) |
|
return monitor_value |
|
|
|
|
|
def _get_keras_model(params): |
|
"""Constructs and returns the model.""" |
|
batch_size = params["batch_size"] |
|
|
|
user_input = tf.keras.layers.Input( |
|
shape=(1,), name=movielens.USER_COLUMN, dtype=tf.int32) |
|
|
|
item_input = tf.keras.layers.Input( |
|
shape=(1,), name=movielens.ITEM_COLUMN, dtype=tf.int32) |
|
|
|
valid_pt_mask_input = tf.keras.layers.Input( |
|
shape=(1,), name=rconst.VALID_POINT_MASK, dtype=tf.bool) |
|
|
|
dup_mask_input = tf.keras.layers.Input( |
|
shape=(1,), name=rconst.DUPLICATE_MASK, dtype=tf.int32) |
|
|
|
label_input = tf.keras.layers.Input( |
|
shape=(1,), name=rconst.TRAIN_LABEL_KEY, dtype=tf.bool) |
|
|
|
base_model = neumf_model.construct_model(user_input, item_input, params) |
|
|
|
logits = base_model.output |
|
|
|
zeros = tf.keras.layers.Lambda( |
|
lambda x: x * 0)(logits) |
|
|
|
softmax_logits = tf.keras.layers.concatenate( |
|
[zeros, logits], |
|
axis=-1) |
|
|
|
|
|
|
|
if not params["keras_use_ctl"]: |
|
softmax_logits = MetricLayer( |
|
params["match_mlperf"])([softmax_logits, dup_mask_input]) |
|
|
|
|
|
softmax_logits = LossLayer(batch_size)( |
|
[softmax_logits, label_input, valid_pt_mask_input]) |
|
|
|
keras_model = tf.keras.Model( |
|
inputs={ |
|
movielens.USER_COLUMN: user_input, |
|
movielens.ITEM_COLUMN: item_input, |
|
rconst.VALID_POINT_MASK: valid_pt_mask_input, |
|
rconst.DUPLICATE_MASK: dup_mask_input, |
|
rconst.TRAIN_LABEL_KEY: label_input}, |
|
outputs=softmax_logits) |
|
|
|
keras_model.summary() |
|
return keras_model |
|
|
|
|
|
def run_ncf(_): |
|
"""Run NCF training and eval with Keras.""" |
|
|
|
keras_utils.set_session_config(enable_xla=FLAGS.enable_xla) |
|
|
|
if FLAGS.seed is not None: |
|
print("Setting tf seed") |
|
tf.random.set_seed(FLAGS.seed) |
|
|
|
model_helpers.apply_clean(FLAGS) |
|
|
|
if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras": |
|
policy = tf.keras.mixed_precision.experimental.Policy( |
|
"mixed_float16", |
|
loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")) |
|
tf.keras.mixed_precision.experimental.set_policy(policy) |
|
|
|
strategy = distribution_utils.get_distribution_strategy( |
|
distribution_strategy=FLAGS.distribution_strategy, |
|
num_gpus=FLAGS.num_gpus, |
|
tpu_address=FLAGS.tpu) |
|
|
|
params = ncf_common.parse_flags(FLAGS) |
|
params["distribute_strategy"] = strategy |
|
params["use_tpu"] = (FLAGS.distribution_strategy == "tpu") |
|
|
|
if params["use_tpu"] and not params["keras_use_ctl"]: |
|
logging.error("Custom training loop must be used when using TPUStrategy.") |
|
return |
|
|
|
batch_size = params["batch_size"] |
|
time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) |
|
callbacks = [time_callback] |
|
|
|
producer, input_meta_data = None, None |
|
generate_input_online = params["train_dataset_path"] is None |
|
|
|
if generate_input_online: |
|
|
|
num_users, num_items, _, _, producer = ncf_common.get_inputs(params) |
|
producer.start() |
|
per_epoch_callback = IncrementEpochCallback(producer) |
|
callbacks.append(per_epoch_callback) |
|
else: |
|
assert params["eval_dataset_path"] and params["input_meta_data_path"] |
|
with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader: |
|
input_meta_data = json.loads(reader.read().decode("utf-8")) |
|
num_users = input_meta_data["num_users"] |
|
num_items = input_meta_data["num_items"] |
|
|
|
params["num_users"], params["num_items"] = num_users, num_items |
|
|
|
if FLAGS.early_stopping: |
|
early_stopping_callback = CustomEarlyStopping( |
|
"val_HR_METRIC", desired_value=FLAGS.hr_threshold) |
|
callbacks.append(early_stopping_callback) |
|
|
|
(train_input_dataset, eval_input_dataset, |
|
num_train_steps, num_eval_steps) = \ |
|
(ncf_input_pipeline.create_ncf_input_data( |
|
params, producer, input_meta_data, strategy)) |
|
steps_per_epoch = None if generate_input_online else num_train_steps |
|
|
|
with distribution_utils.get_strategy_scope(strategy): |
|
keras_model = _get_keras_model(params) |
|
optimizer = tf.keras.optimizers.Adam( |
|
learning_rate=params["learning_rate"], |
|
beta_1=params["beta1"], |
|
beta_2=params["beta2"], |
|
epsilon=params["epsilon"]) |
|
if FLAGS.fp16_implementation == "graph_rewrite": |
|
optimizer = \ |
|
tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( |
|
optimizer, |
|
loss_scale=flags_core.get_loss_scale(FLAGS, |
|
default_for_fp16="dynamic")) |
|
elif FLAGS.dtype == "fp16" and params["keras_use_ctl"]: |
|
|
|
|
|
optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( |
|
optimizer, |
|
tf.keras.mixed_precision.experimental.global_policy().loss_scale) |
|
|
|
if params["keras_use_ctl"]: |
|
train_loss, eval_results = run_ncf_custom_training( |
|
params, |
|
strategy, |
|
keras_model, |
|
optimizer, |
|
callbacks, |
|
train_input_dataset, |
|
eval_input_dataset, |
|
num_train_steps, |
|
num_eval_steps, |
|
generate_input_online=generate_input_online) |
|
else: |
|
keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly) |
|
|
|
if not FLAGS.ml_perf: |
|
|
|
summary_dir = os.path.join(FLAGS.model_dir, "summaries") |
|
summary_callback = tf.keras.callbacks.TensorBoard(summary_dir) |
|
checkpoint_path = os.path.join(FLAGS.model_dir, "checkpoint") |
|
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( |
|
checkpoint_path, save_weights_only=True) |
|
|
|
callbacks += [summary_callback, checkpoint_callback] |
|
|
|
history = keras_model.fit( |
|
train_input_dataset, |
|
epochs=FLAGS.train_epochs, |
|
steps_per_epoch=steps_per_epoch, |
|
callbacks=callbacks, |
|
validation_data=eval_input_dataset, |
|
validation_steps=num_eval_steps, |
|
verbose=2) |
|
|
|
logging.info("Training done. Start evaluating") |
|
|
|
eval_loss_and_metrics = keras_model.evaluate( |
|
eval_input_dataset, steps=num_eval_steps, verbose=2) |
|
|
|
logging.info("Keras evaluation is done.") |
|
|
|
|
|
|
|
|
|
eval_hit_rate = eval_loss_and_metrics[1] / eval_loss_and_metrics[2] |
|
|
|
|
|
eval_results = [eval_loss_and_metrics[0], eval_hit_rate] |
|
|
|
if history and history.history: |
|
train_history = history.history |
|
train_loss = train_history["loss"][-1] |
|
|
|
stats = build_stats(train_loss, eval_results, time_callback) |
|
return stats |
|
|
|
|
|
def run_ncf_custom_training(params, |
|
strategy, |
|
keras_model, |
|
optimizer, |
|
callbacks, |
|
train_input_dataset, |
|
eval_input_dataset, |
|
num_train_steps, |
|
num_eval_steps, |
|
generate_input_online=True): |
|
"""Runs custom training loop. |
|
|
|
Args: |
|
params: Dictionary containing training parameters. |
|
strategy: Distribution strategy to be used for distributed training. |
|
keras_model: Model used for training. |
|
optimizer: Optimizer used for training. |
|
callbacks: Callbacks to be invoked between batches/epochs. |
|
train_input_dataset: tf.data.Dataset used for training. |
|
eval_input_dataset: tf.data.Dataset used for evaluation. |
|
num_train_steps: Total number of steps to run for training. |
|
num_eval_steps: Total number of steps to run for evaluation. |
|
generate_input_online: Whether input data was generated by data producer. |
|
When data is generated by data producer, then train dataset must be |
|
re-initialized after every epoch. |
|
|
|
Returns: |
|
A tuple of train loss and a list of training and evaluation results. |
|
""" |
|
loss_object = tf.keras.losses.SparseCategoricalCrossentropy( |
|
reduction="sum", from_logits=True) |
|
train_input_iterator = iter( |
|
strategy.experimental_distribute_dataset(train_input_dataset)) |
|
|
|
def train_step(train_iterator): |
|
"""Called once per step to train the model.""" |
|
|
|
def step_fn(features): |
|
"""Computes loss and applied gradient per replica.""" |
|
with tf.GradientTape() as tape: |
|
softmax_logits = keras_model(features) |
|
|
|
softmax_logits = tf.cast(softmax_logits, "float32") |
|
labels = features[rconst.TRAIN_LABEL_KEY] |
|
loss = loss_object( |
|
labels, |
|
softmax_logits, |
|
sample_weight=features[rconst.VALID_POINT_MASK]) |
|
loss *= (1.0 / params["batch_size"]) |
|
if FLAGS.dtype == "fp16": |
|
loss = optimizer.get_scaled_loss(loss) |
|
|
|
grads = tape.gradient(loss, keras_model.trainable_variables) |
|
if FLAGS.dtype == "fp16": |
|
grads = optimizer.get_unscaled_gradients(grads) |
|
|
|
grads = neumf_model.sparse_to_dense_grads( |
|
list(zip(grads, keras_model.trainable_variables))) |
|
optimizer.apply_gradients(grads) |
|
return loss |
|
|
|
per_replica_losses = strategy.run( |
|
step_fn, args=(next(train_iterator),)) |
|
mean_loss = strategy.reduce( |
|
tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) |
|
return mean_loss |
|
|
|
def eval_step(eval_iterator): |
|
"""Called once per eval step to compute eval metrics.""" |
|
|
|
def step_fn(features): |
|
"""Computes eval metrics per replica.""" |
|
softmax_logits = keras_model(features) |
|
in_top_k, metric_weights = metric_fn(softmax_logits, |
|
features[rconst.DUPLICATE_MASK], |
|
params["match_mlperf"]) |
|
hr_sum = tf.reduce_sum(in_top_k * metric_weights) |
|
hr_count = tf.reduce_sum(metric_weights) |
|
return hr_sum, hr_count |
|
|
|
per_replica_hr_sum, per_replica_hr_count = ( |
|
strategy.run( |
|
step_fn, args=(next(eval_iterator),))) |
|
hr_sum = strategy.reduce( |
|
tf.distribute.ReduceOp.SUM, per_replica_hr_sum, axis=None) |
|
hr_count = strategy.reduce( |
|
tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None) |
|
return hr_sum, hr_count |
|
|
|
if not FLAGS.run_eagerly: |
|
train_step = tf.function(train_step) |
|
eval_step = tf.function(eval_step) |
|
|
|
for callback in callbacks: |
|
callback.on_train_begin() |
|
|
|
|
|
if FLAGS.ml_perf: |
|
eval_summary_writer, train_summary_writer = None, None |
|
else: |
|
summary_dir = os.path.join(FLAGS.model_dir, "summaries") |
|
eval_summary_writer = tf.summary.create_file_writer( |
|
os.path.join(summary_dir, "eval")) |
|
train_summary_writer = tf.summary.create_file_writer( |
|
os.path.join(summary_dir, "train")) |
|
|
|
train_loss = 0 |
|
for epoch in range(FLAGS.train_epochs): |
|
for cb in callbacks: |
|
cb.on_epoch_begin(epoch) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if generate_input_online: |
|
train_input_iterator = iter( |
|
strategy.experimental_distribute_dataset(train_input_dataset)) |
|
|
|
train_loss = 0 |
|
for step in range(num_train_steps): |
|
current_step = step + epoch * num_train_steps |
|
for c in callbacks: |
|
c.on_batch_begin(current_step) |
|
|
|
train_loss += train_step(train_input_iterator) |
|
|
|
|
|
if train_summary_writer and step % 1000 == 0: |
|
with train_summary_writer.as_default(): |
|
tf.summary.scalar("training_loss", train_loss/(step + 1), |
|
step=current_step) |
|
|
|
for c in callbacks: |
|
c.on_batch_end(current_step) |
|
|
|
train_loss /= num_train_steps |
|
logging.info("Done training epoch %s, epoch loss=%.3f", epoch + 1, |
|
train_loss) |
|
|
|
eval_input_iterator = iter( |
|
strategy.experimental_distribute_dataset(eval_input_dataset)) |
|
|
|
hr_sum = 0.0 |
|
hr_count = 0.0 |
|
for _ in range(num_eval_steps): |
|
step_hr_sum, step_hr_count = eval_step(eval_input_iterator) |
|
hr_sum += step_hr_sum |
|
hr_count += step_hr_count |
|
|
|
logging.info("Done eval epoch %s, hit_rate=%.3f", epoch + 1, |
|
hr_sum / hr_count) |
|
if eval_summary_writer: |
|
with eval_summary_writer.as_default(): |
|
tf.summary.scalar("hit_rate", hr_sum / hr_count, step=current_step) |
|
|
|
if (FLAGS.early_stopping and |
|
float(hr_sum / hr_count) > params["hr_threshold"]): |
|
break |
|
|
|
for c in callbacks: |
|
c.on_train_end() |
|
|
|
|
|
if not FLAGS.ml_perf: |
|
checkpoint = tf.train.Checkpoint(model=keras_model, optimizer=optimizer) |
|
checkpoint_path = os.path.join(FLAGS.model_dir, "ctl_checkpoint") |
|
checkpoint.save(checkpoint_path) |
|
logging.info("Saving model as TF checkpoint: %s", checkpoint_path) |
|
|
|
return train_loss, [None, hr_sum / hr_count] |
|
|
|
|
|
def build_stats(loss, eval_result, time_callback): |
|
"""Normalizes and returns dictionary of stats. |
|
|
|
Args: |
|
loss: The final loss at training time. |
|
eval_result: Output of the eval step. Assumes first value is eval_loss and |
|
second value is accuracy_top_1. |
|
time_callback: Time tracking callback likely used during keras.fit. |
|
|
|
Returns: |
|
Dictionary of normalized results. |
|
""" |
|
stats = {} |
|
if loss: |
|
stats["loss"] = loss |
|
|
|
if eval_result: |
|
stats["eval_loss"] = eval_result[0] |
|
stats["eval_hit_rate"] = eval_result[1] |
|
|
|
if time_callback: |
|
timestamp_log = time_callback.timestamp_log |
|
stats["step_timestamp_log"] = timestamp_log |
|
stats["train_finish_time"] = time_callback.train_finish_time |
|
if len(timestamp_log) > 1: |
|
stats["avg_exp_per_second"] = ( |
|
time_callback.batch_size * time_callback.log_steps * |
|
(len(time_callback.timestamp_log)-1) / |
|
(timestamp_log[-1].timestamp - timestamp_log[0].timestamp)) |
|
|
|
return stats |
|
|
|
|
|
def main(_): |
|
logging.info("Result is %s", run_ncf(FLAGS)) |
|
|
|
|
|
if __name__ == "__main__": |
|
ncf_common.define_ncf_flags() |
|
app.run(main) |
|
|