deanna-emery's picture
updates
93528c6
raw
history blame
12.8 kB
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test NCF data pipeline."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import defaultdict
import hashlib
import os
import mock
import numpy as np
import scipy.stats
import tensorflow as tf, tf_keras
from official.recommendation import constants as rconst
from official.recommendation import data_preprocessing
from official.recommendation import movielens
from official.recommendation import popen_helper
DATASET = "ml-test"
NUM_USERS = 1000
NUM_ITEMS = 2000
NUM_PTS = 50000
BATCH_SIZE = 2048
EVAL_BATCH_SIZE = 4000
NUM_NEG = 4
END_TO_END_TRAIN_MD5 = "b218738e915e825d03939c5e305a2698"
END_TO_END_EVAL_MD5 = "d753d0f3186831466d6e218163a9501e"
FRESH_RANDOMNESS_MD5 = "63d0dff73c0e5f1048fbdc8c65021e22"
def mock_download(*args, **kwargs):
return
# The forkpool used by data producers interacts badly with the threading
# used by TestCase. Without this patch tests will hang, and no amount
# of diligent closing and joining within the producer will prevent it.
@mock.patch.object(popen_helper, "get_forkpool", popen_helper.get_fauxpool)
class BaseTest(tf.test.TestCase):
def setUp(self):
tf.compat.v1.disable_eager_execution()
self.temp_data_dir = self.get_temp_dir()
ratings_folder = os.path.join(self.temp_data_dir, DATASET)
tf.io.gfile.makedirs(ratings_folder)
np.random.seed(0)
raw_user_ids = np.arange(NUM_USERS * 3)
np.random.shuffle(raw_user_ids)
raw_user_ids = raw_user_ids[:NUM_USERS]
raw_item_ids = np.arange(NUM_ITEMS * 3)
np.random.shuffle(raw_item_ids)
raw_item_ids = raw_item_ids[:NUM_ITEMS]
users = np.random.choice(raw_user_ids, NUM_PTS)
items = np.random.choice(raw_item_ids, NUM_PTS)
scores = np.random.randint(low=0, high=5, size=NUM_PTS)
times = np.random.randint(low=1000000000, high=1200000000, size=NUM_PTS)
self.rating_file = os.path.join(ratings_folder, movielens.RATINGS_FILE)
self.seen_pairs = set()
self.holdout = {}
with tf.io.gfile.GFile(self.rating_file, "w") as f:
f.write("user_id,item_id,rating,timestamp\n")
for usr, itm, scr, ts in zip(users, items, scores, times):
pair = (usr, itm)
if pair in self.seen_pairs:
continue
self.seen_pairs.add(pair)
if usr not in self.holdout or (ts, itm) > self.holdout[usr]:
self.holdout[usr] = (ts, itm)
f.write("{},{},{},{}\n".format(usr, itm, scr, ts))
movielens.download = mock_download
movielens.NUM_RATINGS[DATASET] = NUM_PTS
movielens.DATASET_TO_NUM_USERS_AND_ITEMS[DATASET] = (NUM_USERS, NUM_ITEMS)
def make_params(self, train_epochs=1):
return {
"train_epochs": train_epochs,
"batches_per_step": 1,
"use_seed": False,
"batch_size": BATCH_SIZE,
"eval_batch_size": EVAL_BATCH_SIZE,
"num_neg": NUM_NEG,
"match_mlperf": True,
"use_tpu": False,
"use_xla_for_gpu": False,
"stream_files": False,
}
def test_preprocessing(self):
# For the most part the necessary checks are performed within
# _filter_index_sort()
cache_path = os.path.join(self.temp_data_dir, "test_cache.pickle")
data, valid_cache = data_preprocessing._filter_index_sort(
self.rating_file, cache_path=cache_path)
assert len(data[rconst.USER_MAP]) == NUM_USERS
assert len(data[rconst.ITEM_MAP]) == NUM_ITEMS
def drain_dataset(self, dataset, g):
# type: (tf.data.Dataset, tf.Graph) -> list
with self.session(graph=g) as sess:
with g.as_default():
batch = tf.compat.v1.data.make_one_shot_iterator(dataset).get_next()
output = []
while True:
try:
output.append(sess.run(batch))
except tf.errors.OutOfRangeError:
break
return output
def _test_end_to_end(self, constructor_type):
params = self.make_params(train_epochs=1)
_, _, producer = data_preprocessing.instantiate_pipeline(
dataset=DATASET,
data_dir=self.temp_data_dir,
params=params,
constructor_type=constructor_type,
deterministic=True)
producer.start()
producer.join()
assert producer._fatal_exception is None
user_inv_map = {v: k for k, v in producer.user_map.items()}
item_inv_map = {v: k for k, v in producer.item_map.items()}
# ==========================================================================
# == Training Data =========================================================
# ==========================================================================
g = tf.Graph()
with g.as_default():
input_fn = producer.make_input_fn(is_training=True)
dataset = input_fn(params)
first_epoch = self.drain_dataset(dataset=dataset, g=g)
counts = defaultdict(int)
train_examples = {
True: set(),
False: set(),
}
md5 = hashlib.md5()
for features, labels in first_epoch:
data_list = [
features[movielens.USER_COLUMN].flatten(),
features[movielens.ITEM_COLUMN].flatten(),
features[rconst.VALID_POINT_MASK].flatten(),
labels.flatten()
]
for i in data_list:
md5.update(i.tobytes())
for u, i, v, l in zip(*data_list):
if not v:
continue # ignore padding
u_raw = user_inv_map[u]
i_raw = item_inv_map[i]
if ((u_raw, i_raw) in self.seen_pairs) != l:
# The evaluation item is not considered during false negative
# generation, so it will occasionally appear as a negative example
# during training.
assert not l
self.assertEqual(i_raw, self.holdout[u_raw][1])
train_examples[l].add((u_raw, i_raw))
counts[(u_raw, i_raw)] += 1
self.assertRegexpMatches(md5.hexdigest(), END_TO_END_TRAIN_MD5)
num_positives_seen = len(train_examples[True])
self.assertEqual(producer._train_pos_users.shape[0], num_positives_seen)
# This check is more heuristic because negatives are sampled with
# replacement. It only checks that negative generation is reasonably random.
self.assertGreater(
len(train_examples[False]) / NUM_NEG / num_positives_seen, 0.9)
# This checks that the samples produced are independent by checking the
# number of duplicate entries. If workers are not properly independent there
# will be lots of repeated pairs.
self.assertLess(np.mean(list(counts.values())), 1.1)
# ==========================================================================
# == Eval Data =============================================================
# ==========================================================================
with g.as_default():
input_fn = producer.make_input_fn(is_training=False)
dataset = input_fn(params)
eval_data = self.drain_dataset(dataset=dataset, g=g)
current_user = None
md5 = hashlib.md5()
for features in eval_data:
data_list = [
features[movielens.USER_COLUMN].flatten(),
features[movielens.ITEM_COLUMN].flatten(),
features[rconst.DUPLICATE_MASK].flatten()
]
for i in data_list:
md5.update(i.tobytes())
for idx, (u, i, d) in enumerate(zip(*data_list)):
u_raw = user_inv_map[u]
i_raw = item_inv_map[i]
if current_user is None:
current_user = u
# Ensure that users appear in blocks, as the evaluation logic expects
# this structure.
self.assertEqual(u, current_user)
# The structure of evaluation data is 999 negative examples followed
# by the holdout positive.
if not (idx + 1) % (rconst.NUM_EVAL_NEGATIVES + 1):
# Check that the last element in each chunk is the holdout item.
self.assertEqual(i_raw, self.holdout[u_raw][1])
current_user = None
elif i_raw == self.holdout[u_raw][1]:
# Because the holdout item is not given to the negative generation
# process, it can appear as a negative. In that case, it should be
# masked out as a duplicate. (Since the true positive is placed at
# the end and would therefore lose the tie.)
assert d
else:
# Otherwise check that the other 999 points for a user are selected
# from the negatives.
assert (u_raw, i_raw) not in self.seen_pairs
self.assertRegexpMatches(md5.hexdigest(), END_TO_END_EVAL_MD5)
def _test_fresh_randomness(self, constructor_type):
train_epochs = 5
params = self.make_params(train_epochs=train_epochs)
_, _, producer = data_preprocessing.instantiate_pipeline(
dataset=DATASET,
data_dir=self.temp_data_dir,
params=params,
constructor_type=constructor_type,
deterministic=True)
producer.start()
results = []
g = tf.Graph()
with g.as_default():
for _ in range(train_epochs):
input_fn = producer.make_input_fn(is_training=True)
dataset = input_fn(params)
results.extend(self.drain_dataset(dataset=dataset, g=g))
producer.join()
assert producer._fatal_exception is None
positive_counts, negative_counts = defaultdict(int), defaultdict(int)
md5 = hashlib.md5()
for features, labels in results:
data_list = [
features[movielens.USER_COLUMN].flatten(),
features[movielens.ITEM_COLUMN].flatten(),
features[rconst.VALID_POINT_MASK].flatten(),
labels.flatten()
]
for i in data_list:
md5.update(i.tobytes())
for u, i, v, l in zip(*data_list):
if not v:
continue # ignore padding
if l:
positive_counts[(u, i)] += 1
else:
negative_counts[(u, i)] += 1
self.assertRegexpMatches(md5.hexdigest(), FRESH_RANDOMNESS_MD5)
# The positive examples should appear exactly once each epoch
self.assertAllEqual(
list(positive_counts.values()), [train_epochs for _ in positive_counts])
# The threshold for the negatives is heuristic, but in general repeats are
# expected, but should not appear too frequently.
pair_cardinality = NUM_USERS * NUM_ITEMS
neg_pair_cardinality = pair_cardinality - len(self.seen_pairs)
# Approximation for the expectation number of times that a particular
# negative will appear in a given epoch. Implicit in this calculation is the
# treatment of all negative pairs as equally likely. Normally is not
# necessarily reasonable; however the generation in self.setUp() will
# approximate this behavior sufficiently for heuristic testing.
e_sample = len(self.seen_pairs) * NUM_NEG / neg_pair_cardinality
# The frequency of occurance of a given negative pair should follow an
# approximately binomial distribution in the limit that the cardinality of
# the negative pair set >> number of samples per epoch.
approx_pdf = scipy.stats.binom.pmf(
k=np.arange(train_epochs + 1), n=train_epochs, p=e_sample)
# Tally the actual observed counts.
count_distribution = [0 for _ in range(train_epochs + 1)]
for i in negative_counts.values():
i = min([i, train_epochs]) # round down tail for simplicity.
count_distribution[i] += 1
count_distribution[0] = neg_pair_cardinality - sum(count_distribution[1:])
# Check that the frequency of negative pairs is approximately binomial.
for i in range(train_epochs + 1):
if approx_pdf[i] < 0.05:
continue # Variance will be high at the tails.
observed_fraction = count_distribution[i] / neg_pair_cardinality
deviation = (2 * abs(observed_fraction - approx_pdf[i]) /
(observed_fraction + approx_pdf[i]))
self.assertLess(deviation, 0.2)
def test_end_to_end_materialized(self):
self._test_end_to_end("materialized")
def test_end_to_end_bisection(self):
self._test_end_to_end("bisection")
def test_fresh_randomness_materialized(self):
self._test_fresh_randomness("materialized")
def test_fresh_randomness_bisection(self):
self._test_fresh_randomness("bisection")
if __name__ == "__main__":
tf.test.main()