|
#!/bin/bash |
|
set -e |
|
|
|
if [ `id -u` != 0 ]; then |
|
echo "Calling sudo to gain root for this shell. (Needed to clear caches.)" |
|
sudo echo "Success" |
|
fi |
|
|
|
SCRIPT_DIR=`dirname "$BASH_SOURCE"` |
|
export PYTHONPATH="${SCRIPT_DIR}/../../" |
|
MAIN_SCRIPT="ncf_estimator_main.py" |
|
|
|
DATASET="ml-20m" |
|
|
|
BUCKET=${BUCKET:-""} |
|
ROOT_DIR="${BUCKET:-/tmp}/MLPerf_NCF" |
|
echo "Root directory: ${ROOT_DIR}" |
|
|
|
if [[ -z ${BUCKET} ]]; then |
|
LOCAL_ROOT=${ROOT_DIR} |
|
else |
|
LOCAL_ROOT="/tmp/MLPerf_NCF" |
|
mkdir -p ${LOCAL_ROOT} |
|
echo "Local root (for files which cannot use GCS): ${LOCAL_ROOT}" |
|
fi |
|
|
|
DATE=$(date '+%Y-%m-%d_%H:%M:%S') |
|
TEST_DIR="${ROOT_DIR}/${DATE}" |
|
LOCAL_TEST_DIR="${LOCAL_ROOT}/${DATE}" |
|
mkdir -p ${LOCAL_TEST_DIR} |
|
|
|
TPU=${TPU:-""} |
|
if [[ -z ${TPU} ]]; then |
|
DEVICE_FLAG="--num_gpus -1" |
|
else |
|
DEVICE_FLAG="--tpu ${TPU} --num_gpus 0" |
|
fi |
|
|
|
DATA_DIR="${ROOT_DIR}/movielens_data" |
|
python "${SCRIPT_DIR}/movielens.py" --data_dir ${DATA_DIR} --dataset ${DATASET} |
|
|
|
if [ "$1" == "keras" ] |
|
then |
|
MAIN_SCRIPT="ncf_keras_main.py" |
|
BATCH_SIZE=99000 |
|
DEVICE_FLAG="--num_gpus 1" |
|
else |
|
BATCH_SIZE=98340 |
|
fi |
|
|
|
{ |
|
|
|
for i in `seq 0 4`; |
|
do |
|
START_TIME=$(date +%s) |
|
MODEL_DIR="${TEST_DIR}/model_dir_${i}" |
|
|
|
RUN_LOG="${LOCAL_TEST_DIR}/run_${i}.log" |
|
export COMPLIANCE_FILE="${LOCAL_TEST_DIR}/run_${i}_compliance_raw.log" |
|
export STITCHED_COMPLIANCE_FILE="${LOCAL_TEST_DIR}/run_${i}_compliance_submission.log" |
|
echo "" |
|
echo "Beginning run ${i}" |
|
echo " Complete output logs are in ${RUN_LOG}" |
|
echo " Compliance logs: (submission log is created after run.)" |
|
echo " ${COMPLIANCE_FILE}" |
|
echo " ${STITCHED_COMPLIANCE_FILE}" |
|
|
|
|
|
|
|
|
|
python -u "${SCRIPT_DIR}/${MAIN_SCRIPT}" \ |
|
--model_dir ${MODEL_DIR} \ |
|
--data_dir ${DATA_DIR} \ |
|
--dataset ${DATASET} --hooks "" \ |
|
${DEVICE_FLAG} \ |
|
--clean \ |
|
--train_epochs 14 \ |
|
--batch_size ${BATCH_SIZE} \ |
|
--eval_batch_size 160000 \ |
|
--learning_rate 0.00382059 \ |
|
--beta1 0.783529 \ |
|
--beta2 0.909003 \ |
|
--epsilon 1.45439e-07 \ |
|
--layers 256,256,128,64 --num_factors 64 \ |
|
--hr_threshold 0.635 \ |
|
--ml_perf \ |
|
|& tee ${RUN_LOG} \ |
|
| grep --line-buffered -E --regexp="(Iteration [0-9]+: HR = [0-9\.]+, NDCG = [0-9\.]+, Loss = [0-9\.]+)|(pipeline_hash)|(MLPerf time:)" |
|
|
|
END_TIME=$(date +%s) |
|
echo "Run ${i} complete: $(( $END_TIME - $START_TIME )) seconds." |
|
|
|
|
|
if [[ -z ${BUCKET} ]]; then |
|
echo "Removing model directory to save space." |
|
rm -r ${MODEL_DIR} |
|
fi |
|
|
|
done |
|
|
|
} |& tee "${LOCAL_TEST_DIR}/summary.log" |
|
|