# Copyright 2023 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """TFX beam preprocessing pipeline for Criteo data. Preprocessing util for criteo data. Transformations: 1. Fill missing features with zeros. 2. Set negative integer features to zeros. 3. Normalize integer features using log(x+1). 4. For categorical features (hex), convert to integer and take value modulus the max_vocab_size value. Usage: For raw Criteo data, this script should be run twice. First run should set vocab_gen_mode to true. This run is used to generate vocabulary files in the temp_dir location. Second run should set vocab_gen_mode to false. It is necessary to point to the same temp_dir used during the first run. """ import argparse import datetime import os from absl import logging import apache_beam as beam import numpy as np import tensorflow as tf, tf_keras import tensorflow_transform as tft import tensorflow_transform.beam as tft_beam from tensorflow_transform.tf_metadata import dataset_metadata from tensorflow_transform.tf_metadata import schema_utils from tfx_bsl.public import tfxio parser = argparse.ArgumentParser() parser.add_argument( "--input_path", default=None, required=True, help="Input path. Be sure to set this to cover all data, to ensure " "that sparse vocabs are complete.") parser.add_argument( "--output_path", default=None, required=True, help="Output path.") parser.add_argument( "--temp_dir", default=None, required=True, help="Directory to store temporary metadata. Important because vocab " "dictionaries will be stored here. Co-located with data, ideally.") parser.add_argument( "--csv_delimeter", default="\t", help="Delimeter string for input and output.") parser.add_argument( "--vocab_gen_mode", action="store_true", default=False, help="If it is set, process full dataset and do not write CSV output. In " "this mode, See temp_dir for vocab files. input_path should cover all " "data, e.g. train, test, eval.") parser.add_argument( "--runner", help="Runner for Apache Beam, needs to be one of {DirectRunner, " "DataflowRunner}.", default="DirectRunner") parser.add_argument( "--project", default=None, help="ID of your project. Ignored by DirectRunner.") parser.add_argument( "--region", default=None, help="Region. Ignored by DirectRunner.") parser.add_argument( "--max_vocab_size", type=int, default=10_000_000, help="Max index range, categorical features convert to integer and take " "value modulus the max_vocab_size") args = parser.parse_args() NUM_NUMERIC_FEATURES = 13 NUMERIC_FEATURE_KEYS = [ f"int-feature-{x + 1}" for x in range(NUM_NUMERIC_FEATURES)] CATEGORICAL_FEATURE_KEYS = [ "categorical-feature-%d" % x for x in range(NUM_NUMERIC_FEATURES + 1, 40)] LABEL_KEY = "clicked" # Data is first preprocessed in pure Apache Beam using numpy. # This removes missing values and hexadecimal-encoded values. # For the TF schema, we can thus specify the schema as FixedLenFeature # for TensorFlow Transform. FEATURE_SPEC = dict([(name, tf.io.FixedLenFeature([], dtype=tf.int64)) for name in CATEGORICAL_FEATURE_KEYS] + [(name, tf.io.FixedLenFeature([], dtype=tf.float32)) for name in NUMERIC_FEATURE_KEYS] + [(LABEL_KEY, tf.io.FixedLenFeature([], tf.float32))]) INPUT_METADATA = dataset_metadata.DatasetMetadata( schema_utils.schema_from_feature_spec(FEATURE_SPEC)) def apply_vocab_fn(inputs): """Preprocessing fn for sparse features. Applies vocab to bucketize sparse features. This function operates using previously-created vocab files. Pre-condition: Full vocab has been materialized. Args: inputs: Input features to transform. Returns: Output dict with transformed features. """ outputs = {} outputs[LABEL_KEY] = inputs[LABEL_KEY] for key in NUMERIC_FEATURE_KEYS: outputs[key] = inputs[key] for idx, key in enumerate(CATEGORICAL_FEATURE_KEYS): vocab_fn = os.path.join( args.temp_dir, "tftransform_tmp", "feature_{}_vocab".format(idx)) outputs[key] = tft.apply_vocabulary(inputs[key], vocab_fn) return outputs def compute_vocab_fn(inputs): """Preprocessing fn for sparse features. This function computes unique IDs for the sparse features. We rely on implicit behavior which writes the vocab files to the vocab_filename specified in tft.compute_and_apply_vocabulary. Pre-condition: Sparse features have been converted to integer and mod'ed with args.max_vocab_size. Args: inputs: Input features to transform. Returns: Output dict with transformed features. """ outputs = {} outputs[LABEL_KEY] = inputs[LABEL_KEY] for key in NUMERIC_FEATURE_KEYS: outputs[key] = inputs[key] for idx, key in enumerate(CATEGORICAL_FEATURE_KEYS): outputs[key] = tft.compute_and_apply_vocabulary( x=inputs[key], vocab_filename="feature_{}_vocab".format(idx)) return outputs class FillMissing(beam.DoFn): """Fills missing elements with zero string value.""" def process(self, element): elem_list = element.split(args.csv_delimeter) out_list = [] for val in elem_list: new_val = "0" if not val else val out_list.append(new_val) yield (args.csv_delimeter).join(out_list) class NegsToZeroLog(beam.DoFn): """For int features, sets negative values to zero and takes log(x+1).""" def process(self, element): elem_list = element.split(args.csv_delimeter) out_list = [] for i, val in enumerate(elem_list): if i > 0 and i <= NUM_NUMERIC_FEATURES: new_val = "0" if int(val) < 0 else val new_val = np.log(int(new_val) + 1) new_val = str(new_val) else: new_val = val out_list.append(new_val) yield (args.csv_delimeter).join(out_list) class HexToIntModRange(beam.DoFn): """For categorical features, takes decimal value and mods with max value.""" def process(self, element): elem_list = element.split(args.csv_delimeter) out_list = [] for i, val in enumerate(elem_list): if i > NUM_NUMERIC_FEATURES: new_val = int(val, 16) % args.max_vocab_size else: new_val = val out_list.append(str(new_val)) yield str.encode((args.csv_delimeter).join(out_list)) def transform_data(data_path, output_path): """Preprocesses Criteo data. Two processing modes are supported. Raw data will require two passes. If full vocab files already exist, only one pass is necessary. Args: data_path: File(s) to read. output_path: Path to which output CSVs are written, if necessary. """ preprocessing_fn = compute_vocab_fn if args.vocab_gen_mode else apply_vocab_fn gcp_project = args.project region = args.region job_name = (f"criteo-preprocessing-" f"{datetime.datetime.now().strftime('%y%m%d-%H%M%S')}") # set up Beam pipeline. pipeline_options = None if args.runner == "DataflowRunner": options = { "staging_location": os.path.join(output_path, "tmp", "staging"), "temp_location": os.path.join(output_path, "tmp"), "job_name": job_name, "project": gcp_project, "save_main_session": True, "region": region, "setup_file": "./setup.py", } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) elif args.runner == "DirectRunner": pipeline_options = beam.options.pipeline_options.DirectOptions( direct_num_workers=os.cpu_count(), direct_running_mode="multi_threading") with beam.Pipeline(args.runner, options=pipeline_options) as pipeline: with tft_beam.Context(temp_dir=args.temp_dir): processed_lines = ( pipeline # Read in TSV data. | beam.io.ReadFromText(data_path, coder=beam.coders.StrUtf8Coder()) # Fill in missing elements with the defaults (zeros). | "FillMissing" >> beam.ParDo(FillMissing()) # For numerical features, set negatives to zero. Then take log(x+1). | "NegsToZeroLog" >> beam.ParDo(NegsToZeroLog()) # For categorical features, mod the values with vocab size. | "HexToIntModRange" >> beam.ParDo(HexToIntModRange())) # CSV reader: List the cols in order, as dataset schema is not ordered. ordered_columns = [LABEL_KEY ] + NUMERIC_FEATURE_KEYS + CATEGORICAL_FEATURE_KEYS csv_tfxio = tfxio.BeamRecordCsvTFXIO( physical_format="text", column_names=ordered_columns, delimiter=args.csv_delimeter, schema=INPUT_METADATA.schema) converted_data = ( processed_lines | "DecodeData" >> csv_tfxio.BeamSource()) raw_dataset = (converted_data, csv_tfxio.TensorAdapterConfig()) # The TFXIO output format is chosen for improved performance. transformed_dataset, _ = ( raw_dataset | tft_beam.AnalyzeAndTransformDataset( preprocessing_fn, output_record_batches=False)) # Transformed metadata is not necessary for encoding. transformed_data, transformed_metadata = transformed_dataset if not args.vocab_gen_mode: # Write to CSV. transformed_csv_coder = tft.coders.CsvCoder( ordered_columns, transformed_metadata.schema, delimiter=args.csv_delimeter) _ = ( transformed_data | "EncodeDataCsv" >> beam.Map(transformed_csv_coder.encode) | "WriteDataCsv" >> beam.io.WriteToText(output_path)) if __name__ == "__main__": logging.set_verbosity(logging.INFO) transform_data(data_path=args.input_path, output_path=args.output_path)