Spaces:
Runtime error
Runtime error
# Copyright 2023 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Common configuration settings.""" | |
import dataclasses | |
from typing import Optional, Sequence, Union | |
from official.modeling.hyperparams import base_config | |
from official.modeling.optimization.configs import optimization_config | |
from official.modeling.privacy import configs as dp_configs | |
OptimizationConfig = optimization_config.OptimizationConfig | |
class DataConfig(base_config.Config): | |
"""The base configuration for building datasets. | |
Attributes: | |
input_path: The path to the input. It can be either (1) a str indicating a | |
file path/pattern, or (2) a str indicating multiple file paths/patterns | |
separated by comma (e.g "a, b, c" or no spaces "a,b,c"), or (3) a list of | |
str, each of which is a file path/pattern or multiple file paths/patterns | |
separated by comma, or (4) a dictionary of the previous three approaches | |
for more advanced data mixing using named access. It should not be | |
specified when the following `tfds_name` is specified. | |
tfds_name: The name of the tensorflow dataset (TFDS). It should not be | |
specified when the above `input_path` is specified. | |
tfds_split: A str indicating which split of the data to load from TFDS. It | |
is required when above `tfds_name` is specified. | |
global_batch_size: The global batch size across all replicas. | |
is_training: Whether this data is used for training or not. This flag is | |
useful for consumers of this object to determine whether the data should | |
be repeated or shuffled. | |
drop_remainder: Whether the last batch should be dropped in the case it has | |
fewer than `global_batch_size` elements. | |
shuffle_buffer_size: The buffer size used for shuffling training data. | |
cache: Whether to cache dataset examples. If `True`, we will cache the | |
dataset after applying the decode_fn and parse_fn. It can be used to avoid | |
re-reading from disk, re-decoding and re-parsing the example on the second | |
epoch, but it requires significant memory overhead. | |
cycle_length: The number of files that will be processed concurrently when | |
interleaving files. | |
block_length: The number of consecutive elements to produce from each input | |
element before cycling to another input element when interleaving files. | |
deterministic: A boolean controlling whether determinism should be enforced. | |
sharding: Whether sharding is used in the input pipeline. | |
enable_tf_data_service: A boolean indicating whether to enable tf.data | |
service for the input pipeline. | |
tf_data_service_address: The URI of a tf.data service to offload | |
preprocessing onto during training. The URI should be in the format | |
"protocol://address", e.g. "grpc://tf-data-service:5050". It can be | |
overridden by `FLAGS.tf_data_service` flag in the binary. | |
tf_data_service_job_name: The name of the tf.data service job. This argument | |
makes it possible for multiple datasets to share the same job. The default | |
behavior is that the dataset creates anonymous, exclusively owned jobs. | |
tfds_data_dir: A str specifying the directory to read/write TFDS data. | |
tfds_as_supervised: A bool. When loading dataset from TFDS, if True, the | |
returned tf.data.Dataset will have a 2-tuple structure (input, label) | |
according to builder.info.supervised_keys; if False, the default, the | |
returned tf.data.Dataset will have a dictionary with all the features. | |
tfds_skip_decoding_feature: A str to indicate which features are skipped for | |
decoding when loading dataset from TFDS. Use comma to separate multiple | |
features. The main use case is to skip the image/video decoding for better | |
performance. | |
enable_shared_tf_data_service_between_parallel_trainers: A bool. When set to | |
true, only a single tf.data service will be started, and it will be shared | |
between all the trainer run simultaneously, e.g. using vizier to tune | |
hyperparameters. This will save CPU and RAM resources compared to running | |
separate tf.data service for each trainer. Notice that if batch size is | |
different for different trainers, the field | |
apply_tf_data_service_before_batching also needs to be true so that only a | |
single tf.data service instance will be created. In this case, tf.data | |
service will be applied before batching operation. So make sure to not | |
apply any processing steps after batching (e.g. in postprocess_fn) since | |
they wouldn't be paralleled by tf.data service and may slow down your | |
tf.data pipeline. When using shared tf.data service, the tf.data dataset | |
must be infinite, and slow trainer may skip certain training examples. | |
More details about shared tf.data service can be found at: | |
https://www.tensorflow.org/api_docs/python/tf/data/experimental/service#sharing_tfdata_service_with_concurrent_trainers. | |
apply_tf_data_service_before_batching: A bool. If set to True, tf.data | |
service will be applied before batching operation. This is useful to make | |
sure only a single tf.data service instance is created when | |
enable_shared_tf_data_service_between_parallel_trainers is true and batch | |
size is changing between parallel trainers. | |
trainer_id: A string. The id of the trainer if there are multiple parallel | |
trainer running at the same time, e.g. in vizier tuning case. It will be | |
automatically set if this field is needed. Users does not need to set it | |
when creating experiment configs. | |
seed: An optional seed to use for deterministic shuffling/preprocessing. | |
prefetch_buffer_size: An int specifying the buffer size of prefetch | |
datasets. If None, the buffer size is autotuned. Specifying this is useful | |
in case autotuning uses up too much memory by making the buffer size too | |
high. | |
autotune_algorithm: If specified, use this algorithm for AUTOTUNE. See: | |
https://www.tensorflow.org/api_docs/python/tf/data/experimental/AutotuneAlgorithm | |
""" | |
input_path: Union[Sequence[str], str, base_config.Config] = "" | |
tfds_name: Union[str, base_config.Config] = "" | |
tfds_split: str = "" | |
global_batch_size: int = 0 | |
is_training: Optional[bool] = None | |
drop_remainder: bool = True | |
shuffle_buffer_size: int = 100 | |
cache: bool = False | |
cycle_length: Optional[int] = None | |
block_length: int = 1 | |
deterministic: Optional[bool] = None | |
sharding: bool = True | |
enable_tf_data_service: bool = False | |
tf_data_service_address: Optional[str] = None | |
tf_data_service_job_name: Optional[str] = None | |
tfds_data_dir: str = "" | |
tfds_as_supervised: bool = False | |
tfds_skip_decoding_feature: str = "" | |
enable_shared_tf_data_service_between_parallel_trainers: bool = False | |
apply_tf_data_service_before_batching: bool = False | |
trainer_id: Optional[str] = None | |
seed: Optional[int] = None | |
prefetch_buffer_size: Optional[int] = None | |
autotune_algorithm: Optional[str] = None | |
class RuntimeConfig(base_config.Config): | |
"""High-level configurations for Runtime. | |
These include parameters that are not directly related to the experiment, | |
e.g. directories, accelerator type, etc. | |
Attributes: | |
distribution_strategy: e.g. 'mirrored', 'tpu', etc. | |
enable_xla: Whether or not to enable XLA. | |
per_gpu_thread_count: thread count per GPU. | |
gpu_thread_mode: Whether and how the GPU device uses its own threadpool. | |
dataset_num_private_threads: Number of threads for a private threadpool | |
created for all datasets computation. | |
tpu: The address of the TPU to use, if any. | |
num_gpus: The number of GPUs to use, if any. | |
worker_hosts: comma-separated list of worker ip:port pairs for running | |
multi-worker models with DistributionStrategy. | |
task_index: If multi-worker training, the task index of this worker. | |
all_reduce_alg: Defines the algorithm for performing all-reduce. | |
num_packs: Sets `num_packs` in the cross device ops used in | |
MirroredStrategy. For details, see tf.distribute.NcclAllReduce. | |
mixed_precision_dtype: dtype of mixed precision policy. It can be 'float32', | |
'float16', or 'bfloat16'. | |
loss_scale: The type of loss scale, or 'float' value. This is used when | |
setting the mixed precision policy. | |
run_eagerly: Whether or not to run the experiment eagerly. | |
batchnorm_spatial_persistent: Whether or not to enable the spatial | |
persistent mode for CuDNN batch norm kernel for improved GPU performance. | |
""" | |
distribution_strategy: str = "mirrored" | |
enable_xla: bool = False | |
gpu_thread_mode: Optional[str] = None | |
dataset_num_private_threads: Optional[int] = None | |
per_gpu_thread_count: int = 0 | |
tpu: Optional[str] = None | |
num_gpus: int = 0 | |
worker_hosts: Optional[str] = None | |
task_index: int = -1 | |
all_reduce_alg: Optional[str] = None | |
num_packs: int = 1 | |
mixed_precision_dtype: Optional[str] = None | |
loss_scale: Optional[Union[str, float]] = None | |
run_eagerly: bool = False | |
batchnorm_spatial_persistent: bool = False | |
# XLA runtime params. | |
# XLA params are only applied to the train_step. | |
# These augments can improve training speed. They can also improve eval, but | |
# may reduce usability and users would need to make changes to code. | |
# Whether to enable XLA dynamic padder | |
# infrastructure to handle dynamic shapes inputs inside XLA. True by | |
# default. Disabling this may cause correctness issues with dynamic shapes | |
# inputs, as XLA will just assume the inputs are with padded shapes. However | |
# users can optionally set it to False to improve device time if masking is | |
# already handled in the user side. | |
# If None, will respect XLA default. | |
tpu_enable_xla_dynamic_padder: Optional[bool] = None | |
# Global model parallelism configurations. | |
num_cores_per_replica: int = 1 | |
default_shard_dim: int = -1 | |
use_tpu_mp_strategy: bool = False | |
def model_parallelism(self): | |
return dict( | |
num_cores_per_replica=self.num_cores_per_replica, | |
default_shard_dim=self.default_shard_dim) | |
class TrainerConfig(base_config.Config): | |
"""Configuration for trainer. | |
Attributes: | |
optimizer_config: optimizer config, it includes optimizer, learning rate, | |
and warmup schedule configs. | |
train_tf_while_loop: whether or not to use tf while loop. | |
train_tf_function: whether or not to use tf_function for training loop. | |
eval_tf_function: whether or not to use tf_function for eval. | |
eval_tf_while_loop: whether or not to use tf while loop for eval. | |
allow_tpu_summary: Whether to allow summary happen inside the XLA program | |
runs on TPU through automatic outside compilation. | |
steps_per_loop: number of steps per loop to report training metrics. This | |
can also be used to reduce host worker communication in a TPU setup. | |
summary_interval: number of steps between each summary. | |
checkpoint_interval: number of steps between checkpoints. | |
max_to_keep: max checkpoints to keep. | |
continuous_eval_timeout: maximum number of seconds to wait between | |
checkpoints, if set to None, continuous eval will wait indefinitely. This | |
is only used continuous_train_and_eval and continuous_eval modes. Default | |
value is 1 hrs. | |
train_steps: number of train steps. | |
validation_steps: number of eval steps. If -1, the entire eval dataset is | |
used. | |
validation_interval: number of training steps to run between evaluations. | |
best_checkpoint_export_subdir: if set, the trainer will keep track of the | |
best evaluation metric, and export the corresponding best checkpoint under | |
`model_dir/best_checkpoint_export_subdir`. Note that this only works if | |
mode contains eval (such as `train_and_eval`, `continuous_eval`, and | |
`continuous_train_and_eval`). | |
best_checkpoint_eval_metric: for exporting the best checkpoint, which | |
evaluation metric the trainer should monitor. This can be any evaluation | |
metric appears on tensorboard. | |
best_checkpoint_metric_comp: for exporting the best checkpoint, how the | |
trainer should compare the evaluation metrics. This can be either `higher` | |
(higher the better) or `lower` (lower the better). | |
validation_summary_subdir: A 'str', sub directory for saving eval summary. | |
preemption_on_demand_checkpoint: whether or not to save on-demand | |
checkpoints after a preemption. | |
""" | |
optimizer_config: OptimizationConfig = dataclasses.field( | |
default_factory=OptimizationConfig | |
) | |
# Orbit settings. | |
train_tf_while_loop: bool = True | |
train_tf_function: bool = True | |
eval_tf_function: bool = True | |
eval_tf_while_loop: bool = False | |
allow_tpu_summary: bool = False | |
# Trainer intervals. | |
steps_per_loop: int = 1000 | |
summary_interval: int = 1000 | |
checkpoint_interval: int = 1000 | |
# Checkpoint manager. | |
max_to_keep: int = 5 | |
continuous_eval_timeout: int = 60 * 60 | |
# Train/Eval routines. | |
train_steps: int = 0 | |
# Sets validation steps to be -1 to evaluate the entire dataset. | |
validation_steps: int = -1 | |
validation_interval: int = 1000 | |
# Best checkpoint export. | |
best_checkpoint_export_subdir: str = "" | |
best_checkpoint_eval_metric: str = "" | |
best_checkpoint_metric_comp: str = "higher" | |
# Blowup recovery. | |
loss_upper_bound: float = 1e6 | |
recovery_begin_steps: int = 0 # Enforcing the loss bound after these steps. | |
# When max trials < 0, no recovery module; max trials = 0, we will check | |
# the condition and fail the job if the condition happens; max trials > 0, | |
# we will retore the model states. | |
recovery_max_trials: int = 0 | |
validation_summary_subdir: str = "validation" | |
# Preemption on-demand checkpoint. | |
preemption_on_demand_checkpoint: bool = True # copybara-replace | |
class TaskConfig(base_config.Config): | |
"""Config passed to task.""" | |
init_checkpoint: str = "" | |
model: Optional[base_config.Config] = None | |
train_data: DataConfig = dataclasses.field(default_factory=DataConfig) | |
validation_data: DataConfig = dataclasses.field(default_factory=DataConfig) | |
name: Optional[str] = None | |
# Configs for differential privacy | |
# These configs are only effective if you use create_optimizer in | |
# tensorflow_models/official/core/base_task.py | |
# DEPRECATED b/264611883 | |
differential_privacy_config: Optional[ | |
dp_configs.DifferentialPrivacyConfig] = None | |
# Whether to show image summary. Useful to visualize model predictions. Only | |
# work for vision tasks. | |
allow_image_summary: bool = False | |
class ExperimentConfig(base_config.Config): | |
"""Top-level configuration.""" | |
task: TaskConfig = dataclasses.field(default_factory=TaskConfig) | |
trainer: TrainerConfig = dataclasses.field(default_factory=TrainerConfig) | |
runtime: RuntimeConfig = dataclasses.field(default_factory=RuntimeConfig) | |