# Copyright (c) Facebook, Inc. and its affiliates. | |
# All rights reserved. | |
# | |
# This source code is licensed under the BSD-style license found in the | |
# LICENSE file in the root directory of this source tree. | |
""" | |
Global flags for aot autograd | |
""" | |
import os | |
import sys | |
from typing import TYPE_CHECKING | |
# Converts torch rng ops to their functional philox rng equivalents. Note that | |
# we functionalize only CUDA rng ops today. | |
functionalize_rng_ops = False | |
# can be useful for debugging if we are incorrectly creating meta fake tensors | |
fake_tensor_allow_meta = os.environ.get("FAKE_ALLOW_META", True) | |
# Enables optional asserts in hotpath code to check for errors. If | |
# you are seeing weird accuracy problems, try turning this on. | |
# This is currently off by default as it will harm tracing time, | |
# but it is on by default for aot_eager. | |
debug_assert = False | |
debug_partitioner = os.environ.get("AOT_PARTITIONER_DEBUG", False) | |
# Today, if you are in a situation where there is "false aliasing" | |
# (e.g. you have a bunch of model parameters that all alias the same underlying buffer), | |
# our checks for this situation are very slow if these inputs have dynamic shapes. | |
# This config is set to ensure that there aren't too many aliased inputs in this situation, | |
# so that we error loudly instead of compiling forever. | |
# Eventually, we should make these checks faster. | |
# For now, however, you can simply turn off dynamic shapes by marking your inputs static | |
# when you run into this situation. | |
_max_aliased_inputs_with_dynamic_shapes_enabled = 5 | |
static_weight_shapes = True | |
# Applies CSE to the graph before partitioning | |
cse = True | |
# When AOTAutograd regenerates aliased graph outputs, | |
# attempte to use functionalization's view-replay logic | |
# before falling back to the autograd engine's view replay or as_strided. | |
# This can have some perf implications | |
# (although for many models this will not matter). | |
# (1) If you have many view ops chained together, replaying all of them | |
# at runtime can have more overhead compared to a single as_strided call | |
# (2) If you are doing training, AsStridedBackward is quite slow, | |
# and the individual view op backward formulas will likely be faster. | |
# (3) Some backends like XLA do not support as_strided | |
# Temporary hack: disable this flag for internal | |
# (needed to fix an internal issue while avoiding bumping XLA pin) | |
# eventually: either default this config to false completely | |
# once XLA pin update works, | |
# or default config to true and fix relevant bugs | |
from torch._inductor.config import is_fbcode | |
view_replay_for_aliased_outputs = not is_fbcode() | |
# Restricts the amount of computation AOTAutograd can do. | |
# NB: We have essentially disabled this heuristic now. However, this is kept | |
# here for now in case it's useful. Setting it low can artificially reduce the | |
# amount of recomputation AOTAutograd performs, although not in any kind of | |
# principled way. | |
max_dist_from_bw = 1000 | |
# Bans recomputation of nodes that are reading from nodes that is far before | |
# the current node | |
ban_recompute_used_far_apart = True | |
# Breaks up long chain of fusible ops, as otherwise we can have an arbitrarily | |
# long chain of recomputation in the backwards pass. | |
ban_recompute_long_fusible_chains = True | |
# Bans recomputation of nodes that must be materialized in the backwards pass | |
# (used by a non-fusible node) | |
ban_recompute_materialized_backward = True | |
# Chooses to ban recomputation of nodes based off an allowlist. Setting it to | |
# False changes it to use a denylist. Main change is on operators like | |
# sort/pool/stuff that isn't cheap enough to be fusible for free but also isn't | |
# that expensive | |
ban_recompute_not_in_allowlist = True | |
# Chooses to ban recomputation of reductions. This is generally a good idea, as | |
# the result of reductions is generally very small but recomputing reductions in | |
# a fusion can be expensive. | |
ban_recompute_reductions = True | |
# By default, the partitioner is purely trying to optimize for runtime (although | |
# it should always use less memory than eager) | |
# This knob controls the partitioner to make that tradeoff for you, choosing the | |
# fastest option that saves less activations than the memory budget. | |
# Specifically, 0.0 corresponds to the activation memory from applying | |
# activation checkpointing to the full compiled region, and 1.0 corresponds to | |
# the activation memory from the default runtime-optimized strategy. So, 0.4 | |
# would result in a strategy that saves 40% of the activations compared to the | |
# default strategy. | |
# It solves a 0-1 knapsack to find the minimum recompute necessary to stay below | |
# the activation memory budget. | |
# NOTE: This *cannot* be treated as | |
activation_memory_budget = 1.0 | |
# This controls how we estimate the runtime when deciding what the cheapest | |
# operators to recompute are. The 3 options are | |
# "flops": Bases it off of the flop count provided by torch.utils.flop_counter | |
# "profile": Benchmarks each operator to come up with a runtime | |
# "testing": Returns 1 for everything | |
activation_memory_budget_runtime_estimator = "flops" | |
# This controls the solver used for the 0-1 knapsack. By default we use a | |
# quantized DP solution ("dp"). The other approaches are a "greedy" and a "ilp" | |
# (which has a scipy dependency). | |
activation_memory_budget_solver = "dp" | |
# This dumps out a png visualization of the expected runtime vs. activation | |
# memory tradeoffs for all memory budget values from 0 to 1 in increments of | |
# 0.5. See an example here: | |
# https://github.com/pytorch/pytorch/pull/126320#discussion_r1625104015 | |
visualize_memory_budget_pareto = ( | |
os.environ.get("PARTITIONER_MEMORY_BUDGET_PARETO", "0") == "1" | |
) | |
# Sets all of the ban_recompute heuristics to False except ban_recompute_reductions | |
# Generally, this will probably result in some memory improvement, but at the | |
# cost of some performance | |
aggressive_recomputation = False | |
# If FakeTensor.data_ptr() should error. | |
# This option is independent of AOTAutograd and torch.compile, but our policy | |
# is to turn it off during torch.compile. | |
fake_tensor_allow_unsafe_data_ptr_access = True | |
# Unlifts effect tokens from the inputs/outputs in the traced graph and instead | |
# inserts make_token/sink_token calls in the graph to create tokens and then | |
# sink them at the end. Note that this means the graph is no longer functional | |
# which may lead to silent errors unless the backend knows how to handle the | |
# tokens. | |
unlift_effect_tokens = False | |
# This mode specifies that we should also keep track of the real | |
# tensor along with the fake tensor, and do real compute. While | |
# seemingly this eliminates the whole point of fake tensors, there are | |
# two obvious use cases for it: | |
# | |
# 1. When users call item()/other data dependent operations, | |
# if we propagate_real_tensors we are able to determine what | |
# the true value is and keep going. | |
# | |
# 2. It can be useful for testing, when you want to see if the fake | |
# and real tensors agree with each other. (Note that there are | |
# currently known inaccuracies in how we clone real tensors, that | |
# would have to be tightened up for this to be useful in this | |
# case.) | |
# | |
# Note that fake tensors are typically understood to be cheap to store | |
# indefinitely, so we tend to hold on to them longer than we would | |
# hold onto the real tensors. So we also support you explicitly | |
# deallocating the real tensor associated with a fake tensor, at which | |
# point we will stop propagating real tensors. | |
# | |
# One more thing: when you provide a real tensor to fakeify, we will | |
# clone it, so that we can safely perform mutations on it if necessary. | |
# This will increase live memory usage. This could potentially be | |
# optimized by using COW. We also currently do not faithfully | |
# maintain autograd metadata on the real tensor; this is fine because | |
# AOTAutograd will only use the fake tensor to determine leafness/etc | |
# of tensors in question. | |
fake_tensor_propagate_real_tensors = False | |
# Controls the default graph output format used by draw_graph | |
# Supported formats are defined here https://graphviz.org/docs/outputs/ | |
torch_compile_graph_format = os.environ.get("TORCH_COMPILE_GRAPH_FORMAT", "svg") | |
if TYPE_CHECKING: | |
from torch.utils._config_typing import * # noqa: F401, F403 | |
from torch.utils._config_module import install_config_module | |
# adds patch, save_config, invalid config checks, etc | |
install_config_module(sys.modules[__name__]) | |