diff --git a/MLPY/Lib/site-packages/mlagents/__init__.py b/MLPY/Lib/site-packages/mlagents/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MLPY/Lib/site-packages/mlagents/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1fc331dcaa50d42d451399c27253555544bff6e Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/plugins/__init__.py b/MLPY/Lib/site-packages/mlagents/plugins/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b63a39732d8027e1ab6c893725b3c6912af161d3 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/plugins/__init__.py @@ -0,0 +1,8 @@ +from typing import Dict, Any + +ML_AGENTS_STATS_WRITER = "mlagents.stats_writer" +ML_AGENTS_TRAINER_TYPE = "mlagents.trainer_type" + +# TODO: the real type is Dict[str, HyperparamSettings] +all_trainer_types: Dict[str, Any] = {} +all_trainer_settings: Dict[str, Any] = {} diff --git a/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a459379f72ef09fd864d2a2e2c1b54b40d2695a5 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/stats_writer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/stats_writer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e18dfb26ce23cb88d269c779720c333794ba811e Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/stats_writer.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/trainer_type.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/trainer_type.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..286875aa9a2ce89acbea53f6e8a6e20b6f49dba6 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/trainer_type.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/plugins/stats_writer.py b/MLPY/Lib/site-packages/mlagents/plugins/stats_writer.py new file mode 100644 index 0000000000000000000000000000000000000000..17acefd32e25f0a9e49b3f89fc4f90733f8a9495 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/plugins/stats_writer.py @@ -0,0 +1,72 @@ +import sys +from typing import List + +# importlib.metadata is new in python3.8 +# We use the backport for older python versions. +if sys.version_info < (3, 8): + import importlib_metadata +else: + import importlib.metadata as importlib_metadata # pylint: disable=E0611 + +from mlagents.trainers.stats import StatsWriter + +from mlagents_envs import logging_util +from mlagents.plugins import ML_AGENTS_STATS_WRITER +from mlagents.trainers.settings import RunOptions +from mlagents.trainers.stats import TensorboardWriter, GaugeWriter, ConsoleWriter + + +logger = logging_util.get_logger(__name__) + + +def get_default_stats_writers(run_options: RunOptions) -> List[StatsWriter]: + """ + The StatsWriters that mlagents-learn always uses: + * A TensorboardWriter to write information to TensorBoard + * A GaugeWriter to record our internal stats + * A ConsoleWriter to output to stdout. + """ + checkpoint_settings = run_options.checkpoint_settings + return [ + TensorboardWriter( + checkpoint_settings.write_path, + clear_past_data=not checkpoint_settings.resume, + hidden_keys=["Is Training", "Step"], + ), + GaugeWriter(), + ConsoleWriter(), + ] + + +def register_stats_writer_plugins(run_options: RunOptions) -> List[StatsWriter]: + """ + Registers all StatsWriter plugins (including the default one), + and evaluates them, and returns the list of all the StatsWriter implementations. + """ + all_stats_writers: List[StatsWriter] = [] + if ML_AGENTS_STATS_WRITER not in importlib_metadata.entry_points(): + logger.warning( + f"Unable to find any entry points for {ML_AGENTS_STATS_WRITER}, even the default ones. " + "Uninstalling and reinstalling ml-agents via pip should resolve. " + "Using default plugins for now." + ) + return get_default_stats_writers(run_options) + + entry_points = importlib_metadata.entry_points()[ML_AGENTS_STATS_WRITER] + + for entry_point in entry_points: + + try: + logger.debug(f"Initializing StatsWriter plugins: {entry_point.name}") + plugin_func = entry_point.load() + plugin_stats_writers = plugin_func(run_options) + logger.debug( + f"Found {len(plugin_stats_writers)} StatsWriters for plugin {entry_point.name}" + ) + all_stats_writers += plugin_stats_writers + except BaseException: + # Catch all exceptions from setting up the plugin, so that bad user code doesn't break things. + logger.exception( + f"Error initializing StatsWriter plugins for {entry_point.name}. This plugin will not be used." + ) + return all_stats_writers diff --git a/MLPY/Lib/site-packages/mlagents/plugins/trainer_type.py b/MLPY/Lib/site-packages/mlagents/plugins/trainer_type.py new file mode 100644 index 0000000000000000000000000000000000000000..2766368863caca5e74182c02a94429f09ba3c148 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/plugins/trainer_type.py @@ -0,0 +1,80 @@ +import sys +from typing import Dict, Tuple, Any + +# importlib.metadata is new in python3.8 +# We use the backport for older python versions. +if sys.version_info < (3, 8): + import importlib_metadata +else: + import importlib.metadata as importlib_metadata # pylint: disable=E0611 + + +from mlagents_envs import logging_util +from mlagents.plugins import ML_AGENTS_TRAINER_TYPE +from mlagents.trainers.ppo.trainer import PPOTrainer +from mlagents.trainers.sac.trainer import SACTrainer +from mlagents.trainers.poca.trainer import POCATrainer +from mlagents.trainers.ppo.optimizer_torch import PPOSettings +from mlagents.trainers.sac.optimizer_torch import SACSettings +from mlagents.trainers.poca.optimizer_torch import POCASettings +from mlagents import plugins as mla_plugins + +logger = logging_util.get_logger(__name__) + + +def get_default_trainer_types() -> Tuple[Dict[str, Any], Dict[str, Any]]: + """ + The Trainers that mlagents-learn always uses: + """ + + mla_plugins.all_trainer_types.update( + { + PPOTrainer.get_trainer_name(): PPOTrainer, + SACTrainer.get_trainer_name(): SACTrainer, + POCATrainer.get_trainer_name(): POCATrainer, + } + ) + # global all_trainer_settings + mla_plugins.all_trainer_settings.update( + { + PPOTrainer.get_trainer_name(): PPOSettings, + SACTrainer.get_trainer_name(): SACSettings, + POCATrainer.get_trainer_name(): POCASettings, + } + ) + + return mla_plugins.all_trainer_types, mla_plugins.all_trainer_settings + + +def register_trainer_plugins() -> Tuple[Dict[str, Any], Dict[str, Any]]: + """ + Registers all Trainer plugins (including the default one), + and evaluates them, and returns the list of all the Trainer implementations. + """ + if ML_AGENTS_TRAINER_TYPE not in importlib_metadata.entry_points(): + logger.warning( + f"Unable to find any entry points for {ML_AGENTS_TRAINER_TYPE}, even the default ones. " + "Uninstalling and reinstalling ml-agents via pip should resolve. " + "Using default plugins for now." + ) + return get_default_trainer_types() + + entry_points = importlib_metadata.entry_points()[ML_AGENTS_TRAINER_TYPE] + + for entry_point in entry_points: + + try: + logger.debug(f"Initializing Trainer plugins: {entry_point.name}") + plugin_func = entry_point.load() + plugin_trainer_types, plugin_trainer_settings = plugin_func() + logger.debug( + f"Found {len(plugin_trainer_types)} Trainers for plugin {entry_point.name}" + ) + mla_plugins.all_trainer_types.update(plugin_trainer_types) + mla_plugins.all_trainer_settings.update(plugin_trainer_settings) + except BaseException: + # Catch all exceptions from setting up the plugin, so that bad user code doesn't break things. + logger.exception( + f"Error initializing Trainer plugins for {entry_point.name}. This plugin will not be used." + ) + return mla_plugins.all_trainer_types, mla_plugins.all_trainer_settings diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/__init__.py b/MLPY/Lib/site-packages/mlagents/torch_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0acc96997d94d8d9968950e8db21c651484b8cf3 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/torch_utils/__init__.py @@ -0,0 +1,4 @@ +from mlagents.torch_utils.torch import torch as torch # noqa +from mlagents.torch_utils.torch import nn # noqa +from mlagents.torch_utils.torch import set_torch_config # noqa +from mlagents.torch_utils.torch import default_device # noqa diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7235c1375171c5f9647619ff49438ee14d50c4f Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/cpu_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/cpu_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3446f50f861006bb3be343b899dc27c687b12119 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/cpu_utils.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/globals.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/globals.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..515fc26b31cb944de9173b14744f98fdab276230 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/globals.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/torch.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/torch.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62fd647d9b47ccf7aac470754be1ee92fb125e19 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/torch.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/cpu_utils.py b/MLPY/Lib/site-packages/mlagents/torch_utils/cpu_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7f577375703e0066d596cef16f5d4660881b55f9 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/torch_utils/cpu_utils.py @@ -0,0 +1,41 @@ +from typing import Optional + +import os + + +def get_num_threads_to_use() -> Optional[int]: + """ + Gets the number of threads to use. For most problems, 4 is all you + need, but for smaller machines, we'd like to scale to less than that. + By default, PyTorch uses 1/2 of the available cores. + """ + num_cpus = _get_num_available_cpus() + return max(min(num_cpus // 2, 4), 1) if num_cpus is not None else None + + +def _get_num_available_cpus() -> Optional[int]: + """ + Returns number of CPUs using cgroups if possible. This accounts + for Docker containers that are limited in cores. + """ + period = _read_in_integer_file("/sys/fs/cgroup/cpu/cpu.cfs_period_us") + quota = _read_in_integer_file("/sys/fs/cgroup/cpu/cpu.cfs_quota_us") + share = _read_in_integer_file("/sys/fs/cgroup/cpu/cpu.shares") + is_kubernetes = os.getenv("KUBERNETES_SERVICE_HOST") is not None + + if period > 0 and quota > 0: + return int(quota // period) + elif period > 0 and share > 0 and is_kubernetes: + # In kubernetes, each requested CPU is 1024 CPU shares + # https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#how-pods-with-resource-limits-are-run + return int(share // 1024) + else: + return os.cpu_count() + + +def _read_in_integer_file(filename: str) -> int: + try: + with open(filename) as f: + return int(f.read().rstrip()) + except FileNotFoundError: + return -1 diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/globals.py b/MLPY/Lib/site-packages/mlagents/torch_utils/globals.py new file mode 100644 index 0000000000000000000000000000000000000000..99705b1067305890316036cb700845af626f9a59 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/torch_utils/globals.py @@ -0,0 +1,13 @@ +from typing import Optional + +_rank: Optional[int] = None + + +def get_rank() -> Optional[int]: + """ + Returns the rank (in the MPI sense) of the current node. + For local training, this will always be None. + If this needs to be used, it should be done from outside ml-agents. + :return: + """ + return _rank diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/torch.py b/MLPY/Lib/site-packages/mlagents/torch_utils/torch.py new file mode 100644 index 0000000000000000000000000000000000000000..24dc45cca3ff5ca99300fff7d86ffc27c0fc49b9 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/torch_utils/torch.py @@ -0,0 +1,68 @@ +import os + +from distutils.version import LooseVersion +import pkg_resources +from mlagents.torch_utils import cpu_utils +from mlagents.trainers.settings import TorchSettings +from mlagents_envs.logging_util import get_logger + + +logger = get_logger(__name__) + + +def assert_torch_installed(): + # Check that torch version 1.6.0 or later has been installed. If not, refer + # user to the PyTorch webpage for install instructions. + torch_pkg = None + try: + torch_pkg = pkg_resources.get_distribution("torch") + except pkg_resources.DistributionNotFound: + pass + assert torch_pkg is not None and LooseVersion(torch_pkg.version) >= LooseVersion( + "1.6.0" + ), ( + "A compatible version of PyTorch was not installed. Please visit the PyTorch homepage " + + "(https://pytorch.org/get-started/locally/) and follow the instructions to install. " + + "Version 1.6.0 and later are supported." + ) + + +assert_torch_installed() + +# This should be the only place that we import torch directly. +# Everywhere else is caught by the banned-modules setting for flake8 +import torch # noqa I201 + + +torch.set_num_threads(cpu_utils.get_num_threads_to_use()) +os.environ["KMP_BLOCKTIME"] = "0" + + +_device = torch.device("cpu") + + +def set_torch_config(torch_settings: TorchSettings) -> None: + global _device + + if torch_settings.device is None: + device_str = "cuda" if torch.cuda.is_available() else "cpu" + else: + device_str = torch_settings.device + + _device = torch.device(device_str) + + if _device.type == "cuda": + torch.set_default_tensor_type(torch.cuda.FloatTensor) + else: + torch.set_default_tensor_type(torch.FloatTensor) + logger.debug(f"default Torch device: {_device}") + + +# Initialize to default settings +set_torch_config(TorchSettings(device=None)) + +nn = torch.nn + + +def default_device(): + return _device diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8476df42fa19998794ea2bcbe8c61682f45f5df5 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/__init__.py @@ -0,0 +1,5 @@ +# Version of the library that will be used to upload to pypi +__version__ = "0.30.0" + +# Git tag that will be checked to determine whether to trigger upload to pypi +__release_tag__ = "release_20" diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92fcaded558ee2337e7c708d6581b2390239b53c Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/action_info.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/action_info.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..335cd4c1c9ea86f970eae15c8f914dc45c390cea Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/action_info.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/agent_processor.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/agent_processor.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e172529a6e1e7ec4771d30ae304ffb4bb0826df Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/agent_processor.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/behavior_id_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/behavior_id_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4ca00581e6ad646ea4bac8c71e0f568a76df96c Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/behavior_id_utils.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/buffer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/buffer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cc366be6891c31a25958da748dcfa1b1491b6288 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/buffer.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/cli_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/cli_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d040950c5e69fd25c8d931e37bc07b0b8f6ab9c Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/cli_utils.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/demo_loader.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/demo_loader.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92d4e5593951b1cab2ee2e2d85a2710b34559ce0 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/demo_loader.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/directory_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/directory_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d60eea7e059dca67f6bc64ebbb72f19596eb54d Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/directory_utils.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/env_manager.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/env_manager.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cbdbdbd4865c896d0e37fa2cc936dc45642a8bb1 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/env_manager.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/environment_parameter_manager.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/environment_parameter_manager.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..154850531d958f4f58ba44dd7e853ccef8864f86 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/environment_parameter_manager.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/exception.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/exception.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5afbf16ca335670aa26551916143ab1ddb6c413a Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/exception.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/learn.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/learn.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb25f1b64c5a0df78e195cf834c81384166609dc Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/learn.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/run_experiment.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/run_experiment.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4956e2b0b085fd5b517dd41bd744a36312b96600 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/run_experiment.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/settings.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/settings.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d79cc905dc8546cd7e38fe83a948d02e5f31df4 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/settings.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/simple_env_manager.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/simple_env_manager.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eca690e9d68e59d12c319c68a9cf7603f7b6d7bf Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/simple_env_manager.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/stats.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/stats.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38dab601aa83e3d4113ad3cf2d9aff78402c42a5 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/stats.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/subprocess_env_manager.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/subprocess_env_manager.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2aa86d7bd7840e34c9bf8b19b77fab7e617951c Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/subprocess_env_manager.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/trainer_controller.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/trainer_controller.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40cbcc954fc21678fd4fca10a38be9ef7fbc8934 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/trainer_controller.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/training_analytics_side_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/training_analytics_side_channel.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c443c32390d6263ba3c81a5b388398aeb9fe222b Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/training_analytics_side_channel.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/training_status.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/training_status.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d60965cfd69a9615653cfddf8ec47f406a8342b9 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/training_status.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/trajectory.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/trajectory.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50bedcdf4fa8a07cee4c6cee2f2dfae5ba137dd6 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/trajectory.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/upgrade_config.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/upgrade_config.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38487e42cc76f6f3851293d9248d8ac8ccfd24cc Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/upgrade_config.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/action_info.py b/MLPY/Lib/site-packages/mlagents/trainers/action_info.py new file mode 100644 index 0000000000000000000000000000000000000000..c0ec02327116119c3115891c4acea4227d3791f3 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/action_info.py @@ -0,0 +1,25 @@ +from typing import NamedTuple, Any, Dict, List +import numpy as np +from mlagents_envs.base_env import AgentId + +ActionInfoOutputs = Dict[str, np.ndarray] + + +class ActionInfo(NamedTuple): + """ + A NamedTuple containing actions and related quantities to the policy forward + pass. Additionally contains the agent ids in the corresponding DecisionStep + :param action: The action output of the policy + :param env_action: The possibly clipped action to be executed in the environment + :param outputs: Dict of all quantities associated with the policy forward pass + :param agent_ids: List of int agent ids in DecisionStep + """ + + action: Any + env_action: Any + outputs: ActionInfoOutputs + agent_ids: List[AgentId] + + @staticmethod + def empty() -> "ActionInfo": + return ActionInfo([], [], {}, []) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/agent_processor.py b/MLPY/Lib/site-packages/mlagents/trainers/agent_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..720f3d14bdc9060b8def61b5b8fbead30bbf375e --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/agent_processor.py @@ -0,0 +1,469 @@ +import sys +import numpy as np +from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union +from collections import defaultdict, Counter +import queue +from mlagents.torch_utils import torch + +from mlagents_envs.base_env import ( + ActionTuple, + DecisionSteps, + DecisionStep, + TerminalSteps, + TerminalStep, +) +from mlagents_envs.side_channel.stats_side_channel import ( + StatsAggregationMethod, + EnvironmentStats, +) +from mlagents.trainers.exception import UnityTrainerException +from mlagents.trainers.trajectory import AgentStatus, Trajectory, AgentExperience +from mlagents.trainers.policy import Policy +from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs +from mlagents.trainers.stats import StatsReporter +from mlagents.trainers.behavior_id_utils import ( + get_global_agent_id, + get_global_group_id, + GlobalAgentId, + GlobalGroupId, +) +from mlagents.trainers.torch_entities.action_log_probs import LogProbsTuple +from mlagents.trainers.torch_entities.utils import ModelUtils + +T = TypeVar("T") + + +class AgentProcessor: + """ + AgentProcessor contains a dictionary per-agent trajectory buffers. The buffers are indexed by agent_id. + Buffer also contains an update_buffer that corresponds to the buffer used when updating the model. + One AgentProcessor should be created per agent group. + """ + + def __init__( + self, + policy: Policy, + behavior_id: str, + stats_reporter: StatsReporter, + max_trajectory_length: int = sys.maxsize, + ): + """ + Create an AgentProcessor. + + :param trainer: Trainer instance connected to this AgentProcessor. Trainer is given trajectory + when it is finished. + :param policy: Policy instance associated with this AgentProcessor. + :param max_trajectory_length: Maximum length of a trajectory before it is added to the trainer. + :param stats_category: The category under which to write the stats. Usually, this comes from the Trainer. + """ + self._experience_buffers: Dict[ + GlobalAgentId, List[AgentExperience] + ] = defaultdict(list) + self._last_step_result: Dict[GlobalAgentId, Tuple[DecisionStep, int]] = {} + # current_group_obs is used to collect the current (i.e. the most recently seen) + # obs of all the agents in the same group, and assemble the group obs. + # It is a dictionary of GlobalGroupId to dictionaries of GlobalAgentId to observation. + self._current_group_obs: Dict[ + GlobalGroupId, Dict[GlobalAgentId, List[np.ndarray]] + ] = defaultdict(lambda: defaultdict(list)) + # group_status is used to collect the current, most recently seen + # group status of all the agents in the same group, and assemble the group's status. + # It is a dictionary of GlobalGroupId to dictionaries of GlobalAgentId to AgentStatus. + self._group_status: Dict[ + GlobalGroupId, Dict[GlobalAgentId, AgentStatus] + ] = defaultdict(lambda: defaultdict(None)) + # last_take_action_outputs stores the action a_t taken before the current observation s_(t+1), while + # grabbing previous_action from the policy grabs the action PRIOR to that, a_(t-1). + self._last_take_action_outputs: Dict[GlobalAgentId, ActionInfoOutputs] = {} + + self._episode_steps: Counter = Counter() + self._episode_rewards: Dict[GlobalAgentId, float] = defaultdict(float) + self._stats_reporter = stats_reporter + self._max_trajectory_length = max_trajectory_length + self._trajectory_queues: List[AgentManagerQueue[Trajectory]] = [] + self._behavior_id = behavior_id + + # Note: In the future this policy reference will be the policy of the env_manager and not the trainer. + # We can in that case just grab the action from the policy rather than having it passed in. + self.policy = policy + + def add_experiences( + self, + decision_steps: DecisionSteps, + terminal_steps: TerminalSteps, + worker_id: int, + previous_action: ActionInfo, + ) -> None: + """ + Adds experiences to each agent's experience history. + :param decision_steps: current DecisionSteps. + :param terminal_steps: current TerminalSteps. + :param previous_action: The outputs of the Policy's get_action method. + """ + take_action_outputs = previous_action.outputs + if take_action_outputs: + try: + for _entropy in take_action_outputs["entropy"]: + if isinstance(_entropy, torch.Tensor): + _entropy = ModelUtils.to_numpy(_entropy) + self._stats_reporter.add_stat("Policy/Entropy", _entropy) + except KeyError: + pass + + # Make unique agent_ids that are global across workers + action_global_agent_ids = [ + get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids + ] + for global_id in action_global_agent_ids: + if global_id in self._last_step_result: # Don't store if agent just reset + self._last_take_action_outputs[global_id] = take_action_outputs + + # Iterate over all the terminal steps, first gather all the group obs + # and then create the AgentExperiences/Trajectories. _add_to_group_status + # stores Group statuses in a common data structure self.group_status + for terminal_step in terminal_steps.values(): + self._add_group_status_and_obs(terminal_step, worker_id) + for terminal_step in terminal_steps.values(): + local_id = terminal_step.agent_id + global_id = get_global_agent_id(worker_id, local_id) + self._process_step( + terminal_step, worker_id, terminal_steps.agent_id_to_index[local_id] + ) + + # Iterate over all the decision steps, first gather all the group obs + # and then create the trajectories. _add_to_group_status + # stores Group statuses in a common data structure self.group_status + for ongoing_step in decision_steps.values(): + self._add_group_status_and_obs(ongoing_step, worker_id) + for ongoing_step in decision_steps.values(): + local_id = ongoing_step.agent_id + self._process_step( + ongoing_step, worker_id, decision_steps.agent_id_to_index[local_id] + ) + # Clear the last seen group obs when agents die, but only after all of the group + # statuses were added to the trajectory. + for terminal_step in terminal_steps.values(): + local_id = terminal_step.agent_id + global_id = get_global_agent_id(worker_id, local_id) + self._clear_group_status_and_obs(global_id) + + for _gid in action_global_agent_ids: + # If the ID doesn't have a last step result, the agent just reset, + # don't store the action. + if _gid in self._last_step_result: + if "action" in take_action_outputs: + self.policy.save_previous_action( + [_gid], take_action_outputs["action"] + ) + + def _add_group_status_and_obs( + self, step: Union[TerminalStep, DecisionStep], worker_id: int + ) -> None: + """ + Takes a TerminalStep or DecisionStep and adds the information in it + to self.group_status. This information can then be retrieved + when constructing trajectories to get the status of group mates. Also stores the current + observation into current_group_obs, to be used to get the next group observations + for bootstrapping. + :param step: TerminalStep or DecisionStep + :param worker_id: Worker ID of this particular environment. Used to generate a + global group id. + """ + global_agent_id = get_global_agent_id(worker_id, step.agent_id) + stored_decision_step, idx = self._last_step_result.get( + global_agent_id, (None, None) + ) + stored_take_action_outputs = self._last_take_action_outputs.get( + global_agent_id, None + ) + if stored_decision_step is not None and stored_take_action_outputs is not None: + # 0, the default group_id, means that the agent doesn't belong to an agent group. + # If 0, don't add any groupmate information. + if step.group_id > 0: + global_group_id = get_global_group_id(worker_id, step.group_id) + stored_actions = stored_take_action_outputs["action"] + action_tuple = ActionTuple( + continuous=stored_actions.continuous[idx], + discrete=stored_actions.discrete[idx], + ) + group_status = AgentStatus( + obs=stored_decision_step.obs, + reward=step.reward, + action=action_tuple, + done=isinstance(step, TerminalStep), + ) + self._group_status[global_group_id][global_agent_id] = group_status + self._current_group_obs[global_group_id][global_agent_id] = step.obs + + def _clear_group_status_and_obs(self, global_id: GlobalAgentId) -> None: + """ + Clears an agent from self._group_status and self._current_group_obs. + """ + self._delete_in_nested_dict(self._current_group_obs, global_id) + self._delete_in_nested_dict(self._group_status, global_id) + + def _delete_in_nested_dict(self, nested_dict: Dict[str, Any], key: str) -> None: + for _manager_id in list(nested_dict.keys()): + _team_group = nested_dict[_manager_id] + self._safe_delete(_team_group, key) + if not _team_group: # if dict is empty + self._safe_delete(nested_dict, _manager_id) + + def _process_step( + self, step: Union[TerminalStep, DecisionStep], worker_id: int, index: int + ) -> None: + terminated = isinstance(step, TerminalStep) + global_agent_id = get_global_agent_id(worker_id, step.agent_id) + global_group_id = get_global_group_id(worker_id, step.group_id) + stored_decision_step, idx = self._last_step_result.get( + global_agent_id, (None, None) + ) + stored_take_action_outputs = self._last_take_action_outputs.get( + global_agent_id, None + ) + if not terminated: + # Index is needed to grab from last_take_action_outputs + self._last_step_result[global_agent_id] = (step, index) + + # This state is the consequence of a past action + if stored_decision_step is not None and stored_take_action_outputs is not None: + obs = stored_decision_step.obs + if self.policy.use_recurrent: + memory = self.policy.retrieve_previous_memories([global_agent_id])[0, :] + else: + memory = None + done = terminated # Since this is an ongoing step + interrupted = step.interrupted if terminated else False + # Add the outputs of the last eval + stored_actions = stored_take_action_outputs["action"] + action_tuple = ActionTuple( + continuous=stored_actions.continuous[idx], + discrete=stored_actions.discrete[idx], + ) + try: + stored_action_probs = stored_take_action_outputs["log_probs"] + if not isinstance(stored_action_probs, LogProbsTuple): + stored_action_probs = stored_action_probs.to_log_probs_tuple() + log_probs_tuple = LogProbsTuple( + continuous=stored_action_probs.continuous[idx], + discrete=stored_action_probs.discrete[idx], + ) + except KeyError: + log_probs_tuple = LogProbsTuple.empty_log_probs() + + action_mask = stored_decision_step.action_mask + prev_action = self.policy.retrieve_previous_action([global_agent_id])[0, :] + + # Assemble teammate_obs. If none saved, then it will be an empty list. + group_statuses = [] + for _id, _mate_status in self._group_status[global_group_id].items(): + if _id != global_agent_id: + group_statuses.append(_mate_status) + + experience = AgentExperience( + obs=obs, + reward=step.reward, + done=done, + action=action_tuple, + action_probs=log_probs_tuple, + action_mask=action_mask, + prev_action=prev_action, + interrupted=interrupted, + memory=memory, + group_status=group_statuses, + group_reward=step.group_reward, + ) + # Add the value outputs if needed + self._experience_buffers[global_agent_id].append(experience) + self._episode_rewards[global_agent_id] += step.reward + if not terminated: + self._episode_steps[global_agent_id] += 1 + + # Add a trajectory segment to the buffer if terminal or the length has reached the time horizon + if ( + len(self._experience_buffers[global_agent_id]) + >= self._max_trajectory_length + or terminated + ): + next_obs = step.obs + next_group_obs = [] + for _id, _obs in self._current_group_obs[global_group_id].items(): + if _id != global_agent_id: + next_group_obs.append(_obs) + + trajectory = Trajectory( + steps=self._experience_buffers[global_agent_id], + agent_id=global_agent_id, + next_obs=next_obs, + next_group_obs=next_group_obs, + behavior_id=self._behavior_id, + ) + for traj_queue in self._trajectory_queues: + traj_queue.put(trajectory) + self._experience_buffers[global_agent_id] = [] + if terminated: + # Record episode length. + self._stats_reporter.add_stat( + "Environment/Episode Length", + self._episode_steps.get(global_agent_id, 0), + ) + self._clean_agent_data(global_agent_id) + + def _clean_agent_data(self, global_id: GlobalAgentId) -> None: + """ + Removes the data for an Agent. + """ + self._safe_delete(self._experience_buffers, global_id) + self._safe_delete(self._last_take_action_outputs, global_id) + self._safe_delete(self._last_step_result, global_id) + self._safe_delete(self._episode_steps, global_id) + self._safe_delete(self._episode_rewards, global_id) + self.policy.remove_previous_action([global_id]) + self.policy.remove_memories([global_id]) + + def _safe_delete(self, my_dictionary: Dict[Any, Any], key: Any) -> None: + """ + Safe removes data from a dictionary. If not found, + don't delete. + """ + if key in my_dictionary: + del my_dictionary[key] + + def publish_trajectory_queue( + self, trajectory_queue: "AgentManagerQueue[Trajectory]" + ) -> None: + """ + Adds a trajectory queue to the list of queues to publish to when this AgentProcessor + assembles a Trajectory + :param trajectory_queue: Trajectory queue to publish to. + """ + self._trajectory_queues.append(trajectory_queue) + + def end_episode(self) -> None: + """ + Ends the episode, terminating the current trajectory and stopping stats collection for that + episode. Used for forceful reset (e.g. in curriculum or generalization training.) + """ + all_gids = list(self._experience_buffers.keys()) # Need to make copy + for _gid in all_gids: + self._clean_agent_data(_gid) + + +class AgentManagerQueue(Generic[T]): + """ + Queue used by the AgentManager. Note that we make our own class here because in most implementations + deque is sufficient and faster. However, if we want to switch to multiprocessing, we'll need to change + out this implementation. + """ + + class Empty(Exception): + """ + Exception for when the queue is empty. + """ + + pass + + def __init__(self, behavior_id: str, maxlen: int = 0): + """ + Initializes an AgentManagerQueue. Note that we can give it a behavior_id so that it can be identified + separately from an AgentManager. + """ + self._maxlen: int = maxlen + self._queue: queue.Queue = queue.Queue(maxsize=maxlen) + self._behavior_id = behavior_id + + @property + def maxlen(self): + """ + The maximum length of the queue. + :return: Maximum length of the queue. + """ + return self._maxlen + + @property + def behavior_id(self): + """ + The Behavior ID of this queue. + :return: Behavior ID associated with the queue. + """ + return self._behavior_id + + def qsize(self) -> int: + """ + Returns the approximate size of the queue. Note that values may differ + depending on the underlying queue implementation. + """ + return self._queue.qsize() + + def empty(self) -> bool: + return self._queue.empty() + + def get_nowait(self) -> T: + """ + Gets the next item from the queue, throwing an AgentManagerQueue.Empty exception + if the queue is empty. + """ + try: + return self._queue.get_nowait() + except queue.Empty: + raise self.Empty("The AgentManagerQueue is empty.") + + def put(self, item: T) -> None: + self._queue.put(item) + + +class AgentManager(AgentProcessor): + """ + An AgentManager is an AgentProcessor that also holds a single trajectory and policy queue. + Note: this leaves room for adding AgentProcessors that publish multiple trajectory queues. + """ + + def __init__( + self, + policy: Policy, + behavior_id: str, + stats_reporter: StatsReporter, + max_trajectory_length: int = sys.maxsize, + threaded: bool = True, + ): + super().__init__(policy, behavior_id, stats_reporter, max_trajectory_length) + trajectory_queue_len = 20 if threaded else 0 + self.trajectory_queue: AgentManagerQueue[Trajectory] = AgentManagerQueue( + self._behavior_id, maxlen=trajectory_queue_len + ) + # NOTE: we make policy queues of infinite length to avoid lockups of the trainers. + # In the environment manager, we make sure to empty the policy queue before continuing to produce steps. + self.policy_queue: AgentManagerQueue[Policy] = AgentManagerQueue( + self._behavior_id, maxlen=0 + ) + self.publish_trajectory_queue(self.trajectory_queue) + + def record_environment_stats( + self, env_stats: EnvironmentStats, worker_id: int + ) -> None: + """ + Pass stats from the environment to the StatsReporter. + Depending on the StatsAggregationMethod, either StatsReporter.add_stat or StatsReporter.set_stat is used. + The worker_id is used to determine whether StatsReporter.set_stat should be used. + + :param env_stats: + :param worker_id: + :return: + """ + for stat_name, value_list in env_stats.items(): + for val, agg_type in value_list: + if agg_type == StatsAggregationMethod.AVERAGE: + self._stats_reporter.add_stat(stat_name, val, agg_type) + elif agg_type == StatsAggregationMethod.SUM: + self._stats_reporter.add_stat(stat_name, val, agg_type) + elif agg_type == StatsAggregationMethod.HISTOGRAM: + self._stats_reporter.add_stat(stat_name, val, agg_type) + elif agg_type == StatsAggregationMethod.MOST_RECENT: + # In order to prevent conflicts between multiple environments, + # only stats from the first environment are recorded. + if worker_id == 0: + self._stats_reporter.set_stat(stat_name, val) + else: + raise UnityTrainerException( + f"Unknown StatsAggregationMethod encountered. {agg_type}" + ) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/behavior_id_utils.py b/MLPY/Lib/site-packages/mlagents/trainers/behavior_id_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c23069161b8996c2a148b87ca884589cc0c7e8e3 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/behavior_id_utils.py @@ -0,0 +1,64 @@ +from typing import NamedTuple +from urllib.parse import urlparse, parse_qs +from mlagents_envs.base_env import AgentId, GroupId + +GlobalGroupId = str +GlobalAgentId = str + + +class BehaviorIdentifiers(NamedTuple): + """ + BehaviorIdentifiers is a named tuple of the identifiers that uniquely distinguish + an agent encountered in the trainer_controller. The named tuple consists of the + fully qualified behavior name, the name of the brain name (corresponds to trainer + in the trainer controller) and the team id. In the future, this can be extended + to support further identifiers. + """ + + behavior_id: str + brain_name: str + team_id: int + + @staticmethod + def from_name_behavior_id(name_behavior_id: str) -> "BehaviorIdentifiers": + """ + Parses a name_behavior_id of the form name?team=0 + into a BehaviorIdentifiers NamedTuple. + This allows you to access the brain name and team id of an agent + :param name_behavior_id: String of behavior params in HTTP format. + :returns: A BehaviorIdentifiers object. + """ + + parsed = urlparse(name_behavior_id) + name = parsed.path + ids = parse_qs(parsed.query) + team_id: int = 0 + if "team" in ids: + team_id = int(ids["team"][0]) + return BehaviorIdentifiers( + behavior_id=name_behavior_id, brain_name=name, team_id=team_id + ) + + +def create_name_behavior_id(name: str, team_id: int) -> str: + """ + Reconstructs fully qualified behavior name from name and team_id + :param name: brain name + :param team_id: team ID + :return: name_behavior_id + """ + return name + "?team=" + str(team_id) + + +def get_global_agent_id(worker_id: int, agent_id: AgentId) -> GlobalAgentId: + """ + Create an agent id that is unique across environment workers using the worker_id. + """ + return f"agent_{worker_id}-{agent_id}" + + +def get_global_group_id(worker_id: int, group_id: GroupId) -> GlobalGroupId: + """ + Create a group id that is unique across environment workers when using the worker_id. + """ + return f"group_{worker_id}-{group_id}" diff --git a/MLPY/Lib/site-packages/mlagents/trainers/buffer.py b/MLPY/Lib/site-packages/mlagents/trainers/buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..ea6a2d51114906d9c755f4b7d0a411cdd345846e --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/buffer.py @@ -0,0 +1,521 @@ +from collections import defaultdict +from collections.abc import MutableMapping +import enum +import itertools +from typing import BinaryIO, DefaultDict, List, Tuple, Union, Optional + +import numpy as np +import h5py + +from mlagents_envs.exception import UnityException + +# Elements in the buffer can be np.ndarray, or in the case of teammate obs, actions, rewards, +# a List of np.ndarray. This is done so that we don't have duplicated np.ndarrays, only references. +BufferEntry = Union[np.ndarray, List[np.ndarray]] + + +class BufferException(UnityException): + """ + Related to errors with the Buffer. + """ + + pass + + +class BufferKey(enum.Enum): + ACTION_MASK = "action_mask" + CONTINUOUS_ACTION = "continuous_action" + NEXT_CONT_ACTION = "next_continuous_action" + CONTINUOUS_LOG_PROBS = "continuous_log_probs" + DISCRETE_ACTION = "discrete_action" + NEXT_DISC_ACTION = "next_discrete_action" + DISCRETE_LOG_PROBS = "discrete_log_probs" + DONE = "done" + ENVIRONMENT_REWARDS = "environment_rewards" + MASKS = "masks" + MEMORY = "memory" + CRITIC_MEMORY = "critic_memory" + BASELINE_MEMORY = "poca_baseline_memory" + PREV_ACTION = "prev_action" + + ADVANTAGES = "advantages" + DISCOUNTED_RETURNS = "discounted_returns" + + GROUP_DONES = "group_dones" + GROUPMATE_REWARDS = "groupmate_reward" + GROUP_REWARD = "group_reward" + GROUP_CONTINUOUS_ACTION = "group_continuous_action" + GROUP_DISCRETE_ACTION = "group_discrete_aaction" + GROUP_NEXT_CONT_ACTION = "group_next_cont_action" + GROUP_NEXT_DISC_ACTION = "group_next_disc_action" + + +class ObservationKeyPrefix(enum.Enum): + OBSERVATION = "obs" + NEXT_OBSERVATION = "next_obs" + + GROUP_OBSERVATION = "group_obs" + NEXT_GROUP_OBSERVATION = "next_group_obs" + + +class RewardSignalKeyPrefix(enum.Enum): + # Reward signals + REWARDS = "rewards" + VALUE_ESTIMATES = "value_estimates" + RETURNS = "returns" + ADVANTAGE = "advantage" + BASELINES = "baselines" + + +AgentBufferKey = Union[ + BufferKey, Tuple[ObservationKeyPrefix, int], Tuple[RewardSignalKeyPrefix, str] +] + + +class RewardSignalUtil: + @staticmethod + def rewards_key(name: str) -> AgentBufferKey: + return RewardSignalKeyPrefix.REWARDS, name + + @staticmethod + def value_estimates_key(name: str) -> AgentBufferKey: + return RewardSignalKeyPrefix.RETURNS, name + + @staticmethod + def returns_key(name: str) -> AgentBufferKey: + return RewardSignalKeyPrefix.RETURNS, name + + @staticmethod + def advantage_key(name: str) -> AgentBufferKey: + return RewardSignalKeyPrefix.ADVANTAGE, name + + @staticmethod + def baseline_estimates_key(name: str) -> AgentBufferKey: + return RewardSignalKeyPrefix.BASELINES, name + + +class AgentBufferField(list): + """ + AgentBufferField is a list of numpy arrays, or List[np.ndarray] for group entries. + When an agent collects a field, you can add it to its AgentBufferField with the append method. + """ + + def __init__(self, *args, **kwargs): + self.padding_value = 0 + super().__init__(*args, **kwargs) + + def __str__(self) -> str: + return f"AgentBufferField: {super().__str__()}" + + def __getitem__(self, index): + return_data = super().__getitem__(index) + if isinstance(return_data, list): + return AgentBufferField(return_data) + else: + return return_data + + @property + def contains_lists(self) -> bool: + """ + Checks whether this AgentBufferField contains List[np.ndarray]. + """ + return len(self) > 0 and isinstance(self[0], list) + + def append(self, element: BufferEntry, padding_value: float = 0.0) -> None: + """ + Adds an element to this list. Also lets you change the padding + type, so that it can be set on append (e.g. action_masks should + be padded with 1.) + :param element: The element to append to the list. + :param padding_value: The value used to pad when get_batch is called. + """ + super().append(element) + self.padding_value = padding_value + + def set(self, data: List[BufferEntry]) -> None: + """ + Sets the list of BufferEntry to the input data + :param data: The BufferEntry list to be set. + """ + self[:] = data + + def get_batch( + self, + batch_size: int = None, + training_length: Optional[int] = 1, + sequential: bool = True, + ) -> List[BufferEntry]: + """ + Retrieve the last batch_size elements of length training_length + from the list of np.array + :param batch_size: The number of elements to retrieve. If None: + All elements will be retrieved. + :param training_length: The length of the sequence to be retrieved. If + None: only takes one element. + :param sequential: If true and training_length is not None: the elements + will not repeat in the sequence. [a,b,c,d,e] with training_length = 2 and + sequential=True gives [[0,a],[b,c],[d,e]]. If sequential=False gives + [[a,b],[b,c],[c,d],[d,e]] + """ + if training_length is None: + training_length = 1 + if sequential: + # The sequences will not have overlapping elements (this involves padding) + leftover = len(self) % training_length + # leftover is the number of elements in the first sequence (this sequence might need 0 padding) + if batch_size is None: + # retrieve the maximum number of elements + batch_size = len(self) // training_length + 1 * (leftover != 0) + # The maximum number of sequences taken from a list of length len(self) without overlapping + # with padding is equal to batch_size + if batch_size > (len(self) // training_length + 1 * (leftover != 0)): + raise BufferException( + "The batch size and training length requested for get_batch where" + " too large given the current number of data points." + ) + if batch_size * training_length > len(self): + if self.contains_lists: + padding = [] + else: + # We want to duplicate the last value in the array, multiplied by the padding_value. + padding = np.array(self[-1], dtype=np.float32) * self.padding_value + return self[:] + [padding] * (training_length - leftover) + + else: + return self[len(self) - batch_size * training_length :] + else: + # The sequences will have overlapping elements + if batch_size is None: + # retrieve the maximum number of elements + batch_size = len(self) - training_length + 1 + # The number of sequences of length training_length taken from a list of len(self) elements + # with overlapping is equal to batch_size + if (len(self) - training_length + 1) < batch_size: + raise BufferException( + "The batch size and training length requested for get_batch where" + " too large given the current number of data points." + ) + tmp_list: List[np.ndarray] = [] + for end in range(len(self) - batch_size + 1, len(self) + 1): + tmp_list += self[end - training_length : end] + return tmp_list + + def reset_field(self) -> None: + """ + Resets the AgentBufferField + """ + self[:] = [] + + def padded_to_batch( + self, pad_value: np.float = 0, dtype: np.dtype = np.float32 + ) -> Union[np.ndarray, List[np.ndarray]]: + """ + Converts this AgentBufferField (which is a List[BufferEntry]) into a numpy array + with first dimension equal to the length of this AgentBufferField. If this AgentBufferField + contains a List[List[BufferEntry]] (i.e., in the case of group observations), return a List + containing numpy arrays or tensors, of length equal to the maximum length of an entry. Missing + For entries with less than that length, the array will be padded with pad_value. + :param pad_value: Value to pad List AgentBufferFields, when there are less than the maximum + number of agents present. + :param dtype: Dtype of output numpy array. + :return: Numpy array or List of numpy arrays representing this AgentBufferField, where the first + dimension is equal to the length of the AgentBufferField. + """ + if len(self) > 0 and not isinstance(self[0], list): + return np.asanyarray(self, dtype=dtype) + + shape = None + for _entry in self: + # _entry could be an empty list if there are no group agents in this + # step. Find the first non-empty list and use that shape. + if _entry: + shape = _entry[0].shape + break + # If there were no groupmate agents in the entire batch, return an empty List. + if shape is None: + return [] + + # Convert to numpy array while padding with 0's + new_list = list( + map( + lambda x: np.asanyarray(x, dtype=dtype), + itertools.zip_longest(*self, fillvalue=np.full(shape, pad_value)), + ) + ) + return new_list + + def to_ndarray(self): + """ + Returns the AgentBufferField which is a list of numpy ndarrays (or List[np.ndarray]) as an ndarray. + """ + return np.array(self) + + +class AgentBuffer(MutableMapping): + """ + AgentBuffer contains a dictionary of AgentBufferFields. Each agent has his own AgentBuffer. + The keys correspond to the name of the field. Example: state, action + """ + + # Whether or not to validate the types of keys at runtime + # This should be off for training, but enabled for testing + CHECK_KEY_TYPES_AT_RUNTIME = False + + def __init__(self): + self.last_brain_info = None + self.last_take_action_outputs = None + self._fields: DefaultDict[AgentBufferKey, AgentBufferField] = defaultdict( + AgentBufferField + ) + + def __str__(self): + return ", ".join([f"'{k}' : {str(self[k])}" for k in self._fields.keys()]) + + def reset_agent(self) -> None: + """ + Resets the AgentBuffer + """ + for f in self._fields.values(): + f.reset_field() + self.last_brain_info = None + self.last_take_action_outputs = None + + @staticmethod + def _check_key(key): + if isinstance(key, BufferKey): + return + if isinstance(key, tuple): + key0, key1 = key + if isinstance(key0, ObservationKeyPrefix): + if isinstance(key1, int): + return + raise KeyError(f"{key} has type ({type(key0)}, {type(key1)})") + if isinstance(key0, RewardSignalKeyPrefix): + if isinstance(key1, str): + return + raise KeyError(f"{key} has type ({type(key0)}, {type(key1)})") + raise KeyError(f"{key} is a {type(key)}") + + @staticmethod + def _encode_key(key: AgentBufferKey) -> str: + """ + Convert the key to a string representation so that it can be used for serialization. + """ + if isinstance(key, BufferKey): + return key.value + prefix, suffix = key + return f"{prefix.value}:{suffix}" + + @staticmethod + def _decode_key(encoded_key: str) -> AgentBufferKey: + """ + Convert the string representation back to a key after serialization. + """ + # Simple case: convert the string directly to a BufferKey + try: + return BufferKey(encoded_key) + except ValueError: + pass + + # Not a simple key, so split into two parts + prefix_str, _, suffix_str = encoded_key.partition(":") + + # See if it's an ObservationKeyPrefix first + try: + return ObservationKeyPrefix(prefix_str), int(suffix_str) + except ValueError: + pass + + # If not, it had better be a RewardSignalKeyPrefix + try: + return RewardSignalKeyPrefix(prefix_str), suffix_str + except ValueError: + raise ValueError(f"Unable to convert {encoded_key} to an AgentBufferKey") + + def __getitem__(self, key: AgentBufferKey) -> AgentBufferField: + if self.CHECK_KEY_TYPES_AT_RUNTIME: + self._check_key(key) + return self._fields[key] + + def __setitem__(self, key: AgentBufferKey, value: AgentBufferField) -> None: + if self.CHECK_KEY_TYPES_AT_RUNTIME: + self._check_key(key) + self._fields[key] = value + + def __delitem__(self, key: AgentBufferKey) -> None: + if self.CHECK_KEY_TYPES_AT_RUNTIME: + self._check_key(key) + self._fields.__delitem__(key) + + def __iter__(self): + return self._fields.__iter__() + + def __len__(self) -> int: + return self._fields.__len__() + + def __contains__(self, key): + if self.CHECK_KEY_TYPES_AT_RUNTIME: + self._check_key(key) + return self._fields.__contains__(key) + + def check_length(self, key_list: List[AgentBufferKey]) -> bool: + """ + Some methods will require that some fields have the same length. + check_length will return true if the fields in key_list + have the same length. + :param key_list: The fields which length will be compared + """ + if self.CHECK_KEY_TYPES_AT_RUNTIME: + for k in key_list: + self._check_key(k) + + if len(key_list) < 2: + return True + length = None + for key in key_list: + if key not in self._fields: + return False + if (length is not None) and (length != len(self[key])): + return False + length = len(self[key]) + return True + + def shuffle( + self, sequence_length: int, key_list: List[AgentBufferKey] = None + ) -> None: + """ + Shuffles the fields in key_list in a consistent way: The reordering will + be the same across fields. + :param key_list: The fields that must be shuffled. + """ + if key_list is None: + key_list = list(self._fields.keys()) + if not self.check_length(key_list): + raise BufferException( + "Unable to shuffle if the fields are not of same length" + ) + s = np.arange(len(self[key_list[0]]) // sequence_length) + np.random.shuffle(s) + for key in key_list: + buffer_field = self[key] + tmp: List[np.ndarray] = [] + for i in s: + tmp += buffer_field[i * sequence_length : (i + 1) * sequence_length] + buffer_field.set(tmp) + + def make_mini_batch(self, start: int, end: int) -> "AgentBuffer": + """ + Creates a mini-batch from buffer. + :param start: Starting index of buffer. + :param end: Ending index of buffer. + :return: Dict of mini batch. + """ + mini_batch = AgentBuffer() + for key, field in self._fields.items(): + # slicing AgentBufferField returns a List[Any} + mini_batch[key] = field[start:end] # type: ignore + return mini_batch + + def sample_mini_batch( + self, batch_size: int, sequence_length: int = 1 + ) -> "AgentBuffer": + """ + Creates a mini-batch from a random start and end. + :param batch_size: number of elements to withdraw. + :param sequence_length: Length of sequences to sample. + Number of sequences to sample will be batch_size/sequence_length. + """ + num_seq_to_sample = batch_size // sequence_length + mini_batch = AgentBuffer() + buff_len = self.num_experiences + num_sequences_in_buffer = buff_len // sequence_length + start_idxes = ( + np.random.randint(num_sequences_in_buffer, size=num_seq_to_sample) + * sequence_length + ) # Sample random sequence starts + for key in self: + buffer_field = self[key] + mb_list = (buffer_field[i : i + sequence_length] for i in start_idxes) + # See comparison of ways to make a list from a list of lists here: + # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists + mini_batch[key].set(list(itertools.chain.from_iterable(mb_list))) + return mini_batch + + def save_to_file(self, file_object: BinaryIO) -> None: + """ + Saves the AgentBuffer to a file-like object. + """ + with h5py.File(file_object, "w") as write_file: + for key, data in self.items(): + write_file.create_dataset( + self._encode_key(key), data=data, dtype="f", compression="gzip" + ) + + def load_from_file(self, file_object: BinaryIO) -> None: + """ + Loads the AgentBuffer from a file-like object. + """ + with h5py.File(file_object, "r") as read_file: + for key in list(read_file.keys()): + decoded_key = self._decode_key(key) + self[decoded_key] = AgentBufferField() + # extend() will convert the numpy array's first dimension into list + self[decoded_key].extend(read_file[key][()]) + + def truncate(self, max_length: int, sequence_length: int = 1) -> None: + """ + Truncates the buffer to a certain length. + + This can be slow for large buffers. We compensate by cutting further than we need to, so that + we're not truncating at each update. Note that we must truncate an integer number of sequence_lengths + param: max_length: The length at which to truncate the buffer. + """ + current_length = self.num_experiences + # make max_length an integer number of sequence_lengths + max_length -= max_length % sequence_length + if current_length > max_length: + for _key in self.keys(): + self[_key][:] = self[_key][current_length - max_length :] + + def resequence_and_append( + self, + target_buffer: "AgentBuffer", + key_list: List[AgentBufferKey] = None, + batch_size: int = None, + training_length: int = None, + ) -> None: + """ + Takes in a batch size and training length (sequence length), and appends this AgentBuffer to target_buffer + properly padded for LSTM use. Optionally, use key_list to restrict which fields are inserted into the new + buffer. + :param target_buffer: The buffer which to append the samples to. + :param key_list: The fields that must be added. If None: all fields will be appended. + :param batch_size: The number of elements that must be appended. If None: All of them will be. + :param training_length: The length of the samples that must be appended. If None: only takes one element. + """ + if key_list is None: + key_list = list(self.keys()) + if not self.check_length(key_list): + raise BufferException( + f"The length of the fields {key_list} were not of same length" + ) + for field_key in key_list: + target_buffer[field_key].extend( + self[field_key].get_batch( + batch_size=batch_size, training_length=training_length + ) + ) + + @property + def num_experiences(self) -> int: + """ + The number of agent experiences in the AgentBuffer, i.e. the length of the buffer. + + An experience consists of one element across all of the fields of this AgentBuffer. + Note that these all have to be the same length, otherwise shuffle and append_to_update_buffer + will fail. + """ + if self.values(): + return len(next(iter(self.values()))) + else: + return 0 diff --git a/MLPY/Lib/site-packages/mlagents/trainers/cli_utils.py b/MLPY/Lib/site-packages/mlagents/trainers/cli_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..de420c42a42c5519f196c502f77ad36f80346abb --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/cli_utils.py @@ -0,0 +1,331 @@ +from typing import Set, Dict, Any, TextIO +import os +import yaml +from mlagents.trainers.exception import TrainerConfigError +from mlagents_envs.environment import UnityEnvironment +import argparse +from mlagents_envs import logging_util + +logger = logging_util.get_logger(__name__) + + +class RaiseRemovedWarning(argparse.Action): + """ + Internal custom Action to raise warning when argument is called. + """ + + def __init__(self, nargs=0, **kwargs): + super().__init__(nargs=nargs, **kwargs) + + def __call__(self, arg_parser, namespace, values, option_string=None): + logger.warning(f"The command line argument {option_string} was removed.") + + +class DetectDefault(argparse.Action): + """ + Internal custom Action to help detect arguments that aren't default. + """ + + non_default_args: Set[str] = set() + + def __call__(self, arg_parser, namespace, values, option_string=None): + setattr(namespace, self.dest, values) + DetectDefault.non_default_args.add(self.dest) + + +class DetectDefaultStoreTrue(DetectDefault): + """ + Internal class to help detect arguments that aren't default. + Used for store_true arguments. + """ + + def __init__(self, nargs=0, **kwargs): + super().__init__(nargs=nargs, **kwargs) + + def __call__(self, arg_parser, namespace, values, option_string=None): + super().__call__(arg_parser, namespace, True, option_string) + + +class StoreConfigFile(argparse.Action): + """ + Custom Action to store the config file location not as part of the CLI args. + This is because we want to maintain an equivalence between the config file's + contents and the args themselves. + """ + + trainer_config_path: str + + def __call__(self, arg_parser, namespace, values, option_string=None): + delattr(namespace, self.dest) + StoreConfigFile.trainer_config_path = values + + +def _create_parser() -> argparse.ArgumentParser: + argparser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + argparser.add_argument( + "trainer_config_path", action=StoreConfigFile, nargs="?", default=None + ) + argparser.add_argument( + "--env", + default=None, + dest="env_path", + help="Path to the Unity executable to train", + action=DetectDefault, + ) + argparser.add_argument( + "--load", + default=False, + dest="load_model", + action=DetectDefaultStoreTrue, + help=argparse.SUPPRESS, # Deprecated but still usable for now. + ) + argparser.add_argument( + "--resume", + default=False, + dest="resume", + action=DetectDefaultStoreTrue, + help="Whether to resume training from a checkpoint. Specify a --run-id to use this option. " + "If set, the training code loads an already trained model to initialize the neural network " + "before resuming training. This option is only valid when the models exist, and have the same " + "behavior names as the current agents in your scene.", + ) + argparser.add_argument( + "--deterministic", + default=False, + dest="deterministic", + action=DetectDefaultStoreTrue, + help="Whether to select actions deterministically in policy. `dist.mean` for continuous action " + "space, and `dist.argmax` for deterministic action space ", + ) + argparser.add_argument( + "--force", + default=False, + dest="force", + action=DetectDefaultStoreTrue, + help="Whether to force-overwrite this run-id's existing summary and model data. (Without " + "this flag, attempting to train a model with a run-id that has been used before will throw " + "an error.", + ) + argparser.add_argument( + "--run-id", + default="ppo", + help="The identifier for the training run. This identifier is used to name the " + "subdirectories in which the trained model and summary statistics are saved as well " + "as the saved model itself. If you use TensorBoard to view the training statistics, " + "always set a unique run-id for each training run. (The statistics for all runs with the " + "same id are combined as if they were produced by a the same session.)", + action=DetectDefault, + ) + argparser.add_argument( + "--initialize-from", + metavar="RUN_ID", + default=None, + help="Specify a previously saved run ID from which to initialize the model from. " + "This can be used, for instance, to fine-tune an existing model on a new environment. " + "Note that the previously saved models must have the same behavior parameters as your " + "current environment.", + action=DetectDefault, + ) + argparser.add_argument( + "--seed", + default=-1, + type=int, + help="A number to use as a seed for the random number generator used by the training code", + action=DetectDefault, + ) + argparser.add_argument( + "--train", + default=False, + dest="train_model", + action=DetectDefaultStoreTrue, + help=argparse.SUPPRESS, + ) + argparser.add_argument( + "--inference", + default=False, + dest="inference", + action=DetectDefaultStoreTrue, + help="Whether to run in Python inference mode (i.e. no training). Use with --resume to load " + "a model trained with an existing run ID.", + ) + argparser.add_argument( + "--base-port", + default=UnityEnvironment.BASE_ENVIRONMENT_PORT, + type=int, + help="The starting port for environment communication. Each concurrent Unity environment " + "instance will get assigned a port sequentially, starting from the base-port. Each instance " + "will use the port (base_port + worker_id), where the worker_id is sequential IDs given to " + "each instance from 0 to (num_envs - 1). Note that when training using the Editor rather " + "than an executable, the base port will be ignored.", + action=DetectDefault, + ) + argparser.add_argument( + "--num-envs", + default=1, + type=int, + help="The number of concurrent Unity environment instances to collect experiences " + "from when training", + action=DetectDefault, + ) + + argparser.add_argument( + "--num-areas", + default=1, + type=int, + help="The number of parallel training areas in each Unity environment instance.", + action=DetectDefault, + ) + + argparser.add_argument( + "--debug", + default=False, + action=DetectDefaultStoreTrue, + help="Whether to enable debug-level logging for some parts of the code", + ) + argparser.add_argument( + "--env-args", + default=None, + nargs=argparse.REMAINDER, + help="Arguments passed to the Unity executable. Be aware that the standalone build will also " + "process these as Unity Command Line Arguments. You should choose different argument names if " + "you want to create environment-specific arguments. All arguments after this flag will be " + "passed to the executable.", + action=DetectDefault, + ) + argparser.add_argument( + "--max-lifetime-restarts", + default=10, + help="The max number of times a single Unity executable can crash over its lifetime before ml-agents exits. " + "Can be set to -1 if no limit is desired.", + action=DetectDefault, + ) + argparser.add_argument( + "--restarts-rate-limit-n", + default=1, + help="The maximum number of times a single Unity executable can crash over a period of time (period set in " + "restarts-rate-limit-period-s). Can be set to -1 to not use rate limiting with restarts.", + action=DetectDefault, + ) + argparser.add_argument( + "--restarts-rate-limit-period-s", + default=60, + help="The period of time --restarts-rate-limit-n applies to.", + action=DetectDefault, + ) + argparser.add_argument( + "--torch", + default=False, + action=RaiseRemovedWarning, + help="(Removed) Use the PyTorch framework.", + ) + argparser.add_argument( + "--tensorflow", + default=False, + action=RaiseRemovedWarning, + help="(Removed) Use the TensorFlow framework.", + ) + argparser.add_argument( + "--results-dir", + default="results", + action=DetectDefault, + help="Results base directory", + ) + + eng_conf = argparser.add_argument_group(title="Engine Configuration") + eng_conf.add_argument( + "--width", + default=84, + type=int, + help="The width of the executable window of the environment(s) in pixels " + "(ignored for editor training).", + action=DetectDefault, + ) + eng_conf.add_argument( + "--height", + default=84, + type=int, + help="The height of the executable window of the environment(s) in pixels " + "(ignored for editor training)", + action=DetectDefault, + ) + eng_conf.add_argument( + "--quality-level", + default=5, + type=int, + help="The quality level of the environment(s). Equivalent to calling " + "QualitySettings.SetQualityLevel in Unity.", + action=DetectDefault, + ) + eng_conf.add_argument( + "--time-scale", + default=20, + type=float, + help="The time scale of the Unity environment(s). Equivalent to setting " + "Time.timeScale in Unity.", + action=DetectDefault, + ) + eng_conf.add_argument( + "--target-frame-rate", + default=-1, + type=int, + help="The target frame rate of the Unity environment(s). Equivalent to setting " + "Application.targetFrameRate in Unity.", + action=DetectDefault, + ) + eng_conf.add_argument( + "--capture-frame-rate", + default=60, + type=int, + help="The capture frame rate of the Unity environment(s). Equivalent to setting " + "Time.captureFramerate in Unity.", + action=DetectDefault, + ) + eng_conf.add_argument( + "--no-graphics", + default=False, + action=DetectDefaultStoreTrue, + help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing " + "the graphics driver. Use this only if your agents don't use visual observations.", + ) + + torch_conf = argparser.add_argument_group(title="Torch Configuration") + torch_conf.add_argument( + "--torch-device", + default=None, + dest="device", + action=DetectDefault, + help='Settings for the default torch.device used in training, for example, "cpu", "cuda", or "cuda:0"', + ) + return argparser + + +def load_config(config_path: str) -> Dict[str, Any]: + try: + with open(config_path) as data_file: + return _load_config(data_file) + except OSError: + abs_path = os.path.abspath(config_path) + raise TrainerConfigError(f"Config file could not be found at {abs_path}.") + except UnicodeDecodeError: + raise TrainerConfigError( + f"There was an error decoding Config file from {config_path}. " + f"Make sure your file is save using UTF-8" + ) + + +def _load_config(fp: TextIO) -> Dict[str, Any]: + """ + Load the yaml config from the file-like object. + """ + try: + return yaml.safe_load(fp) + except yaml.parser.ParserError as e: + raise TrainerConfigError( + "Error parsing yaml file. Please check for formatting errors. " + "A tool such as http://www.yamllint.com/ can be helpful with this." + ) from e + + +parser = _create_parser() diff --git a/MLPY/Lib/site-packages/mlagents/trainers/demo_loader.py b/MLPY/Lib/site-packages/mlagents/trainers/demo_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..eba0200b8830ce884ec08e674c3be26f8e0e3ea7 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/demo_loader.py @@ -0,0 +1,246 @@ +import os +from typing import List, Tuple +import numpy as np +from mlagents.trainers.buffer import AgentBuffer, BufferKey +from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import ( + AgentInfoActionPairProto, +) +from mlagents.trainers.trajectory import ObsUtil +from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto +from mlagents_envs.base_env import BehaviorSpec +from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto +from mlagents_envs.communicator_objects.demonstration_meta_pb2 import ( + DemonstrationMetaProto, +) +from mlagents_envs.timers import timed, hierarchical_timer +from google.protobuf.internal.decoder import _DecodeVarint32 # type: ignore +from google.protobuf.internal.encoder import _EncodeVarint # type: ignore + + +INITIAL_POS = 33 +SUPPORTED_DEMONSTRATION_VERSIONS = frozenset([0, 1]) + + +@timed +def make_demo_buffer( + pair_infos: List[AgentInfoActionPairProto], + behavior_spec: BehaviorSpec, + sequence_length: int, +) -> AgentBuffer: + # Create and populate buffer using experiences + demo_raw_buffer = AgentBuffer() + demo_processed_buffer = AgentBuffer() + for idx, current_pair_info in enumerate(pair_infos): + if idx > len(pair_infos) - 2: + break + next_pair_info = pair_infos[idx + 1] + current_decision_step, current_terminal_step = steps_from_proto( + [current_pair_info.agent_info], behavior_spec + ) + next_decision_step, next_terminal_step = steps_from_proto( + [next_pair_info.agent_info], behavior_spec + ) + previous_action = ( + np.array( + pair_infos[idx].action_info.vector_actions_deprecated, dtype=np.float32 + ) + * 0 + ) + if idx > 0: + previous_action = np.array( + pair_infos[idx - 1].action_info.vector_actions_deprecated, + dtype=np.float32, + ) + + next_done = len(next_terminal_step) == 1 + next_reward = 0 + if len(next_terminal_step) == 1: + next_reward = next_terminal_step.reward[0] + else: + next_reward = next_decision_step.reward[0] + current_obs = None + if len(current_terminal_step) == 1: + current_obs = list(current_terminal_step.values())[0].obs + else: + current_obs = list(current_decision_step.values())[0].obs + + demo_raw_buffer[BufferKey.DONE].append(next_done) + demo_raw_buffer[BufferKey.ENVIRONMENT_REWARDS].append(next_reward) + for i, obs in enumerate(current_obs): + demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs) + if ( + len(current_pair_info.action_info.continuous_actions) == 0 + and len(current_pair_info.action_info.discrete_actions) == 0 + ): + if behavior_spec.action_spec.continuous_size > 0: + demo_raw_buffer[BufferKey.CONTINUOUS_ACTION].append( + current_pair_info.action_info.vector_actions_deprecated + ) + else: + demo_raw_buffer[BufferKey.DISCRETE_ACTION].append( + current_pair_info.action_info.vector_actions_deprecated + ) + else: + if behavior_spec.action_spec.continuous_size > 0: + demo_raw_buffer[BufferKey.CONTINUOUS_ACTION].append( + current_pair_info.action_info.continuous_actions + ) + if behavior_spec.action_spec.discrete_size > 0: + demo_raw_buffer[BufferKey.DISCRETE_ACTION].append( + current_pair_info.action_info.discrete_actions + ) + demo_raw_buffer[BufferKey.PREV_ACTION].append(previous_action) + if next_done: + demo_raw_buffer.resequence_and_append( + demo_processed_buffer, batch_size=None, training_length=sequence_length + ) + demo_raw_buffer.reset_agent() + demo_raw_buffer.resequence_and_append( + demo_processed_buffer, batch_size=None, training_length=sequence_length + ) + return demo_processed_buffer + + +@timed +def demo_to_buffer( + file_path: str, sequence_length: int, expected_behavior_spec: BehaviorSpec = None +) -> Tuple[BehaviorSpec, AgentBuffer]: + """ + Loads demonstration file and uses it to fill training buffer. + :param file_path: Location of demonstration file (.demo). + :param sequence_length: Length of trajectories to fill buffer. + :return: + """ + behavior_spec, info_action_pair, _ = load_demonstration(file_path) + demo_buffer = make_demo_buffer(info_action_pair, behavior_spec, sequence_length) + if expected_behavior_spec: + # check action dimensions in demonstration match + if behavior_spec.action_spec != expected_behavior_spec.action_spec: + raise RuntimeError( + "The actions {} in demonstration do not match the policy's {}.".format( + behavior_spec.action_spec, expected_behavior_spec.action_spec + ) + ) + # check observations match + if len(behavior_spec.observation_specs) != len( + expected_behavior_spec.observation_specs + ): + raise RuntimeError( + "The demonstrations do not have the same number of observations as the policy." + ) + else: + for i, (demo_obs, policy_obs) in enumerate( + zip( + behavior_spec.observation_specs, + expected_behavior_spec.observation_specs, + ) + ): + if demo_obs.shape != policy_obs.shape: + raise RuntimeError( + f"The shape {demo_obs} for observation {i} in demonstration \ + do not match the policy's {policy_obs}." + ) + return behavior_spec, demo_buffer + + +def get_demo_files(path: str) -> List[str]: + """ + Retrieves the demonstration file(s) from a path. + :param path: Path of demonstration file or directory. + :return: List of demonstration files + + Raises errors if |path| is invalid. + """ + if os.path.isfile(path): + if not path.endswith(".demo"): + raise ValueError("The path provided is not a '.demo' file.") + return [path] + elif os.path.isdir(path): + paths = [ + os.path.join(path, name) + for name in os.listdir(path) + if name.endswith(".demo") + ] + if not paths: + raise ValueError("There are no '.demo' files in the provided directory.") + return paths + else: + raise FileNotFoundError( + f"The demonstration file or directory {path} does not exist." + ) + + +@timed +def load_demonstration( + file_path: str, +) -> Tuple[BehaviorSpec, List[AgentInfoActionPairProto], int]: + """ + Loads and parses a demonstration file. + :param file_path: Location of demonstration file (.demo). + :return: BrainParameter and list of AgentInfoActionPairProto containing demonstration data. + """ + + # First 32 bytes of file dedicated to meta-data. + file_paths = get_demo_files(file_path) + behavior_spec = None + brain_param_proto = None + info_action_pairs = [] + total_expected = 0 + for _file_path in file_paths: + with open(_file_path, "rb") as fp: + with hierarchical_timer("read_file"): + data = fp.read() + next_pos, pos, obs_decoded = 0, 0, 0 + while pos < len(data): + next_pos, pos = _DecodeVarint32(data, pos) + if obs_decoded == 0: + meta_data_proto = DemonstrationMetaProto() + meta_data_proto.ParseFromString(data[pos : pos + next_pos]) + if ( + meta_data_proto.api_version + not in SUPPORTED_DEMONSTRATION_VERSIONS + ): + raise RuntimeError( + f"Can't load Demonstration data from an unsupported version ({meta_data_proto.api_version})" + ) + total_expected += meta_data_proto.number_steps + pos = INITIAL_POS + if obs_decoded == 1: + brain_param_proto = BrainParametersProto() + brain_param_proto.ParseFromString(data[pos : pos + next_pos]) + pos += next_pos + if obs_decoded > 1: + agent_info_action = AgentInfoActionPairProto() + agent_info_action.ParseFromString(data[pos : pos + next_pos]) + if behavior_spec is None: + behavior_spec = behavior_spec_from_proto( + brain_param_proto, agent_info_action.agent_info + ) + info_action_pairs.append(agent_info_action) + if len(info_action_pairs) == total_expected: + break + pos += next_pos + obs_decoded += 1 + if not behavior_spec: + raise RuntimeError( + f"No BrainParameters found in demonstration file at {file_path}." + ) + return behavior_spec, info_action_pairs, total_expected + + +def write_delimited(f, message): + msg_string = message.SerializeToString() + msg_size = len(msg_string) + _EncodeVarint(f.write, msg_size) + f.write(msg_string) + + +def write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos): + with open(demo_path, "wb") as f: + # write metadata + write_delimited(f, meta_data_proto) + f.seek(INITIAL_POS) + write_delimited(f, brain_param_proto) + + for agent in agent_info_protos: + write_delimited(f, agent) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/directory_utils.py b/MLPY/Lib/site-packages/mlagents/trainers/directory_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..80379d81e998743d6bc5e3759f8f7a3b71e3c970 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/directory_utils.py @@ -0,0 +1,76 @@ +import os +from mlagents.trainers.exception import UnityTrainerException +from mlagents.trainers.settings import TrainerSettings +from mlagents.trainers.model_saver.torch_model_saver import DEFAULT_CHECKPOINT_NAME + + +def validate_existing_directories( + output_path: str, resume: bool, force: bool, init_path: str = None +) -> None: + """ + Validates that if the run_id model exists, we do not overwrite it unless --force is specified. + Throws an exception if resume isn't specified and run_id exists. Throws an exception + if --resume is specified and run-id was not found. + :param model_path: The model path specified. + :param summary_path: The summary path to be used. + :param resume: Whether or not the --resume flag was passed. + :param force: Whether or not the --force flag was passed. + :param init_path: Path to run-id dir to initialize from + """ + + output_path_exists = os.path.isdir(output_path) + + if output_path_exists: + if not resume and not force: + raise UnityTrainerException( + "Previous data from this run ID was found. " + "Either specify a new run ID, use --resume to resume this run, " + "or use the --force parameter to overwrite existing data." + ) + else: + if resume: + raise UnityTrainerException( + "Previous data from this run ID was not found. " + "Train a new run by removing the --resume flag." + ) + + # Verify init path if specified. + if init_path is not None: + if not os.path.isdir(init_path): + raise UnityTrainerException( + "Could not initialize from {}. " + "Make sure models have already been saved with that run ID.".format( + init_path + ) + ) + + +def setup_init_path( + behaviors: TrainerSettings.DefaultTrainerDict, init_dir: str +) -> None: + """ + For each behavior, setup full init_path to checkpoint file to initialize policy from + :param behaviors: mapping from behavior_name to TrainerSettings + :param init_dir: Path to run-id dir to initialize from + """ + for behavior_name, ts in behaviors.items(): + if ts.init_path is None: + # set default if None + ts.init_path = os.path.join( + init_dir, behavior_name, DEFAULT_CHECKPOINT_NAME + ) + elif not os.path.dirname(ts.init_path): + # update to full path if just the file name + ts.init_path = os.path.join(init_dir, behavior_name, ts.init_path) + _validate_init_full_path(ts.init_path) + + +def _validate_init_full_path(init_file: str) -> None: + """ + Validate initialization path to be a .pt file + :param init_file: full path to initialization checkpoint file + """ + if not (os.path.isfile(init_file) and init_file.endswith(".pt")): + raise UnityTrainerException( + f"Could not initialize from {init_file}. file does not exists or is not a `.pt` file" + ) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/env_manager.py b/MLPY/Lib/site-packages/mlagents/trainers/env_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..a4a90fdc709d4829112a11d9dce1ee5dd428ff0e --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/env_manager.py @@ -0,0 +1,157 @@ +from abc import ABC, abstractmethod + +from typing import List, Dict, NamedTuple, Iterable, Tuple +from mlagents_envs.base_env import ( + DecisionSteps, + TerminalSteps, + BehaviorSpec, + BehaviorName, +) +from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats + +from mlagents.trainers.policy import Policy +from mlagents.trainers.agent_processor import AgentManager, AgentManagerQueue +from mlagents.trainers.action_info import ActionInfo +from mlagents.trainers.settings import TrainerSettings +from mlagents_envs.logging_util import get_logger + +AllStepResult = Dict[BehaviorName, Tuple[DecisionSteps, TerminalSteps]] +AllGroupSpec = Dict[BehaviorName, BehaviorSpec] + +logger = get_logger(__name__) + + +class EnvironmentStep(NamedTuple): + current_all_step_result: AllStepResult + worker_id: int + brain_name_to_action_info: Dict[BehaviorName, ActionInfo] + environment_stats: EnvironmentStats + + @property + def name_behavior_ids(self) -> Iterable[BehaviorName]: + return self.current_all_step_result.keys() + + @staticmethod + def empty(worker_id: int) -> "EnvironmentStep": + return EnvironmentStep({}, worker_id, {}, {}) + + +class EnvManager(ABC): + def __init__(self): + self.policies: Dict[BehaviorName, Policy] = {} + self.agent_managers: Dict[BehaviorName, AgentManager] = {} + self.first_step_infos: List[EnvironmentStep] = [] + + def set_policy(self, brain_name: BehaviorName, policy: Policy) -> None: + self.policies[brain_name] = policy + if brain_name in self.agent_managers: + self.agent_managers[brain_name].policy = policy + + def set_agent_manager( + self, brain_name: BehaviorName, manager: AgentManager + ) -> None: + self.agent_managers[brain_name] = manager + + @abstractmethod + def _step(self) -> List[EnvironmentStep]: + pass + + @abstractmethod + def _reset_env(self, config: Dict = None) -> List[EnvironmentStep]: + pass + + def reset(self, config: Dict = None) -> int: + for manager in self.agent_managers.values(): + manager.end_episode() + # Save the first step infos, after the reset. + # They will be processed on the first advance(). + self.first_step_infos = self._reset_env(config) + return len(self.first_step_infos) + + @abstractmethod + def set_env_parameters(self, config: Dict = None) -> None: + """ + Sends environment parameter settings to C# via the + EnvironmentParametersSideChannel. + :param config: Dict of environment parameter keys and values + """ + pass + + def on_training_started( + self, behavior_name: str, trainer_settings: TrainerSettings + ) -> None: + """ + Handle traing starting for a new behavior type. Generally nothing is necessary here. + :param behavior_name: + :param trainer_settings: + :return: + """ + pass + + @property + @abstractmethod + def training_behaviors(self) -> Dict[BehaviorName, BehaviorSpec]: + pass + + @abstractmethod + def close(self): + pass + + def get_steps(self) -> List[EnvironmentStep]: + """ + Updates the policies, steps the environments, and returns the step information from the environments. + Calling code should pass the returned EnvironmentSteps to process_steps() after calling this. + :return: The list of EnvironmentSteps + """ + # If we had just reset, process the first EnvironmentSteps. + # Note that we do it here instead of in reset() so that on the very first reset(), + # we can create the needed AgentManagers before calling advance() and processing the EnvironmentSteps. + if self.first_step_infos: + self._process_step_infos(self.first_step_infos) + self.first_step_infos = [] + # Get new policies if found. Always get the latest policy. + for brain_name in self.agent_managers.keys(): + _policy = None + try: + # We make sure to empty the policy queue before continuing to produce steps. + # This halts the trainers until the policy queue is empty. + while True: + _policy = self.agent_managers[brain_name].policy_queue.get_nowait() + except AgentManagerQueue.Empty: + if _policy is not None: + self.set_policy(brain_name, _policy) + # Step the environments + new_step_infos = self._step() + return new_step_infos + + def process_steps(self, new_step_infos: List[EnvironmentStep]) -> int: + # Add to AgentProcessor + num_step_infos = self._process_step_infos(new_step_infos) + return num_step_infos + + def _process_step_infos(self, step_infos: List[EnvironmentStep]) -> int: + for step_info in step_infos: + for name_behavior_id in step_info.name_behavior_ids: + if name_behavior_id not in self.agent_managers: + logger.warning( + "Agent manager was not created for behavior id {}.".format( + name_behavior_id + ) + ) + continue + decision_steps, terminal_steps = step_info.current_all_step_result[ + name_behavior_id + ] + self.agent_managers[name_behavior_id].add_experiences( + decision_steps, + terminal_steps, + step_info.worker_id, + step_info.brain_name_to_action_info.get( + name_behavior_id, ActionInfo.empty() + ), + ) + + self.agent_managers[name_behavior_id].record_environment_stats( + step_info.environment_stats, step_info.worker_id + ) + return len(step_infos) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/environment_parameter_manager.py b/MLPY/Lib/site-packages/mlagents/trainers/environment_parameter_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..7dd00f98a04a29067a1cd4f5a83c319a237add38 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/environment_parameter_manager.py @@ -0,0 +1,186 @@ +from typing import Dict, List, Tuple, Optional +from mlagents.trainers.settings import ( + EnvironmentParameterSettings, + ParameterRandomizationSettings, +) +from collections import defaultdict +from mlagents.trainers.training_status import GlobalTrainingStatus, StatusType + +from mlagents_envs.logging_util import get_logger + +logger = get_logger(__name__) + + +class EnvironmentParameterManager: + def __init__( + self, + settings: Optional[Dict[str, EnvironmentParameterSettings]] = None, + run_seed: int = -1, + restore: bool = False, + ): + """ + EnvironmentParameterManager manages all the environment parameters of a training + session. It determines when parameters should change and gives access to the + current sampler of each parameter. + :param settings: A dictionary from environment parameter to + EnvironmentParameterSettings. + :param run_seed: When the seed is not provided for an environment parameter, + this seed will be used instead. + :param restore: If true, the EnvironmentParameterManager will use the + GlobalTrainingStatus to try and reload the lesson status of each environment + parameter. + """ + if settings is None: + settings = {} + self._dict_settings = settings + for parameter_name in self._dict_settings.keys(): + initial_lesson = GlobalTrainingStatus.get_parameter_state( + parameter_name, StatusType.LESSON_NUM + ) + if initial_lesson is None or not restore: + GlobalTrainingStatus.set_parameter_state( + parameter_name, StatusType.LESSON_NUM, 0 + ) + self._smoothed_values: Dict[str, float] = defaultdict(float) + for key in self._dict_settings.keys(): + self._smoothed_values[key] = 0.0 + # Update the seeds of the samplers + self._set_sampler_seeds(run_seed) + + def _set_sampler_seeds(self, seed): + """ + Sets the seeds for the samplers (if no seed was already present). Note that + using the provided seed. + """ + offset = 0 + for settings in self._dict_settings.values(): + for lesson in settings.curriculum: + if lesson.value.seed == -1: + lesson.value.seed = seed + offset + offset += 1 + + def get_minimum_reward_buffer_size(self, behavior_name: str) -> int: + """ + Calculates the minimum size of the reward buffer a behavior must use. This + method uses the 'min_lesson_length' sampler_parameter to determine this value. + :param behavior_name: The name of the behavior the minimum reward buffer + size corresponds to. + """ + result = 1 + for settings in self._dict_settings.values(): + for lesson in settings.curriculum: + if lesson.completion_criteria is not None: + if lesson.completion_criteria.behavior == behavior_name: + result = max( + result, lesson.completion_criteria.min_lesson_length + ) + return result + + def get_current_samplers(self) -> Dict[str, ParameterRandomizationSettings]: + """ + Creates a dictionary from environment parameter name to their corresponding + ParameterRandomizationSettings. If curriculum is used, the + ParameterRandomizationSettings corresponds to the sampler of the current lesson. + """ + samplers: Dict[str, ParameterRandomizationSettings] = {} + for param_name, settings in self._dict_settings.items(): + lesson_num = GlobalTrainingStatus.get_parameter_state( + param_name, StatusType.LESSON_NUM + ) + lesson = settings.curriculum[lesson_num] + samplers[param_name] = lesson.value + return samplers + + def get_current_lesson_number(self) -> Dict[str, int]: + """ + Creates a dictionary from environment parameter to the current lesson number. + If not using curriculum, this number is always 0 for that environment parameter. + """ + result: Dict[str, int] = {} + for parameter_name in self._dict_settings.keys(): + result[parameter_name] = GlobalTrainingStatus.get_parameter_state( + parameter_name, StatusType.LESSON_NUM + ) + return result + + def log_current_lesson(self, parameter_name: Optional[str] = None) -> None: + """ + Logs the current lesson number and sampler value of the parameter with name + parameter_name. If no parameter_name is provided, the values and lesson + numbers of all parameters will be displayed. + """ + if parameter_name is not None: + settings = self._dict_settings[parameter_name] + lesson_number = GlobalTrainingStatus.get_parameter_state( + parameter_name, StatusType.LESSON_NUM + ) + lesson_name = settings.curriculum[lesson_number].name + lesson_value = settings.curriculum[lesson_number].value + logger.info( + f"Parameter '{parameter_name}' is in lesson '{lesson_name}' " + f"and has value '{lesson_value}'." + ) + else: + for parameter_name, settings in self._dict_settings.items(): + lesson_number = GlobalTrainingStatus.get_parameter_state( + parameter_name, StatusType.LESSON_NUM + ) + lesson_name = settings.curriculum[lesson_number].name + lesson_value = settings.curriculum[lesson_number].value + logger.info( + f"Parameter '{parameter_name}' is in lesson '{lesson_name}' " + f"and has value '{lesson_value}'." + ) + + def update_lessons( + self, + trainer_steps: Dict[str, int], + trainer_max_steps: Dict[str, int], + trainer_reward_buffer: Dict[str, List[float]], + ) -> Tuple[bool, bool]: + """ + Given progress metrics, calculates if at least one environment parameter is + in a new lesson and if at least one environment parameter requires the env + to reset. + :param trainer_steps: A dictionary from behavior_name to the number of training + steps this behavior's trainer has performed. + :param trainer_max_steps: A dictionary from behavior_name to the maximum number + of training steps this behavior's trainer has performed. + :param trainer_reward_buffer: A dictionary from behavior_name to the list of + the most recent episode returns for this behavior's trainer. + :returns: A tuple of two booleans : (True if any lesson has changed, True if + environment needs to reset) + """ + must_reset = False + updated = False + for param_name, settings in self._dict_settings.items(): + lesson_num = GlobalTrainingStatus.get_parameter_state( + param_name, StatusType.LESSON_NUM + ) + next_lesson_num = lesson_num + 1 + lesson = settings.curriculum[lesson_num] + if ( + lesson.completion_criteria is not None + and len(settings.curriculum) > next_lesson_num + ): + behavior_to_consider = lesson.completion_criteria.behavior + if behavior_to_consider in trainer_steps: + ( + must_increment, + new_smoothing, + ) = lesson.completion_criteria.need_increment( + float(trainer_steps[behavior_to_consider]) + / float(trainer_max_steps[behavior_to_consider]), + trainer_reward_buffer[behavior_to_consider], + self._smoothed_values[param_name], + ) + self._smoothed_values[param_name] = new_smoothing + if must_increment: + GlobalTrainingStatus.set_parameter_state( + param_name, StatusType.LESSON_NUM, next_lesson_num + ) + self.log_current_lesson(param_name) + updated = True + if lesson.completion_criteria.require_reset: + must_reset = True + return updated, must_reset diff --git a/MLPY/Lib/site-packages/mlagents/trainers/exception.py b/MLPY/Lib/site-packages/mlagents/trainers/exception.py new file mode 100644 index 0000000000000000000000000000000000000000..3c0742bcec7a45f96a7ccb4c320010f845320d4c --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/exception.py @@ -0,0 +1,75 @@ +""" +Contains exceptions for the trainers package. +""" + + +class TrainerError(Exception): + """ + Any error related to the trainers in the ML-Agents Toolkit. + """ + + pass + + +class TrainerConfigError(Exception): + """ + Any error related to the configuration of trainers in the ML-Agents Toolkit. + """ + + pass + + +class TrainerConfigWarning(Warning): + """ + Any warning related to the configuration of trainers in the ML-Agents Toolkit. + """ + + pass + + +class CurriculumError(TrainerError): + """ + Any error related to training with a curriculum. + """ + + pass + + +class CurriculumLoadingError(CurriculumError): + """ + Any error related to loading the Curriculum config file. + """ + + pass + + +class CurriculumConfigError(CurriculumError): + """ + Any error related to processing the Curriculum config file. + """ + + pass + + +class MetaCurriculumError(TrainerError): + """ + Any error related to the configuration of a metacurriculum. + """ + + pass + + +class SamplerException(TrainerError): + """ + Related to errors with the sampler actions. + """ + + pass + + +class UnityTrainerException(TrainerError): + """ + Related to errors with the Trainer. + """ + + pass diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ghost/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dbedb9f414f336cb0a7312f4d8644c3dca2cc74a Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/controller.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/controller.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52e60d362256e9209391803531b42358d52372cf Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/controller.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/trainer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9fff08fc583e976e6fad640ed58cd6b84f499a8 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/trainer.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ghost/controller.py b/MLPY/Lib/site-packages/mlagents/trainers/ghost/controller.py new file mode 100644 index 0000000000000000000000000000000000000000..84901e14f6e57993317a1a5a3f94c7571c7c39d9 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/ghost/controller.py @@ -0,0 +1,103 @@ +from mlagents_envs.logging_util import get_logger +from typing import Deque, Dict +from collections import deque +from mlagents.trainers.ghost.trainer import GhostTrainer + +logger = get_logger(__name__) + + +class GhostController: + """ + GhostController contains a queue of team ids. GhostTrainers subscribe to the GhostController and query + it to get the current learning team. The GhostController cycles through team ids every 'swap_interval' + which corresponds to the number of trainer steps between changing learning teams. + The GhostController is a unique object and there can only be one per training run. + """ + + def __init__(self, maxlen: int = 10): + """ + Create a GhostController. + :param maxlen: Maximum number of GhostTrainers allowed in this GhostController + """ + + # Tracks last swap step for each learning team because trainer + # steps of all GhostTrainers do not increment together + self._queue: Deque[int] = deque(maxlen=maxlen) + self._learning_team: int = -1 + # Dict from team id to GhostTrainer for ELO calculation + self._ghost_trainers: Dict[int, GhostTrainer] = {} + # Signals to the trainer control to perform a hard change_training_team + self._changed_training_team = False + + @property + def get_learning_team(self) -> int: + """ + Returns the current learning team. + :return: The learning team id + """ + return self._learning_team + + def should_reset(self) -> bool: + """ + Whether or not team change occurred. Causes full reset in trainer_controller + :return: The truth value of the team changing + """ + changed_team = self._changed_training_team + if self._changed_training_team: + self._changed_training_team = False + return changed_team + + def subscribe_team_id(self, team_id: int, trainer: GhostTrainer) -> None: + """ + Given a team_id and trainer, add to queue and trainers if not already. + The GhostTrainer is used later by the controller to get ELO ratings of agents. + :param team_id: The team_id of an agent managed by this GhostTrainer + :param trainer: A GhostTrainer that manages this team_id. + """ + if team_id not in self._ghost_trainers: + self._ghost_trainers[team_id] = trainer + if self._learning_team < 0: + self._learning_team = team_id + else: + self._queue.append(team_id) + + def change_training_team(self, step: int) -> None: + """ + The current learning team is added to the end of the queue and then updated with the + next in line. + :param step: The step of the trainer for debugging + """ + self._queue.append(self._learning_team) + self._learning_team = self._queue.popleft() + logger.debug(f"Learning team {self._learning_team} swapped on step {step}") + self._changed_training_team = True + + # Adapted from https://github.com/Unity-Technologies/ml-agents/pull/1975 and + # https://metinmediamath.wordpress.com/2013/11/27/how-to-calculate-the-elo-rating-including-example/ + # ELO calculation + # TODO : Generalize this to more than two teams + def compute_elo_rating_changes(self, rating: float, result: float) -> float: + """ + Calculates ELO. Given the rating of the learning team and result. The GhostController + queries the other GhostTrainers for the ELO of their agent that is currently being deployed. + Note, this could be the current agent or a past snapshot. + :param rating: Rating of the learning team. + :param result: Win, loss, or draw from the perspective of the learning team. + :return: The change in ELO. + """ + opponent_rating: float = 0.0 + for team_id, trainer in self._ghost_trainers.items(): + if team_id != self._learning_team: + opponent_rating = trainer.get_opponent_elo() + r1 = pow(10, rating / 400) + r2 = pow(10, opponent_rating / 400) + + summed = r1 + r2 + e1 = r1 / summed + + change = result - e1 + for team_id, trainer in self._ghost_trainers.items(): + if team_id != self._learning_team: + trainer.change_opponent_elo(change) + + return change diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ghost/trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/ghost/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..f49a643574b8eef7b1f64c28acd9c1a48b0ec0d7 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/ghost/trainer.py @@ -0,0 +1,480 @@ +# # Unity ML-Agents Toolkit +# ## ML-Agent Learning (Ghost Trainer) + +from collections import defaultdict +from typing import Deque, Dict, DefaultDict, List + +import numpy as np + +from mlagents_envs.logging_util import get_logger +from mlagents_envs.base_env import BehaviorSpec +from mlagents.trainers.policy import Policy + +from mlagents.trainers.trainer import Trainer +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.trajectory import Trajectory +from mlagents.trainers.agent_processor import AgentManagerQueue +from mlagents.trainers.stats import StatsPropertyType +from mlagents.trainers.behavior_id_utils import ( + BehaviorIdentifiers, + create_name_behavior_id, +) +from mlagents.trainers.training_status import GlobalTrainingStatus, StatusType + + +logger = get_logger(__name__) + + +class GhostTrainer(Trainer): + """ + The GhostTrainer trains agents in adversarial games (there are teams in opposition) using a self-play mechanism. + In adversarial settings with self-play, at any time, there is only a single learning team. The other team(s) is + "ghosted" which means that its agents are executing fixed policies and not learning. The GhostTrainer wraps + a standard RL trainer which trains the learning team and ensures that only the trajectories collected + by the learning team are used for training. The GhostTrainer also maintains past policy snapshots to be used + as the fixed policies when the team is not learning. The GhostTrainer is 1:1 with brain_names as the other + trainers, and is responsible for one or more teams. Note, a GhostTrainer can have only one team in + asymmetric games where there is only one team with a particular behavior i.e. Hide and Seek. + The GhostController manages high level coordination between multiple ghost trainers. The learning team id + is cycled throughout a training run. + """ + + def __init__( + self, + trainer, + brain_name, + controller, + reward_buff_cap, + trainer_settings, + training, + artifact_path, + ): + """ + Creates a GhostTrainer. + :param trainer: The trainer of the policy/policies being trained with self_play + :param brain_name: The name of the brain associated with trainer config + :param controller: GhostController that coordinates all ghost trainers and calculates ELO + :param reward_buff_cap: Max reward history to track in the reward buffer + :param trainer_settings: The parameters for the trainer. + :param training: Whether the trainer is set for training. + :param artifact_path: Path to store artifacts from this trainer. + """ + + super().__init__( + brain_name, trainer_settings, training, artifact_path, reward_buff_cap + ) + + self.trainer = trainer + self.controller = controller + + self._internal_trajectory_queues: Dict[str, AgentManagerQueue[Trajectory]] = {} + self._internal_policy_queues: Dict[str, AgentManagerQueue[Policy]] = {} + + self._team_to_name_to_policy_queue: DefaultDict[ + int, Dict[str, AgentManagerQueue[Policy]] + ] = defaultdict(dict) + + self._name_to_parsed_behavior_id: Dict[str, BehaviorIdentifiers] = {} + + # assign ghost's stats collection to wrapped trainer's + self._stats_reporter = self.trainer.stats_reporter + # Set the logging to print ELO in the console + self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY, True) + + self_play_parameters = trainer_settings.self_play + self.window = self_play_parameters.window + self.play_against_latest_model_ratio = ( + self_play_parameters.play_against_latest_model_ratio + ) + if ( + self.play_against_latest_model_ratio > 1.0 + or self.play_against_latest_model_ratio < 0.0 + ): + logger.warning( + "The play_against_latest_model_ratio is not between 0 and 1." + ) + + self.steps_between_save = self_play_parameters.save_steps + self.steps_between_swap = self_play_parameters.swap_steps + self.steps_to_train_team = self_play_parameters.team_change + if self.steps_to_train_team > self.get_max_steps: + logger.warning( + "The max steps of the GhostTrainer for behavior name {} is less than team change. This team will not face \ + opposition that has been trained if the opposition is managed by a different GhostTrainer as in an \ + asymmetric game.".format( + self.brain_name + ) + ) + + # Counts the number of steps of the ghost policies. Snapshot swapping + # depends on this counter whereas snapshot saving and team switching depends + # on the wrapped. This ensures that all teams train for the same number of trainer + # steps. + self.ghost_step: int = 0 + + # A list of dicts from brain name to a single snapshot for this trainer's policies + self.policy_snapshots: List[Dict[str, List[float]]] = [] + + # A dict from brain name to the current snapshot of this trainer's policies + self.current_policy_snapshot: Dict[str, List[float]] = {} + + self.snapshot_counter: int = 0 + + # wrapped_training_team and learning team need to be separate + # in the situation where new agents are created destroyed + # after learning team switches. These agents need to be added + # to trainers properly. + self._learning_team: int = None + self.wrapped_trainer_team: int = None + self.last_save: int = 0 + self.last_swap: int = 0 + self.last_team_change: int = 0 + + self.initial_elo = GlobalTrainingStatus.get_parameter_state( + self.brain_name, StatusType.ELO + ) + if self.initial_elo is None: + self.initial_elo = self_play_parameters.initial_elo + self.policy_elos: List[float] = [self.initial_elo] * ( + self.window + 1 + ) # for learning policy + self.current_opponent: int = 0 + + @property + def get_step(self) -> int: + """ + Returns the number of steps the wrapped trainer has performed + :return: the step count of the wrapped trainer + """ + return self.trainer.get_step + + @property + def reward_buffer(self) -> Deque[float]: + """ + Returns the reward buffer. The reward buffer contains the cumulative + rewards of the most recent episodes completed by agents using this + trainer. + :return: the reward buffer. + """ + return self.trainer.reward_buffer + + @property + def current_elo(self) -> float: + """ + Gets ELO of current policy which is always last in the list + :return: ELO of current policy + """ + return self.policy_elos[-1] + + def change_current_elo(self, change: float) -> None: + """ + Changes elo of current policy which is always last in the list + :param change: Amount to change current elo by + """ + self.policy_elos[-1] += change + + def get_opponent_elo(self) -> float: + """ + Get elo of current opponent policy + :return: ELO of current opponent policy + """ + return self.policy_elos[self.current_opponent] + + def change_opponent_elo(self, change: float) -> None: + """ + Changes elo of current opponent policy + :param change: Amount to change current opponent elo by + """ + self.policy_elos[self.current_opponent] -= change + + def _process_trajectory(self, trajectory: Trajectory) -> None: + """ + Determines the final result of an episode and asks the GhostController + to calculate the ELO change. The GhostController changes the ELO + of the opponent policy since this may be in a different GhostTrainer + i.e. in asymmetric games. We assume the last reward determines the winner. + :param trajectory: Trajectory. + """ + if ( + trajectory.done_reached + and trajectory.all_group_dones_reached + and not trajectory.interrupted + ): + # Assumption is that final reward is >0/0/<0 for win/draw/loss + final_reward = ( + trajectory.steps[-1].reward + trajectory.steps[-1].group_reward + ) + result = 0.5 + if final_reward > 0: + result = 1.0 + elif final_reward < 0: + result = 0.0 + + change = self.controller.compute_elo_rating_changes( + self.current_elo, result + ) + self.change_current_elo(change) + self._stats_reporter.add_stat("Self-play/ELO", self.current_elo) + + def advance(self) -> None: + """ + Steps the trainer, passing trajectories to wrapped trainer and calling trainer advance + """ + for trajectory_queue in self.trajectory_queues: + parsed_behavior_id = self._name_to_parsed_behavior_id[ + trajectory_queue.behavior_id + ] + if parsed_behavior_id.team_id == self._learning_team: + # With a future multiagent trainer, this will be indexed by 'role' + internal_trajectory_queue = self._internal_trajectory_queues[ + parsed_behavior_id.brain_name + ] + try: + # We grab at most the maximum length of the queue. + # This ensures that even if the queue is being filled faster than it is + # being emptied, the trajectories in the queue are on-policy. + for _ in range(trajectory_queue.qsize()): + t = trajectory_queue.get_nowait() + # adds to wrapped trainers queue + internal_trajectory_queue.put(t) + self._process_trajectory(t) + except AgentManagerQueue.Empty: + pass + else: + # Dump trajectories from non-learning policy + try: + for _ in range(trajectory_queue.qsize()): + t = trajectory_queue.get_nowait() + # count ghost steps + self.ghost_step += len(t.steps) + except AgentManagerQueue.Empty: + pass + + self._next_summary_step = self.trainer._next_summary_step + self.trainer.advance() + if self.get_step - self.last_team_change > self.steps_to_train_team: + self.controller.change_training_team(self.get_step) + self.last_team_change = self.get_step + + next_learning_team = self.controller.get_learning_team + + # Case 1: No team change. The if statement just continues to push the policy + # into the correct queue (or not if not learning team). + for brain_name in self._internal_policy_queues: + internal_policy_queue = self._internal_policy_queues[brain_name] + try: + policy = internal_policy_queue.get_nowait() + self.current_policy_snapshot[brain_name] = policy.get_weights() + except AgentManagerQueue.Empty: + continue + if ( + self._learning_team == next_learning_team + and next_learning_team in self._team_to_name_to_policy_queue + ): + name_to_policy_queue = self._team_to_name_to_policy_queue[ + next_learning_team + ] + if brain_name in name_to_policy_queue: + behavior_id = create_name_behavior_id( + brain_name, next_learning_team + ) + policy = self.get_policy(behavior_id) + policy.load_weights(self.current_policy_snapshot[brain_name]) + name_to_policy_queue[brain_name].put(policy) + + # CASE 2: Current learning team is managed by this GhostTrainer. + # If the learning team changes, the following loop over queues will push the + # new policy into the policy queue for the new learning agent if + # that policy is managed by this GhostTrainer. Otherwise, it will save the current snapshot. + # CASE 3: Current learning team is managed by a different GhostTrainer. + # If the learning team changes to a team managed by this GhostTrainer, this loop + # will push the current_snapshot into the correct queue. Otherwise, + # it will continue skipping and swap_snapshot will continue to handle + # pushing fixed snapshots + if ( + self._learning_team != next_learning_team + and next_learning_team in self._team_to_name_to_policy_queue + ): + name_to_policy_queue = self._team_to_name_to_policy_queue[ + next_learning_team + ] + for brain_name in name_to_policy_queue: + behavior_id = create_name_behavior_id(brain_name, next_learning_team) + policy = self.get_policy(behavior_id) + policy.load_weights(self.current_policy_snapshot[brain_name]) + name_to_policy_queue[brain_name].put(policy) + + # Note save and swap should be on different step counters. + # We don't want to save unless the policy is learning. + if self.get_step - self.last_save > self.steps_between_save: + self._save_snapshot() + self.last_save = self.get_step + + if ( + self._learning_team != next_learning_team + or self.ghost_step - self.last_swap > self.steps_between_swap + ): + self._learning_team = next_learning_team + self._swap_snapshots() + self.last_swap = self.ghost_step + + def end_episode(self): + """ + Forwarding call to wrapped trainers end_episode + """ + self.trainer.end_episode() + + def save_model(self) -> None: + """ + Forwarding call to wrapped trainers save_model. + """ + GlobalTrainingStatus.set_parameter_state( + self.brain_name, StatusType.ELO, self.current_elo + ) + self.trainer.save_model() + + def create_policy( + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + ) -> Policy: + """ + Creates policy with the wrapped trainer's create_policy function + The first policy encountered sets the wrapped + trainer team. This is to ensure that all agents from the same multi-agent + team are grouped. All policies associated with this team are added to the + wrapped trainer to be trained. + """ + policy = self.trainer.create_policy(parsed_behavior_id, behavior_spec) + team_id = parsed_behavior_id.team_id + self.controller.subscribe_team_id(team_id, self) + + # First policy or a new agent on the same team encountered + if self.wrapped_trainer_team is None or team_id == self.wrapped_trainer_team: + internal_trainer_policy = self.trainer.create_policy( + parsed_behavior_id, behavior_spec + ) + self.trainer.add_policy(parsed_behavior_id, internal_trainer_policy) + self.current_policy_snapshot[ + parsed_behavior_id.brain_name + ] = internal_trainer_policy.get_weights() + + policy.load_weights(internal_trainer_policy.get_weights()) + self._save_snapshot() # Need to save after trainer initializes policy + self._learning_team = self.controller.get_learning_team + self.wrapped_trainer_team = team_id + else: + # Load the weights of the ghost policy from the wrapped one + policy.load_weights( + self.trainer.get_policy(parsed_behavior_id).get_weights() + ) + return policy + + def create_optimizer(self) -> TorchOptimizer: + pass + + def add_policy( + self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy + ) -> None: + """ + Adds policy to GhostTrainer. + :param parsed_behavior_id: Behavior ID that the policy should belong to. + :param policy: Policy to associate with name_behavior_id. + """ + name_behavior_id = parsed_behavior_id.behavior_id + self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id + self.policies[name_behavior_id] = policy + + def _save_snapshot(self) -> None: + """ + Saves a snapshot of the current weights of the policy and maintains the policy_snapshots + according to the window size + """ + for brain_name in self.current_policy_snapshot: + current_snapshot_for_brain_name = self.current_policy_snapshot[brain_name] + + try: + self.policy_snapshots[self.snapshot_counter][ + brain_name + ] = current_snapshot_for_brain_name + except IndexError: + self.policy_snapshots.append( + {brain_name: current_snapshot_for_brain_name} + ) + self.policy_elos[self.snapshot_counter] = self.current_elo + self.snapshot_counter = (self.snapshot_counter + 1) % self.window + + def _swap_snapshots(self) -> None: + """ + Swaps the appropriate weight to the policy and pushes it to respective policy queues + """ + + for team_id in self._team_to_name_to_policy_queue: + if team_id == self._learning_team: + continue + elif np.random.uniform() < (1 - self.play_against_latest_model_ratio): + x = np.random.randint(len(self.policy_snapshots)) + snapshot = self.policy_snapshots[x] + else: + snapshot = self.current_policy_snapshot + x = "current" + + self.current_opponent = -1 if x == "current" else x + name_to_policy_queue = self._team_to_name_to_policy_queue[team_id] + for brain_name in self._team_to_name_to_policy_queue[team_id]: + behavior_id = create_name_behavior_id(brain_name, team_id) + policy = self.get_policy(behavior_id) + policy.load_weights(snapshot[brain_name]) + name_to_policy_queue[brain_name].put(policy) + logger.debug( + "Step {}: Swapping snapshot {} to id {} with team {} learning".format( + self.ghost_step, x, behavior_id, self._learning_team + ) + ) + + def publish_policy_queue(self, policy_queue: AgentManagerQueue[Policy]) -> None: + """ + Adds a policy queue for every member of the team to the list of queues to publish to when this Trainer + makes a policy update. Creates an internal policy queue for the wrapped + trainer to push to. The GhostTrainer pushes all policies to the env. + :param queue: Policy queue to publish to. + """ + super().publish_policy_queue(policy_queue) + parsed_behavior_id = self._name_to_parsed_behavior_id[policy_queue.behavior_id] + self._team_to_name_to_policy_queue[parsed_behavior_id.team_id][ + parsed_behavior_id.brain_name + ] = policy_queue + if parsed_behavior_id.team_id == self.wrapped_trainer_team: + # With a future multiagent trainer, this will be indexed by 'role' + internal_policy_queue: AgentManagerQueue[Policy] = AgentManagerQueue( + parsed_behavior_id.brain_name + ) + + self._internal_policy_queues[ + parsed_behavior_id.brain_name + ] = internal_policy_queue + self.trainer.publish_policy_queue(internal_policy_queue) + + def subscribe_trajectory_queue( + self, trajectory_queue: AgentManagerQueue[Trajectory] + ) -> None: + """ + Adds a trajectory queue for every member of the team to the list of queues for the trainer + to ingest Trajectories from. Creates an internal trajectory queue to push trajectories from + the learning team. The wrapped trainer subscribes to this queue. + :param queue: Trajectory queue to publish to. + """ + super().subscribe_trajectory_queue(trajectory_queue) + parsed_behavior_id = self._name_to_parsed_behavior_id[ + trajectory_queue.behavior_id + ] + if parsed_behavior_id.team_id == self.wrapped_trainer_team: + # With a future multiagent trainer, this will be indexed by 'role' + internal_trajectory_queue: AgentManagerQueue[ + Trajectory + ] = AgentManagerQueue(parsed_behavior_id.brain_name) + + self._internal_trajectory_queues[ + parsed_behavior_id.brain_name + ] = internal_trajectory_queue + self.trainer.subscribe_trajectory_queue(internal_trajectory_queue) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/learn.py b/MLPY/Lib/site-packages/mlagents/trainers/learn.py new file mode 100644 index 0000000000000000000000000000000000000000..69320920a585d8782c3596cea5e4b2fbefc24f69 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/learn.py @@ -0,0 +1,269 @@ +# # Unity ML-Agents Toolkit +from mlagents import torch_utils +import yaml + +import os +import numpy as np +import json + +from typing import Callable, Optional, List + +import mlagents.trainers +import mlagents_envs +from mlagents.trainers.trainer_controller import TrainerController +from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager +from mlagents.trainers.trainer import TrainerFactory +from mlagents.trainers.directory_utils import ( + validate_existing_directories, + setup_init_path, +) +from mlagents.trainers.stats import StatsReporter +from mlagents.trainers.cli_utils import parser +from mlagents_envs.environment import UnityEnvironment +from mlagents.trainers.settings import RunOptions + +from mlagents.trainers.training_status import GlobalTrainingStatus +from mlagents_envs.base_env import BaseEnv +from mlagents.trainers.subprocess_env_manager import SubprocessEnvManager +from mlagents_envs.side_channel.side_channel import SideChannel +from mlagents_envs.timers import ( + hierarchical_timer, + get_timer_tree, + add_metadata as add_timer_metadata, +) +from mlagents_envs import logging_util +from mlagents.plugins.stats_writer import register_stats_writer_plugins +from mlagents.plugins.trainer_type import register_trainer_plugins + +logger = logging_util.get_logger(__name__) + +TRAINING_STATUS_FILE_NAME = "training_status.json" + + +def get_version_string() -> str: + return f""" Version information: + ml-agents: {mlagents.trainers.__version__}, + ml-agents-envs: {mlagents_envs.__version__}, + Communicator API: {UnityEnvironment.API_VERSION}, + PyTorch: {torch_utils.torch.__version__}""" + + +def parse_command_line( + argv: Optional[List[str]] = None, +) -> RunOptions: + _, _ = register_trainer_plugins() + args = parser.parse_args(argv) + return RunOptions.from_argparse(args) + + +def run_training(run_seed: int, options: RunOptions, num_areas: int) -> None: + """ + Launches training session. + :param run_seed: Random seed used for training. + :param num_areas: Number of training areas to instantiate + :param options: parsed command line arguments + """ + with hierarchical_timer("run_training.setup"): + torch_utils.set_torch_config(options.torch_settings) + checkpoint_settings = options.checkpoint_settings + env_settings = options.env_settings + engine_settings = options.engine_settings + + run_logs_dir = checkpoint_settings.run_logs_dir + port: Optional[int] = env_settings.base_port + # Check if directory exists + validate_existing_directories( + checkpoint_settings.write_path, + checkpoint_settings.resume, + checkpoint_settings.force, + checkpoint_settings.maybe_init_path, + ) + # Make run logs directory + os.makedirs(run_logs_dir, exist_ok=True) + # Load any needed states in case of resume + if checkpoint_settings.resume: + GlobalTrainingStatus.load_state( + os.path.join(run_logs_dir, "training_status.json") + ) + # In case of initialization, set full init_path for all behaviors + elif checkpoint_settings.maybe_init_path is not None: + setup_init_path(options.behaviors, checkpoint_settings.maybe_init_path) + + # Configure Tensorboard Writers and StatsReporter + stats_writers = register_stats_writer_plugins(options) + for sw in stats_writers: + StatsReporter.add_writer(sw) + + if env_settings.env_path is None: + port = None + env_factory = create_environment_factory( + env_settings.env_path, + engine_settings.no_graphics, + run_seed, + num_areas, + port, + env_settings.env_args, + os.path.abspath(run_logs_dir), # Unity environment requires absolute path + ) + + env_manager = SubprocessEnvManager(env_factory, options, env_settings.num_envs) + env_parameter_manager = EnvironmentParameterManager( + options.environment_parameters, run_seed, restore=checkpoint_settings.resume + ) + + trainer_factory = TrainerFactory( + trainer_config=options.behaviors, + output_path=checkpoint_settings.write_path, + train_model=not checkpoint_settings.inference, + load_model=checkpoint_settings.resume, + seed=run_seed, + param_manager=env_parameter_manager, + init_path=checkpoint_settings.maybe_init_path, + multi_gpu=False, + ) + # Create controller and begin training. + tc = TrainerController( + trainer_factory, + checkpoint_settings.write_path, + checkpoint_settings.run_id, + env_parameter_manager, + not checkpoint_settings.inference, + run_seed, + ) + + # Begin training + try: + tc.start_learning(env_manager) + finally: + env_manager.close() + write_run_options(checkpoint_settings.write_path, options) + write_timing_tree(run_logs_dir) + write_training_status(run_logs_dir) + + +def write_run_options(output_dir: str, run_options: RunOptions) -> None: + run_options_path = os.path.join(output_dir, "configuration.yaml") + try: + with open(run_options_path, "w") as f: + try: + yaml.dump(run_options.as_dict(), f, sort_keys=False) + except TypeError: # Older versions of pyyaml don't support sort_keys + yaml.dump(run_options.as_dict(), f) + except FileNotFoundError: + logger.warning( + f"Unable to save configuration to {run_options_path}. Make sure the directory exists" + ) + + +def write_training_status(output_dir: str) -> None: + GlobalTrainingStatus.save_state(os.path.join(output_dir, TRAINING_STATUS_FILE_NAME)) + + +def write_timing_tree(output_dir: str) -> None: + timing_path = os.path.join(output_dir, "timers.json") + try: + with open(timing_path, "w") as f: + json.dump(get_timer_tree(), f, indent=4) + except FileNotFoundError: + logger.warning( + f"Unable to save to {timing_path}. Make sure the directory exists" + ) + + +def create_environment_factory( + env_path: Optional[str], + no_graphics: bool, + seed: int, + num_areas: int, + start_port: Optional[int], + env_args: Optional[List[str]], + log_folder: str, +) -> Callable[[int, List[SideChannel]], BaseEnv]: + def create_unity_environment( + worker_id: int, side_channels: List[SideChannel] + ) -> UnityEnvironment: + # Make sure that each environment gets a different seed + env_seed = seed + worker_id + return UnityEnvironment( + file_name=env_path, + worker_id=worker_id, + seed=env_seed, + num_areas=num_areas, + no_graphics=no_graphics, + base_port=start_port, + additional_args=env_args, + side_channels=side_channels, + log_folder=log_folder, + ) + + return create_unity_environment + + +def run_cli(options: RunOptions) -> None: + try: + print( + """ + ┐ ╖ + ╓╖╬│╡ ││╬╖╖ + ╓╖╬│││││┘ ╬│││││╬╖ + ╖╬│││││╬╜ ╙╬│││││╖╖ ╗╗╗ + ╬╬╬╬╖││╦╖ ╖╬││╗╣╣╣╬ ╟╣╣╬ ╟╣╣╣ ╜╜╜ ╟╣╣ + ╬╬╬╬╬╬╬╬╖│╬╖╖╓╬╪│╓╣╣╣╣╣╣╣╬ ╟╣╣╬ ╟╣╣╣ ╒╣╣╖╗╣╣╣╗ ╣╣╣ ╣╣╣╣╣╣ ╟╣╣╖ ╣╣╣ + ╬╬╬╬┐ ╙╬╬╬╬│╓╣╣╣╝╜ ╫╣╣╣╬ ╟╣╣╬ ╟╣╣╣ ╟╣╣╣╙ ╙╣╣╣ ╣╣╣ ╙╟╣╣╜╙ ╫╣╣ ╟╣╣ + ╬╬╬╬┐ ╙╬╬╣╣ ╫╣╣╣╬ ╟╣╣╬ ╟╣╣╣ ╟╣╣╬ ╣╣╣ ╣╣╣ ╟╣╣ ╣╣╣┌╣╣╜ + ╬╬╬╜ ╬╬╣╣ ╙╝╣╣╬ ╙╣╣╣╗╖╓╗╣╣╣╜ ╟╣╣╬ ╣╣╣ ╣╣╣ ╟╣╣╦╓ ╣╣╣╣╣ + ╙ ╓╦╖ ╬╬╣╣ ╓╗╗╖ ╙╝╣╣╣╣╝╜ ╘╝╝╜ ╝╝╝ ╝╝╝ ╙╣╣╣ ╟╣╣╣ + ╩╬╬╬╬╬╬╦╦╬╬╣╣╗╣╣╣╣╣╣╣╝ ╫╣╣╣╣ + ╙╬╬╬╬╬╬╬╣╣╣╣╣╣╝╜ + ╙╬╬╬╣╣╣╜ + ╙ + """ + ) + except Exception: + print("\n\n\tUnity Technologies\n") + print(get_version_string()) + + if options.debug: + log_level = logging_util.DEBUG + else: + log_level = logging_util.INFO + + logging_util.set_log_level(log_level) + + logger.debug("Configuration for this run:") + logger.debug(json.dumps(options.as_dict(), indent=4)) + + # Options deprecation warnings + if options.checkpoint_settings.load_model: + logger.warning( + "The --load option has been deprecated. Please use the --resume option instead." + ) + if options.checkpoint_settings.train_model: + logger.warning( + "The --train option has been deprecated. Train mode is now the default. Use " + "--inference to run in inference mode." + ) + + run_seed = options.env_settings.seed + num_areas = options.env_settings.num_areas + + # Add some timer metadata + add_timer_metadata("mlagents_version", mlagents.trainers.__version__) + add_timer_metadata("mlagents_envs_version", mlagents_envs.__version__) + add_timer_metadata("communication_protocol_version", UnityEnvironment.API_VERSION) + add_timer_metadata("pytorch_version", torch_utils.torch.__version__) + add_timer_metadata("numpy_version", np.__version__) + + if options.env_settings.seed == -1: + run_seed = np.random.randint(0, 10000) + logger.debug(f"run_seed set to {run_seed}") + run_training(run_seed, options, num_areas) + + +def main(): + run_cli(parse_command_line()) + + +# For python debugger to directly run this script +if __name__ == "__main__": + main() diff --git a/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4f55d529f7761190dd1804e42dacd164dd2936c Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/model_saver.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/model_saver.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f496daddf5e3f6a50faeae8d141843831674105 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/model_saver.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/torch_model_saver.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/torch_model_saver.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..677fa2db96ed631363e469e8ee4673f22f3536b4 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/torch_model_saver.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/model_saver/model_saver.py b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/model_saver.py new file mode 100644 index 0000000000000000000000000000000000000000..c1594ff08e79e251a8cb7ce6c074d929819787ff --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/model_saver.py @@ -0,0 +1,69 @@ +# # Unity ML-Agents Toolkit +import abc +from typing import Any, Tuple, List + + +class BaseModelSaver(abc.ABC): + """This class is the base class for the ModelSaver""" + + def __init__(self): + pass + + @abc.abstractmethod + def register(self, module: Any) -> None: + """ + Register the modules to the ModelSaver. + The ModelSaver will store the module and include it in the saved files + when saving checkpoint/exporting graph. + :param module: the module to be registered + """ + pass + + def _register_policy(self, policy): + """ + Helper function for registering policy to the ModelSaver. + :param policy: the policy to be registered + """ + pass + + def _register_optimizer(self, optimizer): + """ + Helper function for registering optimizer to the ModelSaver. + :param optimizer: the optimizer to be registered + """ + pass + + @abc.abstractmethod + def save_checkpoint(self, behavior_name: str, step: int) -> Tuple[str, List[str]]: + """ + Checkpoints the policy on disk. + :param checkpoint_path: filepath to write the checkpoint + :param behavior_name: Behavior name of bevavior to be trained + :return: A Tuple of the path to the exported file, as well as a List of any + auxillary files that were returned. For instance, an exported file would be Model.onnx, + and the auxillary files would be [Model.pt] for PyTorch + """ + pass + + @abc.abstractmethod + def export(self, output_filepath: str, behavior_name: str) -> None: + """ + Saves the serialized model, given a path and behavior name. + This method will save the policy graph to the given filepath. The path + should be provided without an extension as multiple serialized model formats + may be generated as a result. + :param output_filepath: path (without suffix) for the model file(s) + :param behavior_name: Behavior name of behavior to be trained. + """ + pass + + @abc.abstractmethod + def initialize_or_load(self, policy): + """ + Initialize/Load registered modules by default. + If given input argument policy, do with the input policy instead. + This argument is mainly for the initialization of the ghost trainer's fixed policy. + :param policy (optional): if given, perform the initializing/loading on this input policy. + Otherwise, do with the registered policy + """ + pass diff --git a/MLPY/Lib/site-packages/mlagents/trainers/model_saver/torch_model_saver.py b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/torch_model_saver.py new file mode 100644 index 0000000000000000000000000000000000000000..70c3f19e431d6cdd9b346a311e79745326238ff5 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/torch_model_saver.py @@ -0,0 +1,153 @@ +import os +import shutil +from mlagents.torch_utils import torch +from typing import Dict, Union, Optional, cast, Tuple, List +from mlagents_envs.exception import UnityPolicyException +from mlagents_envs.logging_util import get_logger +from mlagents.trainers.model_saver.model_saver import BaseModelSaver +from mlagents.trainers.settings import TrainerSettings, SerializationSettings +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.torch_entities.model_serialization import ModelSerializer + + +logger = get_logger(__name__) +DEFAULT_CHECKPOINT_NAME = "checkpoint.pt" + + +class TorchModelSaver(BaseModelSaver): + """ + ModelSaver class for PyTorch + """ + + def __init__( + self, trainer_settings: TrainerSettings, model_path: str, load: bool = False + ): + super().__init__() + self.model_path = model_path + self.initialize_path = trainer_settings.init_path + self._keep_checkpoints = trainer_settings.keep_checkpoints + self.load = load + + self.policy: Optional[TorchPolicy] = None + self.exporter: Optional[ModelSerializer] = None + self.modules: Dict[str, torch.nn.Modules] = {} + + def register(self, module: Union[TorchPolicy, TorchOptimizer]) -> None: + if isinstance(module, TorchPolicy) or isinstance(module, TorchOptimizer): + self.modules.update(module.get_modules()) # type: ignore + else: + raise UnityPolicyException( + "Registering Object of unsupported type {} to ModelSaver ".format( + type(module) + ) + ) + if self.policy is None and isinstance(module, TorchPolicy): + self.policy = module + self.exporter = ModelSerializer(self.policy) + + def save_checkpoint(self, behavior_name: str, step: int) -> Tuple[str, List[str]]: + if not os.path.exists(self.model_path): + os.makedirs(self.model_path) + checkpoint_path = os.path.join(self.model_path, f"{behavior_name}-{step}") + state_dict = { + name: module.state_dict() for name, module in self.modules.items() + } + pytorch_ckpt_path = f"{checkpoint_path}.pt" + export_ckpt_path = f"{checkpoint_path}.onnx" + torch.save(state_dict, f"{checkpoint_path}.pt") + torch.save(state_dict, os.path.join(self.model_path, DEFAULT_CHECKPOINT_NAME)) + self.export(checkpoint_path, behavior_name) + return export_ckpt_path, [pytorch_ckpt_path] + + def export(self, output_filepath: str, behavior_name: str) -> None: + if self.exporter is not None: + self.exporter.export_policy_model(output_filepath) + + def initialize_or_load(self, policy: Optional[TorchPolicy] = None) -> None: + # Initialize/Load registered self.policy by default. + # If given input argument policy, use the input policy instead. + # This argument is mainly for initialization of the ghost trainer's fixed policy. + reset_steps = not self.load + if self.initialize_path is not None: + logger.info(f"Initializing from {self.initialize_path}.") + self._load_model( + self.initialize_path, policy, reset_global_steps=reset_steps + ) + elif self.load: + logger.info(f"Resuming from {self.model_path}.") + self._load_model( + os.path.join(self.model_path, DEFAULT_CHECKPOINT_NAME), + policy, + reset_global_steps=reset_steps, + ) + + def _load_model( + self, + load_path: str, + policy: Optional[TorchPolicy] = None, + reset_global_steps: bool = False, + ) -> None: + saved_state_dict = torch.load(load_path) + if policy is None: + modules = self.modules + policy = self.policy + else: + modules = policy.get_modules() + policy = cast(TorchPolicy, policy) + + for name, mod in modules.items(): + try: + if isinstance(mod, torch.nn.Module): + missing_keys, unexpected_keys = mod.load_state_dict( + saved_state_dict[name], strict=False + ) + if missing_keys: + logger.warning( + f"Did not find these keys {missing_keys} in checkpoint. Initializing." + ) + if unexpected_keys: + logger.warning( + f"Did not expect these keys {unexpected_keys} in checkpoint. Ignoring." + ) + else: + # If module is not an nn.Module, try to load as one piece + mod.load_state_dict(saved_state_dict[name]) + + # KeyError is raised if the module was not present in the last run but is being + # accessed in the saved_state_dict. + # ValueError is raised by the optimizer's load_state_dict if the parameters have + # have changed. Note, the optimizer uses a completely different load_state_dict + # function because it is not an nn.Module. + # RuntimeError is raised by PyTorch if there is a size mismatch between modules + # of the same name. This will still partially assign values to those layers that + # have not changed shape. + except (KeyError, ValueError, RuntimeError) as err: + logger.warning(f"Failed to load for module {name}. Initializing") + logger.debug(f"Module loading error : {err}") + + if reset_global_steps: + policy.set_step(0) + logger.info( + "Starting training from step 0 and saving to {}.".format( + self.model_path + ) + ) + else: + logger.info(f"Resuming training from step {policy.get_current_step()}.") + + def copy_final_model(self, source_nn_path: str) -> None: + """ + Copy the .nn file at the given source to the destination. + Also copies the corresponding .onnx file if it exists. + """ + final_model_name = os.path.splitext(source_nn_path)[0] + + if SerializationSettings.convert_to_onnx: + try: + source_path = f"{final_model_name}.onnx" + destination_path = f"{self.model_path}.onnx" + shutil.copyfile(source_path, destination_path) + logger.info(f"Copied {source_path} to {destination_path}.") + except OSError: + pass diff --git a/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ba7b9a71313c198fad246562c0e0f796053eb562 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__init__.py @@ -0,0 +1 @@ +from mlagents.trainers.optimizer.optimizer import Optimizer # noqa diff --git a/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91a2c96bb345fb41c0116e7d7c8c49951c04b917 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/optimizer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/optimizer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f3e62685ca86a1156dc09579a15965efbaeeafc Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/optimizer.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/torch_optimizer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/torch_optimizer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b031ab29d01c2bd1b036eb57bec92df5ef621439 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/torch_optimizer.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/optimizer/optimizer.py b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..d18e3c60ec292084127f28e77d144723804c4e9a --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/optimizer.py @@ -0,0 +1,24 @@ +import abc +from typing import Dict + +from mlagents.trainers.buffer import AgentBuffer + + +class Optimizer(abc.ABC): + """ + Creates loss functions and auxillary networks (e.g. Q or Value) needed for training. + Provides methods to update the Policy. + """ + + def __init__(self): + self.reward_signals = {} + + @abc.abstractmethod + def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: + """ + Update the Policy based on the batch that was passed in. + :param batch: AgentBuffer that contains the minibatch of data used for this update. + :param num_sequences: Number of recurrent sequences found in the minibatch. + :return: A Dict containing statistics (name, value) from the update (e.g. loss) + """ + pass diff --git a/MLPY/Lib/site-packages/mlagents/trainers/optimizer/torch_optimizer.py b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/torch_optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..8cb0a6ee8c0809ccbf7a15cdd474e92059575efc --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/torch_optimizer.py @@ -0,0 +1,211 @@ +from typing import Dict, Optional, Tuple, List +from mlagents.torch_utils import torch +import numpy as np +from collections import defaultdict + +from mlagents.trainers.buffer import AgentBuffer, AgentBufferField +from mlagents.trainers.trajectory import ObsUtil +from mlagents.trainers.torch_entities.components.bc.module import BCModule +from mlagents.trainers.torch_entities.components.reward_providers import ( + create_reward_provider, +) + +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.optimizer import Optimizer +from mlagents.trainers.settings import ( + TrainerSettings, + RewardSignalSettings, + RewardSignalType, +) +from mlagents.trainers.torch_entities.utils import ModelUtils + + +class TorchOptimizer(Optimizer): + def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings): + super().__init__() + self.policy = policy + self.trainer_settings = trainer_settings + self.update_dict: Dict[str, torch.Tensor] = {} + self.value_heads: Dict[str, torch.Tensor] = {} + self.memory_in: torch.Tensor = None + self.memory_out: torch.Tensor = None + self.m_size: int = 0 + self.global_step = torch.tensor(0) + self.bc_module: Optional[BCModule] = None + self.create_reward_signals(trainer_settings.reward_signals) + self.critic_memory_dict: Dict[str, torch.Tensor] = {} + if trainer_settings.behavioral_cloning is not None: + self.bc_module = BCModule( + self.policy, + trainer_settings.behavioral_cloning, + policy_learning_rate=trainer_settings.hyperparameters.learning_rate, + default_batch_size=trainer_settings.hyperparameters.batch_size, + default_num_epoch=3, + ) + + @property + def critic(self): + raise NotImplementedError + + def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: + pass + + def create_reward_signals( + self, reward_signal_configs: Dict[RewardSignalType, RewardSignalSettings] + ) -> None: + """ + Create reward signals + :param reward_signal_configs: Reward signal config. + """ + for reward_signal, settings in reward_signal_configs.items(): + # Name reward signals by string in case we have duplicates later + self.reward_signals[reward_signal.value] = create_reward_provider( + reward_signal, self.policy.behavior_spec, settings + ) + + def _evaluate_by_sequence( + self, tensor_obs: List[torch.Tensor], initial_memory: torch.Tensor + ) -> Tuple[Dict[str, torch.Tensor], AgentBufferField, torch.Tensor]: + """ + Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the + intermediate memories for the critic. + :param tensor_obs: A List of tensors of shape (trajectory_len, ) that are the agent's + observations for this trajectory. + :param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,), i.e. + what is returned as the output of a MemoryModules. + :return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial + memories to be used during value function update, and the final memory at the end of the trajectory. + """ + num_experiences = tensor_obs[0].shape[0] + all_next_memories = AgentBufferField() + # When using LSTM, we need to divide the trajectory into sequences of equal length. Sometimes, + # that division isn't even, and we must pad the leftover sequence. + # When it is added to the buffer, the last sequence will be padded. So if seq_len = 3 and + # trajectory is of length 10, the last sequence is [obs,pad,pad] once it is added to the buffer. + # Compute the number of elements in this sequence that will end up being padded. + leftover_seq_len = num_experiences % self.policy.sequence_length + + all_values: Dict[str, List[np.ndarray]] = defaultdict(list) + _mem = initial_memory + # Evaluate other trajectories, carrying over _mem after each + # trajectory + for seq_num in range(num_experiences // self.policy.sequence_length): + seq_obs = [] + for _ in range(self.policy.sequence_length): + all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze())) + start = seq_num * self.policy.sequence_length + end = (seq_num + 1) * self.policy.sequence_length + + for _obs in tensor_obs: + seq_obs.append(_obs[start:end]) + values, _mem = self.critic.critic_pass( + seq_obs, _mem, sequence_length=self.policy.sequence_length + ) + for signal_name, _val in values.items(): + all_values[signal_name].append(_val) + + # Compute values for the potentially truncated last sequence. Note that this + # sequence isn't padded yet, but will be. + seq_obs = [] + + if leftover_seq_len > 0: + for _obs in tensor_obs: + last_seq_obs = _obs[-leftover_seq_len:] + seq_obs.append(last_seq_obs) + + # For the last sequence, the initial memory should be the one at the + # end of this trajectory. + for _ in range(leftover_seq_len): + all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze())) + + last_values, _mem = self.critic.critic_pass( + seq_obs, _mem, sequence_length=leftover_seq_len + ) + for signal_name, _val in last_values.items(): + all_values[signal_name].append(_val) + + # Create one tensor per reward signal + all_value_tensors = { + signal_name: torch.cat(value_list, dim=0) + for signal_name, value_list in all_values.items() + } + next_mem = _mem + return all_value_tensors, all_next_memories, next_mem + + def update_reward_signals(self, batch: AgentBuffer) -> Dict[str, float]: + update_stats: Dict[str, float] = {} + for reward_provider in self.reward_signals.values(): + update_stats.update(reward_provider.update(batch)) + return update_stats + + def get_trajectory_value_estimates( + self, + batch: AgentBuffer, + next_obs: List[np.ndarray], + done: bool, + agent_id: str = "", + ) -> Tuple[Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField]]: + """ + Get value estimates and memories for a trajectory, in batch form. + :param batch: An AgentBuffer that consists of a trajectory. + :param next_obs: the next observation (after the trajectory). Used for boostrapping + if this is not a termiinal trajectory. + :param done: Set true if this is a terminal trajectory. + :param agent_id: Agent ID of the agent that this trajectory belongs to. + :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)], + the final value estimate as a Dict of [name, float], and optionally (if using memories) + an AgentBufferField of initial critic memories to be used during update. + """ + n_obs = len(self.policy.behavior_spec.observation_specs) + + if agent_id in self.critic_memory_dict: + memory = self.critic_memory_dict[agent_id] + else: + memory = ( + torch.zeros((1, 1, self.critic.memory_size)) + if self.policy.use_recurrent + else None + ) + + # Convert to tensors + current_obs = [ + ModelUtils.list_to_tensor(obs) for obs in ObsUtil.from_buffer(batch, n_obs) + ] + next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] + + next_obs = [obs.unsqueeze(0) for obs in next_obs] + + # If we're using LSTM, we want to get all the intermediate memories. + all_next_memories: Optional[AgentBufferField] = None + + # To prevent memory leak and improve performance, evaluate with no_grad. + with torch.no_grad(): + if self.policy.use_recurrent: + ( + value_estimates, + all_next_memories, + next_memory, + ) = self._evaluate_by_sequence(current_obs, memory) + else: + value_estimates, next_memory = self.critic.critic_pass( + current_obs, memory, sequence_length=batch.num_experiences + ) + + # Store the memory for the next trajectory. This should NOT have a gradient. + self.critic_memory_dict[agent_id] = next_memory + + next_value_estimate, _ = self.critic.critic_pass( + next_obs, next_memory, sequence_length=1 + ) + + for name, estimate in value_estimates.items(): + value_estimates[name] = ModelUtils.to_numpy(estimate) + next_value_estimate[name] = ModelUtils.to_numpy(next_value_estimate[name]) + + if done: + for k in next_value_estimate: + if not self.reward_signals[k].ignore_done: + next_value_estimate[k] = 0.0 + if agent_id in self.critic_memory_dict: + self.critic_memory_dict.pop(agent_id) + return value_estimates, next_value_estimate, all_next_memories diff --git a/MLPY/Lib/site-packages/mlagents/trainers/poca/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/poca/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8d6e46d80918231d0c49c9e8a7660c96d9a60e1 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/optimizer_torch.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/optimizer_torch.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20f71e62e75ec4c80baa7f8b08b679380aa0dfec Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/optimizer_torch.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/trainer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1040c036b396c886d9209463f1410ad607a32b58 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/trainer.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/poca/optimizer_torch.py b/MLPY/Lib/site-packages/mlagents/trainers/poca/optimizer_torch.py new file mode 100644 index 0000000000000000000000000000000000000000..4f77de4ebbf3f005fb7e2f4eb5ccda0f1bb640e0 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/poca/optimizer_torch.py @@ -0,0 +1,690 @@ +from typing import Dict, cast, List, Tuple, Optional +from collections import defaultdict +import attr + +from mlagents.trainers.torch_entities.components.reward_providers.extrinsic_reward_provider import ( + ExtrinsicRewardProvider, +) +import numpy as np +from mlagents.torch_utils import torch, default_device + +from mlagents.trainers.buffer import ( + AgentBuffer, + BufferKey, + RewardSignalUtil, + AgentBufferField, +) + +from mlagents_envs.timers import timed +from mlagents_envs.base_env import ObservationSpec, ActionSpec +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.settings import ( + RewardSignalSettings, + RewardSignalType, + TrainerSettings, + NetworkSettings, + OnPolicyHyperparamSettings, + ScheduleType, +) +from mlagents.trainers.torch_entities.networks import Critic, MultiAgentNetworkBody +from mlagents.trainers.torch_entities.decoders import ValueHeads +from mlagents.trainers.torch_entities.agent_action import AgentAction +from mlagents.trainers.torch_entities.action_log_probs import ActionLogProbs +from mlagents.trainers.torch_entities.utils import ModelUtils +from mlagents.trainers.trajectory import ObsUtil, GroupObsUtil + +from mlagents_envs.logging_util import get_logger + +logger = get_logger(__name__) + + +@attr.s(auto_attribs=True) +class POCASettings(OnPolicyHyperparamSettings): + beta: float = 5.0e-3 + epsilon: float = 0.2 + lambd: float = 0.95 + num_epoch: int = 3 + learning_rate_schedule: ScheduleType = ScheduleType.LINEAR + beta_schedule: ScheduleType = ScheduleType.LINEAR + epsilon_schedule: ScheduleType = ScheduleType.LINEAR + + +class TorchPOCAOptimizer(TorchOptimizer): + class POCAValueNetwork(torch.nn.Module, Critic): + """ + The POCAValueNetwork uses the MultiAgentNetworkBody to compute the value + and POCA baseline for a variable number of agents in a group that all + share the same observation and action space. + """ + + def __init__( + self, + stream_names: List[str], + observation_specs: List[ObservationSpec], + network_settings: NetworkSettings, + action_spec: ActionSpec, + ): + torch.nn.Module.__init__(self) + self.network_body = MultiAgentNetworkBody( + observation_specs, network_settings, action_spec + ) + if network_settings.memory is not None: + encoding_size = network_settings.memory.memory_size // 2 + else: + encoding_size = network_settings.hidden_units + + self.value_heads = ValueHeads(stream_names, encoding_size + 1, 1) + # The + 1 is for the normalized number of agents + + @property + def memory_size(self) -> int: + return self.network_body.memory_size + + def update_normalization(self, buffer: AgentBuffer) -> None: + self.network_body.update_normalization(buffer) + + def baseline( + self, + obs_without_actions: List[torch.Tensor], + obs_with_actions: Tuple[List[List[torch.Tensor]], List[AgentAction]], + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: + """ + The POCA baseline marginalizes the action of the agent associated with self_obs. + It calls the forward pass of the MultiAgentNetworkBody with the state action + pairs of groupmates but just the state of the agent in question. + :param obs_without_actions: The obs of the agent for which to compute the baseline. + :param obs_with_actions: Tuple of observations and actions for all groupmates. + :param memories: If using memory, a Tensor of initial memories. + :param sequence_length: If using memory, the sequence length. + + :return: A Tuple of Dict of reward stream to tensor and critic memories. + """ + (obs, actions) = obs_with_actions + encoding, memories = self.network_body( + obs_only=[obs_without_actions], + obs=obs, + actions=actions, + memories=memories, + sequence_length=sequence_length, + ) + + value_outputs, critic_mem_out = self.forward( + encoding, memories, sequence_length + ) + return value_outputs, critic_mem_out + + def critic_pass( + self, + obs: List[List[torch.Tensor]], + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: + """ + A centralized value function. It calls the forward pass of MultiAgentNetworkBody + with just the states of all agents. + :param obs: List of observations for all agents in group + :param memories: If using memory, a Tensor of initial memories. + :param sequence_length: If using memory, the sequence length. + :return: A Tuple of Dict of reward stream to tensor and critic memories. + """ + encoding, memories = self.network_body( + obs_only=obs, + obs=[], + actions=[], + memories=memories, + sequence_length=sequence_length, + ) + + value_outputs, critic_mem_out = self.forward( + encoding, memories, sequence_length + ) + return value_outputs, critic_mem_out + + def forward( + self, + encoding: torch.Tensor, + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Tuple[torch.Tensor, torch.Tensor]: + + output = self.value_heads(encoding) + return output, memories + + def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings): + """ + Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. + :param policy: A TorchPolicy object that will be updated by this POCA Optimizer. + :param trainer_params: Trainer parameters dictionary that specifies the + properties of the trainer. + """ + # Create the graph here to give more granular control of the TF graph to the Optimizer. + + super().__init__(policy, trainer_settings) + reward_signal_configs = trainer_settings.reward_signals + reward_signal_names = [key.value for key, _ in reward_signal_configs.items()] + + self._critic = TorchPOCAOptimizer.POCAValueNetwork( + reward_signal_names, + policy.behavior_spec.observation_specs, + network_settings=trainer_settings.network_settings, + action_spec=policy.behavior_spec.action_spec, + ) + # Move to GPU if needed + self._critic.to(default_device()) + + params = list(self.policy.actor.parameters()) + list(self.critic.parameters()) + + self.hyperparameters: POCASettings = cast( + POCASettings, trainer_settings.hyperparameters + ) + + self.decay_learning_rate = ModelUtils.DecayedValue( + self.hyperparameters.learning_rate_schedule, + self.hyperparameters.learning_rate, + 1e-10, + self.trainer_settings.max_steps, + ) + self.decay_epsilon = ModelUtils.DecayedValue( + self.hyperparameters.epsilon_schedule, + self.hyperparameters.epsilon, + 0.1, + self.trainer_settings.max_steps, + ) + self.decay_beta = ModelUtils.DecayedValue( + self.hyperparameters.beta_schedule, + self.hyperparameters.beta, + 1e-5, + self.trainer_settings.max_steps, + ) + + self.optimizer = torch.optim.Adam( + params, lr=self.trainer_settings.hyperparameters.learning_rate + ) + self.stats_name_to_update_name = { + "Losses/Value Loss": "value_loss", + "Losses/Policy Loss": "policy_loss", + } + + self.stream_names = list(self.reward_signals.keys()) + self.value_memory_dict: Dict[str, torch.Tensor] = {} + self.baseline_memory_dict: Dict[str, torch.Tensor] = {} + + def create_reward_signals( + self, reward_signal_configs: Dict[RewardSignalType, RewardSignalSettings] + ) -> None: + """ + Create reward signals. Override default to provide warnings for Curiosity and + GAIL, and make sure Extrinsic adds team rewards. + :param reward_signal_configs: Reward signal config. + """ + for reward_signal in reward_signal_configs.keys(): + if reward_signal != RewardSignalType.EXTRINSIC: + logger.warning( + f"Reward signal {reward_signal.value.capitalize()} is not supported with the POCA trainer; " + "results may be unexpected." + ) + super().create_reward_signals(reward_signal_configs) + # Make sure we add the groupmate rewards in POCA, so agents learn how to help each + # other achieve individual rewards as well + for reward_provider in self.reward_signals.values(): + if isinstance(reward_provider, ExtrinsicRewardProvider): + reward_provider.add_groupmate_rewards = True + + @property + def critic(self): + return self._critic + + @timed + def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: + """ + Performs update on model. + :param batch: Batch of experiences. + :param num_sequences: Number of sequences to process. + :return: Results of update. + """ + # Get decayed parameters + decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step()) + decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step()) + decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) + returns = {} + old_values = {} + old_baseline_values = {} + for name in self.reward_signals: + old_values[name] = ModelUtils.list_to_tensor( + batch[RewardSignalUtil.value_estimates_key(name)] + ) + returns[name] = ModelUtils.list_to_tensor( + batch[RewardSignalUtil.returns_key(name)] + ) + old_baseline_values[name] = ModelUtils.list_to_tensor( + batch[RewardSignalUtil.baseline_estimates_key(name)] + ) + + n_obs = len(self.policy.behavior_spec.observation_specs) + current_obs = ObsUtil.from_buffer(batch, n_obs) + # Convert to tensors + current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] + groupmate_obs = GroupObsUtil.from_buffer(batch, n_obs) + groupmate_obs = [ + [ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs] + for _groupmate_obs in groupmate_obs + ] + + act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) + actions = AgentAction.from_buffer(batch) + groupmate_actions = AgentAction.group_from_buffer(batch) + + memories = [ + ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) + for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) + ] + if len(memories) > 0: + memories = torch.stack(memories).unsqueeze(0) + value_memories = [ + ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) + for i in range( + 0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length + ) + ] + + baseline_memories = [ + ModelUtils.list_to_tensor(batch[BufferKey.BASELINE_MEMORY][i]) + for i in range( + 0, len(batch[BufferKey.BASELINE_MEMORY]), self.policy.sequence_length + ) + ] + + if len(value_memories) > 0: + value_memories = torch.stack(value_memories).unsqueeze(0) + baseline_memories = torch.stack(baseline_memories).unsqueeze(0) + + run_out = self.policy.actor.get_stats( + current_obs, + actions, + masks=act_masks, + memories=memories, + sequence_length=self.policy.sequence_length, + ) + + log_probs = run_out["log_probs"] + entropy = run_out["entropy"] + + all_obs = [current_obs] + groupmate_obs + values, _ = self.critic.critic_pass( + all_obs, + memories=value_memories, + sequence_length=self.policy.sequence_length, + ) + groupmate_obs_and_actions = (groupmate_obs, groupmate_actions) + baselines, _ = self.critic.baseline( + current_obs, + groupmate_obs_and_actions, + memories=baseline_memories, + sequence_length=self.policy.sequence_length, + ) + old_log_probs = ActionLogProbs.from_buffer(batch).flatten() + log_probs = log_probs.flatten() + loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) + + baseline_loss = ModelUtils.trust_region_value_loss( + baselines, old_baseline_values, returns, decay_eps, loss_masks + ) + value_loss = ModelUtils.trust_region_value_loss( + values, old_values, returns, decay_eps, loss_masks + ) + policy_loss = ModelUtils.trust_region_policy_loss( + ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]), + log_probs, + old_log_probs, + loss_masks, + decay_eps, + ) + + loss = ( + policy_loss + + 0.5 * (value_loss + 0.5 * baseline_loss) + - decay_bet * ModelUtils.masked_mean(entropy, loss_masks) + ) + + # Set optimizer learning rate + ModelUtils.update_learning_rate(self.optimizer, decay_lr) + self.optimizer.zero_grad() + loss.backward() + + self.optimizer.step() + update_stats = { + # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow. + # TODO: After PyTorch is default, change to something more correct. + "Losses/Policy Loss": torch.abs(policy_loss).item(), + "Losses/Value Loss": value_loss.item(), + "Losses/Baseline Loss": baseline_loss.item(), + "Policy/Learning Rate": decay_lr, + "Policy/Epsilon": decay_eps, + "Policy/Beta": decay_bet, + } + + return update_stats + + def get_modules(self): + modules = {"Optimizer:adam": self.optimizer, "Optimizer:critic": self._critic} + for reward_provider in self.reward_signals.values(): + modules.update(reward_provider.get_modules()) + return modules + + def _evaluate_by_sequence_team( + self, + self_obs: List[torch.Tensor], + obs: List[List[torch.Tensor]], + actions: List[AgentAction], + init_value_mem: torch.Tensor, + init_baseline_mem: torch.Tensor, + ) -> Tuple[ + Dict[str, torch.Tensor], + Dict[str, torch.Tensor], + AgentBufferField, + AgentBufferField, + torch.Tensor, + torch.Tensor, + ]: + """ + Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the + intermediate memories for the critic. + :param tensor_obs: A List of tensors of shape (trajectory_len, ) that are the agent's + observations for this trajectory. + :param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,), i.e. + what is returned as the output of a MemoryModules. + :return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial + memories to be used during value function update, and the final memory at the end of the trajectory. + """ + num_experiences = self_obs[0].shape[0] + all_next_value_mem = AgentBufferField() + all_next_baseline_mem = AgentBufferField() + + # When using LSTM, we need to divide the trajectory into sequences of equal length. Sometimes, + # that division isn't even, and we must pad the leftover sequence. + # In the buffer, the last sequence are the ones that are padded. So if seq_len = 3 and + # trajectory is of length 10, the last sequence is [obs,pad,pad]. + # Compute the number of elements in this padded seq. + leftover_seq_len = num_experiences % self.policy.sequence_length + + all_values: Dict[str, List[np.ndarray]] = defaultdict(list) + all_baseline: Dict[str, List[np.ndarray]] = defaultdict(list) + _baseline_mem = init_baseline_mem + _value_mem = init_value_mem + + # Evaluate other trajectories, carrying over _mem after each + # trajectory + for seq_num in range(num_experiences // self.policy.sequence_length): + for _ in range(self.policy.sequence_length): + all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze())) + all_next_baseline_mem.append( + ModelUtils.to_numpy(_baseline_mem.squeeze()) + ) + + start = seq_num * self.policy.sequence_length + end = (seq_num + 1) * self.policy.sequence_length + + self_seq_obs = [] + groupmate_seq_obs = [] + groupmate_seq_act = [] + seq_obs = [] + for _self_obs in self_obs: + seq_obs.append(_self_obs[start:end]) + self_seq_obs.append(seq_obs) + + for groupmate_obs, groupmate_action in zip(obs, actions): + seq_obs = [] + for _obs in groupmate_obs: + sliced_seq_obs = _obs[start:end] + seq_obs.append(sliced_seq_obs) + groupmate_seq_obs.append(seq_obs) + _act = groupmate_action.slice(start, end) + groupmate_seq_act.append(_act) + + all_seq_obs = self_seq_obs + groupmate_seq_obs + values, _value_mem = self.critic.critic_pass( + all_seq_obs, _value_mem, sequence_length=self.policy.sequence_length + ) + for signal_name, _val in values.items(): + all_values[signal_name].append(_val) + + groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act) + baselines, _baseline_mem = self.critic.baseline( + self_seq_obs[0], + groupmate_obs_and_actions, + _baseline_mem, + sequence_length=self.policy.sequence_length, + ) + for signal_name, _val in baselines.items(): + all_baseline[signal_name].append(_val) + + # Compute values for the potentially truncated initial sequence + if leftover_seq_len > 0: + self_seq_obs = [] + groupmate_seq_obs = [] + groupmate_seq_act = [] + seq_obs = [] + for _self_obs in self_obs: + last_seq_obs = _self_obs[-leftover_seq_len:] + seq_obs.append(last_seq_obs) + self_seq_obs.append(seq_obs) + + for groupmate_obs, groupmate_action in zip(obs, actions): + seq_obs = [] + for _obs in groupmate_obs: + last_seq_obs = _obs[-leftover_seq_len:] + seq_obs.append(last_seq_obs) + groupmate_seq_obs.append(seq_obs) + _act = groupmate_action.slice(len(_obs) - leftover_seq_len, len(_obs)) + groupmate_seq_act.append(_act) + + # For the last sequence, the initial memory should be the one at the + # beginning of this trajectory. + seq_obs = [] + for _ in range(leftover_seq_len): + all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze())) + all_next_baseline_mem.append( + ModelUtils.to_numpy(_baseline_mem.squeeze()) + ) + + all_seq_obs = self_seq_obs + groupmate_seq_obs + last_values, _value_mem = self.critic.critic_pass( + all_seq_obs, _value_mem, sequence_length=leftover_seq_len + ) + for signal_name, _val in last_values.items(): + all_values[signal_name].append(_val) + groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act) + last_baseline, _baseline_mem = self.critic.baseline( + self_seq_obs[0], + groupmate_obs_and_actions, + _baseline_mem, + sequence_length=leftover_seq_len, + ) + for signal_name, _val in last_baseline.items(): + all_baseline[signal_name].append(_val) + # Create one tensor per reward signal + all_value_tensors = { + signal_name: torch.cat(value_list, dim=0) + for signal_name, value_list in all_values.items() + } + all_baseline_tensors = { + signal_name: torch.cat(baseline_list, dim=0) + for signal_name, baseline_list in all_baseline.items() + } + next_value_mem = _value_mem + next_baseline_mem = _baseline_mem + return ( + all_value_tensors, + all_baseline_tensors, + all_next_value_mem, + all_next_baseline_mem, + next_value_mem, + next_baseline_mem, + ) + + def get_trajectory_value_estimates( + self, + batch: AgentBuffer, + next_obs: List[np.ndarray], + done: bool, + agent_id: str = "", + ) -> Tuple[Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField]]: + """ + Override base class method. Unused in the trainer, but needed to make sure class heirarchy is maintained. + Assume that there are no group obs. + """ + ( + value_estimates, + _, + next_value_estimates, + all_next_value_mem, + _, + ) = self.get_trajectory_and_baseline_value_estimates( + batch, next_obs, [], done, agent_id + ) + + return value_estimates, next_value_estimates, all_next_value_mem + + def get_trajectory_and_baseline_value_estimates( + self, + batch: AgentBuffer, + next_obs: List[np.ndarray], + next_groupmate_obs: List[List[np.ndarray]], + done: bool, + agent_id: str = "", + ) -> Tuple[ + Dict[str, np.ndarray], + Dict[str, np.ndarray], + Dict[str, float], + Optional[AgentBufferField], + Optional[AgentBufferField], + ]: + """ + Get value estimates, baseline estimates, and memories for a trajectory, in batch form. + :param batch: An AgentBuffer that consists of a trajectory. + :param next_obs: the next observation (after the trajectory). Used for boostrapping + if this is not a termiinal trajectory. + :param next_groupmate_obs: the next observations from other members of the group. + :param done: Set true if this is a terminal trajectory. + :param agent_id: Agent ID of the agent that this trajectory belongs to. + :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)], + the baseline estimates as a Dict, the final value estimate as a Dict of [name, float], and + optionally (if using memories) an AgentBufferField of initial critic and baseline memories to be used + during update. + """ + + n_obs = len(self.policy.behavior_spec.observation_specs) + + current_obs = ObsUtil.from_buffer(batch, n_obs) + groupmate_obs = GroupObsUtil.from_buffer(batch, n_obs) + + current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] + groupmate_obs = [ + [ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs] + for _groupmate_obs in groupmate_obs + ] + + groupmate_actions = AgentAction.group_from_buffer(batch) + + next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] + next_obs = [obs.unsqueeze(0) for obs in next_obs] + + next_groupmate_obs = [ + ModelUtils.list_to_tensor_list(_list_obs) + for _list_obs in next_groupmate_obs + ] + # Expand dimensions of next critic obs + next_groupmate_obs = [ + [_obs.unsqueeze(0) for _obs in _list_obs] + for _list_obs in next_groupmate_obs + ] + + if agent_id in self.value_memory_dict: + # The agent_id should always be in both since they are added together + _init_value_mem = self.value_memory_dict[agent_id] + _init_baseline_mem = self.baseline_memory_dict[agent_id] + else: + _init_value_mem = ( + torch.zeros((1, 1, self.critic.memory_size)) + if self.policy.use_recurrent + else None + ) + _init_baseline_mem = ( + torch.zeros((1, 1, self.critic.memory_size)) + if self.policy.use_recurrent + else None + ) + + all_obs = ( + [current_obs] + groupmate_obs + if groupmate_obs is not None + else [current_obs] + ) + all_next_value_mem: Optional[AgentBufferField] = None + all_next_baseline_mem: Optional[AgentBufferField] = None + with torch.no_grad(): + if self.policy.use_recurrent: + ( + value_estimates, + baseline_estimates, + all_next_value_mem, + all_next_baseline_mem, + next_value_mem, + next_baseline_mem, + ) = self._evaluate_by_sequence_team( + current_obs, + groupmate_obs, + groupmate_actions, + _init_value_mem, + _init_baseline_mem, + ) + else: + value_estimates, next_value_mem = self.critic.critic_pass( + all_obs, _init_value_mem, sequence_length=batch.num_experiences + ) + groupmate_obs_and_actions = (groupmate_obs, groupmate_actions) + baseline_estimates, next_baseline_mem = self.critic.baseline( + current_obs, + groupmate_obs_and_actions, + _init_baseline_mem, + sequence_length=batch.num_experiences, + ) + # Store the memory for the next trajectory + self.value_memory_dict[agent_id] = next_value_mem + self.baseline_memory_dict[agent_id] = next_baseline_mem + + all_next_obs = ( + [next_obs] + next_groupmate_obs + if next_groupmate_obs is not None + else [next_obs] + ) + + next_value_estimates, _ = self.critic.critic_pass( + all_next_obs, next_value_mem, sequence_length=1 + ) + + for name, estimate in baseline_estimates.items(): + baseline_estimates[name] = ModelUtils.to_numpy(estimate) + + for name, estimate in value_estimates.items(): + value_estimates[name] = ModelUtils.to_numpy(estimate) + + # the base line and V shpuld not be on the same done flag + for name, estimate in next_value_estimates.items(): + next_value_estimates[name] = ModelUtils.to_numpy(estimate) + + if done: + for k in next_value_estimates: + if not self.reward_signals[k].ignore_done: + next_value_estimates[k][-1] = 0.0 + + return ( + value_estimates, + baseline_estimates, + next_value_estimates, + all_next_value_mem, + all_next_baseline_mem, + ) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/poca/trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/poca/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..266a14932154fc7413c9949bef221a1c268c1d4f --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/poca/trainer.py @@ -0,0 +1,249 @@ +# # Unity ML-Agents Toolkit +# ## ML-Agents Learning (POCA) +# Contains an implementation of MA-POCA. + +from collections import defaultdict +from typing import cast, Dict, Union, Any, Type + +import numpy as np + +from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod +from mlagents_envs.logging_util import get_logger +from mlagents_envs.base_env import BehaviorSpec +from mlagents.trainers.buffer import BufferKey, RewardSignalUtil +from mlagents.trainers.trainer.on_policy_trainer import OnPolicyTrainer +from mlagents.trainers.trainer.trainer_utils import lambda_return +from mlagents.trainers.policy import Policy +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.poca.optimizer_torch import TorchPOCAOptimizer, POCASettings +from mlagents.trainers.trajectory import Trajectory +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers +from mlagents.trainers.settings import TrainerSettings + +from mlagents.trainers.torch_entities.networks import SimpleActor, SharedActorCritic + +logger = get_logger(__name__) + +TRAINER_NAME = "poca" + + +class POCATrainer(OnPolicyTrainer): + """The POCATrainer is an implementation of the MA-POCA algorithm.""" + + def __init__( + self, + behavior_name: str, + reward_buff_cap: int, + trainer_settings: TrainerSettings, + training: bool, + load: bool, + seed: int, + artifact_path: str, + ): + """ + Responsible for collecting experiences and training POCA model. + :param behavior_name: The name of the behavior associated with trainer config + :param reward_buff_cap: Max reward history to track in the reward buffer + :param trainer_settings: The parameters for the trainer. + :param training: Whether the trainer is set for training. + :param load: Whether the model should be loaded. + :param seed: The seed the model will be initialized with + :param artifact_path: The directory within which to store artifacts from this trainer. + """ + super().__init__( + behavior_name, + reward_buff_cap, + trainer_settings, + training, + load, + seed, + artifact_path, + ) + self.hyperparameters: POCASettings = cast( + POCASettings, self.trainer_settings.hyperparameters + ) + self.seed = seed + self.policy: TorchPolicy = None # type: ignore + self.optimizer: TorchPOCAOptimizer = None # type: ignore + self.collected_group_rewards: Dict[str, int] = defaultdict(lambda: 0) + + def _process_trajectory(self, trajectory: Trajectory) -> None: + """ + Takes a trajectory and processes it, putting it into the update buffer. + Processing involves calculating value and advantage targets for model updating step. + :param trajectory: The Trajectory tuple containing the steps to be processed. + """ + super()._process_trajectory(trajectory) + agent_id = trajectory.agent_id # All the agents should have the same ID + + agent_buffer_trajectory = trajectory.to_agentbuffer() + # Update the normalization + if self.is_training: + self.policy.actor.update_normalization(agent_buffer_trajectory) + self.optimizer.critic.update_normalization(agent_buffer_trajectory) + + # Get all value estimates + ( + value_estimates, + baseline_estimates, + value_next, + value_memories, + baseline_memories, + ) = self.optimizer.get_trajectory_and_baseline_value_estimates( + agent_buffer_trajectory, + trajectory.next_obs, + trajectory.next_group_obs, + trajectory.all_group_dones_reached + and trajectory.done_reached + and not trajectory.interrupted, + ) + + if value_memories is not None and baseline_memories is not None: + agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(value_memories) + agent_buffer_trajectory[BufferKey.BASELINE_MEMORY].set(baseline_memories) + + for name, v in value_estimates.items(): + agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(name)].extend( + v + ) + agent_buffer_trajectory[ + RewardSignalUtil.baseline_estimates_key(name) + ].extend(baseline_estimates[name]) + self._stats_reporter.add_stat( + f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate", + np.mean(baseline_estimates[name]), + ) + self._stats_reporter.add_stat( + f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate", + np.mean(value_estimates[name]), + ) + + self.collected_rewards["environment"][agent_id] += np.sum( + agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS] + ) + self.collected_group_rewards[agent_id] += np.sum( + agent_buffer_trajectory[BufferKey.GROUP_REWARD] + ) + for name, reward_signal in self.optimizer.reward_signals.items(): + evaluate_result = ( + reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength + ) + agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend( + evaluate_result + ) + # Report the reward signals + self.collected_rewards[name][agent_id] += np.sum(evaluate_result) + + # Compute lambda returns and advantage + tmp_advantages = [] + for name in self.optimizer.reward_signals: + + local_rewards = np.array( + agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].get_batch(), + dtype=np.float32, + ) + + baseline_estimate = agent_buffer_trajectory[ + RewardSignalUtil.baseline_estimates_key(name) + ].get_batch() + v_estimates = agent_buffer_trajectory[ + RewardSignalUtil.value_estimates_key(name) + ].get_batch() + + lambd_returns = lambda_return( + r=local_rewards, + value_estimates=v_estimates, + gamma=self.optimizer.reward_signals[name].gamma, + lambd=self.hyperparameters.lambd, + value_next=value_next[name], + ) + + local_advantage = np.array(lambd_returns) - np.array(baseline_estimate) + + agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set( + lambd_returns + ) + agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set( + local_advantage + ) + tmp_advantages.append(local_advantage) + + # Get global advantages + global_advantages = list( + np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0) + ) + agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages) + + self._append_to_update_buffer(agent_buffer_trajectory) + + # If this was a terminal trajectory, append stats and reset reward collection + if trajectory.done_reached: + self._update_end_episode_stats(agent_id, self.optimizer) + # Remove dead agents from group reward recording + if not trajectory.all_group_dones_reached: + self.collected_group_rewards.pop(agent_id) + + # If the whole team is done, average the remaining group rewards. + if trajectory.all_group_dones_reached and trajectory.done_reached: + self.stats_reporter.add_stat( + "Environment/Group Cumulative Reward", + self.collected_group_rewards.get(agent_id, 0), + aggregation=StatsAggregationMethod.HISTOGRAM, + ) + self.collected_group_rewards.pop(agent_id) + + def _is_ready_update(self): + """ + Returns whether or not the trainer has enough elements to run update model + :return: A boolean corresponding to whether or not update_model() can be run + """ + size_of_buffer = self.update_buffer.num_experiences + return size_of_buffer > self.hyperparameters.buffer_size + + def end_episode(self) -> None: + """ + A signal that the Episode has ended. The buffer must be reset. + Get only called when the academy resets. For POCA, we should + also zero out the group rewards. + """ + super().end_episode() + self.collected_group_rewards.clear() + + def create_policy( + self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + ) -> TorchPolicy: + """ + Creates a policy with a PyTorch backend and POCA hyperparameters + :param parsed_behavior_id: + :param behavior_spec: specifications for policy construction + :return policy + """ + actor_cls: Union[Type[SimpleActor], Type[SharedActorCritic]] = SimpleActor + actor_kwargs: Dict[str, Any] = { + "conditional_sigma": False, + "tanh_squash": False, + } + + policy = TorchPolicy( + self.seed, + behavior_spec, + self.trainer_settings.network_settings, + actor_cls, + actor_kwargs, + ) + return policy + + def create_optimizer(self) -> TorchPOCAOptimizer: + return TorchPOCAOptimizer(self.policy, self.trainer_settings) + + def get_policy(self, name_behavior_id: str) -> Policy: + """ + Gets policy from trainer associated with name_behavior_id + :param name_behavior_id: full identifier of policy + """ + + return self.policy + + @staticmethod + def get_trainer_name() -> str: + return TRAINER_NAME diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/policy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8348d8738b939b384661966836d61d4509b19f87 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/policy/__init__.py @@ -0,0 +1 @@ +from mlagents.trainers.policy.policy import Policy # noqa diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da57693726e5894ec1ad99d7891a3364d6e2965b Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/checkpoint_manager.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/checkpoint_manager.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5674d230aa50ff60add6ec5c5ee4aa756da9622 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/checkpoint_manager.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/policy.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/policy.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68b66f6f868fa2e13d6e95619b153b8963989e92 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/policy.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/torch_policy.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/torch_policy.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ae17b2bdf44c825457634f1cb09b5b8c8c26cd0 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/torch_policy.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/checkpoint_manager.py b/MLPY/Lib/site-packages/mlagents/trainers/policy/checkpoint_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..5f3e2762b36357c97c6114d969a627d3b874a661 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/policy/checkpoint_manager.py @@ -0,0 +1,101 @@ +# # Unity ML-Agents Toolkit +from typing import Dict, Any, Optional, List +import os +import attr +from mlagents.trainers.training_status import GlobalTrainingStatus, StatusType +from mlagents_envs.logging_util import get_logger + +logger = get_logger(__name__) + + +@attr.s(auto_attribs=True) +class ModelCheckpoint: + steps: int + file_path: str + reward: Optional[float] + creation_time: float + auxillary_file_paths: List[str] = attr.ib(factory=list) + + +class ModelCheckpointManager: + @staticmethod + def get_checkpoints(behavior_name: str) -> List[Dict[str, Any]]: + checkpoint_list = GlobalTrainingStatus.get_parameter_state( + behavior_name, StatusType.CHECKPOINTS + ) + if not checkpoint_list: + checkpoint_list = [] + GlobalTrainingStatus.set_parameter_state( + behavior_name, StatusType.CHECKPOINTS, checkpoint_list + ) + return checkpoint_list + + @staticmethod + def remove_checkpoint(checkpoint: Dict[str, Any]) -> None: + """ + Removes a checkpoint stored in checkpoint_list. + If checkpoint cannot be found, no action is done. + + :param checkpoint: A checkpoint stored in checkpoint_list + """ + file_paths: List[str] = [checkpoint["file_path"]] + file_paths.extend(checkpoint["auxillary_file_paths"]) + for file_path in file_paths: + if os.path.exists(file_path): + os.remove(file_path) + logger.debug(f"Removed checkpoint model {file_path}.") + else: + logger.debug(f"Checkpoint at {file_path} could not be found.") + return + + @classmethod + def _cleanup_extra_checkpoints( + cls, checkpoints: List[Dict], keep_checkpoints: int + ) -> List[Dict]: + """ + Ensures that the number of checkpoints stored are within the number + of checkpoints the user defines. If the limit is hit, checkpoints are + removed to create room for the next checkpoint to be inserted. + + :param behavior_name: The behavior name whose checkpoints we will mange. + :param keep_checkpoints: Number of checkpoints to record (user-defined). + """ + while len(checkpoints) > keep_checkpoints: + if keep_checkpoints <= 0 or len(checkpoints) == 0: + break + ModelCheckpointManager.remove_checkpoint(checkpoints.pop(0)) + return checkpoints + + @classmethod + def add_checkpoint( + cls, behavior_name: str, new_checkpoint: ModelCheckpoint, keep_checkpoints: int + ) -> None: + """ + Make room for new checkpoint if needed and insert new checkpoint information. + :param behavior_name: Behavior name for the checkpoint. + :param new_checkpoint: The new checkpoint to be recorded. + :param keep_checkpoints: Number of checkpoints to record (user-defined). + """ + new_checkpoint_dict = attr.asdict(new_checkpoint) + checkpoints = cls.get_checkpoints(behavior_name) + checkpoints.append(new_checkpoint_dict) + cls._cleanup_extra_checkpoints(checkpoints, keep_checkpoints) + GlobalTrainingStatus.set_parameter_state( + behavior_name, StatusType.CHECKPOINTS, checkpoints + ) + + @classmethod + def track_final_checkpoint( + cls, behavior_name: str, final_checkpoint: ModelCheckpoint + ) -> None: + """ + Ensures number of checkpoints stored is within the max number of checkpoints + defined by the user and finally stores the information about the final + model (or intermediate model if training is interrupted). + :param behavior_name: Behavior name of the model. + :param final_checkpoint: Checkpoint information for the final model. + """ + final_model_dict = attr.asdict(final_checkpoint) + GlobalTrainingStatus.set_parameter_state( + behavior_name, StatusType.FINAL_CHECKPOINT, final_model_dict + ) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/policy.py b/MLPY/Lib/site-packages/mlagents/trainers/policy/policy.py new file mode 100644 index 0000000000000000000000000000000000000000..0c5e9f72472d071676b9fbaa9ba845d0f31ab40b --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/policy/policy.py @@ -0,0 +1,146 @@ +from abc import abstractmethod +from typing import Dict, List, Optional +import numpy as np + +from mlagents_envs.base_env import ActionTuple, BehaviorSpec, DecisionSteps +from mlagents_envs.exception import UnityException + +from mlagents.trainers.action_info import ActionInfo +from mlagents.trainers.settings import NetworkSettings +from mlagents.trainers.behavior_id_utils import GlobalAgentId + + +class UnityPolicyException(UnityException): + """ + Related to errors with the Trainer. + """ + + pass + + +class Policy: + def __init__( + self, + seed: int, + behavior_spec: BehaviorSpec, + network_settings: NetworkSettings, + ): + self.behavior_spec = behavior_spec + self.network_settings: NetworkSettings = network_settings + self.seed = seed + self.previous_action_dict: Dict[str, np.ndarray] = {} + self.previous_memory_dict: Dict[str, np.ndarray] = {} + self.memory_dict: Dict[str, np.ndarray] = {} + self.normalize = network_settings.normalize + self.use_recurrent = self.network_settings.memory is not None + self.m_size = 0 + self.sequence_length = 1 + if self.use_recurrent: + self.m_size = self.network_settings.memory.memory_size + self.sequence_length = self.network_settings.memory.sequence_length + + def make_empty_memory(self, num_agents): + """ + Creates empty memory for use with RNNs + :param num_agents: Number of agents. + :return: Numpy array of zeros. + """ + return np.zeros((num_agents, self.m_size), dtype=np.float32) + + def save_memories( + self, agent_ids: List[GlobalAgentId], memory_matrix: Optional[np.ndarray] + ) -> None: + if memory_matrix is None: + return + + # Pass old memories into previous_memory_dict + for agent_id in agent_ids: + if agent_id in self.memory_dict: + self.previous_memory_dict[agent_id] = self.memory_dict[agent_id] + + for index, agent_id in enumerate(agent_ids): + self.memory_dict[agent_id] = memory_matrix[index, :] + + def retrieve_memories(self, agent_ids: List[GlobalAgentId]) -> np.ndarray: + memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32) + for index, agent_id in enumerate(agent_ids): + if agent_id in self.memory_dict: + memory_matrix[index, :] = self.memory_dict[agent_id] + return memory_matrix + + def retrieve_previous_memories(self, agent_ids: List[GlobalAgentId]) -> np.ndarray: + memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32) + for index, agent_id in enumerate(agent_ids): + if agent_id in self.previous_memory_dict: + memory_matrix[index, :] = self.previous_memory_dict[agent_id] + return memory_matrix + + def remove_memories(self, agent_ids: List[GlobalAgentId]) -> None: + for agent_id in agent_ids: + if agent_id in self.memory_dict: + self.memory_dict.pop(agent_id) + if agent_id in self.previous_memory_dict: + self.previous_memory_dict.pop(agent_id) + + def make_empty_previous_action(self, num_agents: int) -> np.ndarray: + """ + Creates empty previous action for use with RNNs and discrete control + :param num_agents: Number of agents. + :return: Numpy array of zeros. + """ + return np.zeros( + (num_agents, self.behavior_spec.action_spec.discrete_size), dtype=np.int32 + ) + + def save_previous_action( + self, agent_ids: List[GlobalAgentId], action_tuple: ActionTuple + ) -> None: + for index, agent_id in enumerate(agent_ids): + self.previous_action_dict[agent_id] = action_tuple.discrete[index, :] + + def retrieve_previous_action(self, agent_ids: List[GlobalAgentId]) -> np.ndarray: + action_matrix = self.make_empty_previous_action(len(agent_ids)) + for index, agent_id in enumerate(agent_ids): + if agent_id in self.previous_action_dict: + action_matrix[index, :] = self.previous_action_dict[agent_id] + return action_matrix + + def remove_previous_action(self, agent_ids: List[GlobalAgentId]) -> None: + for agent_id in agent_ids: + if agent_id in self.previous_action_dict: + self.previous_action_dict.pop(agent_id) + + def get_action( + self, decision_requests: DecisionSteps, worker_id: int = 0 + ) -> ActionInfo: + raise NotImplementedError + + @staticmethod + def check_nan_action(action: Optional[ActionTuple]) -> None: + # Fast NaN check on the action + # See https://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy for background. + if action is not None: + d = np.sum(action.continuous) + has_nan = np.isnan(d) + if has_nan: + raise RuntimeError("Continuous NaN action detected.") + + @abstractmethod + def increment_step(self, n_steps): + pass + + @abstractmethod + def get_current_step(self): + pass + + @abstractmethod + def load_weights(self, values: List[np.ndarray]) -> None: + pass + + @abstractmethod + def get_weights(self) -> List[np.ndarray]: + return [] + + @abstractmethod + def init_load_weights(self) -> None: + pass diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/torch_policy.py b/MLPY/Lib/site-packages/mlagents/trainers/policy/torch_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..fceacda6e95c8fb31a7dbeb837a65992af94b41f --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/policy/torch_policy.py @@ -0,0 +1,173 @@ +from typing import Any, Dict, List +import numpy as np +from mlagents.torch_utils import torch, default_device +import copy + +from mlagents.trainers.action_info import ActionInfo +from mlagents.trainers.behavior_id_utils import get_global_agent_id +from mlagents.trainers.policy import Policy +from mlagents_envs.base_env import DecisionSteps, BehaviorSpec +from mlagents_envs.timers import timed + +from mlagents.trainers.settings import NetworkSettings +from mlagents.trainers.torch_entities.networks import GlobalSteps + +from mlagents.trainers.torch_entities.utils import ModelUtils + +EPSILON = 1e-7 # Small value to avoid divide by zero + + +class TorchPolicy(Policy): + def __init__( + self, + seed: int, + behavior_spec: BehaviorSpec, + network_settings: NetworkSettings, + actor_cls: type, + actor_kwargs: Dict[str, Any], + ): + """ + Policy that uses a multilayer perceptron to map the observations to actions. Could + also use a CNN to encode visual input prior to the MLP. Supports discrete and + continuous actions, as well as recurrent networks. + :param seed: Random seed. + :param behavior_spec: Assigned BehaviorSpec object. + :param network_settings: Defined network parameters. + :param actor_cls: The type of Actor + :param actor_kwargs: Keyword args for the Actor class + """ + super().__init__(seed, behavior_spec, network_settings) + self.global_step = ( + GlobalSteps() + ) # could be much simpler if TorchPolicy is nn.Module + + self.stats_name_to_update_name = { + "Losses/Value Loss": "value_loss", + "Losses/Policy Loss": "policy_loss", + } + + self.actor = actor_cls( + observation_specs=self.behavior_spec.observation_specs, + network_settings=network_settings, + action_spec=behavior_spec.action_spec, + **actor_kwargs, + ) + + # Save the m_size needed for export + self._export_m_size = self.m_size + # m_size needed for training is determined by network, not trainer settings + self.m_size = self.actor.memory_size + + self.actor.to(default_device()) + + @property + def export_memory_size(self) -> int: + """ + Returns the memory size of the exported ONNX policy. This only includes the memory + of the Actor and not any auxillary networks. + """ + return self._export_m_size + + def _extract_masks(self, decision_requests: DecisionSteps) -> np.ndarray: + mask = None + if self.behavior_spec.action_spec.discrete_size > 0: + num_discrete_flat = np.sum(self.behavior_spec.action_spec.discrete_branches) + mask = torch.ones([len(decision_requests), num_discrete_flat]) + if decision_requests.action_mask is not None: + mask = torch.as_tensor( + 1 - np.concatenate(decision_requests.action_mask, axis=1) + ) + return mask + + @timed + def evaluate( + self, decision_requests: DecisionSteps, global_agent_ids: List[str] + ) -> Dict[str, Any]: + """ + Evaluates policy for the agent experiences provided. + :param global_agent_ids: + :param decision_requests: DecisionStep object containing inputs. + :return: Outputs from network as defined by self.inference_dict. + """ + obs = decision_requests.obs + masks = self._extract_masks(decision_requests) + tensor_obs = [torch.as_tensor(np_ob) for np_ob in obs] + + memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze( + 0 + ) + with torch.no_grad(): + action, run_out, memories = self.actor.get_action_and_stats( + tensor_obs, masks=masks, memories=memories + ) + run_out["action"] = action.to_action_tuple() + if "log_probs" in run_out: + run_out["log_probs"] = run_out["log_probs"].to_log_probs_tuple() + if "entropy" in run_out: + run_out["entropy"] = ModelUtils.to_numpy(run_out["entropy"]) + if self.use_recurrent: + run_out["memory_out"] = ModelUtils.to_numpy(memories).squeeze(0) + return run_out + + def get_action( + self, decision_requests: DecisionSteps, worker_id: int = 0 + ) -> ActionInfo: + """ + Decides actions given observations information, and takes them in environment. + :param worker_id: + :param decision_requests: A dictionary of behavior names and DecisionSteps from environment. + :return: an ActionInfo containing action, memories, values and an object + to be passed to add experiences + """ + if len(decision_requests) == 0: + return ActionInfo.empty() + + global_agent_ids = [ + get_global_agent_id(worker_id, int(agent_id)) + for agent_id in decision_requests.agent_id + ] # For 1-D array, the iterator order is correct. + + run_out = self.evaluate(decision_requests, global_agent_ids) + self.save_memories(global_agent_ids, run_out.get("memory_out")) + self.check_nan_action(run_out.get("action")) + return ActionInfo( + action=run_out.get("action"), + env_action=run_out.get("env_action"), + outputs=run_out, + agent_ids=list(decision_requests.agent_id), + ) + + def get_current_step(self): + """ + Gets current model step. + :return: current model step. + """ + return self.global_step.current_step + + def set_step(self, step: int) -> int: + """ + Sets current model step to step without creating additional ops. + :param step: Step to set the current model step to. + :return: The step the model was set to. + """ + self.global_step.current_step = step + return step + + def increment_step(self, n_steps): + """ + Increments model step. + """ + self.global_step.increment(n_steps) + return self.get_current_step() + + def load_weights(self, values: List[np.ndarray]) -> None: + self.actor.load_state_dict(values) + + def init_load_weights(self) -> None: + pass + + def get_weights(self) -> List[np.ndarray]: + return copy.deepcopy(self.actor.state_dict()) + + def get_modules(self): + return {"Policy": self.actor, "global_step": self.global_step} diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ppo/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27701694d1965d32e08249fc92ce943bb5483cff Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/optimizer_torch.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/optimizer_torch.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32cde79232d9c19fdd1582ba7c9edfde2fb50cc2 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/optimizer_torch.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/trainer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8bf6bf44fff1af85cc7242253cc84e5cfb4fb20 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/trainer.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ppo/optimizer_torch.py b/MLPY/Lib/site-packages/mlagents/trainers/ppo/optimizer_torch.py new file mode 100644 index 0000000000000000000000000000000000000000..41a452c65cb114e81767d92ac90d198aa44ef6e9 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/ppo/optimizer_torch.py @@ -0,0 +1,207 @@ +from typing import Dict, cast +import attr + +from mlagents.torch_utils import torch, default_device + +from mlagents.trainers.buffer import AgentBuffer, BufferKey, RewardSignalUtil + +from mlagents_envs.timers import timed +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.settings import ( + TrainerSettings, + OnPolicyHyperparamSettings, + ScheduleType, +) +from mlagents.trainers.torch_entities.networks import ValueNetwork +from mlagents.trainers.torch_entities.agent_action import AgentAction +from mlagents.trainers.torch_entities.action_log_probs import ActionLogProbs +from mlagents.trainers.torch_entities.utils import ModelUtils +from mlagents.trainers.trajectory import ObsUtil + + +@attr.s(auto_attribs=True) +class PPOSettings(OnPolicyHyperparamSettings): + beta: float = 5.0e-3 + epsilon: float = 0.2 + lambd: float = 0.95 + num_epoch: int = 3 + shared_critic: bool = False + learning_rate_schedule: ScheduleType = ScheduleType.LINEAR + beta_schedule: ScheduleType = ScheduleType.LINEAR + epsilon_schedule: ScheduleType = ScheduleType.LINEAR + + +class TorchPPOOptimizer(TorchOptimizer): + def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings): + """ + Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy. + The PPO optimizer has a value estimator and a loss function. + :param policy: A TorchPolicy object that will be updated by this PPO Optimizer. + :param trainer_params: Trainer parameters dictionary that specifies the + properties of the trainer. + """ + # Create the graph here to give more granular control of the TF graph to the Optimizer. + + super().__init__(policy, trainer_settings) + reward_signal_configs = trainer_settings.reward_signals + reward_signal_names = [key.value for key, _ in reward_signal_configs.items()] + + self.hyperparameters: PPOSettings = cast( + PPOSettings, trainer_settings.hyperparameters + ) + + params = list(self.policy.actor.parameters()) + if self.hyperparameters.shared_critic: + self._critic = policy.actor + else: + self._critic = ValueNetwork( + reward_signal_names, + policy.behavior_spec.observation_specs, + network_settings=trainer_settings.network_settings, + ) + self._critic.to(default_device()) + params += list(self._critic.parameters()) + + self.decay_learning_rate = ModelUtils.DecayedValue( + self.hyperparameters.learning_rate_schedule, + self.hyperparameters.learning_rate, + 1e-10, + self.trainer_settings.max_steps, + ) + self.decay_epsilon = ModelUtils.DecayedValue( + self.hyperparameters.epsilon_schedule, + self.hyperparameters.epsilon, + 0.1, + self.trainer_settings.max_steps, + ) + self.decay_beta = ModelUtils.DecayedValue( + self.hyperparameters.beta_schedule, + self.hyperparameters.beta, + 1e-5, + self.trainer_settings.max_steps, + ) + + self.optimizer = torch.optim.Adam( + params, lr=self.trainer_settings.hyperparameters.learning_rate + ) + self.stats_name_to_update_name = { + "Losses/Value Loss": "value_loss", + "Losses/Policy Loss": "policy_loss", + } + + self.stream_names = list(self.reward_signals.keys()) + + @property + def critic(self): + return self._critic + + @timed + def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: + """ + Performs update on model. + :param batch: Batch of experiences. + :param num_sequences: Number of sequences to process. + :return: Results of update. + """ + # Get decayed parameters + decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step()) + decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step()) + decay_bet = self.decay_beta.get_value(self.policy.get_current_step()) + returns = {} + old_values = {} + for name in self.reward_signals: + old_values[name] = ModelUtils.list_to_tensor( + batch[RewardSignalUtil.value_estimates_key(name)] + ) + returns[name] = ModelUtils.list_to_tensor( + batch[RewardSignalUtil.returns_key(name)] + ) + + n_obs = len(self.policy.behavior_spec.observation_specs) + current_obs = ObsUtil.from_buffer(batch, n_obs) + # Convert to tensors + current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] + + act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) + actions = AgentAction.from_buffer(batch) + + memories = [ + ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) + for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) + ] + if len(memories) > 0: + memories = torch.stack(memories).unsqueeze(0) + + # Get value memories + value_memories = [ + ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) + for i in range( + 0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length + ) + ] + if len(value_memories) > 0: + value_memories = torch.stack(value_memories).unsqueeze(0) + + run_out = self.policy.actor.get_stats( + current_obs, + actions, + masks=act_masks, + memories=memories, + sequence_length=self.policy.sequence_length, + ) + + log_probs = run_out["log_probs"] + entropy = run_out["entropy"] + + values, _ = self.critic.critic_pass( + current_obs, + memories=value_memories, + sequence_length=self.policy.sequence_length, + ) + old_log_probs = ActionLogProbs.from_buffer(batch).flatten() + log_probs = log_probs.flatten() + loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) + value_loss = ModelUtils.trust_region_value_loss( + values, old_values, returns, decay_eps, loss_masks + ) + policy_loss = ModelUtils.trust_region_policy_loss( + ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]), + log_probs, + old_log_probs, + loss_masks, + decay_eps, + ) + loss = ( + policy_loss + + 0.5 * value_loss + - decay_bet * ModelUtils.masked_mean(entropy, loss_masks) + ) + + # Set optimizer learning rate + ModelUtils.update_learning_rate(self.optimizer, decay_lr) + self.optimizer.zero_grad() + loss.backward() + + self.optimizer.step() + update_stats = { + # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow. + # TODO: After PyTorch is default, change to something more correct. + "Losses/Policy Loss": torch.abs(policy_loss).item(), + "Losses/Value Loss": value_loss.item(), + "Policy/Learning Rate": decay_lr, + "Policy/Epsilon": decay_eps, + "Policy/Beta": decay_bet, + } + + return update_stats + + # TODO move module update into TorchOptimizer for reward_provider + def get_modules(self): + modules = { + "Optimizer:value_optimizer": self.optimizer, + "Optimizer:critic": self._critic, + } + for reward_provider in self.reward_signals.values(): + modules.update(reward_provider.get_modules()) + return modules diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ppo/trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/ppo/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..e7421f0da155244058ac234edc7b49fedc3f046e --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/ppo/trainer.py @@ -0,0 +1,213 @@ +# # Unity ML-Agents Toolkit +# ## ML-Agent Learning (PPO) +# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347 + +from typing import cast, Type, Union, Dict, Any + +import numpy as np + +from mlagents_envs.base_env import BehaviorSpec +from mlagents_envs.logging_util import get_logger +from mlagents.trainers.buffer import BufferKey, RewardSignalUtil +from mlagents.trainers.trainer.on_policy_trainer import OnPolicyTrainer +from mlagents.trainers.policy.policy import Policy +from mlagents.trainers.trainer.trainer_utils import get_gae +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer, PPOSettings +from mlagents.trainers.trajectory import Trajectory +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers +from mlagents.trainers.settings import TrainerSettings + +from mlagents.trainers.torch_entities.networks import SimpleActor, SharedActorCritic + +logger = get_logger(__name__) + +TRAINER_NAME = "ppo" + + +class PPOTrainer(OnPolicyTrainer): + """The PPOTrainer is an implementation of the PPO algorithm.""" + + def __init__( + self, + behavior_name: str, + reward_buff_cap: int, + trainer_settings: TrainerSettings, + training: bool, + load: bool, + seed: int, + artifact_path: str, + ): + """ + Responsible for collecting experiences and training PPO model. + :param behavior_name: The name of the behavior associated with trainer config + :param reward_buff_cap: Max reward history to track in the reward buffer + :param trainer_settings: The parameters for the trainer. + :param training: Whether the trainer is set for training. + :param load: Whether the model should be loaded. + :param seed: The seed the model will be initialized with + :param artifact_path: The directory within which to store artifacts from this trainer. + """ + super().__init__( + behavior_name, + reward_buff_cap, + trainer_settings, + training, + load, + seed, + artifact_path, + ) + self.hyperparameters: PPOSettings = cast( + PPOSettings, self.trainer_settings.hyperparameters + ) + self.seed = seed + self.shared_critic = self.hyperparameters.shared_critic + self.policy: TorchPolicy = None # type: ignore + + def _process_trajectory(self, trajectory: Trajectory) -> None: + """ + Takes a trajectory and processes it, putting it into the update buffer. + Processing involves calculating value and advantage targets for model updating step. + :param trajectory: The Trajectory tuple containing the steps to be processed. + """ + super()._process_trajectory(trajectory) + agent_id = trajectory.agent_id # All the agents should have the same ID + + agent_buffer_trajectory = trajectory.to_agentbuffer() + # Check if we used group rewards, warn if so. + self._warn_if_group_reward(agent_buffer_trajectory) + + # Update the normalization + if self.is_training: + self.policy.actor.update_normalization(agent_buffer_trajectory) + self.optimizer.critic.update_normalization(agent_buffer_trajectory) + + # Get all value estimates + ( + value_estimates, + value_next, + value_memories, + ) = self.optimizer.get_trajectory_value_estimates( + agent_buffer_trajectory, + trajectory.next_obs, + trajectory.done_reached and not trajectory.interrupted, + ) + if value_memories is not None: + agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(value_memories) + + for name, v in value_estimates.items(): + agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(name)].extend( + v + ) + self._stats_reporter.add_stat( + f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate", + np.mean(v), + ) + + # Evaluate all reward functions + self.collected_rewards["environment"][agent_id] += np.sum( + agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS] + ) + for name, reward_signal in self.optimizer.reward_signals.items(): + evaluate_result = ( + reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength + ) + agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend( + evaluate_result + ) + # Report the reward signals + self.collected_rewards[name][agent_id] += np.sum(evaluate_result) + + # Compute GAE and returns + tmp_advantages = [] + tmp_returns = [] + for name in self.optimizer.reward_signals: + bootstrap_value = value_next[name] + + local_rewards = agent_buffer_trajectory[ + RewardSignalUtil.rewards_key(name) + ].get_batch() + local_value_estimates = agent_buffer_trajectory[ + RewardSignalUtil.value_estimates_key(name) + ].get_batch() + + local_advantage = get_gae( + rewards=local_rewards, + value_estimates=local_value_estimates, + value_next=bootstrap_value, + gamma=self.optimizer.reward_signals[name].gamma, + lambd=self.hyperparameters.lambd, + ) + local_return = local_advantage + local_value_estimates + # This is later use as target for the different value estimates + agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set( + local_return + ) + agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set( + local_advantage + ) + tmp_advantages.append(local_advantage) + tmp_returns.append(local_return) + + # Get global advantages + global_advantages = list( + np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0) + ) + global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0)) + agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages) + agent_buffer_trajectory[BufferKey.DISCOUNTED_RETURNS].set(global_returns) + + self._append_to_update_buffer(agent_buffer_trajectory) + + # If this was a terminal trajectory, append stats and reset reward collection + if trajectory.done_reached: + self._update_end_episode_stats(agent_id, self.optimizer) + + def create_optimizer(self) -> TorchOptimizer: + return TorchPPOOptimizer( # type: ignore + cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore + ) # type: ignore + + def create_policy( + self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + ) -> TorchPolicy: + """ + Creates a policy with a PyTorch backend and PPO hyperparameters + :param parsed_behavior_id: + :param behavior_spec: specifications for policy construction + :return policy + """ + actor_cls: Union[Type[SimpleActor], Type[SharedActorCritic]] = SimpleActor + actor_kwargs: Dict[str, Any] = { + "conditional_sigma": False, + "tanh_squash": False, + } + if self.shared_critic: + reward_signal_configs = self.trainer_settings.reward_signals + reward_signal_names = [ + key.value for key, _ in reward_signal_configs.items() + ] + actor_cls = SharedActorCritic + actor_kwargs.update({"stream_names": reward_signal_names}) + + policy = TorchPolicy( + self.seed, + behavior_spec, + self.trainer_settings.network_settings, + actor_cls, + actor_kwargs, + ) + return policy + + def get_policy(self, name_behavior_id: str) -> Policy: + """ + Gets policy from trainer associated with name_behavior_id + :param name_behavior_id: full identifier of policy + """ + + return self.policy + + @staticmethod + def get_trainer_name() -> str: + return TRAINER_NAME diff --git a/MLPY/Lib/site-packages/mlagents/trainers/run_experiment.py b/MLPY/Lib/site-packages/mlagents/trainers/run_experiment.py new file mode 100644 index 0000000000000000000000000000000000000000..8544b673bcf209ac1c42a4967788dec5ef45c94b --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/run_experiment.py @@ -0,0 +1,31 @@ +import argparse +from typing import Optional, List +from mlagents.trainers.learn import run_cli +from mlagents.trainers.settings import RunOptions +from mlagents.trainers.cli_utils import load_config + +from mlagents.plugins.trainer_type import register_trainer_plugins + + +def parse_command_line(argv: Optional[List[str]] = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("experiment_config_path") + return parser.parse_args(argv) + + +def main(): + """ + Provides an alternative CLI interface to mlagents-learn, 'mlagents-run-experiment'. + Accepts a JSON/YAML formatted mlagents.trainers.learn.RunOptions object, and executes + the run loop as defined in mlagents.trainers.learn.run_cli. + """ + args = parse_command_line() + expt_config = load_config(args.experiment_config_path) + _, _ = register_trainer_plugins() + run_cli(RunOptions.from_dict(expt_config)) + + +if __name__ == "__main__": + main() diff --git a/MLPY/Lib/site-packages/mlagents/trainers/sac/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/sac/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb216e3d08c3e4aa43b01e7776954288e37e66cc Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/optimizer_torch.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/optimizer_torch.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d9b022d73a26de28c08d1db78a6a4935f838547 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/optimizer_torch.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/trainer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18b510bde245fb576c6a0aadc9a2126f5ad5b9cf Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/trainer.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/sac/optimizer_torch.py b/MLPY/Lib/site-packages/mlagents/trainers/sac/optimizer_torch.py new file mode 100644 index 0000000000000000000000000000000000000000..a7d566859d07937c36d0b1cc7389c84b91155b9d --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/sac/optimizer_torch.py @@ -0,0 +1,655 @@ +import numpy as np +from typing import Dict, List, NamedTuple, cast, Tuple, Optional +import attr + +from mlagents.torch_utils import torch, nn, default_device + +from mlagents_envs.logging_util import get_logger +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.settings import NetworkSettings +from mlagents.trainers.torch_entities.networks import ValueNetwork, SharedActorCritic +from mlagents.trainers.torch_entities.agent_action import AgentAction +from mlagents.trainers.torch_entities.action_log_probs import ActionLogProbs +from mlagents.trainers.torch_entities.utils import ModelUtils +from mlagents.trainers.buffer import AgentBuffer, BufferKey, RewardSignalUtil +from mlagents_envs.timers import timed +from mlagents_envs.base_env import ActionSpec, ObservationSpec +from mlagents.trainers.exception import UnityTrainerException +from mlagents.trainers.settings import TrainerSettings, OffPolicyHyperparamSettings +from contextlib import ExitStack +from mlagents.trainers.trajectory import ObsUtil + +EPSILON = 1e-6 # Small value to avoid divide by zero + +logger = get_logger(__name__) + + +@attr.s(auto_attribs=True) +class SACSettings(OffPolicyHyperparamSettings): + batch_size: int = 128 + buffer_size: int = 50000 + buffer_init_steps: int = 0 + tau: float = 0.005 + steps_per_update: float = 1 + save_replay_buffer: bool = False + init_entcoef: float = 1.0 + reward_signal_steps_per_update: float = attr.ib() + + @reward_signal_steps_per_update.default + def _reward_signal_steps_per_update_default(self): + return self.steps_per_update + + +class TorchSACOptimizer(TorchOptimizer): + class PolicyValueNetwork(nn.Module): + def __init__( + self, + stream_names: List[str], + observation_specs: List[ObservationSpec], + network_settings: NetworkSettings, + action_spec: ActionSpec, + ): + super().__init__() + num_value_outs = max(sum(action_spec.discrete_branches), 1) + num_action_ins = int(action_spec.continuous_size) + + self.q1_network = ValueNetwork( + stream_names, + observation_specs, + network_settings, + num_action_ins, + num_value_outs, + ) + self.q2_network = ValueNetwork( + stream_names, + observation_specs, + network_settings, + num_action_ins, + num_value_outs, + ) + + def forward( + self, + inputs: List[torch.Tensor], + actions: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + q1_grad: bool = True, + q2_grad: bool = True, + ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: + """ + Performs a forward pass on the value network, which consists of a Q1 and Q2 + network. Optionally does not evaluate gradients for either the Q1, Q2, or both. + :param inputs: List of observation tensors. + :param actions: For a continuous Q function (has actions), tensor of actions. + Otherwise, None. + :param memories: Initial memories if using memory. Otherwise, None. + :param sequence_length: Sequence length if using memory. + :param q1_grad: Whether or not to compute gradients for the Q1 network. + :param q2_grad: Whether or not to compute gradients for the Q2 network. + :return: Tuple of two dictionaries, which both map {reward_signal: Q} for Q1 and Q2, + respectively. + """ + # ExitStack allows us to enter the torch.no_grad() context conditionally + with ExitStack() as stack: + if not q1_grad: + stack.enter_context(torch.no_grad()) + q1_out, _ = self.q1_network( + inputs, + actions=actions, + memories=memories, + sequence_length=sequence_length, + ) + with ExitStack() as stack: + if not q2_grad: + stack.enter_context(torch.no_grad()) + q2_out, _ = self.q2_network( + inputs, + actions=actions, + memories=memories, + sequence_length=sequence_length, + ) + return q1_out, q2_out + + class TargetEntropy(NamedTuple): + + discrete: List[float] = [] # One per branch + continuous: float = 0.0 + + class LogEntCoef(nn.Module): + def __init__(self, discrete, continuous): + super().__init__() + self.discrete = discrete + self.continuous = continuous + + def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings): + super().__init__(policy, trainer_settings) + reward_signal_configs = trainer_settings.reward_signals + reward_signal_names = [key.value for key, _ in reward_signal_configs.items()] + if isinstance(policy.actor, SharedActorCritic): + raise UnityTrainerException("SAC does not support SharedActorCritic") + self._critic = ValueNetwork( + reward_signal_names, + policy.behavior_spec.observation_specs, + policy.network_settings, + ) + hyperparameters: SACSettings = cast( + SACSettings, trainer_settings.hyperparameters + ) + + self.tau = hyperparameters.tau + self.init_entcoef = hyperparameters.init_entcoef + + self.policy = policy + policy_network_settings = policy.network_settings + + self.tau = hyperparameters.tau + self.burn_in_ratio = 0.0 + + # Non-exposed SAC parameters + self.discrete_target_entropy_scale = 0.2 # Roughly equal to e-greedy 0.05 + self.continuous_target_entropy_scale = 1.0 + + self.stream_names = list(self.reward_signals.keys()) + # Use to reduce "survivor bonus" when using Curiosity or GAIL. + self.gammas = [_val.gamma for _val in trainer_settings.reward_signals.values()] + self.use_dones_in_backup = { + name: int(not self.reward_signals[name].ignore_done) + for name in self.stream_names + } + self._action_spec = self.policy.behavior_spec.action_spec + + self.q_network = TorchSACOptimizer.PolicyValueNetwork( + self.stream_names, + self.policy.behavior_spec.observation_specs, + policy_network_settings, + self._action_spec, + ) + + self.target_network = ValueNetwork( + self.stream_names, + self.policy.behavior_spec.observation_specs, + policy_network_settings, + ) + ModelUtils.soft_update(self._critic, self.target_network, 1.0) + + # We create one entropy coefficient per action, whether discrete or continuous. + _disc_log_ent_coef = torch.nn.Parameter( + torch.log( + torch.as_tensor( + [self.init_entcoef] * len(self._action_spec.discrete_branches) + ) + ), + requires_grad=True, + ) + _cont_log_ent_coef = torch.nn.Parameter( + torch.log(torch.as_tensor([self.init_entcoef])), requires_grad=True + ) + self._log_ent_coef = TorchSACOptimizer.LogEntCoef( + discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef + ) + _cont_target = ( + -1 + * self.continuous_target_entropy_scale + * np.prod(self._action_spec.continuous_size).astype(np.float32) + ) + _disc_target = [ + self.discrete_target_entropy_scale * np.log(i).astype(np.float32) + for i in self._action_spec.discrete_branches + ] + self.target_entropy = TorchSACOptimizer.TargetEntropy( + continuous=_cont_target, discrete=_disc_target + ) + policy_params = list(self.policy.actor.parameters()) + value_params = list(self.q_network.parameters()) + list( + self._critic.parameters() + ) + + logger.debug("value_vars") + for param in value_params: + logger.debug(param.shape) + logger.debug("policy_vars") + for param in policy_params: + logger.debug(param.shape) + + self.decay_learning_rate = ModelUtils.DecayedValue( + hyperparameters.learning_rate_schedule, + hyperparameters.learning_rate, + 1e-10, + self.trainer_settings.max_steps, + ) + self.policy_optimizer = torch.optim.Adam( + policy_params, lr=hyperparameters.learning_rate + ) + self.value_optimizer = torch.optim.Adam( + value_params, lr=hyperparameters.learning_rate + ) + self.entropy_optimizer = torch.optim.Adam( + self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate + ) + self._move_to_device(default_device()) + + @property + def critic(self): + return self._critic + + def _move_to_device(self, device: torch.device) -> None: + self._log_ent_coef.to(device) + self.target_network.to(device) + self._critic.to(device) + self.q_network.to(device) + + def sac_q_loss( + self, + q1_out: Dict[str, torch.Tensor], + q2_out: Dict[str, torch.Tensor], + target_values: Dict[str, torch.Tensor], + dones: torch.Tensor, + rewards: Dict[str, torch.Tensor], + loss_masks: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + q1_losses = [] + q2_losses = [] + # Multiple q losses per stream + for i, name in enumerate(q1_out.keys()): + q1_stream = q1_out[name].squeeze() + q2_stream = q2_out[name].squeeze() + with torch.no_grad(): + q_backup = rewards[name] + ( + (1.0 - self.use_dones_in_backup[name] * dones) + * self.gammas[i] + * target_values[name] + ) + _q1_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(q_backup, q1_stream), loss_masks + ) + _q2_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(q_backup, q2_stream), loss_masks + ) + + q1_losses.append(_q1_loss) + q2_losses.append(_q2_loss) + q1_loss = torch.mean(torch.stack(q1_losses)) + q2_loss = torch.mean(torch.stack(q2_losses)) + return q1_loss, q2_loss + + def sac_value_loss( + self, + log_probs: ActionLogProbs, + values: Dict[str, torch.Tensor], + q1p_out: Dict[str, torch.Tensor], + q2p_out: Dict[str, torch.Tensor], + loss_masks: torch.Tensor, + ) -> torch.Tensor: + min_policy_qs = {} + with torch.no_grad(): + _cont_ent_coef = self._log_ent_coef.continuous.exp() + _disc_ent_coef = self._log_ent_coef.discrete.exp() + for name in values.keys(): + if self._action_spec.discrete_size <= 0: + min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name]) + else: + disc_action_probs = log_probs.all_discrete_tensor.exp() + _branched_q1p = ModelUtils.break_into_branches( + q1p_out[name] * disc_action_probs, + self._action_spec.discrete_branches, + ) + _branched_q2p = ModelUtils.break_into_branches( + q2p_out[name] * disc_action_probs, + self._action_spec.discrete_branches, + ) + _q1p_mean = torch.mean( + torch.stack( + [ + torch.sum(_br, dim=1, keepdim=True) + for _br in _branched_q1p + ] + ), + dim=0, + ) + _q2p_mean = torch.mean( + torch.stack( + [ + torch.sum(_br, dim=1, keepdim=True) + for _br in _branched_q2p + ] + ), + dim=0, + ) + + min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean) + + value_losses = [] + if self._action_spec.discrete_size <= 0: + for name in values.keys(): + with torch.no_grad(): + v_backup = min_policy_qs[name] - torch.sum( + _cont_ent_coef * log_probs.continuous_tensor, dim=1 + ) + value_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(values[name], v_backup), loss_masks + ) + value_losses.append(value_loss) + else: + disc_log_probs = log_probs.all_discrete_tensor + branched_per_action_ent = ModelUtils.break_into_branches( + disc_log_probs * disc_log_probs.exp(), + self._action_spec.discrete_branches, + ) + # We have to do entropy bonus per action branch + branched_ent_bonus = torch.stack( + [ + torch.sum(_disc_ent_coef[i] * _lp, dim=1, keepdim=True) + for i, _lp in enumerate(branched_per_action_ent) + ] + ) + for name in values.keys(): + with torch.no_grad(): + v_backup = min_policy_qs[name] - torch.mean( + branched_ent_bonus, axis=0 + ) + # Add continuous entropy bonus to minimum Q + if self._action_spec.continuous_size > 0: + v_backup += torch.sum( + _cont_ent_coef * log_probs.continuous_tensor, + dim=1, + keepdim=True, + ) + value_loss = 0.5 * ModelUtils.masked_mean( + torch.nn.functional.mse_loss(values[name], v_backup.squeeze()), + loss_masks, + ) + value_losses.append(value_loss) + value_loss = torch.mean(torch.stack(value_losses)) + if torch.isinf(value_loss).any() or torch.isnan(value_loss).any(): + raise UnityTrainerException("Inf found") + return value_loss + + def sac_policy_loss( + self, + log_probs: ActionLogProbs, + q1p_outs: Dict[str, torch.Tensor], + loss_masks: torch.Tensor, + ) -> torch.Tensor: + _cont_ent_coef, _disc_ent_coef = ( + self._log_ent_coef.continuous, + self._log_ent_coef.discrete, + ) + _cont_ent_coef = _cont_ent_coef.exp() + _disc_ent_coef = _disc_ent_coef.exp() + + mean_q1 = torch.mean(torch.stack(list(q1p_outs.values())), axis=0) + batch_policy_loss = 0 + if self._action_spec.discrete_size > 0: + disc_log_probs = log_probs.all_discrete_tensor + disc_action_probs = disc_log_probs.exp() + branched_per_action_ent = ModelUtils.break_into_branches( + disc_log_probs * disc_action_probs, self._action_spec.discrete_branches + ) + branched_q_term = ModelUtils.break_into_branches( + mean_q1 * disc_action_probs, self._action_spec.discrete_branches + ) + branched_policy_loss = torch.stack( + [ + torch.sum(_disc_ent_coef[i] * _lp - _qt, dim=1, keepdim=False) + for i, (_lp, _qt) in enumerate( + zip(branched_per_action_ent, branched_q_term) + ) + ], + dim=1, + ) + batch_policy_loss += torch.sum(branched_policy_loss, dim=1) + all_mean_q1 = torch.sum(disc_action_probs * mean_q1, dim=1) + else: + all_mean_q1 = mean_q1 + if self._action_spec.continuous_size > 0: + cont_log_probs = log_probs.continuous_tensor + batch_policy_loss += ( + _cont_ent_coef * torch.sum(cont_log_probs, dim=1) - all_mean_q1 + ) + policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks) + + return policy_loss + + def sac_entropy_loss( + self, log_probs: ActionLogProbs, loss_masks: torch.Tensor + ) -> torch.Tensor: + _cont_ent_coef, _disc_ent_coef = ( + self._log_ent_coef.continuous, + self._log_ent_coef.discrete, + ) + entropy_loss = 0 + if self._action_spec.discrete_size > 0: + with torch.no_grad(): + # Break continuous into separate branch + disc_log_probs = log_probs.all_discrete_tensor + branched_per_action_ent = ModelUtils.break_into_branches( + disc_log_probs * disc_log_probs.exp(), + self._action_spec.discrete_branches, + ) + target_current_diff_branched = torch.stack( + [ + torch.sum(_lp, axis=1, keepdim=True) + _te + for _lp, _te in zip( + branched_per_action_ent, self.target_entropy.discrete + ) + ], + axis=1, + ) + target_current_diff = torch.squeeze( + target_current_diff_branched, axis=2 + ) + entropy_loss += -1 * ModelUtils.masked_mean( + torch.mean(_disc_ent_coef * target_current_diff, axis=1), loss_masks + ) + if self._action_spec.continuous_size > 0: + with torch.no_grad(): + cont_log_probs = log_probs.continuous_tensor + target_current_diff = ( + torch.sum(cont_log_probs, dim=1) + self.target_entropy.continuous + ) + # We update all the _cont_ent_coef as one block + entropy_loss += -1 * ModelUtils.masked_mean( + _cont_ent_coef * target_current_diff, loss_masks + ) + + return entropy_loss + + def _condense_q_streams( + self, q_output: Dict[str, torch.Tensor], discrete_actions: torch.Tensor + ) -> Dict[str, torch.Tensor]: + condensed_q_output = {} + onehot_actions = ModelUtils.actions_to_onehot( + discrete_actions, self._action_spec.discrete_branches + ) + for key, item in q_output.items(): + branched_q = ModelUtils.break_into_branches( + item, self._action_spec.discrete_branches + ) + only_action_qs = torch.stack( + [ + torch.sum(_act * _q, dim=1, keepdim=True) + for _act, _q in zip(onehot_actions, branched_q) + ] + ) + + condensed_q_output[key] = torch.mean(only_action_qs, dim=0) + return condensed_q_output + + @timed + def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]: + """ + Updates model using buffer. + :param num_sequences: Number of trajectories in batch. + :param batch: Experience mini-batch. + :param update_target: Whether or not to update target value network + :param reward_signal_batches: Minibatches to use for updating the reward signals, + indexed by name. If none, don't update the reward signals. + :return: Output from update process. + """ + rewards = {} + for name in self.reward_signals: + rewards[name] = ModelUtils.list_to_tensor( + batch[RewardSignalUtil.rewards_key(name)] + ) + + n_obs = len(self.policy.behavior_spec.observation_specs) + current_obs = ObsUtil.from_buffer(batch, n_obs) + # Convert to tensors + current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs] + + next_obs = ObsUtil.from_buffer_next(batch, n_obs) + # Convert to tensors + next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs] + + act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK]) + actions = AgentAction.from_buffer(batch) + + memories_list = [ + ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i]) + for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length) + ] + # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true. + value_memories_list = [ + ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i]) + for i in range( + 0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length + ) + ] + + if len(memories_list) > 0: + memories = torch.stack(memories_list).unsqueeze(0) + value_memories = torch.stack(value_memories_list).unsqueeze(0) + else: + memories = None + value_memories = None + + # Q and V network memories are 0'ed out, since we don't have them during inference. + q_memories = ( + torch.zeros_like(value_memories) if value_memories is not None else None + ) + + # Copy normalizers from policy + self.q_network.q1_network.network_body.copy_normalization( + self.policy.actor.network_body + ) + self.q_network.q2_network.network_body.copy_normalization( + self.policy.actor.network_body + ) + self.target_network.network_body.copy_normalization( + self.policy.actor.network_body + ) + self._critic.network_body.copy_normalization(self.policy.actor.network_body) + sampled_actions, run_out, _, = self.policy.actor.get_action_and_stats( + current_obs, + masks=act_masks, + memories=memories, + sequence_length=self.policy.sequence_length, + ) + log_probs = run_out["log_probs"] + value_estimates, _ = self._critic.critic_pass( + current_obs, value_memories, sequence_length=self.policy.sequence_length + ) + + cont_sampled_actions = sampled_actions.continuous_tensor + cont_actions = actions.continuous_tensor + q1p_out, q2p_out = self.q_network( + current_obs, + cont_sampled_actions, + memories=q_memories, + sequence_length=self.policy.sequence_length, + q2_grad=False, + ) + q1_out, q2_out = self.q_network( + current_obs, + cont_actions, + memories=q_memories, + sequence_length=self.policy.sequence_length, + ) + + if self._action_spec.discrete_size > 0: + disc_actions = actions.discrete_tensor + q1_stream = self._condense_q_streams(q1_out, disc_actions) + q2_stream = self._condense_q_streams(q2_out, disc_actions) + else: + q1_stream, q2_stream = q1_out, q2_out + + with torch.no_grad(): + # Since we didn't record the next value memories, evaluate one step in the critic to + # get them. + if value_memories is not None: + # Get the first observation in each sequence + just_first_obs = [ + _obs[:: self.policy.sequence_length] for _obs in current_obs + ] + _, next_value_memories = self._critic.critic_pass( + just_first_obs, value_memories, sequence_length=1 + ) + else: + next_value_memories = None + target_values, _ = self.target_network( + next_obs, + memories=next_value_memories, + sequence_length=self.policy.sequence_length, + ) + masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool) + dones = ModelUtils.list_to_tensor(batch[BufferKey.DONE]) + + q1_loss, q2_loss = self.sac_q_loss( + q1_stream, q2_stream, target_values, dones, rewards, masks + ) + value_loss = self.sac_value_loss( + log_probs, value_estimates, q1p_out, q2p_out, masks + ) + policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks) + entropy_loss = self.sac_entropy_loss(log_probs, masks) + + total_value_loss = q1_loss + q2_loss + value_loss + + decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step()) + ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr) + self.policy_optimizer.zero_grad() + policy_loss.backward() + self.policy_optimizer.step() + + ModelUtils.update_learning_rate(self.value_optimizer, decay_lr) + self.value_optimizer.zero_grad() + total_value_loss.backward() + self.value_optimizer.step() + + ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr) + self.entropy_optimizer.zero_grad() + entropy_loss.backward() + self.entropy_optimizer.step() + + # Update target network + ModelUtils.soft_update(self._critic, self.target_network, self.tau) + update_stats = { + "Losses/Policy Loss": policy_loss.item(), + "Losses/Value Loss": value_loss.item(), + "Losses/Q1 Loss": q1_loss.item(), + "Losses/Q2 Loss": q2_loss.item(), + "Policy/Discrete Entropy Coeff": torch.mean( + torch.exp(self._log_ent_coef.discrete) + ).item(), + "Policy/Continuous Entropy Coeff": torch.mean( + torch.exp(self._log_ent_coef.continuous) + ).item(), + "Policy/Learning Rate": decay_lr, + } + + return update_stats + + def get_modules(self): + modules = { + "Optimizer:q_network": self.q_network, + "Optimizer:value_network": self._critic, + "Optimizer:target_network": self.target_network, + "Optimizer:policy_optimizer": self.policy_optimizer, + "Optimizer:value_optimizer": self.value_optimizer, + "Optimizer:entropy_optimizer": self.entropy_optimizer, + } + for reward_provider in self.reward_signals.values(): + modules.update(reward_provider.get_modules()) + return modules diff --git a/MLPY/Lib/site-packages/mlagents/trainers/sac/trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/sac/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..56860c7381849dfb4ec4edab2813972c76f568ed --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/sac/trainer.py @@ -0,0 +1,181 @@ +# ## ML-Agent Learning (SAC) +# Contains an implementation of SAC as described in https://arxiv.org/abs/1801.01290 +# and implemented in https://github.com/hill-a/stable-baselines + +from typing import cast + +import numpy as np + +from mlagents_envs.logging_util import get_logger +from mlagents_envs.base_env import BehaviorSpec +from mlagents.trainers.buffer import BufferKey +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.trainer.off_policy_trainer import OffPolicyTrainer +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.policy.policy import Policy +from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer, SACSettings +from mlagents.trainers.trajectory import Trajectory, ObsUtil +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers +from mlagents.trainers.settings import TrainerSettings + +from mlagents.trainers.torch_entities.networks import SimpleActor + +logger = get_logger(__name__) + +BUFFER_TRUNCATE_PERCENT = 0.8 + +TRAINER_NAME = "sac" + + +class SACTrainer(OffPolicyTrainer): + """ + The SACTrainer is an implementation of the SAC algorithm, with support + for discrete actions and recurrent networks. + """ + + def __init__( + self, + behavior_name: str, + reward_buff_cap: int, + trainer_settings: TrainerSettings, + training: bool, + load: bool, + seed: int, + artifact_path: str, + ): + """ + Responsible for collecting experiences and training SAC model. + :param behavior_name: The name of the behavior associated with trainer config + :param reward_buff_cap: Max reward history to track in the reward buffer + :param trainer_settings: The parameters for the trainer. + :param training: Whether the trainer is set for training. + :param load: Whether the model should be loaded. + :param seed: The seed the model will be initialized with + :param artifact_path: The directory within which to store artifacts from this trainer. + """ + super().__init__( + behavior_name, + reward_buff_cap, + trainer_settings, + training, + load, + seed, + artifact_path, + ) + + self.seed = seed + self.policy: TorchPolicy = None # type: ignore + self.optimizer: TorchSACOptimizer = None # type: ignore + self.hyperparameters: SACSettings = cast( + SACSettings, trainer_settings.hyperparameters + ) + self._step = 0 + + # Don't divide by zero + self.update_steps = 1 + self.reward_signal_update_steps = 1 + + self.steps_per_update = self.hyperparameters.steps_per_update + self.reward_signal_steps_per_update = ( + self.hyperparameters.reward_signal_steps_per_update + ) + + self.checkpoint_replay_buffer = self.hyperparameters.save_replay_buffer + + def _process_trajectory(self, trajectory: Trajectory) -> None: + """ + Takes a trajectory and processes it, putting it into the replay buffer. + """ + super()._process_trajectory(trajectory) + last_step = trajectory.steps[-1] + agent_id = trajectory.agent_id # All the agents should have the same ID + + agent_buffer_trajectory = trajectory.to_agentbuffer() + # Check if we used group rewards, warn if so. + self._warn_if_group_reward(agent_buffer_trajectory) + + # Update the normalization + if self.is_training: + self.policy.actor.update_normalization(agent_buffer_trajectory) + self.optimizer.critic.update_normalization(agent_buffer_trajectory) + + # Evaluate all reward functions for reporting purposes + self.collected_rewards["environment"][agent_id] += np.sum( + agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS] + ) + for name, reward_signal in self.optimizer.reward_signals.items(): + evaluate_result = ( + reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength + ) + + # Report the reward signals + self.collected_rewards[name][agent_id] += np.sum(evaluate_result) + + # Get all value estimates for reporting purposes + ( + value_estimates, + _, + value_memories, + ) = self.optimizer.get_trajectory_value_estimates( + agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached + ) + if value_memories is not None: + agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(value_memories) + + for name, v in value_estimates.items(): + self._stats_reporter.add_stat( + f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value", + np.mean(v), + ) + + # Bootstrap using the last step rather than the bootstrap step if max step is reached. + # Set last element to duplicate obs and remove dones. + if last_step.interrupted: + last_step_obs = last_step.obs + for i, obs in enumerate(last_step_obs): + agent_buffer_trajectory[ObsUtil.get_name_at_next(i)][-1] = obs + agent_buffer_trajectory[BufferKey.DONE][-1] = False + + self._append_to_update_buffer(agent_buffer_trajectory) + + if trajectory.done_reached: + self._update_end_episode_stats(agent_id, self.optimizer) + + def create_optimizer(self) -> TorchOptimizer: + return TorchSACOptimizer( # type: ignore + cast(TorchPolicy, self.policy), self.trainer_settings # type: ignore + ) # type: ignore + + def create_policy( + self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec + ) -> TorchPolicy: + """ + Creates a policy with a PyTorch backend and SAC hyperparameters + :param parsed_behavior_id: + :param behavior_spec: specifications for policy construction + :return policy + """ + actor_cls = SimpleActor + actor_kwargs = {"conditional_sigma": True, "tanh_squash": True} + + policy = TorchPolicy( + self.seed, + behavior_spec, + self.trainer_settings.network_settings, + actor_cls, + actor_kwargs, + ) + self.maybe_load_replay_buffer() + return policy + + def get_policy(self, name_behavior_id: str) -> Policy: + """ + Gets policy from trainer associated with name_behavior_id + :param name_behavior_id: full identifier of policy + """ + + return self.policy + + @staticmethod + def get_trainer_name() -> str: + return TRAINER_NAME diff --git a/MLPY/Lib/site-packages/mlagents/trainers/settings.py b/MLPY/Lib/site-packages/mlagents/trainers/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..7cff991ba22ec8c55e8fc785e46b346a58c7ceaf --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/settings.py @@ -0,0 +1,961 @@ +import os.path +import warnings + +import attr +import cattr +from typing import ( + Dict, + Optional, + List, + Any, + DefaultDict, + Mapping, + Tuple, + Union, + ClassVar, +) +from enum import Enum +import collections +import argparse +import abc +import numpy as np +import math +import copy + +from mlagents.trainers.cli_utils import StoreConfigFile, DetectDefault, parser +from mlagents.trainers.cli_utils import load_config +from mlagents.trainers.exception import TrainerConfigError, TrainerConfigWarning + +from mlagents_envs import logging_util +from mlagents_envs.side_channel.environment_parameters_channel import ( + EnvironmentParametersChannel, +) +from mlagents.plugins import all_trainer_settings, all_trainer_types + +logger = logging_util.get_logger(__name__) + + +def check_and_structure(key: str, value: Any, class_type: type) -> Any: + attr_fields_dict = attr.fields_dict(class_type) + if key not in attr_fields_dict: + raise TrainerConfigError( + f"The option {key} was specified in your YAML file for {class_type.__name__}, but is invalid." + ) + # Apply cattr structure to the values + return cattr.structure(value, attr_fields_dict[key].type) + + +def check_hyperparam_schedules(val: Dict, trainer_type: str) -> Dict: + # Check if beta and epsilon are set. If not, set to match learning rate schedule. + if trainer_type == "ppo" or trainer_type == "poca": + if "beta_schedule" not in val.keys() and "learning_rate_schedule" in val.keys(): + val["beta_schedule"] = val["learning_rate_schedule"] + if ( + "epsilon_schedule" not in val.keys() + and "learning_rate_schedule" in val.keys() + ): + val["epsilon_schedule"] = val["learning_rate_schedule"] + return val + + +def strict_to_cls(d: Mapping, t: type) -> Any: + if not isinstance(d, Mapping): + raise TrainerConfigError(f"Unsupported config {d} for {t.__name__}.") + d_copy: Dict[str, Any] = {} + d_copy.update(d) + for key, val in d_copy.items(): + d_copy[key] = check_and_structure(key, val, t) + return t(**d_copy) + + +def defaultdict_to_dict(d: DefaultDict) -> Dict: + return {key: cattr.unstructure(val) for key, val in d.items()} + + +def deep_update_dict(d: Dict, update_d: Mapping) -> None: + """ + Similar to dict.update(), but works for nested dicts of dicts as well. + """ + for key, val in update_d.items(): + if key in d and isinstance(d[key], Mapping) and isinstance(val, Mapping): + deep_update_dict(d[key], val) + else: + d[key] = val + + +class SerializationSettings: + convert_to_onnx = True + onnx_opset = 9 + + +@attr.s(auto_attribs=True) +class ExportableSettings: + def as_dict(self): + return cattr.unstructure(self) + + +class EncoderType(Enum): + FULLY_CONNECTED = "fully_connected" + MATCH3 = "match3" + SIMPLE = "simple" + NATURE_CNN = "nature_cnn" + RESNET = "resnet" + + +class ScheduleType(Enum): + CONSTANT = "constant" + LINEAR = "linear" + # TODO add support for lesson based scheduling + # LESSON = "lesson" + + +class ConditioningType(Enum): + HYPER = "hyper" + NONE = "none" + + +@attr.s(auto_attribs=True) +class NetworkSettings: + @attr.s + class MemorySettings: + sequence_length: int = attr.ib(default=64) + memory_size: int = attr.ib(default=128) + + @memory_size.validator + def _check_valid_memory_size(self, attribute, value): + if value <= 0: + raise TrainerConfigError( + "When using a recurrent network, memory size must be greater than 0." + ) + elif value % 2 != 0: + raise TrainerConfigError( + "When using a recurrent network, memory size must be divisible by 2." + ) + + normalize: bool = False + hidden_units: int = 128 + num_layers: int = 2 + vis_encode_type: EncoderType = EncoderType.SIMPLE + memory: Optional[MemorySettings] = None + goal_conditioning_type: ConditioningType = ConditioningType.HYPER + deterministic: bool = parser.get_default("deterministic") + + +@attr.s(auto_attribs=True) +class BehavioralCloningSettings: + demo_path: str + steps: int = 0 + strength: float = 1.0 + samples_per_update: int = 0 + # Setting either of these to None will allow the Optimizer + # to decide these parameters, based on Trainer hyperparams + num_epoch: Optional[int] = None + batch_size: Optional[int] = None + + +@attr.s(auto_attribs=True) +class HyperparamSettings: + batch_size: int = 1024 + buffer_size: int = 10240 + learning_rate: float = 3.0e-4 + learning_rate_schedule: ScheduleType = ScheduleType.CONSTANT + + +@attr.s(auto_attribs=True) +class OnPolicyHyperparamSettings(HyperparamSettings): + num_epoch: int = 3 + + +@attr.s(auto_attribs=True) +class OffPolicyHyperparamSettings(HyperparamSettings): + batch_size: int = 128 + buffer_size: int = 50000 + buffer_init_steps: int = 0 + steps_per_update: float = 1 + save_replay_buffer: bool = False + reward_signal_steps_per_update: float = 4 + + +# INTRINSIC REWARD SIGNALS ############################################################# +class RewardSignalType(Enum): + EXTRINSIC: str = "extrinsic" + GAIL: str = "gail" + CURIOSITY: str = "curiosity" + RND: str = "rnd" + + def to_settings(self) -> type: + _mapping = { + RewardSignalType.EXTRINSIC: RewardSignalSettings, + RewardSignalType.GAIL: GAILSettings, + RewardSignalType.CURIOSITY: CuriositySettings, + RewardSignalType.RND: RNDSettings, + } + return _mapping[self] + + +@attr.s(auto_attribs=True) +class RewardSignalSettings: + gamma: float = 0.99 + strength: float = 1.0 + network_settings: NetworkSettings = attr.ib(factory=NetworkSettings) + + @staticmethod + def structure(d: Mapping, t: type) -> Any: + """ + Helper method to structure a Dict of RewardSignalSettings class. Meant to be registered with + cattr.register_structure_hook() and called with cattr.structure(). This is needed to handle + the special Enum selection of RewardSignalSettings classes. + """ + if not isinstance(d, Mapping): + raise TrainerConfigError(f"Unsupported reward signal configuration {d}.") + d_final: Dict[RewardSignalType, RewardSignalSettings] = {} + for key, val in d.items(): + enum_key = RewardSignalType(key) + t = enum_key.to_settings() + d_final[enum_key] = strict_to_cls(val, t) + # Checks to see if user specifying deprecated encoding_size for RewardSignals. + # If network_settings is not specified, this updates the default hidden_units + # to the value of encoding size. If specified, this ignores encoding size and + # uses network_settings values. + if "encoding_size" in val: + logger.warning( + "'encoding_size' was deprecated for RewardSignals. Please use network_settings." + ) + # If network settings was not specified, use the encoding size. Otherwise, use hidden_units + if "network_settings" not in val: + d_final[enum_key].network_settings.hidden_units = val[ + "encoding_size" + ] + return d_final + + +@attr.s(auto_attribs=True) +class GAILSettings(RewardSignalSettings): + learning_rate: float = 3e-4 + encoding_size: Optional[int] = None + use_actions: bool = False + use_vail: bool = False + demo_path: str = attr.ib(kw_only=True) + + +@attr.s(auto_attribs=True) +class CuriositySettings(RewardSignalSettings): + learning_rate: float = 3e-4 + encoding_size: Optional[int] = None + + +@attr.s(auto_attribs=True) +class RNDSettings(RewardSignalSettings): + learning_rate: float = 1e-4 + encoding_size: Optional[int] = None + + +# SAMPLERS ############################################################################# +class ParameterRandomizationType(Enum): + UNIFORM: str = "uniform" + GAUSSIAN: str = "gaussian" + MULTIRANGEUNIFORM: str = "multirangeuniform" + CONSTANT: str = "constant" + + def to_settings(self) -> type: + _mapping = { + ParameterRandomizationType.UNIFORM: UniformSettings, + ParameterRandomizationType.GAUSSIAN: GaussianSettings, + ParameterRandomizationType.MULTIRANGEUNIFORM: MultiRangeUniformSettings, + ParameterRandomizationType.CONSTANT: ConstantSettings + # Constant type is handled if a float is provided instead of a config + } + return _mapping[self] + + +@attr.s(auto_attribs=True) +class ParameterRandomizationSettings(abc.ABC): + seed: int = parser.get_default("seed") + + def __str__(self) -> str: + """ + Helper method to output sampler stats to console. + """ + raise TrainerConfigError(f"__str__ not implemented for type {self.__class__}.") + + @staticmethod + def structure( + d: Union[Mapping, float], t: type + ) -> "ParameterRandomizationSettings": + """ + Helper method to a ParameterRandomizationSettings class. Meant to be registered with + cattr.register_structure_hook() and called with cattr.structure(). This is needed to handle + the special Enum selection of ParameterRandomizationSettings classes. + """ + if isinstance(d, (float, int)): + return ConstantSettings(value=d) + if not isinstance(d, Mapping): + raise TrainerConfigError( + f"Unsupported parameter randomization configuration {d}." + ) + if "sampler_type" not in d: + raise TrainerConfigError( + f"Sampler configuration does not contain sampler_type : {d}." + ) + if "sampler_parameters" not in d: + raise TrainerConfigError( + f"Sampler configuration does not contain sampler_parameters : {d}." + ) + enum_key = ParameterRandomizationType(d["sampler_type"]) + t = enum_key.to_settings() + return strict_to_cls(d["sampler_parameters"], t) + + @staticmethod + def unstructure(d: "ParameterRandomizationSettings") -> Mapping: + """ + Helper method to a ParameterRandomizationSettings class. Meant to be registered with + cattr.register_unstructure_hook() and called with cattr.unstructure(). + """ + _reversed_mapping = { + UniformSettings: ParameterRandomizationType.UNIFORM, + GaussianSettings: ParameterRandomizationType.GAUSSIAN, + MultiRangeUniformSettings: ParameterRandomizationType.MULTIRANGEUNIFORM, + ConstantSettings: ParameterRandomizationType.CONSTANT, + } + sampler_type: Optional[str] = None + for t, name in _reversed_mapping.items(): + if isinstance(d, t): + sampler_type = name.value + sampler_parameters = attr.asdict(d) + return {"sampler_type": sampler_type, "sampler_parameters": sampler_parameters} + + @abc.abstractmethod + def apply(self, key: str, env_channel: EnvironmentParametersChannel) -> None: + """ + Helper method to send sampler settings over EnvironmentParametersChannel + Calls the appropriate sampler type set method. + :param key: environment parameter to be sampled + :param env_channel: The EnvironmentParametersChannel to communicate sampler settings to environment + """ + pass + + +@attr.s(auto_attribs=True) +class ConstantSettings(ParameterRandomizationSettings): + value: float = 0.0 + + def __str__(self) -> str: + """ + Helper method to output sampler stats to console. + """ + return f"Float: value={self.value}" + + def apply(self, key: str, env_channel: EnvironmentParametersChannel) -> None: + """ + Helper method to send sampler settings over EnvironmentParametersChannel + Calls the constant sampler type set method. + :param key: environment parameter to be sampled + :param env_channel: The EnvironmentParametersChannel to communicate sampler settings to environment + """ + env_channel.set_float_parameter(key, self.value) + + +@attr.s(auto_attribs=True) +class UniformSettings(ParameterRandomizationSettings): + min_value: float = attr.ib() + max_value: float = 1.0 + + def __str__(self) -> str: + """ + Helper method to output sampler stats to console. + """ + return f"Uniform sampler: min={self.min_value}, max={self.max_value}" + + @min_value.default + def _min_value_default(self): + return 0.0 + + @min_value.validator + def _check_min_value(self, attribute, value): + if self.min_value > self.max_value: + raise TrainerConfigError( + "Minimum value is greater than maximum value in uniform sampler." + ) + + def apply(self, key: str, env_channel: EnvironmentParametersChannel) -> None: + """ + Helper method to send sampler settings over EnvironmentParametersChannel + Calls the uniform sampler type set method. + :param key: environment parameter to be sampled + :param env_channel: The EnvironmentParametersChannel to communicate sampler settings to environment + """ + env_channel.set_uniform_sampler_parameters( + key, self.min_value, self.max_value, self.seed + ) + + +@attr.s(auto_attribs=True) +class GaussianSettings(ParameterRandomizationSettings): + mean: float = 1.0 + st_dev: float = 1.0 + + def __str__(self) -> str: + """ + Helper method to output sampler stats to console. + """ + return f"Gaussian sampler: mean={self.mean}, stddev={self.st_dev}" + + def apply(self, key: str, env_channel: EnvironmentParametersChannel) -> None: + """ + Helper method to send sampler settings over EnvironmentParametersChannel + Calls the gaussian sampler type set method. + :param key: environment parameter to be sampled + :param env_channel: The EnvironmentParametersChannel to communicate sampler settings to environment + """ + env_channel.set_gaussian_sampler_parameters( + key, self.mean, self.st_dev, self.seed + ) + + +@attr.s(auto_attribs=True) +class MultiRangeUniformSettings(ParameterRandomizationSettings): + intervals: List[Tuple[float, float]] = attr.ib() + + def __str__(self) -> str: + """ + Helper method to output sampler stats to console. + """ + return f"MultiRangeUniform sampler: intervals={self.intervals}" + + @intervals.default + def _intervals_default(self): + return [[0.0, 1.0]] + + @intervals.validator + def _check_intervals(self, attribute, value): + for interval in self.intervals: + if len(interval) != 2: + raise TrainerConfigError( + f"The sampling interval {interval} must contain exactly two values." + ) + min_value, max_value = interval + if min_value > max_value: + raise TrainerConfigError( + f"Minimum value is greater than maximum value in interval {interval}." + ) + + def apply(self, key: str, env_channel: EnvironmentParametersChannel) -> None: + """ + Helper method to send sampler settings over EnvironmentParametersChannel + Calls the multirangeuniform sampler type set method. + :param key: environment parameter to be sampled + :param env_channel: The EnvironmentParametersChannel to communicate sampler settings to environment + """ + env_channel.set_multirangeuniform_sampler_parameters( + key, self.intervals, self.seed + ) + + +# ENVIRONMENT PARAMETERS ############################################################### +@attr.s(auto_attribs=True) +class CompletionCriteriaSettings: + """ + CompletionCriteriaSettings contains the information needed to figure out if the next + lesson must start. + """ + + class MeasureType(Enum): + PROGRESS: str = "progress" + REWARD: str = "reward" + + behavior: str + measure: MeasureType = attr.ib(default=MeasureType.REWARD) + min_lesson_length: int = 0 + signal_smoothing: bool = True + threshold: float = attr.ib(default=0.0) + require_reset: bool = False + + @threshold.validator + def _check_threshold_value(self, attribute, value): + """ + Verify that the threshold has a value between 0 and 1 when the measure is + PROGRESS + """ + if self.measure == self.MeasureType.PROGRESS: + if self.threshold > 1.0: + raise TrainerConfigError( + "Threshold for next lesson cannot be greater than 1 when the measure is progress." + ) + if self.threshold < 0.0: + raise TrainerConfigError( + "Threshold for next lesson cannot be negative when the measure is progress." + ) + + def need_increment( + self, progress: float, reward_buffer: List[float], smoothing: float + ) -> Tuple[bool, float]: + """ + Given measures, this method returns a boolean indicating if the lesson + needs to change now, and a float corresponding to the new smoothed value. + """ + # Is the min number of episodes reached + if len(reward_buffer) < self.min_lesson_length: + return False, smoothing + if self.measure == CompletionCriteriaSettings.MeasureType.PROGRESS: + if progress > self.threshold: + return True, smoothing + if self.measure == CompletionCriteriaSettings.MeasureType.REWARD: + if len(reward_buffer) < 1: + return False, smoothing + measure = np.mean(reward_buffer) + if math.isnan(measure): + return False, smoothing + if self.signal_smoothing: + measure = 0.25 * smoothing + 0.75 * measure + smoothing = measure + if measure > self.threshold: + return True, smoothing + return False, smoothing + + +@attr.s(auto_attribs=True) +class Lesson: + """ + Gathers the data of one lesson for one environment parameter including its name, + the condition that must be fullfiled for the lesson to be completed and a sampler + for the environment parameter. If the completion_criteria is None, then this is + the last lesson in the curriculum. + """ + + value: ParameterRandomizationSettings + name: str + completion_criteria: Optional[CompletionCriteriaSettings] = attr.ib(default=None) + + +@attr.s(auto_attribs=True) +class EnvironmentParameterSettings: + """ + EnvironmentParameterSettings is an ordered list of lessons for one environment + parameter. + """ + + curriculum: List[Lesson] + + @staticmethod + def _check_lesson_chain(lessons, parameter_name): + """ + Ensures that when using curriculum, all non-terminal lessons have a valid + CompletionCriteria, and that the terminal lesson does not contain a CompletionCriteria. + """ + num_lessons = len(lessons) + for index, lesson in enumerate(lessons): + if index < num_lessons - 1 and lesson.completion_criteria is None: + raise TrainerConfigError( + f"A non-terminal lesson does not have a completion_criteria for {parameter_name}." + ) + if index == num_lessons - 1 and lesson.completion_criteria is not None: + warnings.warn( + f"Your final lesson definition contains completion_criteria for {parameter_name}." + f"It will be ignored.", + TrainerConfigWarning, + ) + + @staticmethod + def structure(d: Mapping, t: type) -> Dict[str, "EnvironmentParameterSettings"]: + """ + Helper method to structure a Dict of EnvironmentParameterSettings class. Meant + to be registered with cattr.register_structure_hook() and called with + cattr.structure(). + """ + if not isinstance(d, Mapping): + raise TrainerConfigError( + f"Unsupported parameter environment parameter settings {d}." + ) + d_final: Dict[str, EnvironmentParameterSettings] = {} + for environment_parameter, environment_parameter_config in d.items(): + if ( + isinstance(environment_parameter_config, Mapping) + and "curriculum" in environment_parameter_config + ): + d_final[environment_parameter] = strict_to_cls( + environment_parameter_config, EnvironmentParameterSettings + ) + EnvironmentParameterSettings._check_lesson_chain( + d_final[environment_parameter].curriculum, environment_parameter + ) + else: + sampler = ParameterRandomizationSettings.structure( + environment_parameter_config, ParameterRandomizationSettings + ) + d_final[environment_parameter] = EnvironmentParameterSettings( + curriculum=[ + Lesson( + completion_criteria=None, + value=sampler, + name=environment_parameter, + ) + ] + ) + return d_final + + +# TRAINERS ############################################################################# +@attr.s(auto_attribs=True) +class SelfPlaySettings: + save_steps: int = 20000 + team_change: int = attr.ib() + + @team_change.default + def _team_change_default(self): + # Assign team_change to about 4x save_steps + return self.save_steps * 5 + + swap_steps: int = 2000 + window: int = 10 + play_against_latest_model_ratio: float = 0.5 + initial_elo: float = 1200.0 + + +@attr.s(auto_attribs=True) +class TrainerSettings(ExportableSettings): + default_override: ClassVar[Optional["TrainerSettings"]] = None + trainer_type: str = "ppo" + hyperparameters: HyperparamSettings = attr.ib() + + @hyperparameters.default + def _set_default_hyperparameters(self): + return all_trainer_settings[self.trainer_type]() + + network_settings: NetworkSettings = attr.ib(factory=NetworkSettings) + reward_signals: Dict[RewardSignalType, RewardSignalSettings] = attr.ib( + factory=lambda: {RewardSignalType.EXTRINSIC: RewardSignalSettings()} + ) + init_path: Optional[str] = None + keep_checkpoints: int = 5 + checkpoint_interval: int = 500000 + max_steps: int = 500000 + time_horizon: int = 64 + summary_freq: int = 50000 + threaded: bool = False + self_play: Optional[SelfPlaySettings] = None + behavioral_cloning: Optional[BehavioralCloningSettings] = None + + cattr.register_structure_hook_func( + lambda t: t == Dict[RewardSignalType, RewardSignalSettings], + RewardSignalSettings.structure, + ) + + @network_settings.validator + def _check_batch_size_seq_length(self, attribute, value): + if self.network_settings.memory is not None: + if ( + self.network_settings.memory.sequence_length + > self.hyperparameters.batch_size + ): + raise TrainerConfigError( + "When using memory, sequence length must be less than or equal to batch size. " + ) + + @staticmethod + def dict_to_trainerdict(d: Dict, t: type) -> "TrainerSettings.DefaultTrainerDict": + return TrainerSettings.DefaultTrainerDict( + cattr.structure(d, Dict[str, TrainerSettings]) + ) + + @staticmethod + def structure(d: Mapping, t: type) -> Any: + """ + Helper method to structure a TrainerSettings class. Meant to be registered with + cattr.register_structure_hook() and called with cattr.structure(). + """ + + if not isinstance(d, Mapping): + raise TrainerConfigError(f"Unsupported config {d} for {t.__name__}.") + + d_copy: Dict[str, Any] = {} + + # Check if a default_settings was specified. If so, used those as the default + # rather than an empty dict. + if TrainerSettings.default_override is not None: + d_copy.update(cattr.unstructure(TrainerSettings.default_override)) + + deep_update_dict(d_copy, d) + + if "framework" in d_copy: + logger.warning("Framework option was deprecated but was specified") + d_copy.pop("framework", None) + + for key, val in d_copy.items(): + if attr.has(type(val)): + # Don't convert already-converted attrs classes. + continue + if key == "hyperparameters": + if "trainer_type" not in d_copy: + raise TrainerConfigError( + "Hyperparameters were specified but no trainer_type was given." + ) + else: + d_copy[key] = check_hyperparam_schedules( + val, d_copy["trainer_type"] + ) + try: + d_copy[key] = strict_to_cls( + d_copy[key], all_trainer_settings[d_copy["trainer_type"]] + ) + except KeyError: + raise TrainerConfigError( + f"Settings for trainer type {d_copy['trainer_type']} were not found" + ) + elif key == "max_steps": + d_copy[key] = int(float(val)) + # In some legacy configs, max steps was specified as a float + elif key == "trainer_type": + if val not in all_trainer_types.keys(): + raise TrainerConfigError(f"Invalid trainer type {val} was found") + else: + d_copy[key] = check_and_structure(key, val, t) + return t(**d_copy) + + class DefaultTrainerDict(collections.defaultdict): + def __init__(self, *args): + # Depending on how this is called, args may have the defaultdict + # callable at the start of the list or not. In particular, unpickling + # will pass [TrainerSettings]. + if args and args[0] == TrainerSettings: + super().__init__(*args) + else: + super().__init__(TrainerSettings, *args) + self._config_specified = True + + def set_config_specified(self, require_config_specified: bool) -> None: + self._config_specified = require_config_specified + + def __missing__(self, key: Any) -> "TrainerSettings": + if TrainerSettings.default_override is not None: + self[key] = copy.deepcopy(TrainerSettings.default_override) + elif self._config_specified: + raise TrainerConfigError( + f"The behavior name {key} has not been specified in the trainer configuration. " + f"Please add an entry in the configuration file for {key}, or set default_settings." + ) + else: + logger.warning( + f"Behavior name {key} does not match any behaviors specified " + f"in the trainer configuration file. A default configuration will be used." + ) + self[key] = TrainerSettings() + return self[key] + + +# COMMAND LINE ######################################################################### +@attr.s(auto_attribs=True) +class CheckpointSettings: + run_id: str = parser.get_default("run_id") + initialize_from: Optional[str] = parser.get_default("initialize_from") + load_model: bool = parser.get_default("load_model") + resume: bool = parser.get_default("resume") + force: bool = parser.get_default("force") + train_model: bool = parser.get_default("train_model") + inference: bool = parser.get_default("inference") + results_dir: str = parser.get_default("results_dir") + + @property + def write_path(self) -> str: + return os.path.join(self.results_dir, self.run_id) + + @property + def maybe_init_path(self) -> Optional[str]: + return ( + os.path.join(self.results_dir, self.initialize_from) + if self.initialize_from is not None + else None + ) + + @property + def run_logs_dir(self) -> str: + return os.path.join(self.write_path, "run_logs") + + def prioritize_resume_init(self) -> None: + """Prioritize explicit command line resume/init over conflicting yaml options. + if both resume/init are set at one place use resume""" + _non_default_args = DetectDefault.non_default_args + if "resume" in _non_default_args: + if self.initialize_from is not None: + logger.warning( + f"Both 'resume' and 'initialize_from={self.initialize_from}' are set!" + f" Current run will be resumed ignoring initialization." + ) + self.initialize_from = parser.get_default("initialize_from") + elif "initialize_from" in _non_default_args: + if self.resume: + logger.warning( + f"Both 'resume' and 'initialize_from={self.initialize_from}' are set!" + f" {self.run_id} is initialized_from {self.initialize_from} and resume will be ignored." + ) + self.resume = parser.get_default("resume") + elif self.resume and self.initialize_from is not None: + # no cli args but both are set in yaml file + logger.warning( + f"Both 'resume' and 'initialize_from={self.initialize_from}' are set in yaml file!" + f" Current run will be resumed ignoring initialization." + ) + self.initialize_from = parser.get_default("initialize_from") + + +@attr.s(auto_attribs=True) +class EnvironmentSettings: + env_path: Optional[str] = parser.get_default("env_path") + env_args: Optional[List[str]] = parser.get_default("env_args") + base_port: int = parser.get_default("base_port") + num_envs: int = attr.ib(default=parser.get_default("num_envs")) + num_areas: int = attr.ib(default=parser.get_default("num_areas")) + seed: int = parser.get_default("seed") + max_lifetime_restarts: int = parser.get_default("max_lifetime_restarts") + restarts_rate_limit_n: int = parser.get_default("restarts_rate_limit_n") + restarts_rate_limit_period_s: int = parser.get_default( + "restarts_rate_limit_period_s" + ) + + @num_envs.validator + def validate_num_envs(self, attribute, value): + if value > 1 and self.env_path is None: + raise ValueError("num_envs must be 1 if env_path is not set.") + + @num_areas.validator + def validate_num_area(self, attribute, value): + if value <= 0: + raise ValueError("num_areas must be set to a positive number >= 1.") + + +@attr.s(auto_attribs=True) +class EngineSettings: + width: int = parser.get_default("width") + height: int = parser.get_default("height") + quality_level: int = parser.get_default("quality_level") + time_scale: float = parser.get_default("time_scale") + target_frame_rate: int = parser.get_default("target_frame_rate") + capture_frame_rate: int = parser.get_default("capture_frame_rate") + no_graphics: bool = parser.get_default("no_graphics") + + +@attr.s(auto_attribs=True) +class TorchSettings: + device: Optional[str] = parser.get_default("device") + + +@attr.s(auto_attribs=True) +class RunOptions(ExportableSettings): + default_settings: Optional[TrainerSettings] = None + behaviors: TrainerSettings.DefaultTrainerDict = attr.ib( + factory=TrainerSettings.DefaultTrainerDict + ) + env_settings: EnvironmentSettings = attr.ib(factory=EnvironmentSettings) + engine_settings: EngineSettings = attr.ib(factory=EngineSettings) + environment_parameters: Optional[Dict[str, EnvironmentParameterSettings]] = None + checkpoint_settings: CheckpointSettings = attr.ib(factory=CheckpointSettings) + torch_settings: TorchSettings = attr.ib(factory=TorchSettings) + + # These are options that are relevant to the run itself, and not the engine or environment. + # They will be left here. + debug: bool = parser.get_default("debug") + + # Convert to settings while making sure all fields are valid + cattr.register_structure_hook(EnvironmentSettings, strict_to_cls) + cattr.register_structure_hook(EngineSettings, strict_to_cls) + cattr.register_structure_hook(CheckpointSettings, strict_to_cls) + cattr.register_structure_hook_func( + lambda t: t == Dict[str, EnvironmentParameterSettings], + EnvironmentParameterSettings.structure, + ) + cattr.register_structure_hook(Lesson, strict_to_cls) + cattr.register_structure_hook( + ParameterRandomizationSettings, ParameterRandomizationSettings.structure + ) + cattr.register_unstructure_hook( + ParameterRandomizationSettings, ParameterRandomizationSettings.unstructure + ) + cattr.register_structure_hook(TrainerSettings, TrainerSettings.structure) + cattr.register_structure_hook( + TrainerSettings.DefaultTrainerDict, TrainerSettings.dict_to_trainerdict + ) + cattr.register_unstructure_hook(collections.defaultdict, defaultdict_to_dict) + + @staticmethod + def from_argparse(args: argparse.Namespace) -> "RunOptions": + """ + Takes an argparse.Namespace as specified in `parse_command_line`, loads input configuration files + from file paths, and converts to a RunOptions instance. + :param args: collection of command-line parameters passed to mlagents-learn + :return: RunOptions representing the passed in arguments, with trainer config, curriculum and sampler + configs loaded from files. + """ + argparse_args = vars(args) + config_path = StoreConfigFile.trainer_config_path + + # Load YAML + configured_dict: Dict[str, Any] = { + "checkpoint_settings": {}, + "env_settings": {}, + "engine_settings": {}, + "torch_settings": {}, + } + _require_all_behaviors = True + if config_path is not None: + configured_dict.update(load_config(config_path)) + else: + # If we're not loading from a file, we don't require all behavior names to be specified. + _require_all_behaviors = False + + # Use the YAML file values for all values not specified in the CLI. + for key in configured_dict.keys(): + # Detect bad config options + if key not in attr.fields_dict(RunOptions): + raise TrainerConfigError( + "The option {} was specified in your YAML file, but is invalid.".format( + key + ) + ) + + # Override with CLI args + # Keep deprecated --load working, TODO: remove + argparse_args["resume"] = argparse_args["resume"] or argparse_args["load_model"] + + for key, val in argparse_args.items(): + if key in DetectDefault.non_default_args: + if key in attr.fields_dict(CheckpointSettings): + configured_dict["checkpoint_settings"][key] = val + elif key in attr.fields_dict(EnvironmentSettings): + configured_dict["env_settings"][key] = val + elif key in attr.fields_dict(EngineSettings): + configured_dict["engine_settings"][key] = val + elif key in attr.fields_dict(TorchSettings): + configured_dict["torch_settings"][key] = val + else: # Base options + configured_dict[key] = val + + final_runoptions = RunOptions.from_dict(configured_dict) + final_runoptions.checkpoint_settings.prioritize_resume_init() + # Need check to bypass type checking but keep structure on dict working + if isinstance(final_runoptions.behaviors, TrainerSettings.DefaultTrainerDict): + # configure whether or not we should require all behavior names to be found in the config YAML + final_runoptions.behaviors.set_config_specified(_require_all_behaviors) + + _non_default_args = DetectDefault.non_default_args + + # Prioritize the deterministic mode from the cli for deterministic actions. + if "deterministic" in _non_default_args: + for behaviour in final_runoptions.behaviors.keys(): + final_runoptions.behaviors[ + behaviour + ].network_settings.deterministic = argparse_args["deterministic"] + + return final_runoptions + + @staticmethod + def from_dict( + options_dict: Dict[str, Any], + ) -> "RunOptions": + # If a default settings was specified, set the TrainerSettings class override + if ( + "default_settings" in options_dict.keys() + and options_dict["default_settings"] is not None + ): + TrainerSettings.default_override = cattr.structure( + options_dict["default_settings"], TrainerSettings + ) + return cattr.structure(options_dict, RunOptions) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/simple_env_manager.py b/MLPY/Lib/site-packages/mlagents/trainers/simple_env_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..b29fd1ad3b969e33512171090f45f57f0037e305 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/simple_env_manager.py @@ -0,0 +1,84 @@ +from typing import Dict, List + +from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec +from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult +from mlagents_envs.timers import timed +from mlagents.trainers.action_info import ActionInfo +from mlagents.trainers.settings import ParameterRandomizationSettings +from mlagents_envs.side_channel.environment_parameters_channel import ( + EnvironmentParametersChannel, +) + + +class SimpleEnvManager(EnvManager): + """ + Simple implementation of the EnvManager interface that only handles one BaseEnv at a time. + This is generally only useful for testing; see SubprocessEnvManager for a production-quality implementation. + """ + + def __init__(self, env: BaseEnv, env_params: EnvironmentParametersChannel): + super().__init__() + self.env_params = env_params + self.env = env + self.previous_step: EnvironmentStep = EnvironmentStep.empty(0) + self.previous_all_action_info: Dict[str, ActionInfo] = {} + + def _step(self) -> List[EnvironmentStep]: + all_action_info = self._take_step(self.previous_step) + self.previous_all_action_info = all_action_info + + for brain_name, action_info in all_action_info.items(): + self.env.set_actions(brain_name, action_info.env_action) + self.env.step() + all_step_result = self._generate_all_results() + + step_info = EnvironmentStep( + all_step_result, 0, self.previous_all_action_info, {} + ) + self.previous_step = step_info + return [step_info] + + def _reset_env( + self, config: Dict[BehaviorName, float] = None + ) -> List[EnvironmentStep]: # type: ignore + self.set_env_parameters(config) + self.env.reset() + all_step_result = self._generate_all_results() + self.previous_step = EnvironmentStep(all_step_result, 0, {}, {}) + return [self.previous_step] + + def set_env_parameters(self, config: Dict = None) -> None: + """ + Sends environment parameter settings to C# via the + EnvironmentParametersSidehannel. + :param config: Dict of environment parameter keys and values + """ + if config is not None: + for k, v in config.items(): + if isinstance(v, float): + self.env_params.set_float_parameter(k, v) + elif isinstance(v, ParameterRandomizationSettings): + v.apply(k, self.env_params) + + @property + def training_behaviors(self) -> Dict[BehaviorName, BehaviorSpec]: + return self.env.behavior_specs + + def close(self): + self.env.close() + + @timed + def _take_step(self, last_step: EnvironmentStep) -> Dict[BehaviorName, ActionInfo]: + all_action_info: Dict[str, ActionInfo] = {} + for brain_name, step_tuple in last_step.current_all_step_result.items(): + all_action_info[brain_name] = self.policies[brain_name].get_action( + step_tuple[0], + 0, # As there is only one worker, we assign the worker_id to 0. + ) + return all_action_info + + def _generate_all_results(self) -> AllStepResult: + all_step_result: AllStepResult = {} + for brain_name in self.env.behavior_specs: + all_step_result[brain_name] = self.env.get_steps(brain_name) + return all_step_result diff --git a/MLPY/Lib/site-packages/mlagents/trainers/stats.py b/MLPY/Lib/site-packages/mlagents/trainers/stats.py new file mode 100644 index 0000000000000000000000000000000000000000..78c86116e2abe2724a0d1ce563b828dec5c5c527 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/stats.py @@ -0,0 +1,393 @@ +from collections import defaultdict +from enum import Enum +from typing import List, Dict, NamedTuple, Any, Optional +import numpy as np +import abc +import os +import time +from threading import RLock + +from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod + +from mlagents_envs.logging_util import get_logger +from mlagents_envs.timers import set_gauge +from torch.utils.tensorboard import SummaryWriter +from mlagents.torch_utils.globals import get_rank + +logger = get_logger(__name__) + + +def _dict_to_str(param_dict: Dict[str, Any], num_tabs: int) -> str: + """ + Takes a parameter dictionary and converts it to a human-readable string. + Recurses if there are multiple levels of dict. Used to print out hyperparameters. + + :param param_dict: A Dictionary of key, value parameters. + :return: A string version of this dictionary. + """ + if not isinstance(param_dict, dict): + return str(param_dict) + else: + append_newline = "\n" if num_tabs > 0 else "" + return append_newline + "\n".join( + [ + "\t" + + " " * num_tabs + + f"{x}:\t{_dict_to_str(param_dict[x], num_tabs + 1)}" + for x in param_dict + ] + ) + + +class StatsSummary(NamedTuple): + full_dist: List[float] + aggregation_method: StatsAggregationMethod + + @staticmethod + def empty() -> "StatsSummary": + return StatsSummary([], StatsAggregationMethod.AVERAGE) + + @property + def aggregated_value(self): + if self.aggregation_method == StatsAggregationMethod.SUM: + return self.sum + else: + return self.mean + + @property + def mean(self): + return np.mean(self.full_dist) + + @property + def std(self): + return np.std(self.full_dist) + + @property + def num(self): + return len(self.full_dist) + + @property + def sum(self): + return np.sum(self.full_dist) + + +class StatsPropertyType(Enum): + HYPERPARAMETERS = "hyperparameters" + SELF_PLAY = "selfplay" + + +class StatsWriter(abc.ABC): + """ + A StatsWriter abstract class. A StatsWriter takes in a category, key, scalar value, and step + and writes it out by some method. + """ + + def on_add_stat( + self, + category: str, + key: str, + value: float, + aggregation: StatsAggregationMethod = StatsAggregationMethod.AVERAGE, + ) -> None: + """ + Callback method for handling an individual stat value as reported to the StatsReporter add_stat + or set_stat methods. + + :param category: Category of the statistics. Usually this is the behavior name. + :param key: The type of statistic, e.g. Environment/Reward. + :param value: The value of the statistic. + :param aggregation: The aggregation method for the statistic, default StatsAggregationMethod.AVERAGE. + """ + pass + + @abc.abstractmethod + def write_stats( + self, category: str, values: Dict[str, StatsSummary], step: int + ) -> None: + """ + Callback to record training information + :param category: Category of the statistics. Usually this is the behavior name. + :param values: Dictionary of statistics. + :param step: The current training step. + :return: + """ + pass + + def add_property( + self, category: str, property_type: StatsPropertyType, value: Any + ) -> None: + """ + Add a generic property to the StatsWriter. This could be e.g. a Dict of hyperparameters, + a max step count, a trainer type, etc. Note that not all StatsWriters need to be compatible + with all types of properties. For instance, a TB writer doesn't need a max step. + + :param category: The category that the property belongs to. + :param property_type: The type of property. + :param value: The property itself. + """ + pass + + +class GaugeWriter(StatsWriter): + """ + Write all stats that we receive to the timer gauges, so we can track them offline easily + """ + + @staticmethod + def sanitize_string(s: str) -> str: + """ + Clean up special characters in the category and value names. + """ + return s.replace("/", ".").replace(" ", "") + + def write_stats( + self, category: str, values: Dict[str, StatsSummary], step: int + ) -> None: + for val, stats_summary in values.items(): + set_gauge( + GaugeWriter.sanitize_string(f"{category}.{val}.mean"), + float(stats_summary.mean), + ) + set_gauge( + GaugeWriter.sanitize_string(f"{category}.{val}.sum"), + float(stats_summary.sum), + ) + + +class ConsoleWriter(StatsWriter): + def __init__(self): + self.training_start_time = time.time() + # If self-play, we want to print ELO as well as reward + self.self_play = False + self.self_play_team = -1 + self.rank = get_rank() + + def write_stats( + self, category: str, values: Dict[str, StatsSummary], step: int + ) -> None: + is_training = "Not Training" + if "Is Training" in values: + stats_summary = values["Is Training"] + if stats_summary.aggregated_value > 0.0: + is_training = "Training" + + elapsed_time = time.time() - self.training_start_time + log_info: List[str] = [category] + log_info.append(f"Step: {step}") + log_info.append(f"Time Elapsed: {elapsed_time:0.3f} s") + if "Environment/Cumulative Reward" in values: + stats_summary = values["Environment/Cumulative Reward"] + if self.rank is not None: + log_info.append(f"Rank: {self.rank}") + + log_info.append(f"Mean Reward: {stats_summary.mean:0.3f}") + if "Environment/Group Cumulative Reward" in values: + group_stats_summary = values["Environment/Group Cumulative Reward"] + log_info.append(f"Mean Group Reward: {group_stats_summary.mean:0.3f}") + else: + log_info.append(f"Std of Reward: {stats_summary.std:0.3f}") + log_info.append(is_training) + + if self.self_play and "Self-play/ELO" in values: + elo_stats = values["Self-play/ELO"] + log_info.append(f"ELO: {elo_stats.mean:0.3f}") + else: + log_info.append("No episode was completed since last summary") + log_info.append(is_training) + logger.info(". ".join(log_info) + ".") + + def add_property( + self, category: str, property_type: StatsPropertyType, value: Any + ) -> None: + if property_type == StatsPropertyType.HYPERPARAMETERS: + logger.info( + """Hyperparameters for behavior name {}: \n{}""".format( + category, _dict_to_str(value, 0) + ) + ) + elif property_type == StatsPropertyType.SELF_PLAY: + assert isinstance(value, bool) + self.self_play = value + + +class TensorboardWriter(StatsWriter): + def __init__( + self, + base_dir: str, + clear_past_data: bool = False, + hidden_keys: Optional[List[str]] = None, + ): + """ + A StatsWriter that writes to a Tensorboard summary. + + :param base_dir: The directory within which to place all the summaries. Tensorboard files will be written to a + {base_dir}/{category} directory. + :param clear_past_data: Whether or not to clean up existing Tensorboard files associated with the base_dir and + category. + :param hidden_keys: If provided, Tensorboard Writer won't write statistics identified with these Keys in + Tensorboard summary. + """ + self.summary_writers: Dict[str, SummaryWriter] = {} + self.base_dir: str = base_dir + self._clear_past_data = clear_past_data + self.hidden_keys: List[str] = hidden_keys if hidden_keys is not None else [] + + def write_stats( + self, category: str, values: Dict[str, StatsSummary], step: int + ) -> None: + self._maybe_create_summary_writer(category) + for key, value in values.items(): + if key in self.hidden_keys: + continue + self.summary_writers[category].add_scalar( + f"{key}", value.aggregated_value, step + ) + if value.aggregation_method == StatsAggregationMethod.HISTOGRAM: + self.summary_writers[category].add_histogram( + f"{key}_hist", np.array(value.full_dist), step + ) + self.summary_writers[category].flush() + + def _maybe_create_summary_writer(self, category: str) -> None: + if category not in self.summary_writers: + filewriter_dir = "{basedir}/{category}".format( + basedir=self.base_dir, category=category + ) + os.makedirs(filewriter_dir, exist_ok=True) + if self._clear_past_data: + self._delete_all_events_files(filewriter_dir) + self.summary_writers[category] = SummaryWriter(filewriter_dir) + + def _delete_all_events_files(self, directory_name: str) -> None: + for file_name in os.listdir(directory_name): + if file_name.startswith("events.out"): + logger.warning( + f"Deleting TensorBoard data {file_name} that was left over from a " + "previous run." + ) + full_fname = os.path.join(directory_name, file_name) + try: + os.remove(full_fname) + except OSError: + logger.error( + "{} was left over from a previous run and " + "not deleted.".format(full_fname) + ) + + def add_property( + self, category: str, property_type: StatsPropertyType, value: Any + ) -> None: + if property_type == StatsPropertyType.HYPERPARAMETERS: + assert isinstance(value, dict) + summary = _dict_to_str(value, 0) + self._maybe_create_summary_writer(category) + if summary is not None: + self.summary_writers[category].add_text("Hyperparameters", summary) + self.summary_writers[category].flush() + + +class StatsReporter: + writers: List[StatsWriter] = [] + stats_dict: Dict[str, Dict[str, List]] = defaultdict(lambda: defaultdict(list)) + lock = RLock() + stats_aggregation: Dict[str, Dict[str, StatsAggregationMethod]] = defaultdict( + lambda: defaultdict(lambda: StatsAggregationMethod.AVERAGE) + ) + + def __init__(self, category: str): + """ + Generic StatsReporter. A category is the broadest type of storage (would + correspond the run name and trainer name, e.g. 3DBalltest_3DBall. A key is the + type of stat it is (e.g. Environment/Reward). Finally the Value is the float value + attached to this stat. + """ + self.category: str = category + + @staticmethod + def add_writer(writer: StatsWriter) -> None: + with StatsReporter.lock: + StatsReporter.writers.append(writer) + + def add_property(self, property_type: StatsPropertyType, value: Any) -> None: + """ + Add a generic property to the StatsReporter. This could be e.g. a Dict of hyperparameters, + a max step count, a trainer type, etc. Note that not all StatsWriters need to be compatible + with all types of properties. For instance, a TB writer doesn't need a max step. + + :param property_type: The type of property. + :param value: The property itself. + """ + with StatsReporter.lock: + for writer in StatsReporter.writers: + writer.add_property(self.category, property_type, value) + + def add_stat( + self, + key: str, + value: float, + aggregation: StatsAggregationMethod = StatsAggregationMethod.AVERAGE, + ) -> None: + """ + Add a float value stat to the StatsReporter. + + :param key: The type of statistic, e.g. Environment/Reward. + :param value: the value of the statistic. + :param aggregation: the aggregation method for the statistic, default StatsAggregationMethod.AVERAGE. + """ + with StatsReporter.lock: + StatsReporter.stats_dict[self.category][key].append(value) + StatsReporter.stats_aggregation[self.category][key] = aggregation + for writer in StatsReporter.writers: + writer.on_add_stat(self.category, key, value, aggregation) + + def set_stat(self, key: str, value: float) -> None: + """ + Sets a stat value to a float. This is for values that we don't want to average, and just + want the latest. + + :param key: The type of statistic, e.g. Environment/Reward. + :param value: the value of the statistic. + """ + with StatsReporter.lock: + StatsReporter.stats_dict[self.category][key] = [value] + StatsReporter.stats_aggregation[self.category][ + key + ] = StatsAggregationMethod.MOST_RECENT + for writer in StatsReporter.writers: + writer.on_add_stat( + self.category, key, value, StatsAggregationMethod.MOST_RECENT + ) + + def write_stats(self, step: int) -> None: + """ + Write out all stored statistics that fall under the category specified. + The currently stored values will be averaged, written out as a single value, + and the buffer cleared. + + :param step: Training step which to write these stats as. + """ + with StatsReporter.lock: + values: Dict[str, StatsSummary] = {} + for key in StatsReporter.stats_dict[self.category]: + if len(StatsReporter.stats_dict[self.category][key]) > 0: + stat_summary = self.get_stats_summaries(key) + values[key] = stat_summary + for writer in StatsReporter.writers: + writer.write_stats(self.category, values, step) + del StatsReporter.stats_dict[self.category] + + def get_stats_summaries(self, key: str) -> StatsSummary: + """ + Get the mean, std, count, sum and aggregation method of a particular statistic, since last write. + + :param key: The type of statistic, e.g. Environment/Reward. + :returns: A StatsSummary containing summary statistics. + """ + stat_values = StatsReporter.stats_dict[self.category][key] + if len(stat_values) == 0: + return StatsSummary.empty() + + return StatsSummary( + full_dist=stat_values, + aggregation_method=StatsReporter.stats_aggregation[self.category][key], + ) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/subprocess_env_manager.py b/MLPY/Lib/site-packages/mlagents/trainers/subprocess_env_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..43d468f2bc338549d5fe95659804d31ae7602000 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/subprocess_env_manager.py @@ -0,0 +1,546 @@ +import datetime +from typing import Dict, NamedTuple, List, Any, Optional, Callable, Set +import cloudpickle +import enum +import time + +from mlagents_envs.environment import UnityEnvironment +from mlagents_envs.exception import ( + UnityCommunicationException, + UnityTimeOutException, + UnityEnvironmentException, + UnityCommunicatorStoppedException, +) +from multiprocessing import Process, Pipe, Queue +from multiprocessing.connection import Connection +from queue import Empty as EmptyQueueException +from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec +from mlagents_envs import logging_util +from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult +from mlagents.trainers.settings import TrainerSettings +from mlagents_envs.timers import ( + TimerNode, + timed, + hierarchical_timer, + reset_timers, + get_timer_root, +) +from mlagents.trainers.settings import ParameterRandomizationSettings, RunOptions +from mlagents.trainers.action_info import ActionInfo +from mlagents_envs.side_channel.environment_parameters_channel import ( + EnvironmentParametersChannel, +) +from mlagents_envs.side_channel.engine_configuration_channel import ( + EngineConfigurationChannel, + EngineConfig, +) +from mlagents_envs.side_channel.stats_side_channel import ( + EnvironmentStats, + StatsSideChannel, +) +from mlagents.trainers.training_analytics_side_channel import ( + TrainingAnalyticsSideChannel, +) +from mlagents_envs.side_channel.side_channel import SideChannel + + +logger = logging_util.get_logger(__name__) +WORKER_SHUTDOWN_TIMEOUT_S = 10 + + +class EnvironmentCommand(enum.Enum): + STEP = 1 + BEHAVIOR_SPECS = 2 + ENVIRONMENT_PARAMETERS = 3 + RESET = 4 + CLOSE = 5 + ENV_EXITED = 6 + CLOSED = 7 + TRAINING_STARTED = 8 + + +class EnvironmentRequest(NamedTuple): + cmd: EnvironmentCommand + payload: Any = None + + +class EnvironmentResponse(NamedTuple): + cmd: EnvironmentCommand + worker_id: int + payload: Any + + +class StepResponse(NamedTuple): + all_step_result: AllStepResult + timer_root: Optional[TimerNode] + environment_stats: EnvironmentStats + + +class UnityEnvWorker: + def __init__(self, process: Process, worker_id: int, conn: Connection): + self.process = process + self.worker_id = worker_id + self.conn = conn + self.previous_step: EnvironmentStep = EnvironmentStep.empty(worker_id) + self.previous_all_action_info: Dict[str, ActionInfo] = {} + self.waiting = False + self.closed = False + + def send(self, cmd: EnvironmentCommand, payload: Any = None) -> None: + try: + req = EnvironmentRequest(cmd, payload) + self.conn.send(req) + except (BrokenPipeError, EOFError): + raise UnityCommunicationException("UnityEnvironment worker: send failed.") + + def recv(self) -> EnvironmentResponse: + try: + response: EnvironmentResponse = self.conn.recv() + if response.cmd == EnvironmentCommand.ENV_EXITED: + env_exception: Exception = response.payload + raise env_exception + return response + except (BrokenPipeError, EOFError): + raise UnityCommunicationException("UnityEnvironment worker: recv failed.") + + def request_close(self): + try: + self.conn.send(EnvironmentRequest(EnvironmentCommand.CLOSE)) + except (BrokenPipeError, EOFError): + logger.debug( + f"UnityEnvWorker {self.worker_id} got exception trying to close." + ) + pass + + +def worker( + parent_conn: Connection, + step_queue: Queue, + pickled_env_factory: str, + worker_id: int, + run_options: RunOptions, + log_level: int = logging_util.INFO, +) -> None: + env_factory: Callable[ + [int, List[SideChannel]], UnityEnvironment + ] = cloudpickle.loads(pickled_env_factory) + env_parameters = EnvironmentParametersChannel() + + engine_config = EngineConfig( + width=run_options.engine_settings.width, + height=run_options.engine_settings.height, + quality_level=run_options.engine_settings.quality_level, + time_scale=run_options.engine_settings.time_scale, + target_frame_rate=run_options.engine_settings.target_frame_rate, + capture_frame_rate=run_options.engine_settings.capture_frame_rate, + ) + engine_configuration_channel = EngineConfigurationChannel() + engine_configuration_channel.set_configuration(engine_config) + + stats_channel = StatsSideChannel() + training_analytics_channel: Optional[TrainingAnalyticsSideChannel] = None + if worker_id == 0: + training_analytics_channel = TrainingAnalyticsSideChannel() + env: UnityEnvironment = None + # Set log level. On some platforms, the logger isn't common with the + # main process, so we need to set it again. + logging_util.set_log_level(log_level) + + def _send_response(cmd_name: EnvironmentCommand, payload: Any) -> None: + parent_conn.send(EnvironmentResponse(cmd_name, worker_id, payload)) + + def _generate_all_results() -> AllStepResult: + all_step_result: AllStepResult = {} + for brain_name in env.behavior_specs: + all_step_result[brain_name] = env.get_steps(brain_name) + return all_step_result + + try: + side_channels = [env_parameters, engine_configuration_channel, stats_channel] + if training_analytics_channel is not None: + side_channels.append(training_analytics_channel) + + env = env_factory(worker_id, side_channels) + if ( + not env.academy_capabilities + or not env.academy_capabilities.trainingAnalytics + ): + # Make sure we don't try to send training analytics if the environment doesn't know how to process + # them. This wouldn't be catastrophic, but would result in unknown SideChannel UUIDs being used. + training_analytics_channel = None + if training_analytics_channel: + training_analytics_channel.environment_initialized(run_options) + + while True: + req: EnvironmentRequest = parent_conn.recv() + if req.cmd == EnvironmentCommand.STEP: + all_action_info = req.payload + for brain_name, action_info in all_action_info.items(): + if len(action_info.agent_ids) > 0: + env.set_actions(brain_name, action_info.env_action) + env.step() + all_step_result = _generate_all_results() + # The timers in this process are independent from all the processes and the "main" process + # So after we send back the root timer, we can safely clear them. + # Note that we could randomly return timers a fraction of the time if we wanted to reduce + # the data transferred. + # TODO get gauges from the workers and merge them in the main process too. + env_stats = stats_channel.get_and_reset_stats() + step_response = StepResponse( + all_step_result, get_timer_root(), env_stats + ) + step_queue.put( + EnvironmentResponse( + EnvironmentCommand.STEP, worker_id, step_response + ) + ) + reset_timers() + elif req.cmd == EnvironmentCommand.BEHAVIOR_SPECS: + _send_response(EnvironmentCommand.BEHAVIOR_SPECS, env.behavior_specs) + elif req.cmd == EnvironmentCommand.ENVIRONMENT_PARAMETERS: + for k, v in req.payload.items(): + if isinstance(v, ParameterRandomizationSettings): + v.apply(k, env_parameters) + elif req.cmd == EnvironmentCommand.TRAINING_STARTED: + behavior_name, trainer_config = req.payload + if training_analytics_channel: + training_analytics_channel.training_started( + behavior_name, trainer_config + ) + elif req.cmd == EnvironmentCommand.RESET: + env.reset() + all_step_result = _generate_all_results() + _send_response(EnvironmentCommand.RESET, all_step_result) + elif req.cmd == EnvironmentCommand.CLOSE: + break + except ( + KeyboardInterrupt, + UnityCommunicationException, + UnityTimeOutException, + UnityEnvironmentException, + UnityCommunicatorStoppedException, + ) as ex: + logger.debug(f"UnityEnvironment worker {worker_id}: environment stopping.") + step_queue.put( + EnvironmentResponse(EnvironmentCommand.ENV_EXITED, worker_id, ex) + ) + _send_response(EnvironmentCommand.ENV_EXITED, ex) + except Exception as ex: + logger.exception( + f"UnityEnvironment worker {worker_id}: environment raised an unexpected exception." + ) + step_queue.put( + EnvironmentResponse(EnvironmentCommand.ENV_EXITED, worker_id, ex) + ) + _send_response(EnvironmentCommand.ENV_EXITED, ex) + finally: + logger.debug(f"UnityEnvironment worker {worker_id} closing.") + if env is not None: + env.close() + logger.debug(f"UnityEnvironment worker {worker_id} done.") + parent_conn.close() + step_queue.put(EnvironmentResponse(EnvironmentCommand.CLOSED, worker_id, None)) + step_queue.close() + + +class SubprocessEnvManager(EnvManager): + def __init__( + self, + env_factory: Callable[[int, List[SideChannel]], BaseEnv], + run_options: RunOptions, + n_env: int = 1, + ): + super().__init__() + self.env_workers: List[UnityEnvWorker] = [] + self.step_queue: Queue = Queue() + self.workers_alive = 0 + self.env_factory = env_factory + self.run_options = run_options + self.env_parameters: Optional[Dict] = None + # Each worker is correlated with a list of times they restarted within the last time period. + self.recent_restart_timestamps: List[List[datetime.datetime]] = [ + [] for _ in range(n_env) + ] + self.restart_counts: List[int] = [0] * n_env + for worker_idx in range(n_env): + self.env_workers.append( + self.create_worker( + worker_idx, self.step_queue, env_factory, run_options + ) + ) + self.workers_alive += 1 + + @staticmethod + def create_worker( + worker_id: int, + step_queue: Queue, + env_factory: Callable[[int, List[SideChannel]], BaseEnv], + run_options: RunOptions, + ) -> UnityEnvWorker: + parent_conn, child_conn = Pipe() + + # Need to use cloudpickle for the env factory function since function objects aren't picklable + # on Windows as of Python 3.6. + pickled_env_factory = cloudpickle.dumps(env_factory) + child_process = Process( + target=worker, + args=( + child_conn, + step_queue, + pickled_env_factory, + worker_id, + run_options, + logger.level, + ), + ) + child_process.start() + return UnityEnvWorker(child_process, worker_id, parent_conn) + + def _queue_steps(self) -> None: + for env_worker in self.env_workers: + if not env_worker.waiting: + env_action_info = self._take_step(env_worker.previous_step) + env_worker.previous_all_action_info = env_action_info + env_worker.send(EnvironmentCommand.STEP, env_action_info) + env_worker.waiting = True + + def _restart_failed_workers(self, first_failure: EnvironmentResponse) -> None: + if first_failure.cmd != EnvironmentCommand.ENV_EXITED: + return + # Drain the step queue to make sure all workers are paused and we have found all concurrent errors. + # Pausing all training is needed since we need to reset all pending training steps as they could be corrupted. + other_failures: Dict[int, Exception] = self._drain_step_queue() + # TODO: Once we use python 3.9 switch to using the | operator to combine dicts. + failures: Dict[int, Exception] = { + **{first_failure.worker_id: first_failure.payload}, + **other_failures, + } + for worker_id, ex in failures.items(): + self._assert_worker_can_restart(worker_id, ex) + logger.warning(f"Restarting worker[{worker_id}] after '{ex}'") + self.recent_restart_timestamps[worker_id].append(datetime.datetime.now()) + self.restart_counts[worker_id] += 1 + self.env_workers[worker_id] = self.create_worker( + worker_id, self.step_queue, self.env_factory, self.run_options + ) + # The restarts were successful, clear all the existing training trajectories so we don't use corrupted or + # outdated data. + self.reset(self.env_parameters) + + def _drain_step_queue(self) -> Dict[int, Exception]: + """ + Drains all steps out of the step queue and returns all exceptions from crashed workers. + This will effectively pause all workers so that they won't do anything until _queue_steps is called. + """ + all_failures = {} + workers_still_pending = {w.worker_id for w in self.env_workers if w.waiting} + deadline = datetime.datetime.now() + datetime.timedelta(minutes=1) + while workers_still_pending and deadline > datetime.datetime.now(): + try: + while True: + step: EnvironmentResponse = self.step_queue.get_nowait() + if step.cmd == EnvironmentCommand.ENV_EXITED: + workers_still_pending.add(step.worker_id) + all_failures[step.worker_id] = step.payload + else: + workers_still_pending.remove(step.worker_id) + self.env_workers[step.worker_id].waiting = False + except EmptyQueueException: + pass + if deadline < datetime.datetime.now(): + still_waiting = {w.worker_id for w in self.env_workers if w.waiting} + raise TimeoutError(f"Workers {still_waiting} stuck in waiting state") + return all_failures + + def _assert_worker_can_restart(self, worker_id: int, exception: Exception) -> None: + """ + Checks if we can recover from an exception from a worker. + If the restart limit is exceeded it will raise a UnityCommunicationException. + If the exception is not recoverable it re-raises the exception. + """ + if ( + isinstance(exception, UnityCommunicationException) + or isinstance(exception, UnityTimeOutException) + or isinstance(exception, UnityEnvironmentException) + or isinstance(exception, UnityCommunicatorStoppedException) + ): + if self._worker_has_restart_quota(worker_id): + return + else: + logger.error( + f"Worker {worker_id} exceeded the allowed number of restarts." + ) + raise exception + raise exception + + def _worker_has_restart_quota(self, worker_id: int) -> bool: + self._drop_old_restart_timestamps(worker_id) + max_lifetime_restarts = self.run_options.env_settings.max_lifetime_restarts + max_limit_check = ( + max_lifetime_restarts == -1 + or self.restart_counts[worker_id] < max_lifetime_restarts + ) + + rate_limit_n = self.run_options.env_settings.restarts_rate_limit_n + rate_limit_check = ( + rate_limit_n == -1 + or len(self.recent_restart_timestamps[worker_id]) < rate_limit_n + ) + + return rate_limit_check and max_limit_check + + def _drop_old_restart_timestamps(self, worker_id: int) -> None: + """ + Drops environment restart timestamps that are outside of the current window. + """ + + def _filter(t: datetime.datetime) -> bool: + return t > datetime.datetime.now() - datetime.timedelta( + seconds=self.run_options.env_settings.restarts_rate_limit_period_s + ) + + self.recent_restart_timestamps[worker_id] = list( + filter(_filter, self.recent_restart_timestamps[worker_id]) + ) + + def _step(self) -> List[EnvironmentStep]: + # Queue steps for any workers which aren't in the "waiting" state. + self._queue_steps() + + worker_steps: List[EnvironmentResponse] = [] + step_workers: Set[int] = set() + # Poll the step queue for completed steps from environment workers until we retrieve + # 1 or more, which we will then return as StepInfos + while len(worker_steps) < 1: + try: + while True: + step: EnvironmentResponse = self.step_queue.get_nowait() + if step.cmd == EnvironmentCommand.ENV_EXITED: + # If even one env exits try to restart all envs that failed. + self._restart_failed_workers(step) + # Clear state and restart this function. + worker_steps.clear() + step_workers.clear() + self._queue_steps() + elif step.worker_id not in step_workers: + self.env_workers[step.worker_id].waiting = False + worker_steps.append(step) + step_workers.add(step.worker_id) + except EmptyQueueException: + pass + step_infos = self._postprocess_steps(worker_steps) + return step_infos + + def _reset_env(self, config: Optional[Dict] = None) -> List[EnvironmentStep]: + while any(ew.waiting for ew in self.env_workers): + if not self.step_queue.empty(): + step = self.step_queue.get_nowait() + self.env_workers[step.worker_id].waiting = False + # Send config to environment + self.set_env_parameters(config) + # First enqueue reset commands for all workers so that they reset in parallel + for ew in self.env_workers: + ew.send(EnvironmentCommand.RESET, config) + # Next (synchronously) collect the reset observations from each worker in sequence + for ew in self.env_workers: + ew.previous_step = EnvironmentStep(ew.recv().payload, ew.worker_id, {}, {}) + return list(map(lambda ew: ew.previous_step, self.env_workers)) + + def set_env_parameters(self, config: Dict = None) -> None: + """ + Sends environment parameter settings to C# via the + EnvironmentParametersSidehannel for each worker. + :param config: Dict of environment parameter keys and values + """ + self.env_parameters = config + for ew in self.env_workers: + ew.send(EnvironmentCommand.ENVIRONMENT_PARAMETERS, config) + + def on_training_started( + self, behavior_name: str, trainer_settings: TrainerSettings + ) -> None: + """ + Handle traing starting for a new behavior type. Generally nothing is necessary here. + :param behavior_name: + :param trainer_settings: + :return: + """ + for ew in self.env_workers: + ew.send( + EnvironmentCommand.TRAINING_STARTED, (behavior_name, trainer_settings) + ) + + @property + def training_behaviors(self) -> Dict[BehaviorName, BehaviorSpec]: + result: Dict[BehaviorName, BehaviorSpec] = {} + for worker in self.env_workers: + worker.send(EnvironmentCommand.BEHAVIOR_SPECS) + result.update(worker.recv().payload) + return result + + def close(self) -> None: + logger.debug("SubprocessEnvManager closing.") + for env_worker in self.env_workers: + env_worker.request_close() + # Pull messages out of the queue until every worker has CLOSED or we time out. + deadline = time.time() + WORKER_SHUTDOWN_TIMEOUT_S + while self.workers_alive > 0 and time.time() < deadline: + try: + step: EnvironmentResponse = self.step_queue.get_nowait() + env_worker = self.env_workers[step.worker_id] + if step.cmd == EnvironmentCommand.CLOSED and not env_worker.closed: + env_worker.closed = True + self.workers_alive -= 1 + # Discard all other messages. + except EmptyQueueException: + pass + self.step_queue.close() + # Sanity check to kill zombie workers and report an issue if they occur. + if self.workers_alive > 0: + logger.error("SubprocessEnvManager had workers that didn't signal shutdown") + for env_worker in self.env_workers: + if not env_worker.closed and env_worker.process.is_alive(): + env_worker.process.terminate() + logger.error( + "A SubprocessEnvManager worker did not shut down correctly so it was forcefully terminated." + ) + self.step_queue.join_thread() + + def _postprocess_steps( + self, env_steps: List[EnvironmentResponse] + ) -> List[EnvironmentStep]: + step_infos = [] + timer_nodes = [] + for step in env_steps: + payload: StepResponse = step.payload + env_worker = self.env_workers[step.worker_id] + new_step = EnvironmentStep( + payload.all_step_result, + step.worker_id, + env_worker.previous_all_action_info, + payload.environment_stats, + ) + step_infos.append(new_step) + env_worker.previous_step = new_step + + if payload.timer_root: + timer_nodes.append(payload.timer_root) + + if timer_nodes: + with hierarchical_timer("workers") as main_timer_node: + for worker_timer_node in timer_nodes: + main_timer_node.merge( + worker_timer_node, root_name="worker_root", is_parallel=True + ) + + return step_infos + + @timed + def _take_step(self, last_step: EnvironmentStep) -> Dict[BehaviorName, ActionInfo]: + all_action_info: Dict[str, ActionInfo] = {} + for brain_name, step_tuple in last_step.current_all_step_result.items(): + if brain_name in self.policies: + all_action_info[brain_name] = self.policies[brain_name].get_action( + step_tuple[0], last_step.worker_id + ) + return all_action_info diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0520a9f26c8efcbd85d38c83e8b360756c780db3 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_flattener.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_flattener.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29b97d7cada3f6d989a0b9c4ff926828cdaedfab Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_flattener.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_log_probs.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_log_probs.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f93a96ce54a950ac103fd105a0ddf0861e9d010f Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_log_probs.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_model.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..520d19b853c791353a986042a30d7f4c177838ac Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_model.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/agent_action.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/agent_action.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ac7a1e078bb65278c2dbcf60c08a67987804b44 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/agent_action.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/attention.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/attention.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2bebe18d90430fbd28b8613cea34b13e0b721bd1 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/attention.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/conditioning.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/conditioning.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..543a86397ca5dc7fdea7e7f72d572bfa5a719660 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/conditioning.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/decoders.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/decoders.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de3de327b600f8c06184a6b276b224323e6f1590 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/decoders.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/distributions.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/distributions.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..525905043497d42e75a2b408cdc49fb6f6b6a271 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/distributions.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/encoders.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/encoders.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e22dc2278e02cc0bce8b53b0a695ceefed95105 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/encoders.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/layers.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/layers.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fafb38e71b2fc97601c8d15c5d26aa0aebbb00c Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/layers.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/model_serialization.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/model_serialization.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8071dd1776625198c18a5095e8ee888318743d8f Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/model_serialization.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/networks.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/networks.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24fd5d6115a543bc496e58aa0f3b5aa600018840 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/networks.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ed48ded864f83bc0fce6ca308eca335c001ce92 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/utils.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_flattener.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_flattener.py new file mode 100644 index 0000000000000000000000000000000000000000..beb529c96361ffc8cc0d679aad9771a3bd1cfed2 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_flattener.py @@ -0,0 +1,44 @@ +from typing import List +from mlagents.torch_utils import torch + +from mlagents_envs.base_env import ActionSpec +from mlagents.trainers.torch_entities.agent_action import AgentAction +from mlagents.trainers.torch_entities.utils import ModelUtils + + +class ActionFlattener: + def __init__(self, action_spec: ActionSpec): + """ + A torch module that creates the flattened form of an AgentAction object. + The flattened form is the continuous action concatenated with the + concatenated one hot encodings of the discrete actions. + :param action_spec: An ActionSpec that describes the action space dimensions + """ + self._specs = action_spec + + @property + def flattened_size(self) -> int: + """ + The flattened size is the continuous size plus the sum of the branch sizes + since discrete actions are encoded as one hots. + """ + return self._specs.continuous_size + sum(self._specs.discrete_branches) + + def forward(self, action: AgentAction) -> torch.Tensor: + """ + Returns a tensor corresponding the flattened action + :param action: An AgentAction object + """ + action_list: List[torch.Tensor] = [] + if self._specs.continuous_size > 0: + action_list.append(action.continuous_tensor) + if self._specs.discrete_size > 0: + flat_discrete = torch.cat( + ModelUtils.actions_to_onehot( + torch.as_tensor(action.discrete_tensor, dtype=torch.long), + self._specs.discrete_branches, + ), + dim=1, + ) + action_list.append(flat_discrete) + return torch.cat(action_list, dim=1) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_log_probs.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_log_probs.py new file mode 100644 index 0000000000000000000000000000000000000000..b72e7bb22358404f18a5a6c41ac0d93adacfc303 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_log_probs.py @@ -0,0 +1,118 @@ +from typing import List, Optional, NamedTuple +from mlagents.torch_utils import torch +import numpy as np + +from mlagents.trainers.torch_entities.utils import ModelUtils +from mlagents.trainers.buffer import AgentBuffer, BufferKey +from mlagents_envs.base_env import _ActionTupleBase + + +class LogProbsTuple(_ActionTupleBase): + """ + An object whose fields correspond to the log probs of actions of different types. + Continuous and discrete are numpy arrays + Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size), + respectively. Note, this also holds when continuous or discrete size is + zero. + """ + + @property + def discrete_dtype(self) -> np.dtype: + """ + The dtype of a discrete log probability. + """ + return np.float32 + + @staticmethod + def empty_log_probs() -> "LogProbsTuple": + """ + Generates a dummy LogProbsTuple + """ + return LogProbsTuple() + + +class ActionLogProbs(NamedTuple): + """ + A NamedTuple containing the tensor for continuous log probs and list of tensors for + discrete log probs of individual actions as well as all the log probs for an entire branch. + Utility functions provide numpy <=> tensor conversions to be used by the optimizers. + :param continuous_tensor: Torch tensor corresponding to log probs of continuous actions + :param discrete_list: List of Torch tensors each corresponding to log probs of the discrete actions that were + sampled. + :param all_discrete_list: List of Torch tensors each corresponding to all log probs of + a discrete action branch, even the discrete actions that were not sampled. all_discrete_list is a list of Tensors, + each Tensor corresponds to one discrete branch log probabilities. + """ + + continuous_tensor: torch.Tensor + discrete_list: Optional[List[torch.Tensor]] + all_discrete_list: Optional[List[torch.Tensor]] + + @property + def discrete_tensor(self): + """ + Returns the discrete log probs list as a stacked tensor + """ + return torch.stack(self.discrete_list, dim=-1) + + @property + def all_discrete_tensor(self): + """ + Returns the discrete log probs of each branch as a tensor + """ + return torch.cat(self.all_discrete_list, dim=1) + + def to_log_probs_tuple(self) -> LogProbsTuple: + """ + Returns a LogProbsTuple. Only adds if tensor is not None. Otherwise, + LogProbsTuple uses a default. + """ + log_probs_tuple = LogProbsTuple() + if self.continuous_tensor is not None: + continuous = ModelUtils.to_numpy(self.continuous_tensor) + log_probs_tuple.add_continuous(continuous) + if self.discrete_list is not None: + discrete = ModelUtils.to_numpy(self.discrete_tensor) + log_probs_tuple.add_discrete(discrete) + return log_probs_tuple + + def _to_tensor_list(self) -> List[torch.Tensor]: + """ + Returns the tensors in the ActionLogProbs as a flat List of torch Tensors. This + is private and serves as a utility for self.flatten() + """ + tensor_list: List[torch.Tensor] = [] + if self.continuous_tensor is not None: + tensor_list.append(self.continuous_tensor) + if self.discrete_list is not None: + tensor_list.append(self.discrete_tensor) + return tensor_list + + def flatten(self) -> torch.Tensor: + """ + A utility method that returns all log probs in ActionLogProbs as a flattened tensor. + This is useful for algorithms like PPO which can treat all log probs in the same way. + """ + return torch.cat(self._to_tensor_list(), dim=1) + + @staticmethod + def from_buffer(buff: AgentBuffer) -> "ActionLogProbs": + """ + A static method that accesses continuous and discrete log probs fields in an AgentBuffer + and constructs the corresponding ActionLogProbs from the retrieved np arrays. + """ + continuous: torch.Tensor = None + discrete: List[torch.Tensor] = None # type: ignore + + if BufferKey.CONTINUOUS_LOG_PROBS in buff: + continuous = ModelUtils.list_to_tensor(buff[BufferKey.CONTINUOUS_LOG_PROBS]) + if BufferKey.DISCRETE_LOG_PROBS in buff: + discrete_tensor = ModelUtils.list_to_tensor( + buff[BufferKey.DISCRETE_LOG_PROBS] + ) + # This will keep discrete_list = None which enables flatten() + if discrete_tensor.shape[1] > 0: + discrete = [ + discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1]) + ] + return ActionLogProbs(continuous, discrete, None) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_model.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_model.py new file mode 100644 index 0000000000000000000000000000000000000000..7b88c0262d35c02286b3bae894569ce5330640f9 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_model.py @@ -0,0 +1,231 @@ +from typing import List, Tuple, NamedTuple, Optional +from mlagents.torch_utils import torch, nn +from mlagents.trainers.torch_entities.distributions import ( + DistInstance, + DiscreteDistInstance, + GaussianDistribution, + MultiCategoricalDistribution, +) +from mlagents.trainers.torch_entities.agent_action import AgentAction +from mlagents.trainers.torch_entities.action_log_probs import ActionLogProbs +from mlagents_envs.base_env import ActionSpec + + +EPSILON = 1e-7 # Small value to avoid divide by zero + + +class DistInstances(NamedTuple): + """ + A NamedTuple with fields corresponding the the DistInstance objects + output by continuous and discrete distributions, respectively. Discrete distributions + output a list of DistInstance objects whereas continuous distributions output a single + DistInstance object. + """ + + continuous: Optional[DistInstance] + discrete: Optional[List[DiscreteDistInstance]] + + +class ActionModel(nn.Module): + def __init__( + self, + hidden_size: int, + action_spec: ActionSpec, + conditional_sigma: bool = False, + tanh_squash: bool = False, + deterministic: bool = False, + ): + """ + A torch module that represents the action space of a policy. The ActionModel may contain + a continuous distribution, a discrete distribution or both where construction depends on + the action_spec. The ActionModel uses the encoded input of the network body to parameterize + these distributions. The forward method of this module outputs the action, log probs, + and entropies given the encoding from the network body. + :params hidden_size: Size of the input to the ActionModel. + :params action_spec: The ActionSpec defining the action space dimensions and distributions. + :params conditional_sigma: Whether or not the std of a Gaussian is conditioned on state. + :params tanh_squash: Whether to squash the output of a Gaussian with the tanh function. + :params deterministic: Whether to select actions deterministically in policy. + """ + super().__init__() + self.encoding_size = hidden_size + self.action_spec = action_spec + self._continuous_distribution = None + self._discrete_distribution = None + + if self.action_spec.continuous_size > 0: + self._continuous_distribution = GaussianDistribution( + self.encoding_size, + self.action_spec.continuous_size, + conditional_sigma=conditional_sigma, + tanh_squash=tanh_squash, + ) + + if self.action_spec.discrete_size > 0: + self._discrete_distribution = MultiCategoricalDistribution( + self.encoding_size, self.action_spec.discrete_branches + ) + + # During training, clipping is done in TorchPolicy, but we need to clip before ONNX + # export as well. + self.clip_action = not tanh_squash + self._deterministic = deterministic + + def _sample_action(self, dists: DistInstances) -> AgentAction: + """ + Samples actions from a DistInstances tuple + :params dists: The DistInstances tuple + :return: An AgentAction corresponding to the actions sampled from the DistInstances + """ + + continuous_action: Optional[torch.Tensor] = None + discrete_action: Optional[List[torch.Tensor]] = None + # This checks None because mypy complains otherwise + if dists.continuous is not None: + if self._deterministic: + continuous_action = dists.continuous.deterministic_sample() + else: + continuous_action = dists.continuous.sample() + if dists.discrete is not None: + discrete_action = [] + if self._deterministic: + for discrete_dist in dists.discrete: + discrete_action.append(discrete_dist.deterministic_sample()) + else: + for discrete_dist in dists.discrete: + discrete_action.append(discrete_dist.sample()) + return AgentAction(continuous_action, discrete_action) + + def _get_dists(self, inputs: torch.Tensor, masks: torch.Tensor) -> DistInstances: + """ + Creates a DistInstances tuple using the continuous and discrete distributions + :params inputs: The encoding from the network body + :params masks: Action masks for discrete actions + :return: A DistInstances tuple + """ + continuous_dist: Optional[DistInstance] = None + discrete_dist: Optional[List[DiscreteDistInstance]] = None + # This checks None because mypy complains otherwise + if self._continuous_distribution is not None: + continuous_dist = self._continuous_distribution(inputs) + if self._discrete_distribution is not None: + discrete_dist = self._discrete_distribution(inputs, masks) + return DistInstances(continuous_dist, discrete_dist) + + def _get_probs_and_entropy( + self, actions: AgentAction, dists: DistInstances + ) -> Tuple[ActionLogProbs, torch.Tensor]: + """ + Computes the log probabilites of the actions given distributions and entropies of + the given distributions. + :params actions: The AgentAction + :params dists: The DistInstances tuple + :return: An ActionLogProbs tuple and a torch tensor of the distribution entropies. + """ + entropies_list: List[torch.Tensor] = [] + continuous_log_prob: Optional[torch.Tensor] = None + discrete_log_probs: Optional[List[torch.Tensor]] = None + all_discrete_log_probs: Optional[List[torch.Tensor]] = None + # This checks None because mypy complains otherwise + if dists.continuous is not None: + continuous_log_prob = dists.continuous.log_prob(actions.continuous_tensor) + entropies_list.append(dists.continuous.entropy()) + if dists.discrete is not None: + discrete_log_probs = [] + all_discrete_log_probs = [] + for discrete_action, discrete_dist in zip( + actions.discrete_list, dists.discrete # type: ignore + ): + discrete_log_prob = discrete_dist.log_prob(discrete_action) + entropies_list.append(discrete_dist.entropy()) + discrete_log_probs.append(discrete_log_prob) + all_discrete_log_probs.append(discrete_dist.all_log_prob()) + action_log_probs = ActionLogProbs( + continuous_log_prob, discrete_log_probs, all_discrete_log_probs + ) + entropies = torch.cat(entropies_list, dim=1) + return action_log_probs, entropies + + def evaluate( + self, inputs: torch.Tensor, masks: torch.Tensor, actions: AgentAction + ) -> Tuple[ActionLogProbs, torch.Tensor]: + """ + Given actions and encoding from the network body, gets the distributions and + computes the log probabilites and entropies. + :params inputs: The encoding from the network body + :params masks: Action masks for discrete actions + :params actions: The AgentAction + :return: An ActionLogProbs tuple and a torch tensor of the distribution entropies. + """ + dists = self._get_dists(inputs, masks) + log_probs, entropies = self._get_probs_and_entropy(actions, dists) + # Use the sum of entropy across actions, not the mean + entropy_sum = torch.sum(entropies, dim=1) + return log_probs, entropy_sum + + def get_action_out(self, inputs: torch.Tensor, masks: torch.Tensor) -> torch.Tensor: + """ + Gets the tensors corresponding to the output of the policy network to be used for + inference. Called by the Actor's forward call. + :params inputs: The encoding from the network body + :params masks: Action masks for discrete actions + :return: A tuple of torch tensors corresponding to the inference output + """ + dists = self._get_dists(inputs, masks) + continuous_out, discrete_out, action_out_deprecated = None, None, None + deterministic_continuous_out, deterministic_discrete_out = ( + None, + None, + ) # deterministic actions + if self.action_spec.continuous_size > 0 and dists.continuous is not None: + continuous_out = dists.continuous.exported_model_output() + action_out_deprecated = continuous_out + deterministic_continuous_out = dists.continuous.deterministic_sample() + if self.clip_action: + continuous_out = torch.clamp(continuous_out, -3, 3) / 3 + action_out_deprecated = continuous_out + deterministic_continuous_out = ( + torch.clamp(deterministic_continuous_out, -3, 3) / 3 + ) + if self.action_spec.discrete_size > 0 and dists.discrete is not None: + discrete_out_list = [ + discrete_dist.exported_model_output() + for discrete_dist in dists.discrete + ] + discrete_out = torch.cat(discrete_out_list, dim=1) + action_out_deprecated = torch.cat(discrete_out_list, dim=1) + deterministic_discrete_out_list = [ + discrete_dist.deterministic_sample() for discrete_dist in dists.discrete + ] + deterministic_discrete_out = torch.cat( + deterministic_discrete_out_list, dim=1 + ) + + # deprecated action field does not support hybrid action + if self.action_spec.continuous_size > 0 and self.action_spec.discrete_size > 0: + action_out_deprecated = None + return ( + continuous_out, + discrete_out, + action_out_deprecated, + deterministic_continuous_out, + deterministic_discrete_out, + ) + + def forward( + self, inputs: torch.Tensor, masks: torch.Tensor + ) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor]: + """ + The forward method of this module. Outputs the action, log probs, + and entropies given the encoding from the network body. + :params inputs: The encoding from the network body + :params masks: Action masks for discrete actions + :return: Given the input, an AgentAction of the actions generated by the policy and the corresponding + ActionLogProbs and entropies. + """ + dists = self._get_dists(inputs, masks) + actions = self._sample_action(dists) + log_probs, entropies = self._get_probs_and_entropy(actions, dists) + # Use the sum of entropy across actions, not the mean + entropy_sum = torch.sum(entropies, dim=1) + return (actions, log_probs, entropy_sum) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/agent_action.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/agent_action.py new file mode 100644 index 0000000000000000000000000000000000000000..1ecc995a55eec6d5237ee358ba264773605fbf55 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/agent_action.py @@ -0,0 +1,157 @@ +from typing import List, Optional, NamedTuple +import itertools +import numpy as np +from mlagents.torch_utils import torch + +from mlagents.trainers.buffer import AgentBuffer, BufferKey +from mlagents.trainers.torch_entities.utils import ModelUtils +from mlagents_envs.base_env import ActionTuple + + +class AgentAction(NamedTuple): + """ + A NamedTuple containing the tensor for continuous actions and list of tensors for + discrete actions. Utility functions provide numpy <=> tensor conversions to be + sent as actions to the environment manager as well as used by the optimizers. + :param continuous_tensor: Torch tensor corresponding to continuous actions + :param discrete_list: List of Torch tensors each corresponding to discrete actions + """ + + continuous_tensor: torch.Tensor + discrete_list: Optional[List[torch.Tensor]] + + @property + def discrete_tensor(self) -> torch.Tensor: + """ + Returns the discrete action list as a stacked tensor + """ + if self.discrete_list is not None and len(self.discrete_list) > 0: + return torch.stack(self.discrete_list, dim=-1) + else: + return torch.empty(0) + + def slice(self, start: int, end: int) -> "AgentAction": + """ + Returns an AgentAction with the continuous and discrete tensors slices + from index start to index end. + """ + _cont = None + _disc_list = [] + if self.continuous_tensor is not None: + _cont = self.continuous_tensor[start:end] + if self.discrete_list is not None and len(self.discrete_list) > 0: + for _disc in self.discrete_list: + _disc_list.append(_disc[start:end]) + return AgentAction(_cont, _disc_list) + + def to_action_tuple(self, clip: bool = False) -> ActionTuple: + """ + Returns an ActionTuple + """ + action_tuple = ActionTuple() + if self.continuous_tensor is not None: + _continuous_tensor = self.continuous_tensor + if clip: + _continuous_tensor = torch.clamp(_continuous_tensor, -3, 3) / 3 + continuous = ModelUtils.to_numpy(_continuous_tensor) + action_tuple.add_continuous(continuous) + if self.discrete_list is not None: + discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :]) + action_tuple.add_discrete(discrete) + return action_tuple + + @staticmethod + def from_buffer(buff: AgentBuffer) -> "AgentAction": + """ + A static method that accesses continuous and discrete action fields in an AgentBuffer + and constructs the corresponding AgentAction from the retrieved np arrays. + """ + continuous: torch.Tensor = None + discrete: List[torch.Tensor] = None # type: ignore + if BufferKey.CONTINUOUS_ACTION in buff: + continuous = ModelUtils.list_to_tensor(buff[BufferKey.CONTINUOUS_ACTION]) + if BufferKey.DISCRETE_ACTION in buff: + discrete_tensor = ModelUtils.list_to_tensor( + buff[BufferKey.DISCRETE_ACTION], dtype=torch.long + ) + discrete = [ + discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1]) + ] + return AgentAction(continuous, discrete) + + @staticmethod + def _group_agent_action_from_buffer( + buff: AgentBuffer, cont_action_key: BufferKey, disc_action_key: BufferKey + ) -> List["AgentAction"]: + """ + Extracts continuous and discrete groupmate actions, as specified by BufferKey, and + returns a List of AgentActions that correspond to the groupmate's actions. List will + be of length equal to the maximum number of groupmates in the buffer. Any spots where + there are less agents than maximum, the actions will be padded with 0's. + """ + continuous_tensors: List[torch.Tensor] = [] + discrete_tensors: List[torch.Tensor] = [] + if cont_action_key in buff: + padded_batch = buff[cont_action_key].padded_to_batch() + continuous_tensors = [ + ModelUtils.list_to_tensor(arr) for arr in padded_batch + ] + if disc_action_key in buff: + padded_batch = buff[disc_action_key].padded_to_batch(dtype=np.long) + discrete_tensors = [ + ModelUtils.list_to_tensor(arr, dtype=torch.long) for arr in padded_batch + ] + + actions_list = [] + for _cont, _disc in itertools.zip_longest( + continuous_tensors, discrete_tensors, fillvalue=None + ): + if _disc is not None: + _disc = [_disc[..., i] for i in range(_disc.shape[-1])] + actions_list.append(AgentAction(_cont, _disc)) + return actions_list + + @staticmethod + def group_from_buffer(buff: AgentBuffer) -> List["AgentAction"]: + """ + A static method that accesses next group continuous and discrete action fields in an AgentBuffer + and constructs a padded List of AgentActions that represent the group agent actions. + The List is of length equal to max number of groupmate agents in the buffer, and the AgentBuffer iss + of the same length as the buffer. Empty spots (e.g. when agents die) are padded with 0. + :param buff: AgentBuffer of a batch or trajectory + :return: List of groupmate's AgentActions + """ + return AgentAction._group_agent_action_from_buffer( + buff, BufferKey.GROUP_CONTINUOUS_ACTION, BufferKey.GROUP_DISCRETE_ACTION + ) + + @staticmethod + def group_from_buffer_next(buff: AgentBuffer) -> List["AgentAction"]: + """ + A static method that accesses next group continuous and discrete action fields in an AgentBuffer + and constructs a padded List of AgentActions that represent the next group agent actions. + The List is of length equal to max number of groupmate agents in the buffer, and the AgentBuffer iss + of the same length as the buffer. Empty spots (e.g. when agents die) are padded with 0. + :param buff: AgentBuffer of a batch or trajectory + :return: List of groupmate's AgentActions + """ + return AgentAction._group_agent_action_from_buffer( + buff, BufferKey.GROUP_NEXT_CONT_ACTION, BufferKey.GROUP_NEXT_DISC_ACTION + ) + + def to_flat(self, discrete_branches: List[int]) -> torch.Tensor: + """ + Flatten this AgentAction into a single torch Tensor of dimension (batch, num_continuous + num_one_hot_discrete). + Discrete actions are converted into one-hot and concatenated with continuous actions. + :param discrete_branches: List of sizes for discrete actions. + :return: Tensor of flattened actions. + """ + # if there are any discrete actions, create one-hot + if self.discrete_list is not None and len(self.discrete_list) > 0: + discrete_oh = ModelUtils.actions_to_onehot( + self.discrete_tensor, discrete_branches + ) + discrete_oh = torch.cat(discrete_oh, dim=1) + else: + discrete_oh = torch.empty(0) + return torch.cat([self.continuous_tensor, discrete_oh], dim=-1) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/attention.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..ba34e01995d5586da8942620b5d7ec4fd8ce5770 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/attention.py @@ -0,0 +1,293 @@ +from mlagents.torch_utils import torch +import warnings +from typing import Tuple, Optional, List +from mlagents.trainers.torch_entities.layers import ( + LinearEncoder, + Initialization, + linear_layer, + LayerNorm, +) +from mlagents.trainers.torch_entities.model_serialization import exporting_to_onnx +from mlagents.trainers.exception import UnityTrainerException + + +def get_zero_entities_mask(entities: List[torch.Tensor]) -> List[torch.Tensor]: + """ + Takes a List of Tensors and returns a List of mask Tensor with 1 if the input was + all zeros (on dimension 2) and 0 otherwise. This is used in the Attention + layer to mask the padding observations. + """ + with torch.no_grad(): + + if exporting_to_onnx.is_exporting(): + with warnings.catch_warnings(): + # We ignore a TracerWarning from PyTorch that warns that doing + # shape[n].item() will cause the trace to be incorrect (the trace might + # not generalize to other inputs) + # We ignore this warning because we know the model will always be + # run with inputs of the same shape + warnings.simplefilter("ignore") + # When exporting to ONNX, we want to transpose the entities. This is + # because ONNX only support input in NCHW (channel first) format. + # Barracuda also expect to get data in NCHW. + entities = [ + torch.transpose(obs, 2, 1).reshape( + -1, obs.shape[1].item(), obs.shape[2].item() + ) + for obs in entities + ] + + # Generate the masking tensors for each entities tensor (mask only if all zeros) + key_masks: List[torch.Tensor] = [ + (torch.sum(ent**2, axis=2) < 0.01).float() for ent in entities + ] + return key_masks + + +class MultiHeadAttention(torch.nn.Module): + + NEG_INF = -1e6 + + def __init__(self, embedding_size: int, num_heads: int): + """ + Multi Head Attention module. We do not use the regular Torch implementation since + Barracuda does not support some operators it uses. + Takes as input to the forward method 3 tensors: + - query: of dimensions (batch_size, number_of_queries, embedding_size) + - key: of dimensions (batch_size, number_of_keys, embedding_size) + - value: of dimensions (batch_size, number_of_keys, embedding_size) + The forward method will return 2 tensors: + - The output: (batch_size, number_of_queries, embedding_size) + - The attention matrix: (batch_size, num_heads, number_of_queries, number_of_keys) + :param embedding_size: The size of the embeddings that will be generated (should be + dividable by the num_heads) + :param total_max_elements: The maximum total number of entities that can be passed to + the module + :param num_heads: The number of heads of the attention module + """ + super().__init__() + self.n_heads = num_heads + self.head_size: int = embedding_size // self.n_heads + self.embedding_size: int = self.head_size * self.n_heads + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + n_q: int, + n_k: int, + key_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + b = -1 # the batch size + + query = query.reshape( + b, n_q, self.n_heads, self.head_size + ) # (b, n_q, h, emb / h) + key = key.reshape(b, n_k, self.n_heads, self.head_size) # (b, n_k, h, emb / h) + value = value.reshape( + b, n_k, self.n_heads, self.head_size + ) # (b, n_k, h, emb / h) + + query = query.permute([0, 2, 1, 3]) # (b, h, n_q, emb / h) + # The next few lines are equivalent to : key.permute([0, 2, 3, 1]) + # This is a hack, ONNX will compress two permute operations and + # Barracuda will not like seeing `permute([0,2,3,1])` + key = key.permute([0, 2, 1, 3]) # (b, h, emb / h, n_k) + key -= 1 + key += 1 + key = key.permute([0, 1, 3, 2]) # (b, h, emb / h, n_k) + + qk = torch.matmul(query, key) # (b, h, n_q, n_k) + + if key_mask is None: + qk = qk / (self.embedding_size**0.5) + else: + key_mask = key_mask.reshape(b, 1, 1, n_k) + qk = (1 - key_mask) * qk / ( + self.embedding_size**0.5 + ) + key_mask * self.NEG_INF + + att = torch.softmax(qk, dim=3) # (b, h, n_q, n_k) + + value = value.permute([0, 2, 1, 3]) # (b, h, n_k, emb / h) + value_attention = torch.matmul(att, value) # (b, h, n_q, emb / h) + + value_attention = value_attention.permute([0, 2, 1, 3]) # (b, n_q, h, emb / h) + value_attention = value_attention.reshape( + b, n_q, self.embedding_size + ) # (b, n_q, emb) + + return value_attention, att + + +class EntityEmbedding(torch.nn.Module): + """ + A module used to embed entities before passing them to a self-attention block. + Used in conjunction with ResidualSelfAttention to encode information about a self + and additional entities. Can also concatenate self to entities for ego-centric self- + attention. Inspired by architecture used in https://arxiv.org/pdf/1909.07528.pdf. + """ + + def __init__( + self, + entity_size: int, + entity_num_max_elements: Optional[int], + embedding_size: int, + ): + """ + Constructs an EntityEmbedding module. + :param x_self_size: Size of "self" entity. + :param entity_size: Size of other entities. + :param entity_num_max_elements: Maximum elements for a given entity, None for unrestricted. + Needs to be assigned in order for model to be exportable to ONNX and Barracuda. + :param embedding_size: Embedding size for the entity encoder. + :param concat_self: Whether to concatenate x_self to entities. Set True for ego-centric + self-attention. + """ + super().__init__() + self.self_size: int = 0 + self.entity_size: int = entity_size + self.entity_num_max_elements: int = -1 + if entity_num_max_elements is not None: + self.entity_num_max_elements = entity_num_max_elements + self.embedding_size = embedding_size + # Initialization scheme from http://www.cs.toronto.edu/~mvolkovs/ICML2020_tfixup.pdf + self.self_ent_encoder = LinearEncoder( + self.entity_size, + 1, + self.embedding_size, + kernel_init=Initialization.Normal, + kernel_gain=(0.125 / self.embedding_size) ** 0.5, + ) + + def add_self_embedding(self, size: int) -> None: + self.self_size = size + self.self_ent_encoder = LinearEncoder( + self.self_size + self.entity_size, + 1, + self.embedding_size, + kernel_init=Initialization.Normal, + kernel_gain=(0.125 / self.embedding_size) ** 0.5, + ) + + def forward(self, x_self: torch.Tensor, entities: torch.Tensor) -> torch.Tensor: + num_entities = self.entity_num_max_elements + if num_entities < 0: + if exporting_to_onnx.is_exporting(): + raise UnityTrainerException( + "Trying to export an attention mechanism that doesn't have a set max \ + number of elements." + ) + num_entities = entities.shape[1] + + if exporting_to_onnx.is_exporting(): + # When exporting to ONNX, we want to transpose the entities. This is + # because ONNX only support input in NCHW (channel first) format. + # Barracuda also expect to get data in NCHW. + entities = torch.transpose(entities, 2, 1).reshape( + -1, num_entities, self.entity_size + ) + + if self.self_size > 0: + expanded_self = x_self.reshape(-1, 1, self.self_size) + expanded_self = torch.cat([expanded_self] * num_entities, dim=1) + # Concatenate all observations with self + entities = torch.cat([expanded_self, entities], dim=2) + # Encode entities + encoded_entities = self.self_ent_encoder(entities) + return encoded_entities + + +class ResidualSelfAttention(torch.nn.Module): + """ + Residual self attentioninspired from https://arxiv.org/pdf/1909.07528.pdf. Can be used + with an EntityEmbedding module, to apply multi head self attention to encode information + about a "Self" and a list of relevant "Entities". + """ + + EPSILON = 1e-7 + + def __init__( + self, + embedding_size: int, + entity_num_max_elements: Optional[int] = None, + num_heads: int = 4, + ): + """ + Constructs a ResidualSelfAttention module. + :param embedding_size: Embedding sizee for attention mechanism and + Q, K, V encoders. + :param entity_num_max_elements: A List of ints representing the maximum number + of elements in an entity sequence. Should be of length num_entities. Pass None to + not restrict the number of elements; however, this will make the module + unexportable to ONNX/Barracuda. + :param num_heads: Number of heads for Multi Head Self-Attention + """ + super().__init__() + self.max_num_ent: Optional[int] = None + if entity_num_max_elements is not None: + self.max_num_ent = entity_num_max_elements + + self.attention = MultiHeadAttention( + num_heads=num_heads, embedding_size=embedding_size + ) + + # Initialization scheme from http://www.cs.toronto.edu/~mvolkovs/ICML2020_tfixup.pdf + self.fc_q = linear_layer( + embedding_size, + embedding_size, + kernel_init=Initialization.Normal, + kernel_gain=(0.125 / embedding_size) ** 0.5, + ) + self.fc_k = linear_layer( + embedding_size, + embedding_size, + kernel_init=Initialization.Normal, + kernel_gain=(0.125 / embedding_size) ** 0.5, + ) + self.fc_v = linear_layer( + embedding_size, + embedding_size, + kernel_init=Initialization.Normal, + kernel_gain=(0.125 / embedding_size) ** 0.5, + ) + self.fc_out = linear_layer( + embedding_size, + embedding_size, + kernel_init=Initialization.Normal, + kernel_gain=(0.125 / embedding_size) ** 0.5, + ) + self.embedding_norm = LayerNorm() + self.residual_norm = LayerNorm() + + def forward(self, inp: torch.Tensor, key_masks: List[torch.Tensor]) -> torch.Tensor: + # Gather the maximum number of entities information + mask = torch.cat(key_masks, dim=1) + + inp = self.embedding_norm(inp) + # Feed to self attention + query = self.fc_q(inp) # (b, n_q, emb) + key = self.fc_k(inp) # (b, n_k, emb) + value = self.fc_v(inp) # (b, n_k, emb) + + # Only use max num if provided + if self.max_num_ent is not None: + num_ent = self.max_num_ent + else: + num_ent = inp.shape[1] + if exporting_to_onnx.is_exporting(): + raise UnityTrainerException( + "Trying to export an attention mechanism that doesn't have a set max \ + number of elements." + ) + + output, _ = self.attention(query, key, value, num_ent, num_ent, mask) + # Residual + output = self.fc_out(output) + inp + output = self.residual_norm(output) + # Average Pooling + numerator = torch.sum(output * (1 - mask).reshape(-1, num_ent, 1), dim=1) + denominator = torch.sum(1 - mask, dim=1, keepdim=True) + self.EPSILON + output = numerator / denominator + return output diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b79947b9d024befc99baf8b355ecab9ad6aa937c Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14730ab5cbb1b49a041a47a27c01d8a103a1bc4b Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__pycache__/module.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__pycache__/module.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..caf76d070b46eb2218167cf72421091b8a06ee18 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__pycache__/module.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/module.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/module.py new file mode 100644 index 0000000000000000000000000000000000000000..ab454409c6111a59f659a1ab7c335524628c3d31 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/module.py @@ -0,0 +1,186 @@ +from typing import Dict +import numpy as np +from mlagents.torch_utils import torch + +from mlagents.trainers.policy.torch_policy import TorchPolicy +from mlagents.trainers.demo_loader import demo_to_buffer +from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType +from mlagents.trainers.torch_entities.agent_action import AgentAction +from mlagents.trainers.torch_entities.action_log_probs import ActionLogProbs +from mlagents.trainers.torch_entities.utils import ModelUtils +from mlagents.trainers.trajectory import ObsUtil +from mlagents.trainers.buffer import AgentBuffer + + +class BCModule: + def __init__( + self, + policy: TorchPolicy, + settings: BehavioralCloningSettings, + policy_learning_rate: float, + default_batch_size: int, + default_num_epoch: int, + ): + """ + A BC trainer that can be used inline with RL. + :param policy: The policy of the learning model + :param settings: The settings for BehavioralCloning including LR strength, batch_size, + num_epochs, samples_per_update and LR annealing steps. + :param policy_learning_rate: The initial Learning Rate of the policy. Used to set an appropriate learning rate + for the pretrainer. + """ + self.policy = policy + self._anneal_steps = settings.steps + self.current_lr = policy_learning_rate * settings.strength + + learning_rate_schedule: ScheduleType = ( + ScheduleType.LINEAR if self._anneal_steps > 0 else ScheduleType.CONSTANT + ) + self.decay_learning_rate = ModelUtils.DecayedValue( + learning_rate_schedule, self.current_lr, 1e-10, self._anneal_steps + ) + params = self.policy.actor.parameters() + self.optimizer = torch.optim.Adam(params, lr=self.current_lr) + _, self.demonstration_buffer = demo_to_buffer( + settings.demo_path, policy.sequence_length, policy.behavior_spec + ) + self.batch_size = ( + settings.batch_size if settings.batch_size else default_batch_size + ) + self.num_epoch = settings.num_epoch if settings.num_epoch else default_num_epoch + self.n_sequences = max( + min(self.batch_size, self.demonstration_buffer.num_experiences) + // policy.sequence_length, + 1, + ) + + self.has_updated = False + self.use_recurrent = self.policy.use_recurrent + self.samples_per_update = settings.samples_per_update + + def update(self) -> Dict[str, np.ndarray]: + """ + Updates model using buffer. + :param max_batches: The maximum number of batches to use per update. + :return: The loss of the update. + """ + # Don't continue training if the learning rate has reached 0, to reduce training time. + + decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step()) + if self.current_lr <= 1e-10: # Unlike in TF, this never actually reaches 0. + return {"Losses/Pretraining Loss": 0} + + batch_losses = [] + possible_demo_batches = ( + self.demonstration_buffer.num_experiences // self.n_sequences + ) + possible_batches = possible_demo_batches + + max_batches = self.samples_per_update // self.n_sequences + + n_epoch = self.num_epoch + for _ in range(n_epoch): + self.demonstration_buffer.shuffle( + sequence_length=self.policy.sequence_length + ) + if max_batches == 0: + num_batches = possible_batches + else: + num_batches = min(possible_batches, max_batches) + for i in range(num_batches // self.policy.sequence_length): + demo_update_buffer = self.demonstration_buffer + start = i * self.n_sequences * self.policy.sequence_length + end = (i + 1) * self.n_sequences * self.policy.sequence_length + mini_batch_demo = demo_update_buffer.make_mini_batch(start, end) + run_out = self._update_batch(mini_batch_demo, self.n_sequences) + loss = run_out["loss"] + batch_losses.append(loss) + + ModelUtils.update_learning_rate(self.optimizer, decay_lr) + self.current_lr = decay_lr + + self.has_updated = True + update_stats = {"Losses/Pretraining Loss": np.mean(batch_losses)} + return update_stats + + def _behavioral_cloning_loss( + self, + selected_actions: AgentAction, + log_probs: ActionLogProbs, + expert_actions: torch.Tensor, + ) -> torch.Tensor: + bc_loss = 0 + if self.policy.behavior_spec.action_spec.continuous_size > 0: + bc_loss += torch.nn.functional.mse_loss( + selected_actions.continuous_tensor, expert_actions.continuous_tensor + ) + if self.policy.behavior_spec.action_spec.discrete_size > 0: + one_hot_expert_actions = ModelUtils.actions_to_onehot( + expert_actions.discrete_tensor, + self.policy.behavior_spec.action_spec.discrete_branches, + ) + log_prob_branches = ModelUtils.break_into_branches( + log_probs.all_discrete_tensor, + self.policy.behavior_spec.action_spec.discrete_branches, + ) + bc_loss += torch.mean( + torch.stack( + [ + torch.sum( + -torch.nn.functional.log_softmax(log_prob_branch, dim=1) + * expert_actions_branch, + dim=1, + ) + for log_prob_branch, expert_actions_branch in zip( + log_prob_branches, one_hot_expert_actions + ) + ] + ) + ) + return bc_loss + + def _update_batch( + self, mini_batch_demo: AgentBuffer, n_sequences: int + ) -> Dict[str, float]: + """ + Helper function for update_batch. + """ + np_obs = ObsUtil.from_buffer( + mini_batch_demo, len(self.policy.behavior_spec.observation_specs) + ) + # Convert to tensors + tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] + act_masks = None + expert_actions = AgentAction.from_buffer(mini_batch_demo) + if self.policy.behavior_spec.action_spec.discrete_size > 0: + + act_masks = ModelUtils.list_to_tensor( + np.ones( + ( + self.n_sequences * self.policy.sequence_length, + sum(self.policy.behavior_spec.action_spec.discrete_branches), + ), + dtype=np.float32, + ) + ) + + memories = [] + if self.policy.use_recurrent: + memories = torch.zeros(1, self.n_sequences, self.policy.m_size) + + selected_actions, run_out, _ = self.policy.actor.get_action_and_stats( + tensor_obs, + masks=act_masks, + memories=memories, + sequence_length=self.policy.sequence_length, + ) + log_probs = run_out["log_probs"] + bc_loss = self._behavioral_cloning_loss( + selected_actions, log_probs, expert_actions + ) + self.optimizer.zero_grad() + bc_loss.backward() + + self.optimizer.step() + run_out = {"loss": bc_loss.item()} + return run_out diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..696d978b8ab3e1f34c26e6fecc57b8670ab7659a --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__init__.py @@ -0,0 +1,18 @@ +from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import ( # noqa F401 + BaseRewardProvider, +) +from mlagents.trainers.torch_entities.components.reward_providers.extrinsic_reward_provider import ( # noqa F401 + ExtrinsicRewardProvider, +) +from mlagents.trainers.torch_entities.components.reward_providers.curiosity_reward_provider import ( # noqa F401 + CuriosityRewardProvider, +) +from mlagents.trainers.torch_entities.components.reward_providers.gail_reward_provider import ( # noqa F401 + GAILRewardProvider, +) +from mlagents.trainers.torch_entities.components.reward_providers.rnd_reward_provider import ( # noqa F401 + RNDRewardProvider, +) +from mlagents.trainers.torch_entities.components.reward_providers.reward_provider_factory import ( # noqa F401 + create_reward_provider, +) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d77416629f9de718768c04f75825b3b35af1f8b6 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/base_reward_provider.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/base_reward_provider.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..144f8da8f8759c7248dc4488c3a504ea7ef7d8ad Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/base_reward_provider.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/curiosity_reward_provider.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/curiosity_reward_provider.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..109f485c0adaac980c9df2328127c945a7b78218 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/curiosity_reward_provider.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/extrinsic_reward_provider.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/extrinsic_reward_provider.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cddb7572f12675b1df004a4cfc9d189c81e1a9f2 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/extrinsic_reward_provider.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/gail_reward_provider.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/gail_reward_provider.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91e7e830174a1f548e28d175a02fcfb8addb2349 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/gail_reward_provider.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/reward_provider_factory.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/reward_provider_factory.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..21abac81911a0dc82038b3357b6568e8e31f4cdb Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/reward_provider_factory.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/rnd_reward_provider.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/rnd_reward_provider.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd0736695a4e127fd9b5bdf86a452fadac542b54 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/rnd_reward_provider.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/base_reward_provider.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/base_reward_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..4e258657b37a9e2c0b9ad0340b91c98bdb7c7f59 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/base_reward_provider.py @@ -0,0 +1,81 @@ +import numpy as np +from mlagents.torch_utils import torch +from abc import ABC, abstractmethod +from typing import Dict + +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.settings import RewardSignalSettings +from mlagents_envs.base_env import BehaviorSpec + + +class BaseRewardProvider(ABC): + def __init__(self, specs: BehaviorSpec, settings: RewardSignalSettings) -> None: + self._policy_specs = specs + self._gamma = settings.gamma + self._strength = settings.strength + self._ignore_done = False + + @property + def gamma(self) -> float: + """ + The discount factor for the reward signal + """ + return self._gamma + + @property + def strength(self) -> float: + """ + The strength multiplier of the reward provider + """ + return self._strength + + @property + def name(self) -> str: + """ + The name of the reward provider. Is used for reporting and identification + """ + class_name = self.__class__.__name__ + return class_name.replace("RewardProvider", "") + + @property + def ignore_done(self) -> bool: + """ + If true, when the agent is done, the rewards of the next episode must be + used to calculate the return of the current episode. + Is used to mitigate the positive bias in rewards with no natural end. + """ + return self._ignore_done + + @abstractmethod + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + """ + Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward + function drawn straight from a Buffer. + :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) + when drawing from the update buffer. + :return: a np.ndarray of rewards generated by the reward provider + """ + raise NotImplementedError( + "The reward provider's evaluate method has not been implemented " + ) + + @abstractmethod + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: + """ + Update the reward for the data present in the Dict mini_batch. Use this when updating a reward + function drawn straight from a Buffer. + :param mini_batch: A Dict of numpy arrays (the format used by our Buffer) + when drawing from the update buffer. + :return: A dictionary from string to stats values + """ + raise NotImplementedError( + "The reward provider's update method has not been implemented " + ) + + def get_modules(self) -> Dict[str, torch.nn.Module]: + """ + Returns a dictionary of string identifiers to the torch.nn.Modules used by + the reward providers. This method is used for loading and saving the weights + of the reward providers. + """ + return {} diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/curiosity_reward_provider.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/curiosity_reward_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..b4cbf34dd92a847e1051f8e190fab81044a4e660 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/curiosity_reward_provider.py @@ -0,0 +1,239 @@ +import numpy as np +from typing import Dict, NamedTuple +from mlagents.torch_utils import torch, default_device + +from mlagents.trainers.buffer import AgentBuffer, BufferKey +from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import ( + BaseRewardProvider, +) +from mlagents.trainers.settings import CuriositySettings + +from mlagents_envs.base_env import BehaviorSpec +from mlagents_envs import logging_util +from mlagents.trainers.torch_entities.agent_action import AgentAction +from mlagents.trainers.torch_entities.action_flattener import ActionFlattener +from mlagents.trainers.torch_entities.utils import ModelUtils +from mlagents.trainers.torch_entities.networks import NetworkBody +from mlagents.trainers.torch_entities.layers import LinearEncoder, linear_layer +from mlagents.trainers.trajectory import ObsUtil + +logger = logging_util.get_logger(__name__) + + +class ActionPredictionTuple(NamedTuple): + continuous: torch.Tensor + discrete: torch.Tensor + + +class CuriosityRewardProvider(BaseRewardProvider): + beta = 0.2 # Forward vs Inverse loss weight + loss_multiplier = 10.0 # Loss multiplier + + def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: + super().__init__(specs, settings) + self._ignore_done = True + self._network = CuriosityNetwork(specs, settings) + self._network.to(default_device()) + + self.optimizer = torch.optim.Adam( + self._network.parameters(), lr=settings.learning_rate + ) + self._has_updated_once = False + + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + with torch.no_grad(): + rewards = ModelUtils.to_numpy(self._network.compute_reward(mini_batch)) + rewards = np.minimum(rewards, 1.0 / self.strength) + return rewards * self._has_updated_once + + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: + self._has_updated_once = True + forward_loss = self._network.compute_forward_loss(mini_batch) + inverse_loss = self._network.compute_inverse_loss(mini_batch) + + loss = self.loss_multiplier * ( + self.beta * forward_loss + (1.0 - self.beta) * inverse_loss + ) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + return { + "Losses/Curiosity Forward Loss": forward_loss.item(), + "Losses/Curiosity Inverse Loss": inverse_loss.item(), + } + + def get_modules(self): + return {f"Module:{self.name}": self._network} + + +class CuriosityNetwork(torch.nn.Module): + EPSILON = 1e-10 + + def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: + super().__init__() + self._action_spec = specs.action_spec + + state_encoder_settings = settings.network_settings + if state_encoder_settings.memory is not None: + state_encoder_settings.memory = None + logger.warning( + "memory was specified in network_settings but is not supported by Curiosity. It is being ignored." + ) + + self._state_encoder = NetworkBody( + specs.observation_specs, state_encoder_settings + ) + + self._action_flattener = ActionFlattener(self._action_spec) + + self.inverse_model_action_encoding = torch.nn.Sequential( + LinearEncoder(2 * state_encoder_settings.hidden_units, 1, 256) + ) + + if self._action_spec.continuous_size > 0: + self.continuous_action_prediction = linear_layer( + 256, self._action_spec.continuous_size + ) + if self._action_spec.discrete_size > 0: + self.discrete_action_prediction = linear_layer( + 256, sum(self._action_spec.discrete_branches) + ) + + self.forward_model_next_state_prediction = torch.nn.Sequential( + LinearEncoder( + state_encoder_settings.hidden_units + + self._action_flattener.flattened_size, + 1, + 256, + ), + linear_layer(256, state_encoder_settings.hidden_units), + ) + + def get_current_state(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Extracts the current state embedding from a mini_batch. + """ + n_obs = len(self._state_encoder.processors) + np_obs = ObsUtil.from_buffer(mini_batch, n_obs) + # Convert to tensors + tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] + + hidden, _ = self._state_encoder.forward(tensor_obs) + return hidden + + def get_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Extracts the next state embedding from a mini_batch. + """ + n_obs = len(self._state_encoder.processors) + np_obs = ObsUtil.from_buffer_next(mini_batch, n_obs) + # Convert to tensors + tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] + + hidden, _ = self._state_encoder.forward(tensor_obs) + return hidden + + def predict_action(self, mini_batch: AgentBuffer) -> ActionPredictionTuple: + """ + In the continuous case, returns the predicted action. + In the discrete case, returns the logits. + """ + inverse_model_input = torch.cat( + (self.get_current_state(mini_batch), self.get_next_state(mini_batch)), dim=1 + ) + + continuous_pred = None + discrete_pred = None + hidden = self.inverse_model_action_encoding(inverse_model_input) + if self._action_spec.continuous_size > 0: + continuous_pred = self.continuous_action_prediction(hidden) + if self._action_spec.discrete_size > 0: + raw_discrete_pred = self.discrete_action_prediction(hidden) + branches = ModelUtils.break_into_branches( + raw_discrete_pred, self._action_spec.discrete_branches + ) + branches = [torch.softmax(b, dim=1) for b in branches] + discrete_pred = torch.cat(branches, dim=1) + return ActionPredictionTuple(continuous_pred, discrete_pred) + + def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Uses the current state embedding and the action of the mini_batch to predict + the next state embedding. + """ + actions = AgentAction.from_buffer(mini_batch) + flattened_action = self._action_flattener.forward(actions) + forward_model_input = torch.cat( + (self.get_current_state(mini_batch), flattened_action), dim=1 + ) + + return self.forward_model_next_state_prediction(forward_model_input) + + def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Computes the inverse loss for a mini_batch. Corresponds to the error on the + action prediction (given the current and next state). + """ + predicted_action = self.predict_action(mini_batch) + actions = AgentAction.from_buffer(mini_batch) + _inverse_loss = 0 + if self._action_spec.continuous_size > 0: + sq_difference = ( + actions.continuous_tensor - predicted_action.continuous + ) ** 2 + sq_difference = torch.sum(sq_difference, dim=1) + _inverse_loss += torch.mean( + ModelUtils.dynamic_partition( + sq_difference, + ModelUtils.list_to_tensor( + mini_batch[BufferKey.MASKS], dtype=torch.float + ), + 2, + )[1] + ) + if self._action_spec.discrete_size > 0: + true_action = torch.cat( + ModelUtils.actions_to_onehot( + actions.discrete_tensor, self._action_spec.discrete_branches + ), + dim=1, + ) + cross_entropy = torch.sum( + -torch.log(predicted_action.discrete + self.EPSILON) * true_action, + dim=1, + ) + _inverse_loss += torch.mean( + ModelUtils.dynamic_partition( + cross_entropy, + ModelUtils.list_to_tensor( + mini_batch[BufferKey.MASKS], dtype=torch.float + ), # use masks not action_masks + 2, + )[1] + ) + return _inverse_loss + + def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Calculates the curiosity reward for the mini_batch. Corresponds to the error + between the predicted and actual next state. + """ + predicted_next_state = self.predict_next_state(mini_batch) + target = self.get_next_state(mini_batch) + sq_difference = 0.5 * (target - predicted_next_state) ** 2 + sq_difference = torch.sum(sq_difference, dim=1) + return sq_difference + + def compute_forward_loss(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Computes the loss for the next state prediction + """ + return torch.mean( + ModelUtils.dynamic_partition( + self.compute_reward(mini_batch), + ModelUtils.list_to_tensor( + mini_batch[BufferKey.MASKS], dtype=torch.float + ), + 2, + )[1] + ) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/extrinsic_reward_provider.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/extrinsic_reward_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..b0b847463c78ef052e6dbb6a93d9af2321a5ae02 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/extrinsic_reward_provider.py @@ -0,0 +1,43 @@ +import numpy as np +from typing import Dict + +from mlagents.trainers.buffer import AgentBuffer, BufferKey +from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import ( + BaseRewardProvider, +) +from mlagents_envs.base_env import BehaviorSpec +from mlagents.trainers.settings import RewardSignalSettings + + +class ExtrinsicRewardProvider(BaseRewardProvider): + """ + Evaluates extrinsic reward. For single-agent, this equals the individual reward + given to the agent. For the POCA algorithm, we want not only the individual reward + but also the team and the individual rewards of the other agents. + """ + + def __init__(self, specs: BehaviorSpec, settings: RewardSignalSettings) -> None: + super().__init__(specs, settings) + self.add_groupmate_rewards = False + + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + indiv_rewards = np.array( + mini_batch[BufferKey.ENVIRONMENT_REWARDS], dtype=np.float32 + ) + total_rewards = indiv_rewards + if BufferKey.GROUPMATE_REWARDS in mini_batch and self.add_groupmate_rewards: + groupmate_rewards_list = mini_batch[BufferKey.GROUPMATE_REWARDS] + groupmate_rewards_sum = np.array( + [sum(_rew) for _rew in groupmate_rewards_list], dtype=np.float32 + ) + total_rewards += groupmate_rewards_sum + if BufferKey.GROUP_REWARD in mini_batch: + group_rewards = np.array( + mini_batch[BufferKey.GROUP_REWARD], dtype=np.float32 + ) + # Add all the group rewards to the individual rewards + total_rewards += group_rewards + return total_rewards + + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: + return {} diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/gail_reward_provider.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/gail_reward_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..0ae77ba1434d0666420cde871775297fd62d72e4 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/gail_reward_provider.py @@ -0,0 +1,260 @@ +from typing import Optional, Dict, List +import numpy as np +from mlagents.torch_utils import torch, default_device + +from mlagents.trainers.buffer import AgentBuffer, BufferKey +from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import ( + BaseRewardProvider, +) +from mlagents.trainers.settings import GAILSettings +from mlagents_envs.base_env import BehaviorSpec +from mlagents_envs import logging_util +from mlagents.trainers.torch_entities.utils import ModelUtils +from mlagents.trainers.torch_entities.agent_action import AgentAction +from mlagents.trainers.torch_entities.action_flattener import ActionFlattener +from mlagents.trainers.torch_entities.networks import NetworkBody +from mlagents.trainers.torch_entities.layers import linear_layer, Initialization +from mlagents.trainers.demo_loader import demo_to_buffer +from mlagents.trainers.trajectory import ObsUtil + +logger = logging_util.get_logger(__name__) + + +class GAILRewardProvider(BaseRewardProvider): + def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: + super().__init__(specs, settings) + self._ignore_done = False + self._discriminator_network = DiscriminatorNetwork(specs, settings) + self._discriminator_network.to(default_device()) + _, self._demo_buffer = demo_to_buffer( + settings.demo_path, 1, specs + ) # This is supposed to be the sequence length but we do not have access here + params = list(self._discriminator_network.parameters()) + self.optimizer = torch.optim.Adam(params, lr=settings.learning_rate) + + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + with torch.no_grad(): + estimates, _ = self._discriminator_network.compute_estimate( + mini_batch, use_vail_noise=False + ) + return ModelUtils.to_numpy( + -torch.log( + 1.0 + - estimates.squeeze(dim=1) + * (1.0 - self._discriminator_network.EPSILON) + ) + ) + + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: + + expert_batch = self._demo_buffer.sample_mini_batch( + mini_batch.num_experiences, 1 + ) + self._discriminator_network.encoder.update_normalization(expert_batch) + + loss, stats_dict = self._discriminator_network.compute_loss( + mini_batch, expert_batch + ) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + return stats_dict + + def get_modules(self): + return {f"Module:{self.name}": self._discriminator_network} + + +class DiscriminatorNetwork(torch.nn.Module): + gradient_penalty_weight = 10.0 + z_size = 128 + alpha = 0.0005 + mutual_information = 0.5 + EPSILON = 1e-7 + initial_beta = 0.0 + + def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: + super().__init__() + self._use_vail = settings.use_vail + self._settings = settings + + encoder_settings = settings.network_settings + if encoder_settings.memory is not None: + encoder_settings.memory = None + logger.warning( + "memory was specified in network_settings but is not supported by GAIL. It is being ignored." + ) + + self._action_flattener = ActionFlattener(specs.action_spec) + unencoded_size = ( + self._action_flattener.flattened_size + 1 if settings.use_actions else 0 + ) # +1 is for dones + self.encoder = NetworkBody( + specs.observation_specs, encoder_settings, unencoded_size + ) + + estimator_input_size = encoder_settings.hidden_units + if settings.use_vail: + estimator_input_size = self.z_size + self._z_sigma = torch.nn.Parameter( + torch.ones((self.z_size), dtype=torch.float), requires_grad=True + ) + self._z_mu_layer = linear_layer( + encoder_settings.hidden_units, + self.z_size, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=0.1, + ) + self._beta = torch.nn.Parameter( + torch.tensor(self.initial_beta, dtype=torch.float), requires_grad=False + ) + + self._estimator = torch.nn.Sequential( + linear_layer(estimator_input_size, 1, kernel_gain=0.2), torch.nn.Sigmoid() + ) + + def get_action_input(self, mini_batch: AgentBuffer) -> torch.Tensor: + """ + Creates the action Tensor. In continuous case, corresponds to the action. In + the discrete case, corresponds to the concatenation of one hot action Tensors. + """ + return self._action_flattener.forward(AgentAction.from_buffer(mini_batch)) + + def get_state_inputs(self, mini_batch: AgentBuffer) -> List[torch.Tensor]: + """ + Creates the observation input. + """ + n_obs = len(self.encoder.processors) + np_obs = ObsUtil.from_buffer(mini_batch, n_obs) + # Convert to tensors + tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] + return tensor_obs + + def compute_estimate( + self, mini_batch: AgentBuffer, use_vail_noise: bool = False + ) -> torch.Tensor: + """ + Given a mini_batch, computes the estimate (How much the discriminator believes + the data was sampled from the demonstration data). + :param mini_batch: The AgentBuffer of data + :param use_vail_noise: Only when using VAIL : If true, will sample the code, if + false, will return the mean of the code. + """ + inputs = self.get_state_inputs(mini_batch) + if self._settings.use_actions: + actions = self.get_action_input(mini_batch) + dones = torch.as_tensor( + mini_batch[BufferKey.DONE], dtype=torch.float + ).unsqueeze(1) + action_inputs = torch.cat([actions, dones], dim=1) + hidden, _ = self.encoder(inputs, action_inputs) + else: + hidden, _ = self.encoder(inputs) + z_mu: Optional[torch.Tensor] = None + if self._settings.use_vail: + z_mu = self._z_mu_layer(hidden) + hidden = z_mu + torch.randn_like(z_mu) * self._z_sigma * use_vail_noise + estimate = self._estimator(hidden) + return estimate, z_mu + + def compute_loss( + self, policy_batch: AgentBuffer, expert_batch: AgentBuffer + ) -> torch.Tensor: + """ + Given a policy mini_batch and an expert mini_batch, computes the loss of the discriminator. + """ + total_loss = torch.zeros(1) + stats_dict: Dict[str, np.ndarray] = {} + policy_estimate, policy_mu = self.compute_estimate( + policy_batch, use_vail_noise=True + ) + expert_estimate, expert_mu = self.compute_estimate( + expert_batch, use_vail_noise=True + ) + stats_dict["Policy/GAIL Policy Estimate"] = policy_estimate.mean().item() + stats_dict["Policy/GAIL Expert Estimate"] = expert_estimate.mean().item() + discriminator_loss = -( + torch.log(expert_estimate + self.EPSILON) + + torch.log(1.0 - policy_estimate + self.EPSILON) + ).mean() + stats_dict["Losses/GAIL Loss"] = discriminator_loss.item() + total_loss += discriminator_loss + if self._settings.use_vail: + # KL divergence loss (encourage latent representation to be normal) + kl_loss = torch.mean( + -torch.sum( + 1 + + (self._z_sigma**2).log() + - 0.5 * expert_mu**2 + - 0.5 * policy_mu**2 + - (self._z_sigma**2), + dim=1, + ) + ) + vail_loss = self._beta * (kl_loss - self.mutual_information) + with torch.no_grad(): + self._beta.data = torch.max( + self._beta + self.alpha * (kl_loss - self.mutual_information), + torch.tensor(0.0), + ) + total_loss += vail_loss + stats_dict["Policy/GAIL Beta"] = self._beta.item() + stats_dict["Losses/GAIL KL Loss"] = kl_loss.item() + if self.gradient_penalty_weight > 0.0: + gradient_magnitude_loss = ( + self.gradient_penalty_weight + * self.compute_gradient_magnitude(policy_batch, expert_batch) + ) + stats_dict["Policy/GAIL Grad Mag Loss"] = gradient_magnitude_loss.item() + total_loss += gradient_magnitude_loss + return total_loss, stats_dict + + def compute_gradient_magnitude( + self, policy_batch: AgentBuffer, expert_batch: AgentBuffer + ) -> torch.Tensor: + """ + Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp. + for off-policy. Compute gradients w.r.t randomly interpolated input. + """ + policy_inputs = self.get_state_inputs(policy_batch) + expert_inputs = self.get_state_inputs(expert_batch) + interp_inputs = [] + for policy_input, expert_input in zip(policy_inputs, expert_inputs): + obs_epsilon = torch.rand(policy_input.shape) + interp_input = obs_epsilon * policy_input + (1 - obs_epsilon) * expert_input + interp_input.requires_grad = True # For gradient calculation + interp_inputs.append(interp_input) + if self._settings.use_actions: + policy_action = self.get_action_input(policy_batch) + expert_action = self.get_action_input(expert_batch) + action_epsilon = torch.rand(policy_action.shape) + policy_dones = torch.as_tensor( + policy_batch[BufferKey.DONE], dtype=torch.float + ).unsqueeze(1) + expert_dones = torch.as_tensor( + expert_batch[BufferKey.DONE], dtype=torch.float + ).unsqueeze(1) + dones_epsilon = torch.rand(policy_dones.shape) + action_inputs = torch.cat( + [ + action_epsilon * policy_action + + (1 - action_epsilon) * expert_action, + dones_epsilon * policy_dones + (1 - dones_epsilon) * expert_dones, + ], + dim=1, + ) + action_inputs.requires_grad = True + hidden, _ = self.encoder(interp_inputs, action_inputs) + encoder_input = tuple(interp_inputs + [action_inputs]) + else: + hidden, _ = self.encoder(interp_inputs) + encoder_input = tuple(interp_inputs) + if self._settings.use_vail: + use_vail_noise = True + z_mu = self._z_mu_layer(hidden) + hidden = z_mu + torch.randn_like(z_mu) * self._z_sigma * use_vail_noise + estimate = self._estimator(hidden).squeeze(1).sum() + gradient = torch.autograd.grad(estimate, encoder_input, create_graph=True)[0] + # Norm's gradient could be NaN at 0. Use our own safe_norm + safe_norm = (torch.sum(gradient**2, dim=1) + self.EPSILON).sqrt() + gradient_mag = torch.mean((safe_norm - 1) ** 2) + return gradient_mag diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/reward_provider_factory.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/reward_provider_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..825fc49006e4577cfab9ecaf4886b6b1b8c89cef --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/reward_provider_factory.py @@ -0,0 +1,47 @@ +from typing import Dict, Type +from mlagents.trainers.exception import UnityTrainerException + +from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType + +from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import ( + BaseRewardProvider, +) +from mlagents.trainers.torch_entities.components.reward_providers.extrinsic_reward_provider import ( + ExtrinsicRewardProvider, +) +from mlagents.trainers.torch_entities.components.reward_providers.curiosity_reward_provider import ( + CuriosityRewardProvider, +) +from mlagents.trainers.torch_entities.components.reward_providers.gail_reward_provider import ( + GAILRewardProvider, +) +from mlagents.trainers.torch_entities.components.reward_providers.rnd_reward_provider import ( + RNDRewardProvider, +) + +from mlagents_envs.base_env import BehaviorSpec + +NAME_TO_CLASS: Dict[RewardSignalType, Type[BaseRewardProvider]] = { + RewardSignalType.EXTRINSIC: ExtrinsicRewardProvider, + RewardSignalType.CURIOSITY: CuriosityRewardProvider, + RewardSignalType.GAIL: GAILRewardProvider, + RewardSignalType.RND: RNDRewardProvider, +} + + +def create_reward_provider( + name: RewardSignalType, specs: BehaviorSpec, settings: RewardSignalSettings +) -> BaseRewardProvider: + """ + Creates a reward provider class based on the name and config entry provided as a dict. + :param name: The name of the reward signal + :param specs: The BehaviorSpecs of the policy + :param settings: The RewardSignalSettings for that reward signal + :return: The reward signal class instantiated + """ + rcls = NAME_TO_CLASS.get(name) + if not rcls: + raise UnityTrainerException(f"Unknown reward signal type {name}") + + class_inst = rcls(specs, settings) + return class_inst diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/rnd_reward_provider.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/rnd_reward_provider.py new file mode 100644 index 0000000000000000000000000000000000000000..bda1424ab5c07776d10aafd5a6fb33a6c3043553 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/rnd_reward_provider.py @@ -0,0 +1,80 @@ +import numpy as np +from typing import Dict +from mlagents.torch_utils import torch + +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import ( + BaseRewardProvider, +) +from mlagents.trainers.settings import RNDSettings + +from mlagents_envs.base_env import BehaviorSpec +from mlagents_envs import logging_util +from mlagents.trainers.torch_entities.utils import ModelUtils +from mlagents.trainers.torch_entities.networks import NetworkBody +from mlagents.trainers.trajectory import ObsUtil + +logger = logging_util.get_logger(__name__) + + +class RNDRewardProvider(BaseRewardProvider): + """ + Implementation of Random Network Distillation : https://arxiv.org/pdf/1810.12894.pdf + """ + + def __init__(self, specs: BehaviorSpec, settings: RNDSettings) -> None: + super().__init__(specs, settings) + self._ignore_done = True + self._random_network = RNDNetwork(specs, settings) + self._training_network = RNDNetwork(specs, settings) + self.optimizer = torch.optim.Adam( + self._training_network.parameters(), lr=settings.learning_rate + ) + + def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray: + with torch.no_grad(): + target = self._random_network(mini_batch) + prediction = self._training_network(mini_batch) + rewards = torch.sum((prediction - target) ** 2, dim=1) + return rewards.detach().cpu().numpy() + + def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]: + with torch.no_grad(): + target = self._random_network(mini_batch) + prediction = self._training_network(mini_batch) + loss = torch.mean(torch.sum((prediction - target) ** 2, dim=1)) + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + return {"Losses/RND Loss": loss.detach().cpu().numpy()} + + def get_modules(self): + return { + f"Module:{self.name}-pred": self._training_network, + f"Module:{self.name}-target": self._random_network, + } + + +class RNDNetwork(torch.nn.Module): + EPSILON = 1e-10 + + def __init__(self, specs: BehaviorSpec, settings: RNDSettings) -> None: + super().__init__() + state_encoder_settings = settings.network_settings + if state_encoder_settings.memory is not None: + state_encoder_settings.memory = None + logger.warning( + "memory was specified in network_settings but is not supported by RND. It is being ignored." + ) + + self._encoder = NetworkBody(specs.observation_specs, state_encoder_settings) + + def forward(self, mini_batch: AgentBuffer) -> torch.Tensor: + n_obs = len(self._encoder.processors) + np_obs = ObsUtil.from_buffer(mini_batch, n_obs) + # Convert to tensors + tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs] + + hidden, _ = self._encoder.forward(tensor_obs) + self._encoder.update_normalization(mini_batch) + return hidden diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/conditioning.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/conditioning.py new file mode 100644 index 0000000000000000000000000000000000000000..65f622eba38846658f4bc44423883a28463777ff --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/conditioning.py @@ -0,0 +1,133 @@ +from mlagents.torch_utils import torch +from typing import List +import math + +from mlagents.trainers.torch_entities.layers import ( + linear_layer, + Swish, + Initialization, + LayerNorm, +) + + +class ConditionalEncoder(torch.nn.Module): + def __init__( + self, + input_size: int, + goal_size: int, + hidden_size: int, + num_layers: int, + num_conditional_layers: int, + kernel_init: Initialization = Initialization.KaimingHeNormal, + kernel_gain: float = 1.0, + ): + """ + ConditionalEncoder module. A fully connected network of which some of the + weights are generated by a goal conditioning. Uses the HyperNetwork module to + generate the weights of the network. Only the weights of the last + "num_conditional_layers" layers will be generated by HyperNetworks, the others + will use regular parameters. + :param input_size: The size of the input of the encoder + :param goal_size: The size of the goal tensor that will condition the encoder + :param hidden_size: The number of hidden units in the encoder + :param num_layers: The total number of layers of the encoder (both regular and + generated by HyperNetwork) + :param num_conditional_layers: The number of layers generated with hypernetworks + :param kernel_init: The Initialization to use for the weights of the layer + :param kernel_gain: The multiplier for the weights of the kernel. + """ + super().__init__() + layers: List[torch.nn.Module] = [] + prev_size = input_size + for i in range(num_layers): + if num_layers - i <= num_conditional_layers: + # This means layer i is a conditional layer since the conditional + # leyers are the last num_conditional_layers + layers.append( + HyperNetwork(prev_size, hidden_size, goal_size, hidden_size, 2) + ) + else: + layers.append( + linear_layer( + prev_size, + hidden_size, + kernel_init=kernel_init, + kernel_gain=kernel_gain, + ) + ) + layers.append(Swish()) + prev_size = hidden_size + self.layers = torch.nn.ModuleList(layers) + + def forward( + self, input_tensor: torch.Tensor, goal_tensor: torch.Tensor + ) -> torch.Tensor: # type: ignore + activation = input_tensor + for layer in self.layers: + if isinstance(layer, HyperNetwork): + activation = layer(activation, goal_tensor) + else: + activation = layer(activation) + return activation + + +class HyperNetwork(torch.nn.Module): + def __init__( + self, input_size, output_size, hyper_input_size, layer_size, num_layers + ): + """ + Hyper Network module. This module will use the hyper_input tensor to generate + the weights of the main network. The main network is a single fully connected + layer. + :param input_size: The size of the input of the main network + :param output_size: The size of the output of the main network + :param hyper_input_size: The size of the input of the hypernetwork that will + generate the main network. + :param layer_size: The number of hidden units in the layers of the hypernetwork + :param num_layers: The number of layers of the hypernetwork + """ + super().__init__() + self.input_size = input_size + self.output_size = output_size + + layer_in_size = hyper_input_size + layers = [] + for _ in range(num_layers): + layers.append( + linear_layer( + layer_in_size, + layer_size, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=1.0, + bias_init=Initialization.Zero, + ) + ) + layers.append(Swish()) + layer_in_size = layer_size + flat_output = linear_layer( + layer_size, + input_size * output_size, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=0.1, + bias_init=Initialization.Zero, + ) + + # Re-initializing the weights of the last layer of the hypernetwork + bound = math.sqrt(1 / (layer_size * self.input_size)) + flat_output.weight.data.uniform_(-bound, bound) + + self.hypernet = torch.nn.Sequential(*layers, LayerNorm(), flat_output) + + # The hypernetwork will not generate the bias of the main network layer + self.bias = torch.nn.Parameter(torch.zeros(output_size)) + + def forward(self, input_activation, hyper_input): + output_weights = self.hypernet(hyper_input) + + output_weights = output_weights.view(-1, self.input_size, self.output_size) + + result = ( + torch.bmm(input_activation.unsqueeze(1), output_weights).squeeze(1) + + self.bias + ) + return result diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/decoders.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/decoders.py new file mode 100644 index 0000000000000000000000000000000000000000..30f196a455614754b627888ba7e9f380d53aff0d --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/decoders.py @@ -0,0 +1,22 @@ +from typing import List, Dict + +from mlagents.torch_utils import torch, nn +from mlagents.trainers.torch_entities.layers import linear_layer + + +class ValueHeads(nn.Module): + def __init__(self, stream_names: List[str], input_size: int, output_size: int = 1): + super().__init__() + self.stream_names = stream_names + _value_heads = {} + + for name in stream_names: + value = linear_layer(input_size, output_size) + _value_heads[name] = value + self.value_heads = nn.ModuleDict(_value_heads) + + def forward(self, hidden: torch.Tensor) -> Dict[str, torch.Tensor]: + value_outputs = {} + for stream_name, head in self.value_heads.items(): + value_outputs[stream_name] = head(hidden).squeeze(-1) + return value_outputs diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/distributions.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/distributions.py new file mode 100644 index 0000000000000000000000000000000000000000..47fd0d0847e2ca0b2a1b45030e2c07dab5861dc2 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/distributions.py @@ -0,0 +1,248 @@ +import abc +from typing import List +from mlagents.torch_utils import torch, nn +import numpy as np +import math +from mlagents.trainers.torch_entities.layers import linear_layer, Initialization + +EPSILON = 1e-7 # Small value to avoid divide by zero + + +class DistInstance(nn.Module, abc.ABC): + @abc.abstractmethod + def sample(self) -> torch.Tensor: + """ + Return a sample from this distribution. + """ + pass + + @abc.abstractmethod + def deterministic_sample(self) -> torch.Tensor: + """ + Return the most probable sample from this distribution. + """ + pass + + @abc.abstractmethod + def log_prob(self, value: torch.Tensor) -> torch.Tensor: + """ + Returns the log probabilities of a particular value. + :param value: A value sampled from the distribution. + :returns: Log probabilities of the given value. + """ + pass + + @abc.abstractmethod + def entropy(self) -> torch.Tensor: + """ + Returns the entropy of this distribution. + """ + pass + + @abc.abstractmethod + def exported_model_output(self) -> torch.Tensor: + """ + Returns the tensor to be exported to ONNX for the distribution + """ + pass + + +class DiscreteDistInstance(DistInstance): + @abc.abstractmethod + def all_log_prob(self) -> torch.Tensor: + """ + Returns the log probabilities of all actions represented by this distribution. + """ + pass + + +class GaussianDistInstance(DistInstance): + def __init__(self, mean, std): + super().__init__() + self.mean = mean + self.std = std + + def sample(self): + sample = self.mean + torch.randn_like(self.mean) * self.std + return sample + + def deterministic_sample(self): + return self.mean + + def log_prob(self, value): + var = self.std**2 + log_scale = torch.log(self.std + EPSILON) + return ( + -((value - self.mean) ** 2) / (2 * var + EPSILON) + - log_scale + - math.log(math.sqrt(2 * math.pi)) + ) + + def pdf(self, value): + log_prob = self.log_prob(value) + return torch.exp(log_prob) + + def entropy(self): + return torch.mean( + 0.5 * torch.log(2 * math.pi * math.e * self.std**2 + EPSILON), + dim=1, + keepdim=True, + ) # Use equivalent behavior to TF + + def exported_model_output(self): + return self.sample() + + +class TanhGaussianDistInstance(GaussianDistInstance): + def __init__(self, mean, std): + super().__init__(mean, std) + self.transform = torch.distributions.transforms.TanhTransform(cache_size=1) + + def sample(self): + unsquashed_sample = super().sample() + squashed = self.transform(unsquashed_sample) + return squashed + + def _inverse_tanh(self, value): + capped_value = torch.clamp(value, -1 + EPSILON, 1 - EPSILON) + return 0.5 * torch.log((1 + capped_value) / (1 - capped_value) + EPSILON) + + def log_prob(self, value): + unsquashed = self.transform.inv(value) + return super().log_prob(unsquashed) - self.transform.log_abs_det_jacobian( + unsquashed, value + ) + + +class CategoricalDistInstance(DiscreteDistInstance): + def __init__(self, logits): + super().__init__() + self.logits = logits + self.probs = torch.softmax(self.logits, dim=-1) + + def sample(self): + return torch.multinomial(self.probs, 1) + + def deterministic_sample(self): + return torch.argmax(self.probs, dim=1, keepdim=True) + + def pdf(self, value): + # This function is equivalent to torch.diag(self.probs.T[value.flatten().long()]), + # but torch.diag is not supported by ONNX export. + idx = torch.arange(start=0, end=len(value)).unsqueeze(-1) + return torch.gather( + self.probs.permute(1, 0)[value.flatten().long()], -1, idx + ).squeeze(-1) + + def log_prob(self, value): + return torch.log(self.pdf(value) + EPSILON) + + def all_log_prob(self): + return torch.log(self.probs + EPSILON) + + def entropy(self): + return -torch.sum( + self.probs * torch.log(self.probs + EPSILON), dim=-1 + ).unsqueeze(-1) + + def exported_model_output(self): + return self.sample() + + +class GaussianDistribution(nn.Module): + def __init__( + self, + hidden_size: int, + num_outputs: int, + conditional_sigma: bool = False, + tanh_squash: bool = False, + ): + super().__init__() + self.conditional_sigma = conditional_sigma + self.mu = linear_layer( + hidden_size, + num_outputs, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=0.2, + bias_init=Initialization.Zero, + ) + self.tanh_squash = tanh_squash + if conditional_sigma: + self.log_sigma = linear_layer( + hidden_size, + num_outputs, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=0.2, + bias_init=Initialization.Zero, + ) + else: + self.log_sigma = nn.Parameter( + torch.zeros(1, num_outputs, requires_grad=True) + ) + + def forward(self, inputs: torch.Tensor) -> List[DistInstance]: + mu = self.mu(inputs) + if self.conditional_sigma: + log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2) + else: + # Expand so that entropy matches batch size. Note that we're using + # mu*0 here to get the batch size implicitly since Barracuda 1.2.1 + # throws error on runtime broadcasting due to unknown reason. We + # use this to replace torch.expand() becuase it is not supported in + # the verified version of Barracuda (1.0.X). + log_sigma = mu * 0 + self.log_sigma + if self.tanh_squash: + return TanhGaussianDistInstance(mu, torch.exp(log_sigma)) + else: + return GaussianDistInstance(mu, torch.exp(log_sigma)) + + +class MultiCategoricalDistribution(nn.Module): + def __init__(self, hidden_size: int, act_sizes: List[int]): + super().__init__() + self.act_sizes = act_sizes + self.branches = self._create_policy_branches(hidden_size) + + def _create_policy_branches(self, hidden_size: int) -> nn.ModuleList: + branches = [] + for size in self.act_sizes: + branch_output_layer = linear_layer( + hidden_size, + size, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=0.1, + bias_init=Initialization.Zero, + ) + branches.append(branch_output_layer) + return nn.ModuleList(branches) + + def _mask_branch( + self, logits: torch.Tensor, allow_mask: torch.Tensor + ) -> torch.Tensor: + # Zero out masked logits, then subtract a large value. Technique mentionend here: + # https://arxiv.org/abs/2006.14171. Our implementation is ONNX and Barracuda-friendly. + block_mask = -1.0 * allow_mask + 1.0 + # We do -1 * tensor + constant instead of constant - tensor because it seems + # Barracuda might swap the inputs of a "Sub" operation + logits = logits * allow_mask - 1e8 * block_mask + + return logits + + def _split_masks(self, masks: torch.Tensor) -> List[torch.Tensor]: + split_masks = [] + for idx, _ in enumerate(self.act_sizes): + start = int(np.sum(self.act_sizes[:idx])) + end = int(np.sum(self.act_sizes[: idx + 1])) + split_masks.append(masks[:, start:end]) + return split_masks + + def forward(self, inputs: torch.Tensor, masks: torch.Tensor) -> List[DistInstance]: + # Todo - Support multiple branches in mask code + branch_distributions = [] + masks = self._split_masks(masks) + for idx, branch in enumerate(self.branches): + logits = branch(inputs) + norm_logits = self._mask_branch(logits, masks[idx]) + distribution = CategoricalDistInstance(norm_logits) + branch_distributions.append(distribution) + return branch_distributions diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/encoders.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/encoders.py new file mode 100644 index 0000000000000000000000000000000000000000..32b944ddfc7e47c9203eaa70f3fd607855f1ec00 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/encoders.py @@ -0,0 +1,298 @@ +from typing import Tuple, Optional, Union + +from mlagents.trainers.torch_entities.layers import linear_layer, Initialization, Swish + +from mlagents.torch_utils import torch, nn +from mlagents.trainers.torch_entities.model_serialization import exporting_to_onnx + + +class Normalizer(nn.Module): + def __init__(self, vec_obs_size: int): + super().__init__() + self.register_buffer("normalization_steps", torch.tensor(1)) + self.register_buffer("running_mean", torch.zeros(vec_obs_size)) + self.register_buffer("running_variance", torch.ones(vec_obs_size)) + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + normalized_state = torch.clamp( + (inputs - self.running_mean) + / torch.sqrt(self.running_variance / self.normalization_steps), + -5, + 5, + ) + return normalized_state + + def update(self, vector_input: torch.Tensor) -> None: + with torch.no_grad(): + steps_increment = vector_input.size()[0] + total_new_steps = self.normalization_steps + steps_increment + + input_to_old_mean = vector_input - self.running_mean + new_mean: torch.Tensor = self.running_mean + ( + input_to_old_mean / total_new_steps + ).sum(0) + + input_to_new_mean = vector_input - new_mean + new_variance = self.running_variance + ( + input_to_new_mean * input_to_old_mean + ).sum(0) + # Update references. This is much faster than in-place data update. + self.running_mean: torch.Tensor = new_mean + self.running_variance: torch.Tensor = new_variance + self.normalization_steps: torch.Tensor = total_new_steps + + def copy_from(self, other_normalizer: "Normalizer") -> None: + self.normalization_steps.data.copy_(other_normalizer.normalization_steps.data) + self.running_mean.data.copy_(other_normalizer.running_mean.data) + self.running_variance.copy_(other_normalizer.running_variance.data) + + +def conv_output_shape( + h_w: Tuple[int, int], + kernel_size: Union[int, Tuple[int, int]] = 1, + stride: int = 1, + padding: int = 0, + dilation: int = 1, +) -> Tuple[int, int]: + """ + Calculates the output shape (height and width) of the output of a convolution layer. + kernel_size, stride, padding and dilation correspond to the inputs of the + torch.nn.Conv2d layer (https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html) + :param h_w: The height and width of the input. + :param kernel_size: The size of the kernel of the convolution (can be an int or a + tuple [width, height]) + :param stride: The stride of the convolution + :param padding: The padding of the convolution + :param dilation: The dilation of the convolution + """ + from math import floor + + if not isinstance(kernel_size, tuple): + kernel_size = (int(kernel_size), int(kernel_size)) + h = floor( + ((h_w[0] + (2 * padding) - (dilation * (kernel_size[0] - 1)) - 1) / stride) + 1 + ) + w = floor( + ((h_w[1] + (2 * padding) - (dilation * (kernel_size[1] - 1)) - 1) / stride) + 1 + ) + return h, w + + +def pool_out_shape(h_w: Tuple[int, int], kernel_size: int) -> Tuple[int, int]: + """ + Calculates the output shape (height and width) of the output of a max pooling layer. + kernel_size corresponds to the inputs of the + torch.nn.MaxPool2d layer (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html) + :param kernel_size: The size of the kernel of the convolution + """ + height = (h_w[0] - kernel_size) // 2 + 1 + width = (h_w[1] - kernel_size) // 2 + 1 + return height, width + + +class VectorInput(nn.Module): + def __init__(self, input_size: int, normalize: bool = False): + super().__init__() + self.normalizer: Optional[Normalizer] = None + if normalize: + self.normalizer = Normalizer(input_size) + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + if self.normalizer is not None: + inputs = self.normalizer(inputs) + return inputs + + def copy_normalization(self, other_input: "VectorInput") -> None: + if self.normalizer is not None and other_input.normalizer is not None: + self.normalizer.copy_from(other_input.normalizer) + + def update_normalization(self, inputs: torch.Tensor) -> None: + if self.normalizer is not None: + self.normalizer.update(inputs) + + +class FullyConnectedVisualEncoder(nn.Module): + def __init__( + self, height: int, width: int, initial_channels: int, output_size: int + ): + super().__init__() + self.output_size = output_size + self.input_size = height * width * initial_channels + self.dense = nn.Sequential( + linear_layer( + self.input_size, + self.output_size, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=1.41, # Use ReLU gain + ), + nn.LeakyReLU(), + ) + + def forward(self, visual_obs: torch.Tensor) -> torch.Tensor: + if not exporting_to_onnx.is_exporting(): + visual_obs = visual_obs.permute([0, 3, 1, 2]) + hidden = visual_obs.reshape(-1, self.input_size) + return self.dense(hidden) + + +class SmallVisualEncoder(nn.Module): + """ + CNN architecture used by King in their Candy Crush predictor + https://www.researchgate.net/publication/328307928_Human-Like_Playtesting_with_Deep_Learning + """ + + def __init__( + self, height: int, width: int, initial_channels: int, output_size: int + ): + super().__init__() + self.h_size = output_size + conv_1_hw = conv_output_shape((height, width), 3, 1) + conv_2_hw = conv_output_shape(conv_1_hw, 3, 1) + self.final_flat = conv_2_hw[0] * conv_2_hw[1] * 144 + + self.conv_layers = nn.Sequential( + nn.Conv2d(initial_channels, 35, [3, 3], [1, 1]), + nn.LeakyReLU(), + nn.Conv2d(35, 144, [3, 3], [1, 1]), + nn.LeakyReLU(), + ) + self.dense = nn.Sequential( + linear_layer( + self.final_flat, + self.h_size, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=1.41, # Use ReLU gain + ), + nn.LeakyReLU(), + ) + + def forward(self, visual_obs: torch.Tensor) -> torch.Tensor: + if not exporting_to_onnx.is_exporting(): + visual_obs = visual_obs.permute([0, 3, 1, 2]) + hidden = self.conv_layers(visual_obs) + hidden = hidden.reshape(-1, self.final_flat) + return self.dense(hidden) + + +class SimpleVisualEncoder(nn.Module): + def __init__( + self, height: int, width: int, initial_channels: int, output_size: int + ): + super().__init__() + self.h_size = output_size + conv_1_hw = conv_output_shape((height, width), 8, 4) + conv_2_hw = conv_output_shape(conv_1_hw, 4, 2) + self.final_flat = conv_2_hw[0] * conv_2_hw[1] * 32 + + self.conv_layers = nn.Sequential( + nn.Conv2d(initial_channels, 16, [8, 8], [4, 4]), + nn.LeakyReLU(), + nn.Conv2d(16, 32, [4, 4], [2, 2]), + nn.LeakyReLU(), + ) + self.dense = nn.Sequential( + linear_layer( + self.final_flat, + self.h_size, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=1.41, # Use ReLU gain + ), + nn.LeakyReLU(), + ) + + def forward(self, visual_obs: torch.Tensor) -> torch.Tensor: + if not exporting_to_onnx.is_exporting(): + visual_obs = visual_obs.permute([0, 3, 1, 2]) + hidden = self.conv_layers(visual_obs) + hidden = hidden.reshape(-1, self.final_flat) + return self.dense(hidden) + + +class NatureVisualEncoder(nn.Module): + def __init__( + self, height: int, width: int, initial_channels: int, output_size: int + ): + super().__init__() + self.h_size = output_size + conv_1_hw = conv_output_shape((height, width), 8, 4) + conv_2_hw = conv_output_shape(conv_1_hw, 4, 2) + conv_3_hw = conv_output_shape(conv_2_hw, 3, 1) + self.final_flat = conv_3_hw[0] * conv_3_hw[1] * 64 + + self.conv_layers = nn.Sequential( + nn.Conv2d(initial_channels, 32, [8, 8], [4, 4]), + nn.LeakyReLU(), + nn.Conv2d(32, 64, [4, 4], [2, 2]), + nn.LeakyReLU(), + nn.Conv2d(64, 64, [3, 3], [1, 1]), + nn.LeakyReLU(), + ) + self.dense = nn.Sequential( + linear_layer( + self.final_flat, + self.h_size, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=1.41, # Use ReLU gain + ), + nn.LeakyReLU(), + ) + + def forward(self, visual_obs: torch.Tensor) -> torch.Tensor: + if not exporting_to_onnx.is_exporting(): + visual_obs = visual_obs.permute([0, 3, 1, 2]) + hidden = self.conv_layers(visual_obs) + hidden = hidden.reshape([-1, self.final_flat]) + return self.dense(hidden) + + +class ResNetBlock(nn.Module): + def __init__(self, channel: int): + """ + Creates a ResNet Block. + :param channel: The number of channels in the input (and output) tensors of the + convolutions + """ + super().__init__() + self.layers = nn.Sequential( + Swish(), + nn.Conv2d(channel, channel, [3, 3], [1, 1], padding=1), + Swish(), + nn.Conv2d(channel, channel, [3, 3], [1, 1], padding=1), + ) + + def forward(self, input_tensor: torch.Tensor) -> torch.Tensor: + return input_tensor + self.layers(input_tensor) + + +class ResNetVisualEncoder(nn.Module): + def __init__( + self, height: int, width: int, initial_channels: int, output_size: int + ): + super().__init__() + n_channels = [16, 32, 32] # channel for each stack + n_blocks = 2 # number of residual blocks + layers = [] + last_channel = initial_channels + for _, channel in enumerate(n_channels): + layers.append(nn.Conv2d(last_channel, channel, [3, 3], [1, 1], padding=1)) + layers.append(nn.MaxPool2d([3, 3], [2, 2])) + height, width = pool_out_shape((height, width), 3) + for _ in range(n_blocks): + layers.append(ResNetBlock(channel)) + last_channel = channel + layers.append(Swish()) + self.final_flat_size = n_channels[-1] * height * width + self.dense = linear_layer( + self.final_flat_size, + output_size, + kernel_init=Initialization.KaimingHeNormal, + kernel_gain=1.41, # Use ReLU gain + ) + self.sequential = nn.Sequential(*layers) + + def forward(self, visual_obs: torch.Tensor) -> torch.Tensor: + if not exporting_to_onnx.is_exporting(): + visual_obs = visual_obs.permute([0, 3, 1, 2]) + hidden = self.sequential(visual_obs) + before_out = hidden.reshape(-1, self.final_flat_size) + return torch.relu(self.dense(before_out)) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/layers.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..e5a598edf64baa072941acb6a917d9dccef179a5 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/layers.py @@ -0,0 +1,225 @@ +from mlagents.torch_utils import torch +import abc +from typing import Tuple +from enum import Enum +from mlagents.trainers.torch_entities.model_serialization import exporting_to_onnx + + +class Swish(torch.nn.Module): + def forward(self, data: torch.Tensor) -> torch.Tensor: + return torch.mul(data, torch.sigmoid(data)) + + +class Initialization(Enum): + Zero = 0 + XavierGlorotNormal = 1 + XavierGlorotUniform = 2 + KaimingHeNormal = 3 # also known as Variance scaling + KaimingHeUniform = 4 + Normal = 5 + + +_init_methods = { + Initialization.Zero: torch.zero_, + Initialization.XavierGlorotNormal: torch.nn.init.xavier_normal_, + Initialization.XavierGlorotUniform: torch.nn.init.xavier_uniform_, + Initialization.KaimingHeNormal: torch.nn.init.kaiming_normal_, + Initialization.KaimingHeUniform: torch.nn.init.kaiming_uniform_, + Initialization.Normal: torch.nn.init.normal_, +} + + +def linear_layer( + input_size: int, + output_size: int, + kernel_init: Initialization = Initialization.XavierGlorotUniform, + kernel_gain: float = 1.0, + bias_init: Initialization = Initialization.Zero, +) -> torch.nn.Module: + """ + Creates a torch.nn.Linear module and initializes its weights. + :param input_size: The size of the input tensor + :param output_size: The size of the output tensor + :param kernel_init: The Initialization to use for the weights of the layer + :param kernel_gain: The multiplier for the weights of the kernel. Note that in + TensorFlow, the gain is square-rooted. Therefore calling with scale 0.01 is equivalent to calling + KaimingHeNormal with kernel_gain of 0.1 + :param bias_init: The Initialization to use for the weights of the bias layer + """ + layer = torch.nn.Linear(input_size, output_size) + if ( + kernel_init == Initialization.KaimingHeNormal + or kernel_init == Initialization.KaimingHeUniform + ): + _init_methods[kernel_init](layer.weight.data, nonlinearity="linear") + else: + _init_methods[kernel_init](layer.weight.data) + layer.weight.data *= kernel_gain + _init_methods[bias_init](layer.bias.data) + return layer + + +def lstm_layer( + input_size: int, + hidden_size: int, + num_layers: int = 1, + batch_first: bool = True, + forget_bias: float = 1.0, + kernel_init: Initialization = Initialization.XavierGlorotUniform, + bias_init: Initialization = Initialization.Zero, +) -> torch.nn.Module: + """ + Creates a torch.nn.LSTM and initializes its weights and biases. Provides a + forget_bias offset like is done in TensorFlow. + """ + lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=batch_first) + # Add forget_bias to forget gate bias + for name, param in lstm.named_parameters(): + # Each weight and bias is a concatenation of 4 matrices + if "weight" in name: + for idx in range(4): + block_size = param.shape[0] // 4 + _init_methods[kernel_init]( + param.data[idx * block_size : (idx + 1) * block_size] + ) + if "bias" in name: + for idx in range(4): + block_size = param.shape[0] // 4 + _init_methods[bias_init]( + param.data[idx * block_size : (idx + 1) * block_size] + ) + if idx == 1: + param.data[idx * block_size : (idx + 1) * block_size].add_( + forget_bias + ) + return lstm + + +class MemoryModule(torch.nn.Module): + @abc.abstractproperty + def memory_size(self) -> int: + """ + Size of memory that is required at the start of a sequence. + """ + pass + + @abc.abstractmethod + def forward( + self, input_tensor: torch.Tensor, memories: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Pass a sequence to the memory module. + :input_tensor: Tensor of shape (batch_size, seq_length, size) that represents the input. + :memories: Tensor of initial memories. + :return: Tuple of output, final memories. + """ + pass + + +class LayerNorm(torch.nn.Module): + """ + A vanilla implementation of layer normalization https://arxiv.org/pdf/1607.06450.pdf + norm_x = (x - mean) / sqrt((x - mean) ^ 2) + This does not include the trainable parameters gamma and beta for performance speed. + Typically, this is norm_x * gamma + beta + """ + + def forward(self, layer_activations: torch.Tensor) -> torch.Tensor: + mean = torch.mean(layer_activations, dim=-1, keepdim=True) + var = torch.mean((layer_activations - mean) ** 2, dim=-1, keepdim=True) + return (layer_activations - mean) / (torch.sqrt(var + 1e-5)) + + +class LinearEncoder(torch.nn.Module): + """ + Linear layers. + """ + + def __init__( + self, + input_size: int, + num_layers: int, + hidden_size: int, + kernel_init: Initialization = Initialization.KaimingHeNormal, + kernel_gain: float = 1.0, + ): + super().__init__() + self.layers = [ + linear_layer( + input_size, + hidden_size, + kernel_init=kernel_init, + kernel_gain=kernel_gain, + ) + ] + self.layers.append(Swish()) + for _ in range(num_layers - 1): + self.layers.append( + linear_layer( + hidden_size, + hidden_size, + kernel_init=kernel_init, + kernel_gain=kernel_gain, + ) + ) + self.layers.append(Swish()) + self.seq_layers = torch.nn.Sequential(*self.layers) + + def forward(self, input_tensor: torch.Tensor) -> torch.Tensor: + return self.seq_layers(input_tensor) + + +class LSTM(MemoryModule): + """ + Memory module that implements LSTM. + """ + + def __init__( + self, + input_size: int, + memory_size: int, + num_layers: int = 1, + forget_bias: float = 1.0, + kernel_init: Initialization = Initialization.XavierGlorotUniform, + bias_init: Initialization = Initialization.Zero, + ): + super().__init__() + # We set hidden size to half of memory_size since the initial memory + # will be divided between the hidden state and initial cell state. + self.hidden_size = memory_size // 2 + self.lstm = lstm_layer( + input_size, + self.hidden_size, + num_layers, + True, + forget_bias, + kernel_init, + bias_init, + ) + + @property + def memory_size(self) -> int: + return 2 * self.hidden_size + + def forward( + self, input_tensor: torch.Tensor, memories: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + + if exporting_to_onnx.is_exporting(): + # This transpose is needed both at input and output of the LSTM when + # exporting because ONNX will expect (sequence_len, batch, memory_size) + # instead of (batch, sequence_len, memory_size) + memories = torch.transpose(memories, 0, 1) + + # We don't use torch.split here since it is not supported by Barracuda + h0 = memories[:, :, : self.hidden_size].contiguous() + c0 = memories[:, :, self.hidden_size :].contiguous() + + hidden = (h0, c0) + lstm_out, hidden_out = self.lstm(input_tensor, hidden) + output_mem = torch.cat(hidden_out, dim=-1) + + if exporting_to_onnx.is_exporting(): + output_mem = torch.transpose(output_mem, 0, 1) + + return lstm_out, output_mem diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/model_serialization.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/model_serialization.py new file mode 100644 index 0000000000000000000000000000000000000000..f204b52445e5247bc8b921a4eb4207218a9d2747 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/model_serialization.py @@ -0,0 +1,173 @@ +from typing import Tuple +import threading +from mlagents.torch_utils import torch + +from mlagents_envs.logging_util import get_logger +from mlagents.trainers.settings import SerializationSettings + + +logger = get_logger(__name__) + + +class exporting_to_onnx: + """ + Set this context by calling + ``` + with exporting_to_onnx(): + ``` + Within this context, the variable exporting_to_onnx.is_exporting() will be true. + This implementation is thread safe. + """ + + # local is_exporting flag for each thread + _local_data = threading.local() + _local_data._is_exporting = False + + # global lock shared among all threads, to make sure only one thread is exporting at a time + _lock = threading.Lock() + + def __enter__(self): + self._lock.acquire() + self._local_data._is_exporting = True + + def __exit__(self, *args): + self._local_data._is_exporting = False + self._lock.release() + + @staticmethod + def is_exporting(): + if not hasattr(exporting_to_onnx._local_data, "_is_exporting"): + return False + return exporting_to_onnx._local_data._is_exporting + + +class TensorNames: + batch_size_placeholder = "batch_size" + sequence_length_placeholder = "sequence_length" + vector_observation_placeholder = "vector_observation" + recurrent_in_placeholder = "recurrent_in" + visual_observation_placeholder_prefix = "visual_observation_" + observation_placeholder_prefix = "obs_" + previous_action_placeholder = "prev_action" + action_mask_placeholder = "action_masks" + random_normal_epsilon_placeholder = "epsilon" + + value_estimate_output = "value_estimate" + recurrent_output = "recurrent_out" + memory_size = "memory_size" + version_number = "version_number" + + continuous_action_output_shape = "continuous_action_output_shape" + discrete_action_output_shape = "discrete_action_output_shape" + continuous_action_output = "continuous_actions" + discrete_action_output = "discrete_actions" + deterministic_continuous_action_output = "deterministic_continuous_actions" + deterministic_discrete_action_output = "deterministic_discrete_actions" + + # Deprecated TensorNames entries for backward compatibility + is_continuous_control_deprecated = "is_continuous_control" + action_output_deprecated = "action" + action_output_shape_deprecated = "action_output_shape" + + @staticmethod + def get_visual_observation_name(index: int) -> str: + """ + Returns the name of the visual observation with a given index + """ + return TensorNames.visual_observation_placeholder_prefix + str(index) + + @staticmethod + def get_observation_name(index: int) -> str: + """ + Returns the name of the observation with a given index + """ + return TensorNames.observation_placeholder_prefix + str(index) + + +class ModelSerializer: + def __init__(self, policy): + # ONNX only support input in NCHW (channel first) format. + # Barracuda also expect to get data in NCHW. + # Any multi-dimentional input should follow that otherwise will + # cause problem to barracuda import. + self.policy = policy + observation_specs = self.policy.behavior_spec.observation_specs + batch_dim = [1] + seq_len_dim = [1] + num_obs = len(observation_specs) + + dummy_obs = [ + torch.zeros( + batch_dim + list(ModelSerializer._get_onnx_shape(obs_spec.shape)) + ) + for obs_spec in observation_specs + ] + + dummy_masks = torch.ones( + batch_dim + [sum(self.policy.behavior_spec.action_spec.discrete_branches)] + ) + dummy_memories = torch.zeros( + batch_dim + seq_len_dim + [self.policy.export_memory_size] + ) + + self.dummy_input = (dummy_obs, dummy_masks, dummy_memories) + + self.input_names = [TensorNames.get_observation_name(i) for i in range(num_obs)] + self.input_names += [ + TensorNames.action_mask_placeholder, + TensorNames.recurrent_in_placeholder, + ] + + self.dynamic_axes = {name: {0: "batch"} for name in self.input_names} + + self.output_names = [TensorNames.version_number, TensorNames.memory_size] + if self.policy.behavior_spec.action_spec.continuous_size > 0: + self.output_names += [ + TensorNames.continuous_action_output, + TensorNames.continuous_action_output_shape, + TensorNames.deterministic_continuous_action_output, + ] + self.dynamic_axes.update( + {TensorNames.continuous_action_output: {0: "batch"}} + ) + if self.policy.behavior_spec.action_spec.discrete_size > 0: + self.output_names += [ + TensorNames.discrete_action_output, + TensorNames.discrete_action_output_shape, + TensorNames.deterministic_discrete_action_output, + ] + self.dynamic_axes.update({TensorNames.discrete_action_output: {0: "batch"}}) + + if self.policy.export_memory_size > 0: + self.output_names += [TensorNames.recurrent_output] + + @staticmethod + def _get_onnx_shape(shape: Tuple[int, ...]) -> Tuple[int, ...]: + """ + Converts the shape of an observation to be compatible with the NCHW format + of ONNX + """ + if len(shape) == 3: + return shape[2], shape[0], shape[1] + return shape + + def export_policy_model(self, output_filepath: str) -> None: + """ + Exports a Torch model for a Policy to .onnx format for Unity embedding. + + :param output_filepath: file path to output the model (without file suffix) + """ + onnx_output_path = f"{output_filepath}.onnx" + logger.debug(f"Converting to {onnx_output_path}") + + with exporting_to_onnx(): + torch.onnx.export( + self.policy.actor, + self.dummy_input, + onnx_output_path, + opset_version=SerializationSettings.onnx_opset, + input_names=self.input_names, + output_names=self.output_names, + dynamic_axes=self.dynamic_axes, + ) + logger.info(f"Exported {onnx_output_path}") diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/networks.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/networks.py new file mode 100644 index 0000000000000000000000000000000000000000..555268075c90d6e111f5511f16290fd632be88b3 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/networks.py @@ -0,0 +1,767 @@ +from typing import Callable, List, Dict, Tuple, Optional, Union, Any +import abc + +from mlagents.torch_utils import torch, nn + +from mlagents_envs.base_env import ActionSpec, ObservationSpec, ObservationType +from mlagents.trainers.torch_entities.action_model import ActionModel +from mlagents.trainers.torch_entities.agent_action import AgentAction +from mlagents.trainers.settings import NetworkSettings, EncoderType, ConditioningType +from mlagents.trainers.torch_entities.utils import ModelUtils +from mlagents.trainers.torch_entities.decoders import ValueHeads +from mlagents.trainers.torch_entities.layers import LSTM, LinearEncoder +from mlagents.trainers.torch_entities.encoders import VectorInput +from mlagents.trainers.buffer import AgentBuffer +from mlagents.trainers.trajectory import ObsUtil +from mlagents.trainers.torch_entities.conditioning import ConditionalEncoder +from mlagents.trainers.torch_entities.attention import ( + EntityEmbedding, + ResidualSelfAttention, + get_zero_entities_mask, +) +from mlagents.trainers.exception import UnityTrainerException + + +ActivationFunction = Callable[[torch.Tensor], torch.Tensor] +EncoderFunction = Callable[ + [torch.Tensor, int, ActivationFunction, int, str, bool], torch.Tensor +] + +EPSILON = 1e-7 + + +class ObservationEncoder(nn.Module): + ATTENTION_EMBEDDING_SIZE = 128 # The embedding size of attention is fixed + + def __init__( + self, + observation_specs: List[ObservationSpec], + h_size: int, + vis_encode_type: EncoderType, + normalize: bool = False, + ): + """ + Returns an ObservationEncoder that can process and encode a set of observations. + Will use an RSA if needed for variable length observations. + """ + super().__init__() + self.processors, self.embedding_sizes = ModelUtils.create_input_processors( + observation_specs, + h_size, + vis_encode_type, + self.ATTENTION_EMBEDDING_SIZE, + normalize=normalize, + ) + self.rsa, self.x_self_encoder = ModelUtils.create_residual_self_attention( + self.processors, self.embedding_sizes, self.ATTENTION_EMBEDDING_SIZE + ) + if self.rsa is not None: + total_enc_size = sum(self.embedding_sizes) + self.ATTENTION_EMBEDDING_SIZE + else: + total_enc_size = sum(self.embedding_sizes) + self.normalize = normalize + self._total_enc_size = total_enc_size + + self._total_goal_enc_size = 0 + self._goal_processor_indices: List[int] = [] + for i in range(len(observation_specs)): + if observation_specs[i].observation_type == ObservationType.GOAL_SIGNAL: + self._total_goal_enc_size += self.embedding_sizes[i] + self._goal_processor_indices.append(i) + + @property + def total_enc_size(self) -> int: + """ + Returns the total encoding size for this ObservationEncoder. + """ + return self._total_enc_size + + @property + def total_goal_enc_size(self) -> int: + """ + Returns the total goal encoding size for this ObservationEncoder. + """ + return self._total_goal_enc_size + + def update_normalization(self, buffer: AgentBuffer) -> None: + obs = ObsUtil.from_buffer(buffer, len(self.processors)) + for vec_input, enc in zip(obs, self.processors): + if isinstance(enc, VectorInput): + enc.update_normalization(torch.as_tensor(vec_input.to_ndarray())) + + def copy_normalization(self, other_encoder: "ObservationEncoder") -> None: + if self.normalize: + for n1, n2 in zip(self.processors, other_encoder.processors): + if isinstance(n1, VectorInput) and isinstance(n2, VectorInput): + n1.copy_normalization(n2) + + def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor: + """ + Encode observations using a list of processors and an RSA. + :param inputs: List of Tensors corresponding to a set of obs. + """ + encodes = [] + var_len_processor_inputs: List[Tuple[nn.Module, torch.Tensor]] = [] + + for idx, processor in enumerate(self.processors): + if not isinstance(processor, EntityEmbedding): + # The input can be encoded without having to process other inputs + obs_input = inputs[idx] + processed_obs = processor(obs_input) + encodes.append(processed_obs) + else: + var_len_processor_inputs.append((processor, inputs[idx])) + if len(encodes) != 0: + encoded_self = torch.cat(encodes, dim=1) + input_exist = True + else: + input_exist = False + if len(var_len_processor_inputs) > 0 and self.rsa is not None: + # Some inputs need to be processed with a variable length encoder + masks = get_zero_entities_mask([p_i[1] for p_i in var_len_processor_inputs]) + embeddings: List[torch.Tensor] = [] + processed_self = ( + self.x_self_encoder(encoded_self) + if input_exist and self.x_self_encoder is not None + else None + ) + for processor, var_len_input in var_len_processor_inputs: + embeddings.append(processor(processed_self, var_len_input)) + qkv = torch.cat(embeddings, dim=1) + attention_embedding = self.rsa(qkv, masks) + if not input_exist: + encoded_self = torch.cat([attention_embedding], dim=1) + input_exist = True + else: + encoded_self = torch.cat([encoded_self, attention_embedding], dim=1) + + if not input_exist: + raise UnityTrainerException( + "The trainer was unable to process any of the provided inputs. " + "Make sure the trained agents has at least one sensor attached to them." + ) + + return encoded_self + + def get_goal_encoding(self, inputs: List[torch.Tensor]) -> torch.Tensor: + """ + Encode observations corresponding to goals using a list of processors. + :param inputs: List of Tensors corresponding to a set of obs. + """ + encodes = [] + for idx in self._goal_processor_indices: + processor = self.processors[idx] + if not isinstance(processor, EntityEmbedding): + # The input can be encoded without having to process other inputs + obs_input = inputs[idx] + processed_obs = processor(obs_input) + encodes.append(processed_obs) + else: + raise UnityTrainerException( + "The one of the goals uses variable length observations. This use " + "case is not supported." + ) + if len(encodes) != 0: + encoded = torch.cat(encodes, dim=1) + else: + raise UnityTrainerException( + "Trainer was unable to process any of the goals provided as input." + ) + return encoded + + +class NetworkBody(nn.Module): + def __init__( + self, + observation_specs: List[ObservationSpec], + network_settings: NetworkSettings, + encoded_act_size: int = 0, + ): + super().__init__() + self.normalize = network_settings.normalize + self.use_lstm = network_settings.memory is not None + self.h_size = network_settings.hidden_units + self.m_size = ( + network_settings.memory.memory_size + if network_settings.memory is not None + else 0 + ) + self.observation_encoder = ObservationEncoder( + observation_specs, + self.h_size, + network_settings.vis_encode_type, + self.normalize, + ) + self.processors = self.observation_encoder.processors + total_enc_size = self.observation_encoder.total_enc_size + total_enc_size += encoded_act_size + + if ( + self.observation_encoder.total_goal_enc_size > 0 + and network_settings.goal_conditioning_type == ConditioningType.HYPER + ): + self._body_endoder = ConditionalEncoder( + total_enc_size, + self.observation_encoder.total_goal_enc_size, + self.h_size, + network_settings.num_layers, + 1, + ) + else: + self._body_endoder = LinearEncoder( + total_enc_size, network_settings.num_layers, self.h_size + ) + + if self.use_lstm: + self.lstm = LSTM(self.h_size, self.m_size) + else: + self.lstm = None # type: ignore + + def update_normalization(self, buffer: AgentBuffer) -> None: + self.observation_encoder.update_normalization(buffer) + + def copy_normalization(self, other_network: "NetworkBody") -> None: + self.observation_encoder.copy_normalization(other_network.observation_encoder) + + @property + def memory_size(self) -> int: + return self.lstm.memory_size if self.use_lstm else 0 + + def forward( + self, + inputs: List[torch.Tensor], + actions: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Tuple[torch.Tensor, torch.Tensor]: + encoded_self = self.observation_encoder(inputs) + if actions is not None: + encoded_self = torch.cat([encoded_self, actions], dim=1) + if isinstance(self._body_endoder, ConditionalEncoder): + goal = self.observation_encoder.get_goal_encoding(inputs) + encoding = self._body_endoder(encoded_self, goal) + else: + encoding = self._body_endoder(encoded_self) + + if self.use_lstm: + # Resize to (batch, sequence length, encoding size) + encoding = encoding.reshape([-1, sequence_length, self.h_size]) + encoding, memories = self.lstm(encoding, memories) + encoding = encoding.reshape([-1, self.m_size // 2]) + return encoding, memories + + +class MultiAgentNetworkBody(torch.nn.Module): + """ + A network body that uses a self attention layer to handle state + and action input from a potentially variable number of agents that + share the same observation and action space. + """ + + def __init__( + self, + observation_specs: List[ObservationSpec], + network_settings: NetworkSettings, + action_spec: ActionSpec, + ): + super().__init__() + self.normalize = network_settings.normalize + self.use_lstm = network_settings.memory is not None + self.h_size = network_settings.hidden_units + self.m_size = ( + network_settings.memory.memory_size + if network_settings.memory is not None + else 0 + ) + self.action_spec = action_spec + self.observation_encoder = ObservationEncoder( + observation_specs, + self.h_size, + network_settings.vis_encode_type, + self.normalize, + ) + self.processors = self.observation_encoder.processors + + # Modules for multi-agent self-attention + obs_only_ent_size = self.observation_encoder.total_enc_size + q_ent_size = ( + obs_only_ent_size + + sum(self.action_spec.discrete_branches) + + self.action_spec.continuous_size + ) + + attention_embeding_size = self.h_size + self.obs_encoder = EntityEmbedding( + obs_only_ent_size, None, attention_embeding_size + ) + self.obs_action_encoder = EntityEmbedding( + q_ent_size, None, attention_embeding_size + ) + + self.self_attn = ResidualSelfAttention(attention_embeding_size) + + self.linear_encoder = LinearEncoder( + attention_embeding_size, + network_settings.num_layers, + self.h_size, + kernel_gain=(0.125 / self.h_size) ** 0.5, + ) + + if self.use_lstm: + self.lstm = LSTM(self.h_size, self.m_size) + else: + self.lstm = None # type: ignore + self._current_max_agents = torch.nn.Parameter( + torch.as_tensor(1), requires_grad=False + ) + + @property + def memory_size(self) -> int: + return self.lstm.memory_size if self.use_lstm else 0 + + def update_normalization(self, buffer: AgentBuffer) -> None: + self.observation_encoder.update_normalization(buffer) + + def copy_normalization(self, other_network: "MultiAgentNetworkBody") -> None: + self.observation_encoder.copy_normalization(other_network.observation_encoder) + + def _get_masks_from_nans(self, obs_tensors: List[torch.Tensor]) -> torch.Tensor: + """ + Get attention masks by grabbing an arbitrary obs across all the agents + Since these are raw obs, the padded values are still NaN + """ + only_first_obs = [_all_obs[0] for _all_obs in obs_tensors] + # Just get the first element in each obs regardless of its dimension. This will speed up + # searching for NaNs. + only_first_obs_flat = torch.stack( + [_obs.flatten(start_dim=1)[:, 0] for _obs in only_first_obs], dim=1 + ) + # Get the mask from NaNs + attn_mask = only_first_obs_flat.isnan().float() + return attn_mask + + def _copy_and_remove_nans_from_obs( + self, all_obs: List[List[torch.Tensor]], attention_mask: torch.Tensor + ) -> List[List[torch.Tensor]]: + """ + Helper function to remove NaNs from observations using an attention mask. + """ + obs_with_no_nans = [] + for i_agent, single_agent_obs in enumerate(all_obs): + no_nan_obs = [] + for obs in single_agent_obs: + new_obs = obs.clone() + new_obs[attention_mask.bool()[:, i_agent], ::] = 0.0 # Remove NaNs fast + no_nan_obs.append(new_obs) + obs_with_no_nans.append(no_nan_obs) + return obs_with_no_nans + + def forward( + self, + obs_only: List[List[torch.Tensor]], + obs: List[List[torch.Tensor]], + actions: List[AgentAction], + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Returns sampled actions. + If memory is enabled, return the memories as well. + :param obs_only: Observations to be processed that do not have corresponding actions. + These are encoded with the obs_encoder. + :param obs: Observations to be processed that do have corresponding actions. + After concatenation with actions, these are processed with obs_action_encoder. + :param actions: After concatenation with obs, these are processed with obs_action_encoder. + :param memories: If using memory, a Tensor of initial memories. + :param sequence_length: If using memory, the sequence length. + """ + self_attn_masks = [] + self_attn_inputs = [] + concat_f_inp = [] + if obs: + obs_attn_mask = self._get_masks_from_nans(obs) + obs = self._copy_and_remove_nans_from_obs(obs, obs_attn_mask) + for inputs, action in zip(obs, actions): + encoded = self.observation_encoder(inputs) + cat_encodes = [ + encoded, + action.to_flat(self.action_spec.discrete_branches), + ] + concat_f_inp.append(torch.cat(cat_encodes, dim=1)) + f_inp = torch.stack(concat_f_inp, dim=1) + self_attn_masks.append(obs_attn_mask) + self_attn_inputs.append(self.obs_action_encoder(None, f_inp)) + + concat_encoded_obs = [] + if obs_only: + obs_only_attn_mask = self._get_masks_from_nans(obs_only) + obs_only = self._copy_and_remove_nans_from_obs(obs_only, obs_only_attn_mask) + for inputs in obs_only: + encoded = self.observation_encoder(inputs) + concat_encoded_obs.append(encoded) + g_inp = torch.stack(concat_encoded_obs, dim=1) + self_attn_masks.append(obs_only_attn_mask) + self_attn_inputs.append(self.obs_encoder(None, g_inp)) + + encoded_entity = torch.cat(self_attn_inputs, dim=1) + encoded_state = self.self_attn(encoded_entity, self_attn_masks) + + flipped_masks = 1 - torch.cat(self_attn_masks, dim=1) + num_agents = torch.sum(flipped_masks, dim=1, keepdim=True) + if torch.max(num_agents).item() > self._current_max_agents: + self._current_max_agents = torch.nn.Parameter( + torch.as_tensor(torch.max(num_agents).item()), requires_grad=False + ) + + # num_agents will be -1 for a single agent and +1 when the current maximum is reached + num_agents = num_agents * 2.0 / self._current_max_agents - 1 + + encoding = self.linear_encoder(encoded_state) + if self.use_lstm: + # Resize to (batch, sequence length, encoding size) + encoding = encoding.reshape([-1, sequence_length, self.h_size]) + encoding, memories = self.lstm(encoding, memories) + encoding = encoding.reshape([-1, self.m_size // 2]) + encoding = torch.cat([encoding, num_agents], dim=1) + return encoding, memories + + +class Critic(abc.ABC): + @abc.abstractmethod + def update_normalization(self, buffer: AgentBuffer) -> None: + """ + Updates normalization of Actor based on the provided List of vector obs. + :param vector_obs: A List of vector obs as tensors. + """ + pass + + def critic_pass( + self, + inputs: List[torch.Tensor], + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: + """ + Get value outputs for the given obs. + :param inputs: List of inputs as tensors. + :param memories: Tensor of memories, if using memory. Otherwise, None. + :returns: Dict of reward stream to output tensor for values. + """ + pass + + +class ValueNetwork(nn.Module, Critic): + def __init__( + self, + stream_names: List[str], + observation_specs: List[ObservationSpec], + network_settings: NetworkSettings, + encoded_act_size: int = 0, + outputs_per_stream: int = 1, + ): + + # This is not a typo, we want to call __init__ of nn.Module + nn.Module.__init__(self) + self.network_body = NetworkBody( + observation_specs, network_settings, encoded_act_size=encoded_act_size + ) + if network_settings.memory is not None: + encoding_size = network_settings.memory.memory_size // 2 + else: + encoding_size = network_settings.hidden_units + self.value_heads = ValueHeads(stream_names, encoding_size, outputs_per_stream) + + def update_normalization(self, buffer: AgentBuffer) -> None: + self.network_body.update_normalization(buffer) + + @property + def memory_size(self) -> int: + return self.network_body.memory_size + + def critic_pass( + self, + inputs: List[torch.Tensor], + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: + value_outputs, critic_mem_out = self.forward( + inputs, memories=memories, sequence_length=sequence_length + ) + return value_outputs, critic_mem_out + + def forward( + self, + inputs: List[torch.Tensor], + actions: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: + encoding, memories = self.network_body( + inputs, actions, memories, sequence_length + ) + output = self.value_heads(encoding) + return output, memories + + +class Actor(abc.ABC): + @abc.abstractmethod + def update_normalization(self, buffer: AgentBuffer) -> None: + """ + Updates normalization of Actor based on the provided List of vector obs. + :param vector_obs: A List of vector obs as tensors. + """ + pass + + def get_action_and_stats( + self, + inputs: List[torch.Tensor], + masks: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Tuple[AgentAction, Dict[str, Any], torch.Tensor]: + """ + Returns sampled actions. + If memory is enabled, return the memories as well. + :param inputs: A List of inputs as tensors. + :param masks: If using discrete actions, a Tensor of action masks. + :param memories: If using memory, a Tensor of initial memories. + :param sequence_length: If using memory, the sequence length. + :return: A Tuple of AgentAction, ActionLogProbs, entropies, and memories. + Memories will be None if not using memory. + """ + pass + + def get_stats( + self, + inputs: List[torch.Tensor], + actions: AgentAction, + masks: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Dict[str, Any]: + """ + Returns log_probs for actions and entropies. + If memory is enabled, return the memories as well. + :param inputs: A List of inputs as tensors. + :param actions: AgentAction of actions. + :param masks: If using discrete actions, a Tensor of action masks. + :param memories: If using memory, a Tensor of initial memories. + :param sequence_length: If using memory, the sequence length. + :return: A Tuple of AgentAction, ActionLogProbs, entropies, and memories. + Memories will be None if not using memory. + """ + + pass + + @abc.abstractmethod + def forward( + self, + inputs: List[torch.Tensor], + masks: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + ) -> Tuple[Union[int, torch.Tensor], ...]: + """ + Forward pass of the Actor for inference. This is required for export to ONNX, and + the inputs and outputs of this method should not be changed without a respective change + in the ONNX export code. + """ + pass + + +class SimpleActor(nn.Module, Actor): + MODEL_EXPORT_VERSION = 3 # Corresponds to ModelApiVersion.MLAgents2_0 + + def __init__( + self, + observation_specs: List[ObservationSpec], + network_settings: NetworkSettings, + action_spec: ActionSpec, + conditional_sigma: bool = False, + tanh_squash: bool = False, + ): + super().__init__() + self.action_spec = action_spec + self.version_number = torch.nn.Parameter( + torch.Tensor([self.MODEL_EXPORT_VERSION]), requires_grad=False + ) + self.is_continuous_int_deprecated = torch.nn.Parameter( + torch.Tensor([int(self.action_spec.is_continuous())]), requires_grad=False + ) + self.continuous_act_size_vector = torch.nn.Parameter( + torch.Tensor([int(self.action_spec.continuous_size)]), requires_grad=False + ) + self.discrete_act_size_vector = torch.nn.Parameter( + torch.Tensor([self.action_spec.discrete_branches]), requires_grad=False + ) + self.act_size_vector_deprecated = torch.nn.Parameter( + torch.Tensor( + [ + self.action_spec.continuous_size + + sum(self.action_spec.discrete_branches) + ] + ), + requires_grad=False, + ) + self.network_body = NetworkBody(observation_specs, network_settings) + if network_settings.memory is not None: + self.encoding_size = network_settings.memory.memory_size // 2 + else: + self.encoding_size = network_settings.hidden_units + self.memory_size_vector = torch.nn.Parameter( + torch.Tensor([int(self.network_body.memory_size)]), requires_grad=False + ) + + self.action_model = ActionModel( + self.encoding_size, + action_spec, + conditional_sigma=conditional_sigma, + tanh_squash=tanh_squash, + deterministic=network_settings.deterministic, + ) + + @property + def memory_size(self) -> int: + return self.network_body.memory_size + + def update_normalization(self, buffer: AgentBuffer) -> None: + self.network_body.update_normalization(buffer) + + def get_action_and_stats( + self, + inputs: List[torch.Tensor], + masks: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Tuple[AgentAction, Dict[str, Any], torch.Tensor]: + + encoding, memories = self.network_body( + inputs, memories=memories, sequence_length=sequence_length + ) + action, log_probs, entropies = self.action_model(encoding, masks) + run_out = {} + # This is the clipped action which is not saved to the buffer + # but is exclusively sent to the environment. + run_out["env_action"] = action.to_action_tuple( + clip=self.action_model.clip_action + ) + run_out["log_probs"] = log_probs + run_out["entropy"] = entropies + + return action, run_out, memories + + def get_stats( + self, + inputs: List[torch.Tensor], + actions: AgentAction, + masks: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Dict[str, Any]: + encoding, actor_mem_outs = self.network_body( + inputs, memories=memories, sequence_length=sequence_length + ) + + log_probs, entropies = self.action_model.evaluate(encoding, masks, actions) + run_out = {} + run_out["log_probs"] = log_probs + run_out["entropy"] = entropies + return run_out + + def forward( + self, + inputs: List[torch.Tensor], + masks: Optional[torch.Tensor] = None, + memories: Optional[torch.Tensor] = None, + ) -> Tuple[Union[int, torch.Tensor], ...]: + """ + Note: This forward() method is required for exporting to ONNX. Don't modify the inputs and outputs. + + At this moment, torch.onnx.export() doesn't accept None as tensor to be exported, + so the size of return tuple varies with action spec. + """ + encoding, memories_out = self.network_body( + inputs, memories=memories, sequence_length=1 + ) + + ( + cont_action_out, + disc_action_out, + action_out_deprecated, + deterministic_cont_action_out, + deterministic_disc_action_out, + ) = self.action_model.get_action_out(encoding, masks) + export_out = [self.version_number, self.memory_size_vector] + if self.action_spec.continuous_size > 0: + export_out += [ + cont_action_out, + self.continuous_act_size_vector, + deterministic_cont_action_out, + ] + if self.action_spec.discrete_size > 0: + export_out += [ + disc_action_out, + self.discrete_act_size_vector, + deterministic_disc_action_out, + ] + if self.network_body.memory_size > 0: + export_out += [memories_out] + return tuple(export_out) + + +class SharedActorCritic(SimpleActor, Critic): + def __init__( + self, + observation_specs: List[ObservationSpec], + network_settings: NetworkSettings, + action_spec: ActionSpec, + stream_names: List[str], + conditional_sigma: bool = False, + tanh_squash: bool = False, + ): + self.use_lstm = network_settings.memory is not None + super().__init__( + observation_specs, + network_settings, + action_spec, + conditional_sigma, + tanh_squash, + ) + self.stream_names = stream_names + self.value_heads = ValueHeads(stream_names, self.encoding_size) + + def critic_pass( + self, + inputs: List[torch.Tensor], + memories: Optional[torch.Tensor] = None, + sequence_length: int = 1, + ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: + encoding, memories_out = self.network_body( + inputs, memories=memories, sequence_length=sequence_length + ) + return self.value_heads(encoding), memories_out + + +class GlobalSteps(nn.Module): + def __init__(self): + super().__init__() + self.__global_step = nn.Parameter( + torch.Tensor([0]).to(torch.int64), requires_grad=False + ) + + @property + def current_step(self): + return int(self.__global_step.item()) + + @current_step.setter + def current_step(self, value): + self.__global_step[:] = value + + def increment(self, value): + self.__global_step += value + + +class LearningRate(nn.Module): + def __init__(self, lr): + # Todo: add learning rate decay + super().__init__() + self.learning_rate = torch.Tensor([lr]) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/utils.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..048ce8b59174c1b9ac5a2520b4b36f858754f9ac --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/utils.py @@ -0,0 +1,452 @@ +from typing import List, Optional, Tuple, Dict +from mlagents.torch_utils import torch, nn +from mlagents.trainers.torch_entities.layers import LinearEncoder, Initialization +import numpy as np + +from mlagents.trainers.torch_entities.encoders import ( + SimpleVisualEncoder, + ResNetVisualEncoder, + NatureVisualEncoder, + SmallVisualEncoder, + FullyConnectedVisualEncoder, + VectorInput, +) +from mlagents.trainers.settings import EncoderType, ScheduleType +from mlagents.trainers.torch_entities.attention import ( + EntityEmbedding, + ResidualSelfAttention, +) +from mlagents.trainers.exception import UnityTrainerException +from mlagents_envs.base_env import ObservationSpec, DimensionProperty + + +class ModelUtils: + # Minimum supported side for each encoder type. If refactoring an encoder, please + # adjust these also. + MIN_RESOLUTION_FOR_ENCODER = { + EncoderType.FULLY_CONNECTED: 1, + EncoderType.MATCH3: 5, + EncoderType.SIMPLE: 20, + EncoderType.NATURE_CNN: 36, + EncoderType.RESNET: 15, + } + + VALID_VISUAL_PROP = frozenset( + [ + ( + DimensionProperty.TRANSLATIONAL_EQUIVARIANCE, + DimensionProperty.TRANSLATIONAL_EQUIVARIANCE, + DimensionProperty.NONE, + ), + (DimensionProperty.UNSPECIFIED,) * 3, + ] + ) + + VALID_VECTOR_PROP = frozenset( + [(DimensionProperty.NONE,), (DimensionProperty.UNSPECIFIED,)] + ) + + VALID_VAR_LEN_PROP = frozenset( + [(DimensionProperty.VARIABLE_SIZE, DimensionProperty.NONE)] + ) + + @staticmethod + def update_learning_rate(optim: torch.optim.Optimizer, lr: float) -> None: + """ + Apply a learning rate to a torch optimizer. + :param optim: Optimizer + :param lr: Learning rate + """ + for param_group in optim.param_groups: + param_group["lr"] = lr + + class DecayedValue: + def __init__( + self, + schedule: ScheduleType, + initial_value: float, + min_value: float, + max_step: int, + ): + """ + Object that represnets value of a parameter that should be decayed, assuming it is a function of + global_step. + :param schedule: Type of learning rate schedule. + :param initial_value: Initial value before decay. + :param min_value: Decay value to this value by max_step. + :param max_step: The final step count where the return value should equal min_value. + :param global_step: The current step count. + :return: The value. + """ + self.schedule = schedule + self.initial_value = initial_value + self.min_value = min_value + self.max_step = max_step + + def get_value(self, global_step: int) -> float: + """ + Get the value at a given global step. + :param global_step: Step count. + :returns: Decayed value at this global step. + """ + if self.schedule == ScheduleType.CONSTANT: + return self.initial_value + elif self.schedule == ScheduleType.LINEAR: + return ModelUtils.polynomial_decay( + self.initial_value, self.min_value, self.max_step, global_step + ) + else: + raise UnityTrainerException(f"The schedule {self.schedule} is invalid.") + + @staticmethod + def polynomial_decay( + initial_value: float, + min_value: float, + max_step: int, + global_step: int, + power: float = 1.0, + ) -> float: + """ + Get a decayed value based on a polynomial schedule, with respect to the current global step. + :param initial_value: Initial value before decay. + :param min_value: Decay value to this value by max_step. + :param max_step: The final step count where the return value should equal min_value. + :param global_step: The current step count. + :param power: Power of polynomial decay. 1.0 (default) is a linear decay. + :return: The current decayed value. + """ + global_step = min(global_step, max_step) + decayed_value = (initial_value - min_value) * ( + 1 - float(global_step) / max_step + ) ** (power) + min_value + return decayed_value + + @staticmethod + def get_encoder_for_type(encoder_type: EncoderType) -> nn.Module: + ENCODER_FUNCTION_BY_TYPE = { + EncoderType.SIMPLE: SimpleVisualEncoder, + EncoderType.NATURE_CNN: NatureVisualEncoder, + EncoderType.RESNET: ResNetVisualEncoder, + EncoderType.MATCH3: SmallVisualEncoder, + EncoderType.FULLY_CONNECTED: FullyConnectedVisualEncoder, + } + return ENCODER_FUNCTION_BY_TYPE.get(encoder_type) + + @staticmethod + def _check_resolution_for_encoder( + height: int, width: int, vis_encoder_type: EncoderType + ) -> None: + min_res = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[vis_encoder_type] + if height < min_res or width < min_res: + raise UnityTrainerException( + f"Visual observation resolution ({width}x{height}) is too small for" + f"the provided EncoderType ({vis_encoder_type.value}). The min dimension is {min_res}" + ) + + @staticmethod + def get_encoder_for_obs( + obs_spec: ObservationSpec, + normalize: bool, + h_size: int, + attention_embedding_size: int, + vis_encode_type: EncoderType, + ) -> Tuple[nn.Module, int]: + """ + Returns the encoder and the size of the appropriate encoder. + :param shape: Tuples that represent the observation dimension. + :param normalize: Normalize all vector inputs. + :param h_size: Number of hidden units per layer excluding attention layers. + :param attention_embedding_size: Number of hidden units per attention layer. + :param vis_encode_type: Type of visual encoder to use. + """ + shape = obs_spec.shape + dim_prop = obs_spec.dimension_property + + # VISUAL + if dim_prop in ModelUtils.VALID_VISUAL_PROP: + visual_encoder_class = ModelUtils.get_encoder_for_type(vis_encode_type) + ModelUtils._check_resolution_for_encoder( + shape[0], shape[1], vis_encode_type + ) + return (visual_encoder_class(shape[0], shape[1], shape[2], h_size), h_size) + # VECTOR + if dim_prop in ModelUtils.VALID_VECTOR_PROP: + return (VectorInput(shape[0], normalize), shape[0]) + # VARIABLE LENGTH + if dim_prop in ModelUtils.VALID_VAR_LEN_PROP: + return ( + EntityEmbedding( + entity_size=shape[1], + entity_num_max_elements=shape[0], + embedding_size=attention_embedding_size, + ), + 0, + ) + # OTHER + raise UnityTrainerException(f"Unsupported Sensor with specs {obs_spec}") + + @staticmethod + def create_input_processors( + observation_specs: List[ObservationSpec], + h_size: int, + vis_encode_type: EncoderType, + attention_embedding_size: int, + normalize: bool = False, + ) -> Tuple[nn.ModuleList, List[int]]: + """ + Creates visual and vector encoders, along with their normalizers. + :param observation_specs: List of ObservationSpec that represent the observation dimensions. + :param action_size: Number of additional un-normalized inputs to each vector encoder. Used for + conditioning network on other values (e.g. actions for a Q function) + :param h_size: Number of hidden units per layer excluding attention layers. + :param attention_embedding_size: Number of hidden units per attention layer. + :param vis_encode_type: Type of visual encoder to use. + :param unnormalized_inputs: Vector inputs that should not be normalized, and added to the vector + obs. + :param normalize: Normalize all vector inputs. + :return: Tuple of : + - ModuleList of the encoders + - A list of embedding sizes (0 if the input requires to be processed with a variable length + observation encoder) + """ + encoders: List[nn.Module] = [] + embedding_sizes: List[int] = [] + for obs_spec in observation_specs: + encoder, embedding_size = ModelUtils.get_encoder_for_obs( + obs_spec, normalize, h_size, attention_embedding_size, vis_encode_type + ) + encoders.append(encoder) + embedding_sizes.append(embedding_size) + + x_self_size = sum(embedding_sizes) # The size of the "self" embedding + if x_self_size > 0: + for enc in encoders: + if isinstance(enc, EntityEmbedding): + enc.add_self_embedding(attention_embedding_size) + return (nn.ModuleList(encoders), embedding_sizes) + + @staticmethod + def list_to_tensor( + ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = torch.float32 + ) -> torch.Tensor: + """ + Converts a list of numpy arrays into a tensor. MUCH faster than + calling as_tensor on the list directly. + """ + return torch.as_tensor(np.asanyarray(ndarray_list), dtype=dtype) + + @staticmethod + def list_to_tensor_list( + ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = torch.float32 + ) -> torch.Tensor: + """ + Converts a list of numpy arrays into a list of tensors. MUCH faster than + calling as_tensor on the list directly. + """ + return [ + torch.as_tensor(np.asanyarray(_arr), dtype=dtype) for _arr in ndarray_list + ] + + @staticmethod + def to_numpy(tensor: torch.Tensor) -> np.ndarray: + """ + Converts a Torch Tensor to a numpy array. If the Tensor is on the GPU, it will + be brought to the CPU. + """ + return tensor.detach().cpu().numpy() + + @staticmethod + def break_into_branches( + concatenated_logits: torch.Tensor, action_size: List[int] + ) -> List[torch.Tensor]: + """ + Takes a concatenated set of logits that represent multiple discrete action branches + and breaks it up into one Tensor per branch. + :param concatenated_logits: Tensor that represents the concatenated action branches + :param action_size: List of ints containing the number of possible actions for each branch. + :return: A List of Tensors containing one tensor per branch. + """ + action_idx = [0] + list(np.cumsum(action_size)) + branched_logits = [ + concatenated_logits[:, action_idx[i] : action_idx[i + 1]] + for i in range(len(action_size)) + ] + return branched_logits + + @staticmethod + def actions_to_onehot( + discrete_actions: torch.Tensor, action_size: List[int] + ) -> List[torch.Tensor]: + """ + Takes a tensor of discrete actions and turns it into a List of onehot encoding for each + action. + :param discrete_actions: Actions in integer form. + :param action_size: List of branch sizes. Should be of same size as discrete_actions' + last dimension. + :return: List of one-hot tensors, one representing each branch. + """ + onehot_branches = [ + torch.nn.functional.one_hot(_act.T, action_size[i]).float() + for i, _act in enumerate(discrete_actions.long().T) + ] + return onehot_branches + + @staticmethod + def dynamic_partition( + data: torch.Tensor, partitions: torch.Tensor, num_partitions: int + ) -> List[torch.Tensor]: + """ + Torch implementation of dynamic_partition : + https://www.tensorflow.org/api_docs/python/tf/dynamic_partition + Splits the data Tensor input into num_partitions Tensors according to the indices in + partitions. + :param data: The Tensor data that will be split into partitions. + :param partitions: An indices tensor that determines in which partition each element + of data will be in. + :param num_partitions: The number of partitions to output. Corresponds to the + maximum possible index in the partitions argument. + :return: A list of Tensor partitions (Their indices correspond to their partition index). + """ + res: List[torch.Tensor] = [] + for i in range(num_partitions): + res += [data[(partitions == i).nonzero().squeeze(1)]] + return res + + @staticmethod + def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor: + """ + Returns the mean of the tensor but ignoring the values specified by masks. + Used for masking out loss functions. + :param tensor: Tensor which needs mean computation. + :param masks: Boolean tensor of masks with same dimension as tensor. + """ + if tensor.ndim == 0: + return (tensor * masks).sum() / torch.clamp( + (torch.ones_like(tensor) * masks).float().sum(), min=1.0 + ) + else: + return ( + tensor.permute(*torch.arange(tensor.ndim - 1, -1, -1)) * masks + ).sum() / torch.clamp( + ( + torch.ones_like( + tensor.permute(*torch.arange(tensor.ndim - 1, -1, -1)) + ) + * masks + ) + .float() + .sum(), + min=1.0, + ) + + @staticmethod + def soft_update(source: nn.Module, target: nn.Module, tau: float) -> None: + """ + Performs an in-place polyak update of the target module based on the source, + by a ratio of tau. Note that source and target modules must have the same + parameters, where: + target = tau * source + (1-tau) * target + :param source: Source module whose parameters will be used. + :param target: Target module whose parameters will be updated. + :param tau: Percentage of source parameters to use in average. Setting tau to + 1 will copy the source parameters to the target. + """ + with torch.no_grad(): + for source_param, target_param in zip( + source.parameters(), target.parameters() + ): + target_param.data.mul_(1.0 - tau) + torch.add( + target_param.data, + source_param.data, + alpha=tau, + out=target_param.data, + ) + + @staticmethod + def create_residual_self_attention( + input_processors: nn.ModuleList, embedding_sizes: List[int], hidden_size: int + ) -> Tuple[Optional[ResidualSelfAttention], Optional[LinearEncoder]]: + """ + Creates an RSA if there are variable length observations found in the input processors. + :param input_processors: A ModuleList of input processors as returned by the function + create_input_processors(). + :param embedding sizes: A List of embedding sizes as returned by create_input_processors(). + :param hidden_size: The hidden size to use for the RSA. + :returns: A Tuple of the RSA itself, a self encoder, and the embedding size after the RSA. + Returns None for the RSA and encoder if no var len inputs are detected. + """ + rsa, x_self_encoder = None, None + entity_num_max: int = 0 + var_processors = [p for p in input_processors if isinstance(p, EntityEmbedding)] + for processor in var_processors: + entity_max: int = processor.entity_num_max_elements + # Only adds entity max if it was known at construction + if entity_max > 0: + entity_num_max += entity_max + if len(var_processors) > 0: + if sum(embedding_sizes): + x_self_encoder = LinearEncoder( + sum(embedding_sizes), + 1, + hidden_size, + kernel_init=Initialization.Normal, + kernel_gain=(0.125 / hidden_size) ** 0.5, + ) + rsa = ResidualSelfAttention(hidden_size, entity_num_max) + return rsa, x_self_encoder + + @staticmethod + def trust_region_value_loss( + values: Dict[str, torch.Tensor], + old_values: Dict[str, torch.Tensor], + returns: Dict[str, torch.Tensor], + epsilon: float, + loss_masks: torch.Tensor, + ) -> torch.Tensor: + """ + Evaluates value loss, clipping to stay within a trust region of old value estimates. + Used for PPO and POCA. + :param values: Value output of the current network. + :param old_values: Value stored with experiences in buffer. + :param returns: Computed returns. + :param epsilon: Clipping value for value estimate. + :param loss_mask: Mask for losses. Used with LSTM to ignore 0'ed out experiences. + """ + value_losses = [] + for name, head in values.items(): + old_val_tensor = old_values[name] + returns_tensor = returns[name] + clipped_value_estimate = old_val_tensor + torch.clamp( + head - old_val_tensor, -1 * epsilon, epsilon + ) + v_opt_a = (returns_tensor - head) ** 2 + v_opt_b = (returns_tensor - clipped_value_estimate) ** 2 + value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks) + value_losses.append(value_loss) + value_loss = torch.mean(torch.stack(value_losses)) + return value_loss + + @staticmethod + def trust_region_policy_loss( + advantages: torch.Tensor, + log_probs: torch.Tensor, + old_log_probs: torch.Tensor, + loss_masks: torch.Tensor, + epsilon: float, + ) -> torch.Tensor: + """ + Evaluate policy loss clipped to stay within a trust region. Used for PPO and POCA. + :param advantages: Computed advantages. + :param log_probs: Current policy probabilities + :param old_log_probs: Past policy probabilities + :param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences. + """ + advantage = advantages.unsqueeze(-1) + r_theta = torch.exp(log_probs - old_log_probs) + p_opt_a = r_theta * advantage + p_opt_b = torch.clamp(r_theta, 1.0 - epsilon, 1.0 + epsilon) * advantage + policy_loss = -1 * ModelUtils.masked_mean( + torch.min(p_opt_a, p_opt_b), loss_masks + ) + return policy_loss diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..620008bdb17ab1dc338f57ec88cd1432c8ef09bd --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__init__.py @@ -0,0 +1,2 @@ +from mlagents.trainers.trainer.trainer import Trainer # noqa +from mlagents.trainers.trainer.trainer_factory import TrainerFactory # noqa diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f79ce0260d5d4eabf3e6b08c5510884f74fe6a4 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/off_policy_trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/off_policy_trainer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c801ab4ce9e1d7f549c81257c8bcc7aec39606fe Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/off_policy_trainer.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/on_policy_trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/on_policy_trainer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7064bbff51c55fdfee95b61780aefe2af55ffeb9 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/on_policy_trainer.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/rl_trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/rl_trainer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ca252fd18aad88c4c7dee1ced18f8d696826432 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/rl_trainer.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5239e5cfad8df43f8a28775669086284998a8da3 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer_factory.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer_factory.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04f0743965819ded3d1695491f3118e541e67d65 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer_factory.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53c37c5080176463f2c939aaddc4fedf7fe67470 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer_utils.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/off_policy_trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/off_policy_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..92ae4966928ab45b780815e94e64668fca7e1eb9 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/off_policy_trainer.py @@ -0,0 +1,263 @@ +# ## ML-Agent Learning (SAC) +# Contains an implementation of SAC as described in https://arxiv.org/abs/1801.01290 +# and implemented in https://github.com/hill-a/stable-baselines + +from collections import defaultdict +from typing import Dict, cast +import os + +import numpy as np +from mlagents.trainers.policy.checkpoint_manager import ModelCheckpoint + +from mlagents_envs.logging_util import get_logger +from mlagents_envs.timers import timed +from mlagents.trainers.buffer import RewardSignalUtil +from mlagents.trainers.policy import Policy +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.trainer.rl_trainer import RLTrainer +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers +from mlagents.trainers.settings import TrainerSettings, OffPolicyHyperparamSettings + +logger = get_logger(__name__) + +BUFFER_TRUNCATE_PERCENT = 0.8 + + +class OffPolicyTrainer(RLTrainer): + """ + The SACTrainer is an implementation of the SAC algorithm, with support + for discrete actions and recurrent networks. + """ + + def __init__( + self, + behavior_name: str, + reward_buff_cap: int, + trainer_settings: TrainerSettings, + training: bool, + load: bool, + seed: int, + artifact_path: str, + ): + """ + Responsible for collecting experiences and training an off-policy model. + :param behavior_name: The name of the behavior associated with trainer config + :param reward_buff_cap: Max reward history to track in the reward buffer + :param trainer_settings: The parameters for the trainer. + :param training: Whether the trainer is set for training. + :param load: Whether the model should be loaded. + :param seed: The seed the model will be initialized with + :param artifact_path: The directory within which to store artifacts from this trainer. + """ + super().__init__( + behavior_name, + trainer_settings, + training, + load, + artifact_path, + reward_buff_cap, + ) + + self.seed = seed + self.policy: Policy = None # type: ignore + self.optimizer: TorchOptimizer = None # type: ignore + self.hyperparameters: OffPolicyHyperparamSettings = cast( + OffPolicyHyperparamSettings, trainer_settings.hyperparameters + ) + + self._step = 0 + + # Don't divide by zero + self.update_steps = 1 + self.reward_signal_update_steps = 1 + + self.steps_per_update = self.hyperparameters.steps_per_update + self.reward_signal_steps_per_update = ( + self.hyperparameters.reward_signal_steps_per_update + ) + + self.checkpoint_replay_buffer = self.hyperparameters.save_replay_buffer + + def _checkpoint(self) -> ModelCheckpoint: + """ + Writes a checkpoint model to memory + Overrides the default to save the replay buffer. + """ + ckpt = super()._checkpoint() + if self.checkpoint_replay_buffer: + self.save_replay_buffer() + return ckpt + + def save_model(self) -> None: + """ + Saves the final training model to memory + Overrides the default to save the replay buffer. + """ + super().save_model() + if self.checkpoint_replay_buffer: + self.save_replay_buffer() + + def save_replay_buffer(self) -> None: + """ + Save the training buffer's update buffer to a pickle file. + """ + filename = os.path.join(self.artifact_path, "last_replay_buffer.hdf5") + logger.info(f"Saving Experience Replay Buffer to {filename}...") + with open(filename, "wb") as file_object: + self.update_buffer.save_to_file(file_object) + logger.info( + f"Saved Experience Replay Buffer ({os.path.getsize(filename)} bytes)." + ) + + def load_replay_buffer(self) -> None: + """ + Loads the last saved replay buffer from a file. + """ + filename = os.path.join(self.artifact_path, "last_replay_buffer.hdf5") + logger.info(f"Loading Experience Replay Buffer from {filename}...") + with open(filename, "rb+") as file_object: + self.update_buffer.load_from_file(file_object) + logger.debug( + "Experience replay buffer has {} experiences.".format( + self.update_buffer.num_experiences + ) + ) + + def _is_ready_update(self) -> bool: + """ + Returns whether or not the trainer has enough elements to run update model + :return: A boolean corresponding to whether or not _update_policy() can be run + """ + return ( + self.update_buffer.num_experiences >= self.hyperparameters.batch_size + and self._step >= self.hyperparameters.buffer_init_steps + ) + + def maybe_load_replay_buffer(self): + # Load the replay buffer if load + if self.load and self.checkpoint_replay_buffer: + try: + self.load_replay_buffer() + except (AttributeError, FileNotFoundError): + logger.warning( + "Replay buffer was unable to load, starting from scratch." + ) + logger.debug( + "Loaded update buffer with {} sequences".format( + self.update_buffer.num_experiences + ) + ) + + def add_policy( + self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy + ) -> None: + """ + Adds policy to trainer. + """ + if self.policy: + logger.warning( + "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ + train adversarial games.".format( + self.__class__.__name__ + ) + ) + self.policy = policy + self.policies[parsed_behavior_id.behavior_id] = policy + self.optimizer = self.create_optimizer() + for _reward_signal in self.optimizer.reward_signals.keys(): + self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) + + self.model_saver.register(self.policy) + self.model_saver.register(self.optimizer) + self.model_saver.initialize_or_load() + + # Needed to resume loads properly + self._step = policy.get_current_step() + # Assume steps were updated at the correct ratio before + self.update_steps = int(max(1, self._step / self.steps_per_update)) + self.reward_signal_update_steps = int( + max(1, self._step / self.reward_signal_steps_per_update) + ) + + @timed + def _update_policy(self) -> bool: + """ + Uses update_buffer to update the policy. We sample the update_buffer and update + until the steps_per_update ratio is met. + """ + has_updated = False + self.cumulative_returns_since_policy_update.clear() + n_sequences = max( + int(self.hyperparameters.batch_size / self.policy.sequence_length), 1 + ) + + batch_update_stats: Dict[str, list] = defaultdict(list) + while ( + self._step - self.hyperparameters.buffer_init_steps + ) / self.update_steps > self.steps_per_update: + logger.debug(f"Updating SAC policy at step {self._step}") + buffer = self.update_buffer + if self.update_buffer.num_experiences >= self.hyperparameters.batch_size: + sampled_minibatch = buffer.sample_mini_batch( + self.hyperparameters.batch_size, + sequence_length=self.policy.sequence_length, + ) + # Get rewards for each reward + for name, signal in self.optimizer.reward_signals.items(): + sampled_minibatch[RewardSignalUtil.rewards_key(name)] = ( + signal.evaluate(sampled_minibatch) * signal.strength + ) + + update_stats = self.optimizer.update(sampled_minibatch, n_sequences) + for stat_name, value in update_stats.items(): + batch_update_stats[stat_name].append(value) + + self.update_steps += 1 + + for stat, stat_list in batch_update_stats.items(): + self._stats_reporter.add_stat(stat, np.mean(stat_list)) + has_updated = True + + if self.optimizer.bc_module: + update_stats = self.optimizer.bc_module.update() + for stat, val in update_stats.items(): + self._stats_reporter.add_stat(stat, val) + + # Truncate update buffer if neccessary. Truncate more than we need to to avoid truncating + # a large buffer at each update. + if self.update_buffer.num_experiences > self.hyperparameters.buffer_size: + self.update_buffer.truncate( + int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT) + ) + # TODO: revisit this update + self._update_reward_signals() + return has_updated + + def _update_reward_signals(self) -> None: + """ + Iterate through the reward signals and update them. Unlike in PPO, + do it separate from the policy so that it can be done at a different + interval. + This function should only be used to simulate + http://arxiv.org/abs/1809.02925 and similar papers, where the policy is updated + N times, then the reward signals are updated N times. Normally, the reward signal + and policy are updated in parallel. + """ + buffer = self.update_buffer + batch_update_stats: Dict[str, list] = defaultdict(list) + while ( + self._step - self.hyperparameters.buffer_init_steps + ) / self.reward_signal_update_steps > self.reward_signal_steps_per_update: + # Get minibatches for reward signal update if needed + minibatch = buffer.sample_mini_batch( + self.hyperparameters.batch_size, + sequence_length=self.policy.sequence_length, + ) + update_stats = self.optimizer.update_reward_signals(minibatch) + + for stat_name, value in update_stats.items(): + batch_update_stats[stat_name].append(value) + self.reward_signal_update_steps += 1 + + for stat, stat_list in batch_update_stats.items(): + self._stats_reporter.add_stat(stat, np.mean(stat_list)) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/on_policy_trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/on_policy_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..879640a0e5f95f2594d9f30ab8aed2cec9eb722d --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/on_policy_trainer.py @@ -0,0 +1,144 @@ +# # Unity ML-Agents Toolkit +# ## ML-Agent Learning (PPO) +# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347 + +from collections import defaultdict +from typing import cast + +import numpy as np + +from mlagents_envs.logging_util import get_logger +from mlagents.trainers.buffer import BufferKey +from mlagents.trainers.trainer.rl_trainer import RLTrainer +from mlagents.trainers.policy import Policy +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers +from mlagents.trainers.settings import TrainerSettings, OnPolicyHyperparamSettings + +logger = get_logger(__name__) + + +class OnPolicyTrainer(RLTrainer): + """The PPOTrainer is an implementation of the PPO algorithm.""" + + def __init__( + self, + behavior_name: str, + reward_buff_cap: int, + trainer_settings: TrainerSettings, + training: bool, + load: bool, + seed: int, + artifact_path: str, + ): + """ + Responsible for collecting experiences and training an on-policy model. + :param behavior_name: The name of the behavior associated with trainer config + :param reward_buff_cap: Max reward history to track in the reward buffer + :param trainer_settings: The parameters for the trainer. + :param training: Whether the trainer is set for training. + :param load: Whether the model should be loaded. + :param seed: The seed the model will be initialized with + :param artifact_path: The directory within which to store artifacts from this trainer. + """ + super().__init__( + behavior_name, + trainer_settings, + training, + load, + artifact_path, + reward_buff_cap, + ) + self.hyperparameters = cast( + OnPolicyHyperparamSettings, self.trainer_settings.hyperparameters + ) + self.seed = seed + self.policy: Policy = None # type: ignore + self.optimizer: TorchOptimizer = None # type: ignore + + def _is_ready_update(self): + """ + Returns whether or not the trainer has enough elements to run update model + :return: A boolean corresponding to whether or not update_model() can be run + """ + size_of_buffer = self.update_buffer.num_experiences + return size_of_buffer > self.hyperparameters.buffer_size + + def _update_policy(self): + """ + Uses demonstration_buffer to update the policy. + The reward signal generators must be updated in this method at their own pace. + """ + buffer_length = self.update_buffer.num_experiences + self.cumulative_returns_since_policy_update.clear() + + # Make sure batch_size is a multiple of sequence length. During training, we + # will need to reshape the data into a batch_size x sequence_length tensor. + batch_size = ( + self.hyperparameters.batch_size + - self.hyperparameters.batch_size % self.policy.sequence_length + ) + # Make sure there is at least one sequence + batch_size = max(batch_size, self.policy.sequence_length) + + n_sequences = max( + int(self.hyperparameters.batch_size / self.policy.sequence_length), 1 + ) + + advantages = np.array( + self.update_buffer[BufferKey.ADVANTAGES].get_batch(), dtype=np.float32 + ) + self.update_buffer[BufferKey.ADVANTAGES].set( + (advantages - advantages.mean()) / (advantages.std() + 1e-10) + ) + num_epoch = self.hyperparameters.num_epoch + batch_update_stats = defaultdict(list) + for _ in range(num_epoch): + self.update_buffer.shuffle(sequence_length=self.policy.sequence_length) + buffer = self.update_buffer + max_num_batch = buffer_length // batch_size + for i in range(0, max_num_batch * batch_size, batch_size): + minibatch = buffer.make_mini_batch(i, i + batch_size) + update_stats = self.optimizer.update(minibatch, n_sequences) + update_stats.update(self.optimizer.update_reward_signals(minibatch)) + for stat_name, value in update_stats.items(): + batch_update_stats[stat_name].append(value) + + for stat, stat_list in batch_update_stats.items(): + self._stats_reporter.add_stat(stat, np.mean(stat_list)) + + if self.optimizer.bc_module: + update_stats = self.optimizer.bc_module.update() + for stat, val in update_stats.items(): + self._stats_reporter.add_stat(stat, val) + self._clear_update_buffer() + return True + + def add_policy( + self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy + ) -> None: + """ + Adds policy to trainer. + :param parsed_behavior_id: Behavior identifiers that the policy should belong to. + :param policy: Policy to associate with name_behavior_id. + """ + if self.policy: + logger.warning( + "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \ + train adversarial games.".format( + self.__class__.__name__ + ) + ) + self.policy = policy + self.policies[parsed_behavior_id.behavior_id] = policy + + self.optimizer = self.create_optimizer() + for _reward_signal in self.optimizer.reward_signals.keys(): + self.collected_rewards[_reward_signal] = defaultdict(lambda: 0) + + self.model_saver.register(self.policy) + self.model_saver.register(self.optimizer) + self.model_saver.initialize_or_load() + + # Needed to resume loads properly + self._step = policy.get_current_step() diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/rl_trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/rl_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..57454900a0b61a4af4c728d01373eb63147f485b --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/rl_trainer.py @@ -0,0 +1,305 @@ +# # Unity ML-Agents Toolkit +from typing import Dict, List, Optional +from collections import defaultdict +import abc +import time +import attr +import numpy as np +from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod + +from mlagents.trainers.policy.checkpoint_manager import ( + ModelCheckpoint, + ModelCheckpointManager, +) +from mlagents_envs.logging_util import get_logger +from mlagents_envs.timers import timed +from mlagents.trainers.optimizer import Optimizer +from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer +from mlagents.trainers.buffer import AgentBuffer, BufferKey +from mlagents.trainers.trainer import Trainer +from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import ( + BaseRewardProvider, +) +from mlagents_envs.timers import hierarchical_timer +from mlagents.trainers.model_saver.torch_model_saver import TorchModelSaver +from mlagents.trainers.agent_processor import AgentManagerQueue +from mlagents.trainers.trajectory import Trajectory +from mlagents.trainers.settings import TrainerSettings +from mlagents.trainers.stats import StatsPropertyType +from mlagents.trainers.model_saver.model_saver import BaseModelSaver + + +logger = get_logger(__name__) + + +class RLTrainer(Trainer): + """ + This class is the base class for trainers that use Reward Signals. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward + # used for reporting only. We always want to report the environment reward to Tensorboard, regardless + # of what reward signals are actually present. + self.cumulative_returns_since_policy_update: List[float] = [] + self.collected_rewards: Dict[str, Dict[str, int]] = { + "environment": defaultdict(lambda: 0) + } + self.update_buffer: AgentBuffer = AgentBuffer() + self._stats_reporter.add_property( + StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict() + ) + + self._next_save_step = 0 + self._next_summary_step = 0 + self.model_saver = self.create_model_saver( + self.trainer_settings, self.artifact_path, self.load + ) + self._has_warned_group_rewards = False + + def end_episode(self) -> None: + """ + A signal that the Episode has ended. The buffer must be reset. + Get only called when the academy resets. + """ + for rewards in self.collected_rewards.values(): + for agent_id in rewards: + rewards[agent_id] = 0 + + def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None: + for name, rewards in self.collected_rewards.items(): + if name == "environment": + self.stats_reporter.add_stat( + "Environment/Cumulative Reward", + rewards.get(agent_id, 0), + aggregation=StatsAggregationMethod.HISTOGRAM, + ) + self.cumulative_returns_since_policy_update.append( + rewards.get(agent_id, 0) + ) + self.reward_buffer.appendleft(rewards.get(agent_id, 0)) + rewards[agent_id] = 0 + else: + if isinstance(optimizer.reward_signals[name], BaseRewardProvider): + self.stats_reporter.add_stat( + f"Policy/{optimizer.reward_signals[name].name.capitalize()} Reward", + rewards.get(agent_id, 0), + ) + else: + self.stats_reporter.add_stat( + optimizer.reward_signals[name].stat_name, + rewards.get(agent_id, 0), + ) + rewards[agent_id] = 0 + + def _clear_update_buffer(self) -> None: + """ + Clear the buffers that have been built up during inference. + """ + self.update_buffer.reset_agent() + + @abc.abstractmethod + def _is_ready_update(self): + """ + Returns whether or not the trainer has enough elements to run update model + :return: A boolean corresponding to wether or not update_model() can be run + """ + return False + + @abc.abstractmethod + def create_optimizer(self) -> TorchOptimizer: + """ + Creates an Optimizer object + """ + pass + + @staticmethod + def create_model_saver( + trainer_settings: TrainerSettings, model_path: str, load: bool + ) -> BaseModelSaver: + model_saver = TorchModelSaver( # type: ignore + trainer_settings, model_path, load + ) + return model_saver + + def _policy_mean_reward(self) -> Optional[float]: + """Returns the mean episode reward for the current policy.""" + rewards = self.cumulative_returns_since_policy_update + if len(rewards) == 0: + return None + else: + return sum(rewards) / len(rewards) + + @timed + def _checkpoint(self) -> ModelCheckpoint: + """ + Checkpoints the policy associated with this trainer. + """ + n_policies = len(self.policies.keys()) + if n_policies > 1: + logger.warning( + "Trainer has multiple policies, but default behavior only saves the first." + ) + export_path, auxillary_paths = self.model_saver.save_checkpoint( + self.brain_name, self._step + ) + new_checkpoint = ModelCheckpoint( + int(self._step), + export_path, + self._policy_mean_reward(), + time.time(), + auxillary_file_paths=auxillary_paths, + ) + ModelCheckpointManager.add_checkpoint( + self.brain_name, new_checkpoint, self.trainer_settings.keep_checkpoints + ) + return new_checkpoint + + def save_model(self) -> None: + """ + Saves the policy associated with this trainer. + """ + n_policies = len(self.policies.keys()) + if n_policies > 1: + logger.warning( + "Trainer has multiple policies, but default behavior only saves the first." + ) + elif n_policies == 0: + logger.warning("Trainer has no policies, not saving anything.") + return + + model_checkpoint = self._checkpoint() + self.model_saver.copy_final_model(model_checkpoint.file_path) + export_ext = "onnx" + final_checkpoint = attr.evolve( + model_checkpoint, file_path=f"{self.model_saver.model_path}.{export_ext}" + ) + ModelCheckpointManager.track_final_checkpoint(self.brain_name, final_checkpoint) + + @abc.abstractmethod + def _update_policy(self) -> bool: + """ + Uses demonstration_buffer to update model. + :return: Whether or not the policy was updated. + """ + pass + + def _increment_step(self, n_steps: int, name_behavior_id: str) -> None: + """ + Increment the step count of the trainer + :param n_steps: number of steps to increment the step count by + """ + self._step += n_steps + self._next_summary_step = self._get_next_interval_step(self.summary_freq) + self._next_save_step = self._get_next_interval_step( + self.trainer_settings.checkpoint_interval + ) + p = self.get_policy(name_behavior_id) + if p: + p.increment_step(n_steps) + self.stats_reporter.set_stat("Step", float(self.get_step)) + + def _get_next_interval_step(self, interval: int) -> int: + """ + Get the next step count that should result in an action. + :param interval: The interval between actions. + """ + return self._step + (interval - self._step % interval) + + def _write_summary(self, step: int) -> None: + """ + Saves training statistics to Tensorboard. + """ + self.stats_reporter.add_stat("Is Training", float(self.should_still_train)) + self.stats_reporter.write_stats(int(step)) + + @abc.abstractmethod + def _process_trajectory(self, trajectory: Trajectory) -> None: + """ + Takes a trajectory and processes it, putting it into the update buffer. + :param trajectory: The Trajectory tuple containing the steps to be processed. + """ + self._maybe_write_summary(self.get_step + len(trajectory.steps)) + self._maybe_save_model(self.get_step + len(trajectory.steps)) + self._increment_step(len(trajectory.steps), trajectory.behavior_id) + + def _maybe_write_summary(self, step_after_process: int) -> None: + """ + If processing the trajectory will make the step exceed the next summary write, + write the summary. This logic ensures summaries are written on the update step and not in between. + :param step_after_process: the step count after processing the next trajectory. + """ + if self._next_summary_step == 0: # Don't write out the first one + self._next_summary_step = self._get_next_interval_step(self.summary_freq) + if step_after_process >= self._next_summary_step and self.get_step != 0: + self._write_summary(self._next_summary_step) + + def _append_to_update_buffer(self, agentbuffer_trajectory: AgentBuffer) -> None: + """ + Append an AgentBuffer to the update buffer. If the trainer isn't training, + don't update to avoid a memory leak. + """ + if self.should_still_train: + seq_len = ( + self.trainer_settings.network_settings.memory.sequence_length + if self.trainer_settings.network_settings.memory is not None + else 1 + ) + agentbuffer_trajectory.resequence_and_append( + self.update_buffer, training_length=seq_len + ) + + def _maybe_save_model(self, step_after_process: int) -> None: + """ + If processing the trajectory will make the step exceed the next model write, + save the model. This logic ensures models are written on the update step and not in between. + :param step_after_process: the step count after processing the next trajectory. + """ + if self._next_save_step == 0: # Don't save the first one + self._next_save_step = self._get_next_interval_step( + self.trainer_settings.checkpoint_interval + ) + if step_after_process >= self._next_save_step and self.get_step != 0: + self._checkpoint() + + def _warn_if_group_reward(self, buffer: AgentBuffer) -> None: + """ + Warn if the trainer receives a Group Reward but isn't a multiagent trainer (e.g. POCA). + """ + if not self._has_warned_group_rewards: + if np.any(buffer[BufferKey.GROUP_REWARD]): + logger.warning( + "An agent recieved a Group Reward, but you are not using a multi-agent trainer. " + "Please use the POCA trainer for best results." + ) + self._has_warned_group_rewards = True + + def advance(self) -> None: + """ + Steps the trainer, taking in trajectories and updates if ready. + Will block and wait briefly if there are no trajectories. + """ + with hierarchical_timer("process_trajectory"): + for traj_queue in self.trajectory_queues: + # We grab at most the maximum length of the queue. + # This ensures that even if the queue is being filled faster than it is + # being emptied, the trajectories in the queue are on-policy. + _queried = False + for _ in range(traj_queue.qsize()): + _queried = True + try: + t = traj_queue.get_nowait() + self._process_trajectory(t) + except AgentManagerQueue.Empty: + break + if self.threaded and not _queried: + # Yield thread to avoid busy-waiting + time.sleep(0.0001) + if self.should_still_train: + if self._is_ready_update(): + with hierarchical_timer("_update_policy"): + if self._update_policy(): + for q in self.policy_queues: + # Get policies that correspond to the policy queue in question + q.put(self.get_policy(q.behavior_id)) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..58a339efd2b4e5089d8564bbe51d55d005b2b27b --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer.py @@ -0,0 +1,183 @@ +# # Unity ML-Agents Toolkit +from typing import List, Deque, Dict +import abc +from collections import deque + +from mlagents_envs.logging_util import get_logger +from mlagents_envs.base_env import BehaviorSpec +from mlagents.trainers.stats import StatsReporter +from mlagents.trainers.trajectory import Trajectory +from mlagents.trainers.agent_processor import AgentManagerQueue +from mlagents.trainers.policy import Policy +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers +from mlagents.trainers.settings import TrainerSettings + + +logger = get_logger(__name__) + + +class Trainer(abc.ABC): + """This class is the base class for the mlagents_envs.trainers""" + + def __init__( + self, + brain_name: str, + trainer_settings: TrainerSettings, + training: bool, + load: bool, + artifact_path: str, + reward_buff_cap: int = 1, + ): + """ + Responsible for collecting experiences and training a neural network model. + :param brain_name: Brain name of brain to be trained. + :param trainer_settings: The parameters for the trainer (dictionary). + :param training: Whether the trainer is set for training. + :param artifact_path: The directory within which to store artifacts from this trainer + :param reward_buff_cap: + """ + self.brain_name = brain_name + self.trainer_settings = trainer_settings + self._threaded = trainer_settings.threaded + self._stats_reporter = StatsReporter(brain_name) + self.is_training = training + self.load = load + self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap) + self.policy_queues: List[AgentManagerQueue[Policy]] = [] + self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = [] + self._step: int = 0 + self.artifact_path = artifact_path + self.summary_freq = self.trainer_settings.summary_freq + self.policies: Dict[str, Policy] = {} + + @property + def stats_reporter(self): + """ + Returns the stats reporter associated with this Trainer. + """ + return self._stats_reporter + + @property + def parameters(self) -> TrainerSettings: + """ + Returns the trainer parameters of the trainer. + """ + return self.trainer_settings + + @property + def get_max_steps(self) -> int: + """ + Returns the maximum number of steps. Is used to know when the trainer should be stopped. + :return: The maximum number of steps of the trainer + """ + return self.trainer_settings.max_steps + + @property + def get_step(self) -> int: + """ + Returns the number of steps the trainer has performed + :return: the step count of the trainer + """ + return self._step + + @property + def threaded(self) -> bool: + """ + Whether or not to run the trainer in a thread. True allows the trainer to + update the policy while the environment is taking steps. Set to False to + enforce strict on-policy updates (i.e. don't update the policy when taking steps.) + """ + return self._threaded + + @property + def should_still_train(self) -> bool: + """ + Returns whether or not the trainer should train. A Trainer could + stop training if it wasn't training to begin with, or if max_steps + is reached. + """ + return self.is_training and self.get_step <= self.get_max_steps + + @property + def reward_buffer(self) -> Deque[float]: + """ + Returns the reward buffer. The reward buffer contains the cumulative + rewards of the most recent episodes completed by agents using this + trainer. + :return: the reward buffer. + """ + return self._reward_buffer + + @abc.abstractmethod + def save_model(self) -> None: + """ + Saves model file(s) for the policy or policies associated with this trainer. + """ + pass + + @abc.abstractmethod + def end_episode(self): + """ + A signal that the Episode has ended. The buffer must be reset. + Get only called when the academy resets. + """ + pass + + @abc.abstractmethod + def create_policy( + self, + parsed_behavior_id: BehaviorIdentifiers, + behavior_spec: BehaviorSpec, + ) -> Policy: + """ + Creates a Policy object + """ + pass + + @abc.abstractmethod + def add_policy( + self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy + ) -> None: + """ + Adds policy to trainer. + """ + pass + + def get_policy(self, name_behavior_id: str) -> Policy: + """ + Gets policy associated with name_behavior_id + :param name_behavior_id: Fully qualified behavior name + :return: Policy associated with name_behavior_id + """ + return self.policies[name_behavior_id] + + @abc.abstractmethod + def advance(self) -> None: + """ + Advances the trainer. Typically, this means grabbing trajectories + from all subscribed trajectory queues (self.trajectory_queues), and updating + a policy using the steps in them, and if needed pushing a new policy onto the right + policy queues (self.policy_queues). + """ + pass + + def publish_policy_queue(self, policy_queue: AgentManagerQueue[Policy]) -> None: + """ + Adds a policy queue to the list of queues to publish to when this Trainer + makes a policy update + :param policy_queue: Policy queue to publish to. + """ + self.policy_queues.append(policy_queue) + + def subscribe_trajectory_queue( + self, trajectory_queue: AgentManagerQueue[Trajectory] + ) -> None: + """ + Adds a trajectory queue to the list of queues for the trainer to ingest Trajectories from. + :param trajectory_queue: Trajectory queue to read from. + """ + self.trajectory_queues.append(trajectory_queue) + + @staticmethod + def get_trainer_name() -> str: + raise NotImplementedError diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer_factory.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..ffb774151336a5a3e790e25843c62895562ad57d --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer_factory.py @@ -0,0 +1,131 @@ +import os +from typing import Dict + +from mlagents_envs.logging_util import get_logger +from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager +from mlagents.trainers.exception import TrainerConfigError +from mlagents.trainers.trainer import Trainer +from mlagents.trainers.ghost.trainer import GhostTrainer +from mlagents.trainers.ghost.controller import GhostController +from mlagents.trainers.settings import TrainerSettings +from mlagents.plugins import all_trainer_types + + +logger = get_logger(__name__) + + +class TrainerFactory: + def __init__( + self, + trainer_config: Dict[str, TrainerSettings], + output_path: str, + train_model: bool, + load_model: bool, + seed: int, + param_manager: EnvironmentParameterManager, + init_path: str = None, + multi_gpu: bool = False, + ): + """ + The TrainerFactory generates the Trainers based on the configuration passed as + input. + :param trainer_config: A dictionary from behavior name to TrainerSettings + :param output_path: The path to the directory where the artifacts generated by + the trainer will be saved. + :param train_model: If True, the Trainers will train the model and if False, + only perform inference. + :param load_model: If True, the Trainer will load neural networks weights from + the previous run. + :param seed: The seed of the Trainers. Dictates how the neural networks will be + initialized. + :param param_manager: The EnvironmentParameterManager that will dictate when/if + the EnvironmentParameters must change. + :param init_path: Path from which to load model. + :param multi_gpu: If True, multi-gpu will be used. (currently not available) + """ + self.trainer_config = trainer_config + self.output_path = output_path + self.init_path = init_path + self.train_model = train_model + self.load_model = load_model + self.seed = seed + self.param_manager = param_manager + self.multi_gpu = multi_gpu + self.ghost_controller = GhostController() + + def generate(self, behavior_name: str) -> Trainer: + trainer_settings = self.trainer_config[behavior_name] + return TrainerFactory._initialize_trainer( + trainer_settings, + behavior_name, + self.output_path, + self.train_model, + self.load_model, + self.ghost_controller, + self.seed, + self.param_manager, + self.multi_gpu, + ) + + @staticmethod + def _initialize_trainer( + trainer_settings: TrainerSettings, + brain_name: str, + output_path: str, + train_model: bool, + load_model: bool, + ghost_controller: GhostController, + seed: int, + param_manager: EnvironmentParameterManager, + multi_gpu: bool = False, + ) -> Trainer: + """ + Initializes a trainer given a provided trainer configuration and brain parameters, as well as + some general training session options. + + :param trainer_settings: Original trainer configuration loaded from YAML + :param brain_name: Name of the brain to be associated with trainer + :param output_path: Path to save the model and summary statistics + :param keep_checkpoints: How many model checkpoints to keep + :param train_model: Whether to train the model (vs. run inference) + :param load_model: Whether to load the model or randomly initialize + :param ghost_controller: The object that coordinates ghost trainers + :param seed: The random seed to use + :param param_manager: EnvironmentParameterManager, used to determine a reward buffer length for PPOTrainer + :return: + """ + trainer_artifact_path = os.path.join(output_path, brain_name) + + min_lesson_length = param_manager.get_minimum_reward_buffer_size(brain_name) + + trainer: Trainer = None # type: ignore # will be set to one of these, or raise + + try: + trainer_type = all_trainer_types[trainer_settings.trainer_type] + trainer = trainer_type( + brain_name, + min_lesson_length, + trainer_settings, + train_model, + load_model, + seed, + trainer_artifact_path, + ) + + except KeyError: + raise TrainerConfigError( + f"The trainer config contains an unknown trainer type " + f"{trainer_settings.trainer_type} for brain {brain_name}" + ) + + if trainer_settings.self_play is not None: + trainer = GhostTrainer( + trainer, + brain_name, + ghost_controller, + min_lesson_length, + trainer_settings, + train_model, + trainer_artifact_path, + ) + return trainer diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer_utils.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ad94bd35f066fe37780555d2b3402662f62d4ca5 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer_utils.py @@ -0,0 +1,45 @@ +import numpy as np + + +def discount_rewards(r, gamma=0.99, value_next=0.0): + """ + Computes discounted sum of future rewards for use in updating value estimate. + :param r: List of rewards. + :param gamma: Discount factor. + :param value_next: T+1 value estimate for returns calculation. + :return: discounted sum of future rewards as list. + """ + discounted_r = np.zeros_like(r) + running_add = value_next + for t in reversed(range(0, r.size)): + running_add = running_add * gamma + r[t] + discounted_r[t] = running_add + return discounted_r + + +def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95): + """ + Computes generalized advantage estimate for use in updating policy. + :param rewards: list of rewards for time-steps t to T. + :param value_next: Value estimate for time-step T+1. + :param value_estimates: list of value estimates for time-steps t to T. + :param gamma: Discount factor. + :param lambd: GAE weighing factor. + :return: list of advantage estimates for time-steps t to T. + """ + value_estimates = np.append(value_estimates, value_next) + delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1] + advantage = discount_rewards(r=delta_t, gamma=gamma * lambd) + return advantage + + +def lambda_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0): + returns = np.zeros_like(r) + returns[-1] = r[-1] + gamma * value_next + for t in reversed(range(0, r.size - 1)): + returns[t] = ( + gamma * lambd * returns[t + 1] + + r[t] + + (1 - lambd) * gamma * value_estimates[t + 1] + ) + return returns diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer_controller.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer_controller.py new file mode 100644 index 0000000000000000000000000000000000000000..69da1e569496d043e1d4c167a2cce963e9fd69d9 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer_controller.py @@ -0,0 +1,297 @@ +# # Unity ML-Agents Toolkit +# ## ML-Agent Learning +"""Launches trainers for each External Brains in a Unity Environment.""" + +import os +import threading +from typing import Dict, Set, List +from collections import defaultdict + +import numpy as np + +from mlagents_envs.logging_util import get_logger +from mlagents.trainers.env_manager import EnvManager, EnvironmentStep +from mlagents_envs.exception import ( + UnityEnvironmentException, + UnityCommunicationException, + UnityCommunicatorStoppedException, +) +from mlagents_envs.timers import ( + hierarchical_timer, + timed, + get_timer_stack_for_thread, + merge_gauges, +) +from mlagents.trainers.trainer import Trainer +from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager +from mlagents.trainers.trainer import TrainerFactory +from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers +from mlagents.trainers.agent_processor import AgentManager +from mlagents import torch_utils +from mlagents.torch_utils.globals import get_rank + + +class TrainerController: + def __init__( + self, + trainer_factory: TrainerFactory, + output_path: str, + run_id: str, + param_manager: EnvironmentParameterManager, + train: bool, + training_seed: int, + ): + """ + :param output_path: Path to save the model. + :param summaries_dir: Folder to save training summaries. + :param run_id: The sub-directory name for model and summary statistics + :param param_manager: EnvironmentParameterManager object which stores information about all + environment parameters. + :param train: Whether to train model, or only run inference. + :param training_seed: Seed to use for Numpy and Torch random number generation. + :param threaded: Whether or not to run trainers in a separate thread. Disable for testing/debugging. + """ + self.trainers: Dict[str, Trainer] = {} + self.brain_name_to_identifier: Dict[str, Set] = defaultdict(set) + self.trainer_factory = trainer_factory + self.output_path = output_path + self.logger = get_logger(__name__) + self.run_id = run_id + self.train_model = train + self.param_manager = param_manager + self.ghost_controller = self.trainer_factory.ghost_controller + self.registered_behavior_ids: Set[str] = set() + + self.trainer_threads: List[threading.Thread] = [] + self.kill_trainers = False + np.random.seed(training_seed) + torch_utils.torch.manual_seed(training_seed) + self.rank = get_rank() + + @timed + def _save_models(self): + """ + Saves current model to checkpoint folder. + """ + if self.rank is not None and self.rank != 0: + return + + for brain_name in self.trainers.keys(): + self.trainers[brain_name].save_model() + self.logger.debug("Saved Model") + + @staticmethod + def _create_output_path(output_path): + try: + if not os.path.exists(output_path): + os.makedirs(output_path) + except Exception: + raise UnityEnvironmentException( + f"The folder {output_path} containing the " + "generated model could not be " + "accessed. Please make sure the " + "permissions are set correctly." + ) + + @timed + def _reset_env(self, env_manager: EnvManager) -> None: + """Resets the environment. + + Returns: + A Data structure corresponding to the initial reset state of the + environment. + """ + new_config = self.param_manager.get_current_samplers() + env_manager.reset(config=new_config) + # Register any new behavior ids that were generated on the reset. + self._register_new_behaviors(env_manager, env_manager.first_step_infos) + + def _not_done_training(self) -> bool: + return ( + any(t.should_still_train for t in self.trainers.values()) + or not self.train_model + ) or len(self.trainers) == 0 + + def _create_trainer_and_manager( + self, env_manager: EnvManager, name_behavior_id: str + ) -> None: + + parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id) + brain_name = parsed_behavior_id.brain_name + trainerthread = None + if brain_name in self.trainers: + trainer = self.trainers[brain_name] + else: + trainer = self.trainer_factory.generate(brain_name) + self.trainers[brain_name] = trainer + if trainer.threaded: + # Only create trainer thread for new trainers + trainerthread = threading.Thread( + target=self.trainer_update_func, args=(trainer,), daemon=True + ) + self.trainer_threads.append(trainerthread) + env_manager.on_training_started( + brain_name, self.trainer_factory.trainer_config[brain_name] + ) + + policy = trainer.create_policy( + parsed_behavior_id, + env_manager.training_behaviors[name_behavior_id], + ) + trainer.add_policy(parsed_behavior_id, policy) + + agent_manager = AgentManager( + policy, + name_behavior_id, + trainer.stats_reporter, + trainer.parameters.time_horizon, + threaded=trainer.threaded, + ) + env_manager.set_agent_manager(name_behavior_id, agent_manager) + env_manager.set_policy(name_behavior_id, policy) + self.brain_name_to_identifier[brain_name].add(name_behavior_id) + + trainer.publish_policy_queue(agent_manager.policy_queue) + trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue) + + # Only start new trainers + if trainerthread is not None: + trainerthread.start() + + def _create_trainers_and_managers( + self, env_manager: EnvManager, behavior_ids: Set[str] + ) -> None: + for behavior_id in behavior_ids: + self._create_trainer_and_manager(env_manager, behavior_id) + + @timed + def start_learning(self, env_manager: EnvManager) -> None: + self._create_output_path(self.output_path) + try: + # Initial reset + self._reset_env(env_manager) + self.param_manager.log_current_lesson() + while self._not_done_training(): + n_steps = self.advance(env_manager) + for _ in range(n_steps): + self.reset_env_if_ready(env_manager) + # Stop advancing trainers + self.join_threads() + except ( + KeyboardInterrupt, + UnityCommunicationException, + UnityEnvironmentException, + UnityCommunicatorStoppedException, + ) as ex: + self.join_threads() + self.logger.info( + "Learning was interrupted. Please wait while the graph is generated." + ) + if isinstance(ex, KeyboardInterrupt) or isinstance( + ex, UnityCommunicatorStoppedException + ): + pass + else: + # If the environment failed, we want to make sure to raise + # the exception so we exit the process with an return code of 1. + raise ex + finally: + if self.train_model: + self._save_models() + + def end_trainer_episodes(self) -> None: + # Reward buffers reset takes place only for curriculum learning + # else no reset. + for trainer in self.trainers.values(): + trainer.end_episode() + + def reset_env_if_ready(self, env: EnvManager) -> None: + # Get the sizes of the reward buffers. + reward_buff = {k: list(t.reward_buffer) for (k, t) in self.trainers.items()} + curr_step = {k: int(t.get_step) for (k, t) in self.trainers.items()} + max_step = {k: int(t.get_max_steps) for (k, t) in self.trainers.items()} + # Attempt to increment the lessons of the brains who + # were ready. + updated, param_must_reset = self.param_manager.update_lessons( + curr_step, max_step, reward_buff + ) + if updated: + for trainer in self.trainers.values(): + trainer.reward_buffer.clear() + # If ghost trainer swapped teams + ghost_controller_reset = self.ghost_controller.should_reset() + if param_must_reset or ghost_controller_reset: + self._reset_env(env) # This reset also sends the new config to env + self.end_trainer_episodes() + elif updated: + env.set_env_parameters(self.param_manager.get_current_samplers()) + + @timed + def advance(self, env_manager: EnvManager) -> int: + # Get steps + with hierarchical_timer("env_step"): + new_step_infos = env_manager.get_steps() + self._register_new_behaviors(env_manager, new_step_infos) + num_steps = env_manager.process_steps(new_step_infos) + + # Report current lesson for each environment parameter + for ( + param_name, + lesson_number, + ) in self.param_manager.get_current_lesson_number().items(): + for trainer in self.trainers.values(): + trainer.stats_reporter.set_stat( + f"Environment/Lesson Number/{param_name}", lesson_number + ) + + for trainer in self.trainers.values(): + if not trainer.threaded: + with hierarchical_timer("trainer_advance"): + trainer.advance() + + return num_steps + + def _register_new_behaviors( + self, env_manager: EnvManager, step_infos: List[EnvironmentStep] + ) -> None: + """ + Handle registration (adding trainers and managers) of new behaviors ids. + :param env_manager: + :param step_infos: + :return: + """ + step_behavior_ids: Set[str] = set() + for s in step_infos: + step_behavior_ids |= set(s.name_behavior_ids) + new_behavior_ids = step_behavior_ids - self.registered_behavior_ids + self._create_trainers_and_managers(env_manager, new_behavior_ids) + self.registered_behavior_ids |= step_behavior_ids + + def join_threads(self, timeout_seconds: float = 1.0) -> None: + """ + Wait for threads to finish, and merge their timer information into the main thread. + :param timeout_seconds: + :return: + """ + self.kill_trainers = True + for t in self.trainer_threads: + try: + t.join(timeout_seconds) + except Exception: + pass + + with hierarchical_timer("trainer_threads") as main_timer_node: + for trainer_thread in self.trainer_threads: + thread_timer_stack = get_timer_stack_for_thread(trainer_thread) + if thread_timer_stack: + main_timer_node.merge( + thread_timer_stack.root, + root_name="thread_root", + is_parallel=True, + ) + merge_gauges(thread_timer_stack.gauges) + + def trainer_update_func(self, trainer: Trainer) -> None: + while not self.kill_trainers: + with hierarchical_timer("trainer_advance"): + trainer.advance() diff --git a/MLPY/Lib/site-packages/mlagents/trainers/training_analytics_side_channel.py b/MLPY/Lib/site-packages/mlagents/trainers/training_analytics_side_channel.py new file mode 100644 index 0000000000000000000000000000000000000000..84edad717534342d943feffe6a3c44de270a1f61 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/training_analytics_side_channel.py @@ -0,0 +1,188 @@ +import copy +import json +import hmac +import hashlib +import sys +from typing import Optional, Dict +import mlagents_envs +import mlagents.trainers +from mlagents import torch_utils +from mlagents.trainers.settings import RewardSignalType +from mlagents_envs.exception import UnityCommunicationException +from mlagents_envs.side_channel import ( + IncomingMessage, + OutgoingMessage, + DefaultTrainingAnalyticsSideChannel, +) +from mlagents_envs.communicator_objects.training_analytics_pb2 import ( + TrainingEnvironmentInitialized, + TrainingBehaviorInitialized, +) +from google.protobuf.any_pb2 import Any + +from mlagents.trainers.settings import TrainerSettings, RunOptions + + +class TrainingAnalyticsSideChannel(DefaultTrainingAnalyticsSideChannel): + """ + Side channel that sends information about the training to the Unity environment so it can be logged. + """ + + __vendorKey: str = "unity.ml-agents" + + def __init__(self) -> None: + # >>> uuid.uuid5(uuid.NAMESPACE_URL, "com.unity.ml-agents/TrainingAnalyticsSideChannel") + # UUID('b664a4a9-d86f-5a5f-95cb-e8353a7e8356') + # Use the same uuid as the parent side channel + super().__init__() + self.run_options: Optional[RunOptions] = None + + @classmethod + def _hash(cls, data: str) -> str: + res = hmac.new( + cls.__vendorKey.encode("utf-8"), data.encode("utf-8"), hashlib.sha256 + ).hexdigest() + return res + + def on_message_received(self, msg: IncomingMessage) -> None: + raise UnityCommunicationException( + "The TrainingAnalyticsSideChannel received a message from Unity, " + "this should not have happened." + ) + + @classmethod + def _sanitize_run_options(cls, config: RunOptions) -> Dict[str, Any]: + res = copy.deepcopy(config.as_dict()) + + # Filter potentially PII behavior names + if "behaviors" in res and res["behaviors"]: + res["behaviors"] = {cls._hash(k): v for (k, v) in res["behaviors"].items()} + for (k, v) in res["behaviors"].items(): + if "init_path" in v and v["init_path"] is not None: + hashed_path = cls._hash(v["init_path"]) + res["behaviors"][k]["init_path"] = hashed_path + if "demo_path" in v and v["demo_path"] is not None: + hashed_path = cls._hash(v["demo_path"]) + res["behaviors"][k]["demo_path"] = hashed_path + + # Filter potentially PII curriculum and behavior names from Checkpoint Settings + if "environment_parameters" in res and res["environment_parameters"]: + res["environment_parameters"] = { + cls._hash(k): v for (k, v) in res["environment_parameters"].items() + } + for (curriculumName, curriculum) in res["environment_parameters"].items(): + updated_lessons = [] + for lesson in curriculum["curriculum"]: + new_lesson = copy.deepcopy(lesson) + if "name" in lesson: + new_lesson["name"] = cls._hash(lesson["name"]) + if ( + "completion_criteria" in lesson + and lesson["completion_criteria"] is not None + ): + new_lesson["completion_criteria"]["behavior"] = cls._hash( + new_lesson["completion_criteria"]["behavior"] + ) + updated_lessons.append(new_lesson) + res["environment_parameters"][curriculumName][ + "curriculum" + ] = updated_lessons + + # Filter potentially PII filenames from Checkpoint Settings + if "checkpoint_settings" in res and res["checkpoint_settings"] is not None: + if ( + "initialize_from" in res["checkpoint_settings"] + and res["checkpoint_settings"]["initialize_from"] is not None + ): + res["checkpoint_settings"]["initialize_from"] = cls._hash( + res["checkpoint_settings"]["initialize_from"] + ) + if ( + "results_dir" in res["checkpoint_settings"] + and res["checkpoint_settings"]["results_dir"] is not None + ): + res["checkpoint_settings"]["results_dir"] = hash( + res["checkpoint_settings"]["results_dir"] + ) + + return res + + def environment_initialized(self, run_options: RunOptions) -> None: + self.run_options = run_options + # Tuple of (major, minor, patch) + vi = sys.version_info + env_params = run_options.environment_parameters + sanitized_run_options = self._sanitize_run_options(run_options) + + msg = TrainingEnvironmentInitialized( + python_version=f"{vi[0]}.{vi[1]}.{vi[2]}", + mlagents_version=mlagents.trainers.__version__, + mlagents_envs_version=mlagents_envs.__version__, + torch_version=torch_utils.torch.__version__, + torch_device_type=torch_utils.default_device().type, + num_envs=run_options.env_settings.num_envs, + num_environment_parameters=len(env_params) if env_params else 0, + run_options=json.dumps(sanitized_run_options), + ) + + any_message = Any() + any_message.Pack(msg) + + env_init_msg = OutgoingMessage() + env_init_msg.set_raw_bytes(any_message.SerializeToString()) + super().queue_message_to_send(env_init_msg) + + @classmethod + def _sanitize_trainer_settings(cls, config: TrainerSettings) -> Dict[str, Any]: + config_dict = copy.deepcopy(config.as_dict()) + if "init_path" in config_dict and config_dict["init_path"] is not None: + hashed_path = cls._hash(config_dict["init_path"]) + config_dict["init_path"] = hashed_path + if "demo_path" in config_dict and config_dict["demo_path"] is not None: + hashed_path = cls._hash(config_dict["demo_path"]) + config_dict["demo_path"] = hashed_path + return config_dict + + def training_started(self, behavior_name: str, config: TrainerSettings) -> None: + raw_config = self._sanitize_trainer_settings(config) + msg = TrainingBehaviorInitialized( + behavior_name=self._hash(behavior_name), + trainer_type=config.trainer_type, + extrinsic_reward_enabled=( + RewardSignalType.EXTRINSIC in config.reward_signals + ), + gail_reward_enabled=(RewardSignalType.GAIL in config.reward_signals), + curiosity_reward_enabled=( + RewardSignalType.CURIOSITY in config.reward_signals + ), + rnd_reward_enabled=(RewardSignalType.RND in config.reward_signals), + behavioral_cloning_enabled=config.behavioral_cloning is not None, + recurrent_enabled=config.network_settings.memory is not None, + visual_encoder=config.network_settings.vis_encode_type.value, + num_network_layers=config.network_settings.num_layers, + num_network_hidden_units=config.network_settings.hidden_units, + trainer_threaded=config.threaded, + self_play_enabled=config.self_play is not None, + curriculum_enabled=self._behavior_uses_curriculum(behavior_name), + config=json.dumps(raw_config), + ) + + any_message = Any() + any_message.Pack(msg) + + training_start_msg = OutgoingMessage() + training_start_msg.set_raw_bytes(any_message.SerializeToString()) + + super().queue_message_to_send(training_start_msg) + + def _behavior_uses_curriculum(self, behavior_name: str) -> bool: + if not self.run_options or not self.run_options.environment_parameters: + return False + + for param_settings in self.run_options.environment_parameters.values(): + for lesson in param_settings.curriculum: + cc = lesson.completion_criteria + if cc and cc.behavior == behavior_name: + return True + + return False diff --git a/MLPY/Lib/site-packages/mlagents/trainers/training_status.py b/MLPY/Lib/site-packages/mlagents/trainers/training_status.py new file mode 100644 index 0000000000000000000000000000000000000000..06bd73cd23af6e838675f815d859cc6e184a8f2b --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/training_status.py @@ -0,0 +1,118 @@ +from typing import Dict, Any +from enum import Enum +from collections import defaultdict +import json +import attr +import cattr + +from mlagents.torch_utils import torch +from mlagents_envs.logging_util import get_logger +from mlagents.trainers import __version__ +from mlagents.trainers.exception import TrainerError + +logger = get_logger(__name__) + +STATUS_FORMAT_VERSION = "0.3.0" + + +class StatusType(Enum): + LESSON_NUM = "lesson_num" + STATS_METADATA = "metadata" + CHECKPOINTS = "checkpoints" + FINAL_CHECKPOINT = "final_checkpoint" + ELO = "elo" + + +@attr.s(auto_attribs=True) +class StatusMetaData: + stats_format_version: str = STATUS_FORMAT_VERSION + mlagents_version: str = __version__ + torch_version: str = torch.__version__ + + def to_dict(self) -> Dict[str, str]: + return cattr.unstructure(self) + + @staticmethod + def from_dict(import_dict: Dict[str, str]) -> "StatusMetaData": + return cattr.structure(import_dict, StatusMetaData) + + def check_compatibility(self, other: "StatusMetaData") -> None: + """ + Check compatibility with a loaded StatsMetaData and warn the user + if versions mismatch. This is used for resuming from old checkpoints. + """ + # This should cover all stats version mismatches as well. + if self.mlagents_version != other.mlagents_version: + logger.warning( + "Checkpoint was loaded from a different version of ML-Agents. Some things may not resume properly." + ) + if self.torch_version != other.torch_version: + logger.warning( + "PyTorch checkpoint was saved with a different version of PyTorch. Model may not resume properly." + ) + + +class GlobalTrainingStatus: + """ + GlobalTrainingStatus class that contains static methods to save global training status and + load it on a resume. These are values that might be needed for the training resume that + cannot/should not be captured in a model checkpoint, such as curriclum lesson. + """ + + saved_state: Dict[str, Dict[str, Any]] = defaultdict(lambda: {}) + + @staticmethod + def load_state(path: str) -> None: + """ + Load a JSON file that contains saved state. + :param path: Path to the JSON file containing the state. + """ + try: + with open(path) as f: + loaded_dict = json.load(f) + # Compare the metadata + _metadata = loaded_dict[StatusType.STATS_METADATA.value] + StatusMetaData.from_dict(_metadata).check_compatibility(StatusMetaData()) + # Update saved state. + GlobalTrainingStatus.saved_state.update(loaded_dict) + except FileNotFoundError: + logger.warning( + "Training status file not found. Not all functions will resume properly." + ) + except KeyError: + raise TrainerError( + "Metadata not found, resuming from an incompatible version of ML-Agents." + ) + + @staticmethod + def save_state(path: str) -> None: + """ + Save a JSON file that contains saved state. + :param path: Path to the JSON file containing the state. + """ + GlobalTrainingStatus.saved_state[ + StatusType.STATS_METADATA.value + ] = StatusMetaData().to_dict() + with open(path, "w") as f: + json.dump(GlobalTrainingStatus.saved_state, f, indent=4) + + @staticmethod + def set_parameter_state(category: str, key: StatusType, value: Any) -> None: + """ + Stores an arbitrary-named parameter in the global saved state. + :param category: The category (usually behavior name) of the parameter. + :param key: The parameter, e.g. lesson number. + :param value: The value. + """ + GlobalTrainingStatus.saved_state[category][key.value] = value + + @staticmethod + def get_parameter_state(category: str, key: StatusType) -> Any: + """ + Loads an arbitrary-named parameter from training_status.json. + If not found, returns None. + :param category: The category (usually behavior name) of the parameter. + :param key: The statistic, e.g. lesson number. + :param value: The value. + """ + return GlobalTrainingStatus.saved_state[category].get(key.value, None) diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trajectory.py b/MLPY/Lib/site-packages/mlagents/trainers/trajectory.py new file mode 100644 index 0000000000000000000000000000000000000000..0a08bc24b497585357b4273b7d048a4218161ac0 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/trajectory.py @@ -0,0 +1,313 @@ +from typing import List, NamedTuple +import numpy as np + +from mlagents.trainers.buffer import ( + AgentBuffer, + ObservationKeyPrefix, + AgentBufferKey, + BufferKey, +) +from mlagents_envs.base_env import ActionTuple +from mlagents.trainers.torch_entities.action_log_probs import LogProbsTuple + + +class AgentStatus(NamedTuple): + """ + Stores observation, action, and reward for an agent. Does not have additional + fields that are present in AgentExperience. + """ + + obs: List[np.ndarray] + reward: float + action: ActionTuple + done: bool + + +class AgentExperience(NamedTuple): + """ + Stores the full amount of data for an agent in one timestep. Includes + the status' of group mates and the group reward, as well as the probabilities + outputted by the policy. + """ + + obs: List[np.ndarray] + reward: float + done: bool + action: ActionTuple + action_probs: LogProbsTuple + action_mask: np.ndarray + prev_action: np.ndarray + interrupted: bool + memory: np.ndarray + group_status: List[AgentStatus] + group_reward: float + + +class ObsUtil: + @staticmethod + def get_name_at(index: int) -> AgentBufferKey: + """ + returns the name of the observation given the index of the observation + """ + return ObservationKeyPrefix.OBSERVATION, index + + @staticmethod + def get_name_at_next(index: int) -> AgentBufferKey: + """ + returns the name of the next observation given the index of the observation + """ + return ObservationKeyPrefix.NEXT_OBSERVATION, index + + @staticmethod + def from_buffer(batch: AgentBuffer, num_obs: int) -> List[np.array]: + """ + Creates the list of observations from an AgentBuffer + """ + result: List[np.array] = [] + for i in range(num_obs): + result.append(batch[ObsUtil.get_name_at(i)]) + return result + + @staticmethod + def from_buffer_next(batch: AgentBuffer, num_obs: int) -> List[np.array]: + """ + Creates the list of next observations from an AgentBuffer + """ + result = [] + for i in range(num_obs): + result.append(batch[ObsUtil.get_name_at_next(i)]) + return result + + +class GroupObsUtil: + @staticmethod + def get_name_at(index: int) -> AgentBufferKey: + """ + returns the name of the observation given the index of the observation + """ + return ObservationKeyPrefix.GROUP_OBSERVATION, index + + @staticmethod + def get_name_at_next(index: int) -> AgentBufferKey: + """ + returns the name of the next team observation given the index of the observation + """ + return ObservationKeyPrefix.NEXT_GROUP_OBSERVATION, index + + @staticmethod + def _transpose_list_of_lists( + list_list: List[List[np.ndarray]], + ) -> List[List[np.ndarray]]: + return list(map(list, zip(*list_list))) + + @staticmethod + def from_buffer(batch: AgentBuffer, num_obs: int) -> List[np.array]: + """ + Creates the list of observations from an AgentBuffer + """ + separated_obs: List[np.array] = [] + for i in range(num_obs): + separated_obs.append( + batch[GroupObsUtil.get_name_at(i)].padded_to_batch(pad_value=np.nan) + ) + # separated_obs contains a List(num_obs) of Lists(num_agents), we want to flip + # that and get a List(num_agents) of Lists(num_obs) + result = GroupObsUtil._transpose_list_of_lists(separated_obs) + return result + + @staticmethod + def from_buffer_next(batch: AgentBuffer, num_obs: int) -> List[np.array]: + """ + Creates the list of observations from an AgentBuffer + """ + separated_obs: List[np.array] = [] + for i in range(num_obs): + separated_obs.append( + batch[GroupObsUtil.get_name_at_next(i)].padded_to_batch( + pad_value=np.nan + ) + ) + # separated_obs contains a List(num_obs) of Lists(num_agents), we want to flip + # that and get a List(num_agents) of Lists(num_obs) + result = GroupObsUtil._transpose_list_of_lists(separated_obs) + return result + + +class Trajectory(NamedTuple): + steps: List[AgentExperience] + next_obs: List[ + np.ndarray + ] # Observation following the trajectory, for bootstrapping + next_group_obs: List[List[np.ndarray]] + agent_id: str + behavior_id: str + + def to_agentbuffer(self) -> AgentBuffer: + """ + Converts a Trajectory to an AgentBuffer + :param trajectory: A Trajectory + :returns: AgentBuffer. Note that the length of the AgentBuffer will be one + less than the trajectory, as the next observation need to be populated from the last + step of the trajectory. + """ + agent_buffer_trajectory = AgentBuffer() + obs = self.steps[0].obs + for step, exp in enumerate(self.steps): + is_last_step = step == len(self.steps) - 1 + if not is_last_step: + next_obs = self.steps[step + 1].obs + else: + next_obs = self.next_obs + + num_obs = len(obs) + for i in range(num_obs): + agent_buffer_trajectory[ObsUtil.get_name_at(i)].append(obs[i]) + agent_buffer_trajectory[ObsUtil.get_name_at_next(i)].append(next_obs[i]) + + # Take care of teammate obs and actions + teammate_continuous_actions, teammate_discrete_actions, teammate_rewards = ( + [], + [], + [], + ) + for group_status in exp.group_status: + teammate_rewards.append(group_status.reward) + teammate_continuous_actions.append(group_status.action.continuous) + teammate_discrete_actions.append(group_status.action.discrete) + + # Team actions + agent_buffer_trajectory[BufferKey.GROUP_CONTINUOUS_ACTION].append( + teammate_continuous_actions + ) + agent_buffer_trajectory[BufferKey.GROUP_DISCRETE_ACTION].append( + teammate_discrete_actions + ) + agent_buffer_trajectory[BufferKey.GROUPMATE_REWARDS].append( + teammate_rewards + ) + agent_buffer_trajectory[BufferKey.GROUP_REWARD].append(exp.group_reward) + + # Next actions + teammate_cont_next_actions = [] + teammate_disc_next_actions = [] + if not is_last_step: + next_exp = self.steps[step + 1] + for group_status in next_exp.group_status: + teammate_cont_next_actions.append(group_status.action.continuous) + teammate_disc_next_actions.append(group_status.action.discrete) + else: + for group_status in exp.group_status: + teammate_cont_next_actions.append(group_status.action.continuous) + teammate_disc_next_actions.append(group_status.action.discrete) + + agent_buffer_trajectory[BufferKey.GROUP_NEXT_CONT_ACTION].append( + teammate_cont_next_actions + ) + agent_buffer_trajectory[BufferKey.GROUP_NEXT_DISC_ACTION].append( + teammate_disc_next_actions + ) + + for i in range(num_obs): + ith_group_obs = [] + for _group_status in exp.group_status: + # Assume teammates have same obs space + ith_group_obs.append(_group_status.obs[i]) + agent_buffer_trajectory[GroupObsUtil.get_name_at(i)].append( + ith_group_obs + ) + + ith_group_obs_next = [] + if is_last_step: + for _obs in self.next_group_obs: + ith_group_obs_next.append(_obs[i]) + else: + next_group_status = self.steps[step + 1].group_status + for _group_status in next_group_status: + # Assume teammates have same obs space + ith_group_obs_next.append(_group_status.obs[i]) + agent_buffer_trajectory[GroupObsUtil.get_name_at_next(i)].append( + ith_group_obs_next + ) + + if exp.memory is not None: + agent_buffer_trajectory[BufferKey.MEMORY].append(exp.memory) + + agent_buffer_trajectory[BufferKey.MASKS].append(1.0) + agent_buffer_trajectory[BufferKey.DONE].append(exp.done) + agent_buffer_trajectory[BufferKey.GROUP_DONES].append( + [_status.done for _status in exp.group_status] + ) + + # Adds the log prob and action of continuous/discrete separately + agent_buffer_trajectory[BufferKey.CONTINUOUS_ACTION].append( + exp.action.continuous + ) + agent_buffer_trajectory[BufferKey.DISCRETE_ACTION].append( + exp.action.discrete + ) + + if not is_last_step: + next_action = self.steps[step + 1].action + cont_next_actions = next_action.continuous + disc_next_actions = next_action.discrete + else: + cont_next_actions = np.zeros_like(exp.action.continuous) + disc_next_actions = np.zeros_like(exp.action.discrete) + + agent_buffer_trajectory[BufferKey.NEXT_CONT_ACTION].append( + cont_next_actions + ) + agent_buffer_trajectory[BufferKey.NEXT_DISC_ACTION].append( + disc_next_actions + ) + + agent_buffer_trajectory[BufferKey.CONTINUOUS_LOG_PROBS].append( + exp.action_probs.continuous + ) + agent_buffer_trajectory[BufferKey.DISCRETE_LOG_PROBS].append( + exp.action_probs.discrete + ) + + # Store action masks if necessary. Note that 1 means active, while + # in AgentExperience False means active. + if exp.action_mask is not None: + mask = 1 - np.concatenate(exp.action_mask) + agent_buffer_trajectory[BufferKey.ACTION_MASK].append( + mask, padding_value=1 + ) + else: + # This should never be needed unless the environment somehow doesn't supply the + # action mask in a discrete space. + + action_shape = exp.action.discrete.shape + agent_buffer_trajectory[BufferKey.ACTION_MASK].append( + np.ones(action_shape, dtype=np.float32), padding_value=1 + ) + agent_buffer_trajectory[BufferKey.PREV_ACTION].append(exp.prev_action) + agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS].append(exp.reward) + + # Store the next visual obs as the current + obs = next_obs + return agent_buffer_trajectory + + @property + def done_reached(self) -> bool: + """ + Returns true if trajectory is terminated with a Done. + """ + return self.steps[-1].done + + @property + def all_group_dones_reached(self) -> bool: + """ + Returns true if all other agents in this trajectory are done at the end of the trajectory. + Combine with done_reached to check if the whole team is done. + """ + return all(_status.done for _status in self.steps[-1].group_status) + + @property + def interrupted(self) -> bool: + """ + Returns true if trajectory was terminated because max steps was reached. + """ + return self.steps[-1].interrupted diff --git a/MLPY/Lib/site-packages/mlagents/trainers/upgrade_config.py b/MLPY/Lib/site-packages/mlagents/trainers/upgrade_config.py new file mode 100644 index 0000000000000000000000000000000000000000..d07ce0016d11076a8dad6d8ea206248ff1bacc4b --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents/trainers/upgrade_config.py @@ -0,0 +1,251 @@ +# NOTE: This upgrade script is a temporary measure for the transition between the old-format +# configuration file and the new format. It will be marked for deprecation once the +# Python CLI and configuration files are finalized, and removed the following release. + +import attr +import cattr +import yaml +from typing import Dict, Any, Optional +import argparse +from mlagents.trainers.settings import TrainerSettings, NetworkSettings +from mlagents.trainers.cli_utils import load_config +from mlagents.trainers.exception import TrainerConfigError +from mlagents.plugins import all_trainer_settings + + +# Take an existing trainer config (e.g. trainer_config.yaml) and turn it into the new format. +def convert_behaviors(old_trainer_config: Dict[str, Any]) -> Dict[str, Any]: + all_behavior_config_dict = {} + default_config = old_trainer_config.get("default", {}) + for behavior_name, config in old_trainer_config.items(): + if behavior_name != "default": + config = default_config.copy() + config.update(old_trainer_config[behavior_name]) + + # Convert to split TrainerSettings, Hyperparameters, NetworkSettings + # Set trainer_type and get appropriate hyperparameter settings + try: + trainer_type = config["trainer"] + except KeyError: + raise TrainerConfigError( + "Config doesn't specify a trainer type. " + "Please specify trainer: in your config." + ) + new_config = {} + new_config["trainer_type"] = trainer_type + hyperparam_cls = all_trainer_settings[trainer_type] + # Try to absorb as much as possible into the hyperparam_cls + new_config["hyperparameters"] = cattr.structure(config, hyperparam_cls) + + # Try to absorb as much as possible into the network settings + new_config["network_settings"] = cattr.structure(config, NetworkSettings) + # Deal with recurrent + try: + if config["use_recurrent"]: + new_config[ + "network_settings" + ].memory = NetworkSettings.MemorySettings( + sequence_length=config["sequence_length"], + memory_size=config["memory_size"], + ) + except KeyError: + raise TrainerConfigError( + "Config doesn't specify use_recurrent. " + "Please specify true or false for use_recurrent in your config." + ) + # Absorb the rest into the base TrainerSettings + for key, val in config.items(): + if key in attr.fields_dict(TrainerSettings): + new_config[key] = val + + # Structure the whole thing + all_behavior_config_dict[behavior_name] = cattr.structure( + new_config, TrainerSettings + ) + return all_behavior_config_dict + + +def write_to_yaml_file(unstructed_config: Dict[str, Any], output_config: str) -> None: + with open(output_config, "w") as f: + try: + yaml.dump(unstructed_config, f, sort_keys=False) + except TypeError: # Older versions of pyyaml don't support sort_keys + yaml.dump(unstructed_config, f) + + +def remove_nones(config: Dict[Any, Any]) -> Dict[str, Any]: + new_config = {} + for key, val in config.items(): + if isinstance(val, dict): + new_config[key] = remove_nones(val) + elif val is not None: + new_config[key] = val + return new_config + + +# Take a sampler from the old format and convert to new sampler structure +def convert_samplers(old_sampler_config: Dict[str, Any]) -> Dict[str, Any]: + new_sampler_config: Dict[str, Any] = {} + for parameter, parameter_config in old_sampler_config.items(): + if parameter == "resampling-interval": + print( + "resampling-interval is no longer necessary for parameter randomization and is being ignored." + ) + continue + new_sampler_config[parameter] = {} + new_sampler_config[parameter]["sampler_type"] = parameter_config["sampler-type"] + new_samp_parameters = dict(parameter_config) # Copy dict + new_samp_parameters.pop("sampler-type") + new_sampler_config[parameter]["sampler_parameters"] = new_samp_parameters + return new_sampler_config + + +def convert_samplers_and_curriculum( + parameter_dict: Dict[str, Any], curriculum: Dict[str, Any] +) -> Dict[str, Any]: + for key, sampler in parameter_dict.items(): + if "sampler_parameters" not in sampler: + parameter_dict[key]["sampler_parameters"] = {} + for argument in [ + "seed", + "min_value", + "max_value", + "mean", + "st_dev", + "intervals", + ]: + if argument in sampler: + parameter_dict[key]["sampler_parameters"][argument] = sampler[argument] + parameter_dict[key].pop(argument) + param_set = set(parameter_dict.keys()) + for behavior_name, behavior_dict in curriculum.items(): + measure = behavior_dict["measure"] + min_lesson_length = behavior_dict.get("min_lesson_length", 1) + signal_smoothing = behavior_dict.get("signal_smoothing", False) + thresholds = behavior_dict["thresholds"] + num_lessons = len(thresholds) + 1 + parameters = behavior_dict["parameters"] + for param_name in parameters.keys(): + if param_name in param_set: + print( + f"The parameter {param_name} has both a sampler and a curriculum. Will ignore curriculum" + ) + else: + param_set.add(param_name) + parameter_dict[param_name] = {"curriculum": []} + for lesson_index in range(num_lessons - 1): + parameter_dict[param_name]["curriculum"].append( + { + f"Lesson{lesson_index}": { + "completion_criteria": { + "measure": measure, + "behavior": behavior_name, + "signal_smoothing": signal_smoothing, + "min_lesson_length": min_lesson_length, + "threshold": thresholds[lesson_index], + }, + "value": parameters[param_name][lesson_index], + } + } + ) + lesson_index += 1 # This is the last lesson + parameter_dict[param_name]["curriculum"].append( + { + f"Lesson{lesson_index}": { + "value": parameters[param_name][lesson_index] + } + } + ) + return parameter_dict + + +def parse_args(): + argparser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + argparser.add_argument( + "trainer_config_path", + help="Path to old format (<=0.18.X) trainer configuration YAML.", + ) + argparser.add_argument( + "--curriculum", + help="Path to old format (<=0.16.X) curriculum configuration YAML.", + default=None, + ) + argparser.add_argument( + "--sampler", + help="Path to old format (<=0.16.X) parameter randomization configuration YAML.", + default=None, + ) + argparser.add_argument( + "output_config_path", help="Path to write converted YAML file." + ) + args = argparser.parse_args() + return args + + +def convert( + config: Dict[str, Any], + old_curriculum: Optional[Dict[str, Any]], + old_param_random: Optional[Dict[str, Any]], +) -> Dict[str, Any]: + if "behaviors" not in config: + print("Config file format version : version <= 0.16.X") + behavior_config_dict = convert_behaviors(config) + full_config = {"behaviors": behavior_config_dict} + + # Convert curriculum and sampler. note that we don't validate these; if it was correct + # before it should be correct now. + if old_curriculum is not None: + full_config["curriculum"] = old_curriculum + + if old_param_random is not None: + sampler_config_dict = convert_samplers(old_param_random) + full_config["parameter_randomization"] = sampler_config_dict + + # Convert config to dict + config = cattr.unstructure(full_config) + if "curriculum" in config or "parameter_randomization" in config: + print("Config file format version : 0.16.X < version <= 0.18.X") + full_config = {"behaviors": config["behaviors"]} + + param_randomization = config.get("parameter_randomization", {}) + if "resampling-interval" in param_randomization: + param_randomization.pop("resampling-interval") + if len(param_randomization) > 0: + # check if we use the old format sampler-type vs sampler_type + if ( + "sampler-type" + in param_randomization[list(param_randomization.keys())[0]] + ): + param_randomization = convert_samplers(param_randomization) + + full_config["environment_parameters"] = convert_samplers_and_curriculum( + param_randomization, config.get("curriculum", {}) + ) + + # Convert config to dict + config = cattr.unstructure(full_config) + return config + + +def main() -> None: + args = parse_args() + print( + f"Converting {args.trainer_config_path} and saving to {args.output_config_path}." + ) + + old_config = load_config(args.trainer_config_path) + curriculum_config_dict = None + old_sampler_config_dict = None + if args.curriculum is not None: + curriculum_config_dict = load_config(args.curriculum) + if args.sampler is not None: + old_sampler_config_dict = load_config(args.sampler) + new_config = convert(old_config, curriculum_config_dict, old_sampler_config_dict) + unstructed_config = remove_nones(new_config) + write_to_yaml_file(unstructed_config, args.output_config_path) + + +if __name__ == "__main__": + main() diff --git a/MLPY/Lib/site-packages/mlagents_envs/__init__.py b/MLPY/Lib/site-packages/mlagents_envs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8476df42fa19998794ea2bcbe8c61682f45f5df5 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/__init__.py @@ -0,0 +1,5 @@ +# Version of the library that will be used to upload to pypi +__version__ = "0.30.0" + +# Git tag that will be checked to determine whether to trigger upload to pypi +__release_tag__ = "release_20" diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d414c60900f0bf5969f19f90bfafa2d42d6a25fd Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/base_env.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/base_env.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a661c51e6eafcad8c6a9f627d6dabf73e4d65010 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/base_env.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/communicator.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/communicator.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58704ce2657ef36a367919582913a9c3497cd756 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/communicator.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/env_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/env_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6fe8671cc9f90efa7b736f754e17e17cd8e5b34 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/env_utils.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/environment.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/environment.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47a4ef70a910b67fdb2896104d3fe6b8ed10de45 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/environment.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/exception.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/exception.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d2635a0f569a3be6c49005d9a936279b674163ba Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/exception.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/logging_util.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/logging_util.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d5c448e5ff9a6aee53675e0da116ee27df94ec5 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/logging_util.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/mock_communicator.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/mock_communicator.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c735ea63d0e503ea575e30b347a019060f23d471 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/mock_communicator.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/rpc_communicator.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/rpc_communicator.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..837666d9f77e9434408e60c9137aeb344cef8fbf Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/rpc_communicator.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/rpc_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/rpc_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2822e620b37ea65d8ff251f34ec866facdece9c2 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/rpc_utils.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/timers.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/timers.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..92d5cebde9fae7c7add491d217b42ccc57b47b16 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/timers.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/base_env.py b/MLPY/Lib/site-packages/mlagents_envs/base_env.py new file mode 100644 index 0000000000000000000000000000000000000000..a993d8a7c509c34d1a2621195c840e4c9e14a099 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/base_env.py @@ -0,0 +1,616 @@ +""" +Python Environment API for the ML-Agents Toolkit +The aim of this API is to expose Agents evolving in a simulation +to perform reinforcement learning on. +This API supports multi-agent scenarios and groups similar Agents (same +observations, actions spaces and behavior) together. These groups of Agents are +identified by their BehaviorName. +For performance reasons, the data of each group of agents is processed in a +batched manner. Agents are identified by a unique AgentId identifier that +allows tracking of Agents across simulation steps. Note that there is no +guarantee that the number or order of the Agents in the state will be +consistent across simulation steps. +A simulation steps corresponds to moving the simulation forward until at least +one agent in the simulation sends its observations to Python again. Since +Agents can request decisions at different frequencies, a simulation step does +not necessarily correspond to a fixed simulation time increment. +""" + +from abc import ABC, abstractmethod +from collections.abc import Mapping +from typing import ( + List, + NamedTuple, + Tuple, + Optional, + Dict, + Iterator, + Any, + Mapping as MappingType, +) +from enum import IntFlag, Enum +import numpy as np + +from mlagents_envs.exception import UnityActionException + +AgentId = int +GroupId = int +BehaviorName = str + + +class DecisionStep(NamedTuple): + """ + Contains the data a single Agent collected since the last + simulation step. + - obs is a list of numpy arrays observations collected by the agent. + - reward is a float. Corresponds to the rewards collected by the agent + since the last simulation step. + - agent_id is an int and an unique identifier for the corresponding Agent. + - action_mask is an optional list of one dimensional array of booleans. + Only available when using multi-discrete actions. + Each array corresponds to an action branch. Each array contains a mask + for each action of the branch. If true, the action is not available for + the agent during this simulation step. + """ + + obs: List[np.ndarray] + reward: float + agent_id: AgentId + action_mask: Optional[List[np.ndarray]] + group_id: int + group_reward: float + + +class DecisionSteps(Mapping): + """ + Contains the data a batch of similar Agents collected since the last + simulation step. Note that all Agents do not necessarily have new + information to send at each simulation step. Therefore, the ordering of + agents and the batch size of the DecisionSteps are not fixed across + simulation steps. + - obs is a list of numpy arrays observations collected by the batch of + agent. Each obs has one extra dimension compared to DecisionStep: the + first dimension of the array corresponds to the batch size of the batch. + - reward is a float vector of length batch size. Corresponds to the + rewards collected by each agent since the last simulation step. + - agent_id is an int vector of length batch size containing unique + identifier for the corresponding Agent. This is used to track Agents + across simulation steps. + - action_mask is an optional list of two dimensional array of booleans. + Only available when using multi-discrete actions. + Each array corresponds to an action branch. The first dimension of each + array is the batch size and the second contains a mask for each action of + the branch. If true, the action is not available for the agent during + this simulation step. + """ + + def __init__(self, obs, reward, agent_id, action_mask, group_id, group_reward): + self.obs: List[np.ndarray] = obs + self.reward: np.ndarray = reward + self.agent_id: np.ndarray = agent_id + self.action_mask: Optional[List[np.ndarray]] = action_mask + self.group_id: np.ndarray = group_id + self.group_reward: np.ndarray = group_reward + self._agent_id_to_index: Optional[Dict[AgentId, int]] = None + + @property + def agent_id_to_index(self) -> Dict[AgentId, int]: + """ + :returns: A Dict that maps agent_id to the index of those agents in + this DecisionSteps. + """ + if self._agent_id_to_index is None: + self._agent_id_to_index = {} + for a_idx, a_id in enumerate(self.agent_id): + self._agent_id_to_index[a_id] = a_idx + return self._agent_id_to_index + + def __len__(self) -> int: + return len(self.agent_id) + + def __getitem__(self, agent_id: AgentId) -> DecisionStep: + """ + returns the DecisionStep for a specific agent. + :param agent_id: The id of the agent + :returns: The DecisionStep + """ + if agent_id not in self.agent_id_to_index: + raise KeyError(f"agent_id {agent_id} is not present in the DecisionSteps") + agent_index = self._agent_id_to_index[agent_id] # type: ignore + agent_obs = [] + for batched_obs in self.obs: + agent_obs.append(batched_obs[agent_index]) + agent_mask = None + if self.action_mask is not None: + agent_mask = [] + for mask in self.action_mask: + agent_mask.append(mask[agent_index]) + group_id = self.group_id[agent_index] + return DecisionStep( + obs=agent_obs, + reward=self.reward[agent_index], + agent_id=agent_id, + action_mask=agent_mask, + group_id=group_id, + group_reward=self.group_reward[agent_index], + ) + + def __iter__(self) -> Iterator[Any]: + yield from self.agent_id + + @staticmethod + def empty(spec: "BehaviorSpec") -> "DecisionSteps": + """ + Returns an empty DecisionSteps. + :param spec: The BehaviorSpec for the DecisionSteps + """ + obs: List[np.ndarray] = [] + for sen_spec in spec.observation_specs: + obs += [np.zeros((0,) + sen_spec.shape, dtype=np.float32)] + return DecisionSteps( + obs=obs, + reward=np.zeros(0, dtype=np.float32), + agent_id=np.zeros(0, dtype=np.int32), + action_mask=None, + group_id=np.zeros(0, dtype=np.int32), + group_reward=np.zeros(0, dtype=np.float32), + ) + + +class TerminalStep(NamedTuple): + """ + Contains the data a single Agent collected when its episode ended. + - obs is a list of numpy arrays observations collected by the agent. + - reward is a float. Corresponds to the rewards collected by the agent + since the last simulation step. + - interrupted is a bool. Is true if the Agent was interrupted since the last + decision step. For example, if the Agent reached the maximum number of steps for + the episode. + - agent_id is an int and an unique identifier for the corresponding Agent. + """ + + obs: List[np.ndarray] + reward: float + interrupted: bool + agent_id: AgentId + group_id: GroupId + group_reward: float + + +class TerminalSteps(Mapping): + """ + Contains the data a batch of Agents collected when their episode + terminated. All Agents present in the TerminalSteps have ended their + episode. + - obs is a list of numpy arrays observations collected by the batch of + agent. Each obs has one extra dimension compared to DecisionStep: the + first dimension of the array corresponds to the batch size of the batch. + - reward is a float vector of length batch size. Corresponds to the + rewards collected by each agent since the last simulation step. + - interrupted is an array of booleans of length batch size. Is true if the + associated Agent was interrupted since the last decision step. For example, if the + Agent reached the maximum number of steps for the episode. + - agent_id is an int vector of length batch size containing unique + identifier for the corresponding Agent. This is used to track Agents + across simulation steps. + """ + + def __init__(self, obs, reward, interrupted, agent_id, group_id, group_reward): + self.obs: List[np.ndarray] = obs + self.reward: np.ndarray = reward + self.interrupted: np.ndarray = interrupted + self.agent_id: np.ndarray = agent_id + self.group_id: np.ndarray = group_id + self.group_reward: np.ndarray = group_reward + self._agent_id_to_index: Optional[Dict[AgentId, int]] = None + + @property + def agent_id_to_index(self) -> Dict[AgentId, int]: + """ + :returns: A Dict that maps agent_id to the index of those agents in + this TerminalSteps. + """ + if self._agent_id_to_index is None: + self._agent_id_to_index = {} + for a_idx, a_id in enumerate(self.agent_id): + self._agent_id_to_index[a_id] = a_idx + return self._agent_id_to_index + + def __len__(self) -> int: + return len(self.agent_id) + + def __getitem__(self, agent_id: AgentId) -> TerminalStep: + """ + returns the TerminalStep for a specific agent. + :param agent_id: The id of the agent + :returns: obs, reward, done, agent_id and optional action mask for a + specific agent + """ + if agent_id not in self.agent_id_to_index: + raise KeyError(f"agent_id {agent_id} is not present in the TerminalSteps") + agent_index = self._agent_id_to_index[agent_id] # type: ignore + agent_obs = [] + for batched_obs in self.obs: + agent_obs.append(batched_obs[agent_index]) + group_id = self.group_id[agent_index] + return TerminalStep( + obs=agent_obs, + reward=self.reward[agent_index], + interrupted=self.interrupted[agent_index], + agent_id=agent_id, + group_id=group_id, + group_reward=self.group_reward[agent_index], + ) + + def __iter__(self) -> Iterator[Any]: + yield from self.agent_id + + @staticmethod + def empty(spec: "BehaviorSpec") -> "TerminalSteps": + """ + Returns an empty TerminalSteps. + :param spec: The BehaviorSpec for the TerminalSteps + """ + obs: List[np.ndarray] = [] + for sen_spec in spec.observation_specs: + obs += [np.zeros((0,) + sen_spec.shape, dtype=np.float32)] + return TerminalSteps( + obs=obs, + reward=np.zeros(0, dtype=np.float32), + interrupted=np.zeros(0, dtype=bool), + agent_id=np.zeros(0, dtype=np.int32), + group_id=np.zeros(0, dtype=np.int32), + group_reward=np.zeros(0, dtype=np.float32), + ) + + +class _ActionTupleBase(ABC): + """ + An object whose fields correspond to action data of continuous and discrete + spaces. Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size), + respectively. Note, this also holds when continuous or discrete size is + zero. + """ + + def __init__( + self, + continuous: Optional[np.ndarray] = None, + discrete: Optional[np.ndarray] = None, + ): + self._continuous: Optional[np.ndarray] = None + self._discrete: Optional[np.ndarray] = None + if continuous is not None: + self.add_continuous(continuous) + if discrete is not None: + self.add_discrete(discrete) + + @property + def continuous(self) -> np.ndarray: + return self._continuous + + @property + def discrete(self) -> np.ndarray: + return self._discrete + + def add_continuous(self, continuous: np.ndarray) -> None: + if continuous.dtype != np.float32: + continuous = continuous.astype(np.float32, copy=False) + if self._discrete is None: + self._discrete = np.zeros( + (continuous.shape[0], 0), dtype=self.discrete_dtype + ) + self._continuous = continuous + + def add_discrete(self, discrete: np.ndarray) -> None: + if discrete.dtype != self.discrete_dtype: + discrete = discrete.astype(self.discrete_dtype, copy=False) + if self._continuous is None: + self._continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32) + self._discrete = discrete + + @property + @abstractmethod + def discrete_dtype(self) -> np.dtype: + pass + + +class ActionTuple(_ActionTupleBase): + """ + An object whose fields correspond to actions of different types. + Continuous and discrete actions are numpy arrays of type float32 and + int32, respectively and are type checked on construction. + Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size), + respectively. Note, this also holds when continuous or discrete size is + zero. + """ + + @property + def discrete_dtype(self) -> np.dtype: + """ + The dtype of a discrete action. + """ + return np.int32 + + +class ActionSpec(NamedTuple): + """ + A NamedTuple containing utility functions and information about the action spaces + for a group of Agents under the same behavior. + - num_continuous_actions is an int corresponding to the number of floats which + constitute the action. + - discrete_branch_sizes is a Tuple of int where each int corresponds to + the number of discrete actions available to the agent on an independent action branch. + """ + + continuous_size: int + discrete_branches: Tuple[int, ...] + + def __eq__(self, other): + return ( + self.continuous_size == other.continuous_size + and self.discrete_branches == other.discrete_branches + ) + + def __str__(self): + return f"Continuous: {self.continuous_size}, Discrete: {self.discrete_branches}" + + # For backwards compatibility + def is_discrete(self) -> bool: + """ + Returns true if this Behavior uses discrete actions + """ + return self.discrete_size > 0 and self.continuous_size == 0 + + # For backwards compatibility + def is_continuous(self) -> bool: + """ + Returns true if this Behavior uses continuous actions + """ + return self.discrete_size == 0 and self.continuous_size > 0 + + @property + def discrete_size(self) -> int: + """ + Returns a an int corresponding to the number of discrete branches. + """ + return len(self.discrete_branches) + + def empty_action(self, n_agents: int) -> ActionTuple: + """ + Generates ActionTuple corresponding to an empty action (all zeros) + for a number of agents. + :param n_agents: The number of agents that will have actions generated + """ + _continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32) + _discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32) + return ActionTuple(continuous=_continuous, discrete=_discrete) + + def random_action(self, n_agents: int) -> ActionTuple: + """ + Generates ActionTuple corresponding to a random action (either discrete + or continuous) for a number of agents. + :param n_agents: The number of agents that will have actions generated + """ + _continuous = np.random.uniform( + low=-1.0, high=1.0, size=(n_agents, self.continuous_size) + ) + _discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32) + if self.discrete_size > 0: + _discrete = np.column_stack( + [ + np.random.randint( + 0, + self.discrete_branches[i], # type: ignore + size=(n_agents), + dtype=np.int32, + ) + for i in range(self.discrete_size) + ] + ) + return ActionTuple(continuous=_continuous, discrete=_discrete) + + def _validate_action( + self, actions: ActionTuple, n_agents: int, name: str + ) -> ActionTuple: + """ + Validates that action has the correct action dim + for the correct number of agents and ensures the type. + """ + _expected_shape = (n_agents, self.continuous_size) + if actions.continuous.shape != _expected_shape: + raise UnityActionException( + f"The behavior {name} needs a continuous input of dimension " + f"{_expected_shape} for (, ) but " + f"received input of dimension {actions.continuous.shape}" + ) + _expected_shape = (n_agents, self.discrete_size) + if actions.discrete.shape != _expected_shape: + raise UnityActionException( + f"The behavior {name} needs a discrete input of dimension " + f"{_expected_shape} for (, ) but " + f"received input of dimension {actions.discrete.shape}" + ) + return actions + + @staticmethod + def create_continuous(continuous_size: int) -> "ActionSpec": + """ + Creates an ActionSpec that is homogenously continuous + """ + return ActionSpec(continuous_size, ()) + + @staticmethod + def create_discrete(discrete_branches: Tuple[int]) -> "ActionSpec": + """ + Creates an ActionSpec that is homogenously discrete + """ + return ActionSpec(0, discrete_branches) + + +class DimensionProperty(IntFlag): + """ + The dimension property of a dimension of an observation. + """ + + UNSPECIFIED = 0 + """ + No properties specified. + """ + + NONE = 1 + """ + No Property of the observation in that dimension. Observation can be processed with + Fully connected networks. + """ + + TRANSLATIONAL_EQUIVARIANCE = 2 + """ + Means it is suitable to do a convolution in this dimension. + """ + + VARIABLE_SIZE = 4 + """ + Means that there can be a variable number of observations in this dimension. + The observations are unordered. + """ + + +class ObservationType(Enum): + """ + An Enum which defines the type of information carried in the observation + of the agent. + """ + + DEFAULT = 0 + """ + Observation information is generic. + """ + + GOAL_SIGNAL = 1 + """ + Observation contains goal information for current task. + """ + + +class ObservationSpec(NamedTuple): + """ + A NamedTuple containing information about the observation of Agents. + - shape is a Tuple of int : It corresponds to the shape of + an observation's dimensions. + - dimension_property is a Tuple of DimensionProperties flag, one flag for each + dimension. + - observation_type is an enum of ObservationType. + """ + + shape: Tuple[int, ...] + dimension_property: Tuple[DimensionProperty, ...] + observation_type: ObservationType + + # Optional name. For observations coming from com.unity.ml-agents, this + # will be the ISensor name. + name: str + + +class BehaviorSpec(NamedTuple): + """ + A NamedTuple containing information about the observation and action + spaces for a group of Agents under the same behavior. + - observation_specs is a List of ObservationSpec NamedTuple containing + information about the information of the Agent's observations such as their shapes. + The order of the ObservationSpec is the same as the order of the observations of an + agent. + - action_spec is an ActionSpec NamedTuple. + """ + + observation_specs: List[ObservationSpec] + action_spec: ActionSpec + + +class BehaviorMapping(Mapping): + def __init__(self, specs: Dict[BehaviorName, BehaviorSpec]): + self._dict = specs + + def __len__(self) -> int: + return len(self._dict) + + def __getitem__(self, behavior: BehaviorName) -> BehaviorSpec: + return self._dict[behavior] + + def __iter__(self) -> Iterator[Any]: + yield from self._dict + + +class BaseEnv(ABC): + @abstractmethod + def step(self) -> None: + """ + Signals the environment that it must move the simulation forward + by one step. + """ + + @abstractmethod + def reset(self) -> None: + """ + Signals the environment that it must reset the simulation. + """ + + @abstractmethod + def close(self) -> None: + """ + Signals the environment that it must close. + """ + + @property + @abstractmethod + def behavior_specs(self) -> MappingType[str, BehaviorSpec]: + """ + Returns a Mapping from behavior names to behavior specs. + Agents grouped under the same behavior name have the same action and + observation specs, and are expected to behave similarly in the + environment. + Note that new keys can be added to this mapping as new policies are instantiated. + """ + + @abstractmethod + def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None: + """ + Sets the action for all of the agents in the simulation for the next + step. The Actions must be in the same order as the order received in + the DecisionSteps. + :param behavior_name: The name of the behavior the agents are part of + :param action: ActionTuple tuple of continuous and/or discrete action. + Actions are np.arrays with dimensions (n_agents, continuous_size) and + (n_agents, discrete_size), respectively. + """ + + @abstractmethod + def set_action_for_agent( + self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple + ) -> None: + """ + Sets the action for one of the agents in the simulation for the next + step. + :param behavior_name: The name of the behavior the agent is part of + :param agent_id: The id of the agent the action is set for + :param action: ActionTuple tuple of continuous and/or discrete action + Actions are np.arrays with dimensions (1, continuous_size) and + (1, discrete_size), respectively. Note, this initial dimensions of 1 is because + this action is meant for a single agent. + """ + + @abstractmethod + def get_steps( + self, behavior_name: BehaviorName + ) -> Tuple[DecisionSteps, TerminalSteps]: + """ + Retrieves the steps of the agents that requested a step in the + simulation. + :param behavior_name: The name of the behavior the agents are part of + :return: A tuple containing : + - A DecisionSteps NamedTuple containing the observations, + the rewards, the agent ids and the action masks for the Agents + of the specified behavior. These Agents need an action this step. + - A TerminalSteps NamedTuple containing the observations, + rewards, agent ids and interrupted flags of the agents that had their + episode terminated last step. + """ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator.py b/MLPY/Lib/site-packages/mlagents_envs/communicator.py new file mode 100644 index 0000000000000000000000000000000000000000..2223f34d3a4b4177b69867125b3e449be76f920d --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator.py @@ -0,0 +1,43 @@ +from typing import Callable, Optional +from mlagents_envs.communicator_objects.unity_output_pb2 import UnityOutputProto +from mlagents_envs.communicator_objects.unity_input_pb2 import UnityInputProto + + +# Function to call while waiting for a connection timeout. +# This should raise an exception if it needs to break from waiting for the timeout. +PollCallback = Callable[[], None] + + +class Communicator: + def __init__(self, worker_id=0, base_port=5005): + """ + Python side of the communication. Must be used in pair with the right Unity Communicator equivalent. + + :int worker_id: Offset from base_port. Used for training multiple environments simultaneously. + :int base_port: Baseline port number to connect to Unity environment over. worker_id increments over this. + """ + + def initialize( + self, inputs: UnityInputProto, poll_callback: Optional[PollCallback] = None + ) -> UnityOutputProto: + """ + Used to exchange initialization parameters between Python and the Environment + :param inputs: The initialization input that will be sent to the environment. + :param poll_callback: Optional callback to be used while polling the connection. + :return: UnityOutput: The initialization output sent by Unity + """ + + def exchange( + self, inputs: UnityInputProto, poll_callback: Optional[PollCallback] = None + ) -> Optional[UnityOutputProto]: + """ + Used to send an input and receive an output from the Environment + :param inputs: The UnityInput that needs to be sent the Environment + :param poll_callback: Optional callback to be used while polling the connection. + :return: The UnityOutputs generated by the Environment + """ + + def close(self): + """ + Sends a shutdown signal to the unity environment, and closes the connection. + """ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__init__.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7038764d34f752991ce2d03ee964f55c9e90680 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_action_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_action_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24add860faa1a29adf6e63c67a5264983f74bec2 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_action_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_info_action_pair_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_info_action_pair_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17af007ee4fcb18e37c22dd50e7d467c9d8d74ff Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_info_action_pair_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_info_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_info_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4040c697d8c86f47a3fdeb35577de18d297bf591 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_info_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/brain_parameters_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/brain_parameters_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..311ca4cd58ecbf93064cc036bb379f3443a24a16 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/brain_parameters_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/capabilities_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/capabilities_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed1b8882830eeae229036ec3cffc9637f7ea9ec7 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/capabilities_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/command_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/command_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e95b23c2bf79e6c9bd8120348d2d28617832f94 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/command_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/custom_reset_parameters_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/custom_reset_parameters_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f22821ca4a4014f62b0c3cca3baa4146f46bf61 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/custom_reset_parameters_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/demonstration_meta_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/demonstration_meta_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83fd188b2d713866d5e8249bea43050b1065d3b0 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/demonstration_meta_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/engine_configuration_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/engine_configuration_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b512a56d7d63e3e6fb8332354c021fce6e4c7b7f Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/engine_configuration_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/header_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/header_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb8038479bca969620525ed4da3e1ca6bd786af8 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/header_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/observation_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/observation_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb68da15d1d4c4c4b90c3acbf4017e3624dc0ffc Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/observation_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/space_type_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/space_type_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7aa02e6acb319065909c57da144e84e0c0b02479 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/space_type_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/training_analytics_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/training_analytics_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9a4879449b759e061e6545b8441939a6159fb85 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/training_analytics_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_input_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_input_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5409679a8464be7f18eb514c900e5cca489df03a Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_input_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_message_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_message_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b381db243df8c117b6cd4f8d7600aa66f0e7ec9 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_message_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_output_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_output_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88a2cb04996da7eeaa0c1288ab63fb1c70b2dfb2 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_output_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_initialization_input_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_initialization_input_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41ce1337547d8db9c636e5b5516059a3510d9953 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_initialization_input_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_initialization_output_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_initialization_output_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb69cd615142586a4c979a38c9ecf7a1c494cad4 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_initialization_output_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_input_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_input_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97a4657773a88ffeef3db1a13024eef35640d3cf Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_input_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_output_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_output_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3271602d1ee8044734d32768adf268a545d506af Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_output_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_to_external_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_to_external_pb2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a4b6ad090eecc3f169a795d36b7b9cacd113120 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_to_external_pb2.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_to_external_pb2_grpc.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_to_external_pb2_grpc.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17843953c51c87445026ca700680e44408aa584b Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_to_external_pb2_grpc.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_action_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_action_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..7c57d8b4cb179db52449f32c638f841f4a2146a2 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_action_pb2.py @@ -0,0 +1,92 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/agent_action.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/agent_action.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n5mlagents_envs/communicator_objects/agent_action.proto\x12\x14\x63ommunicator_objects\"\x8c\x01\n\x10\x41gentActionProto\x12!\n\x19vector_actions_deprecated\x18\x01 \x03(\x02\x12\r\n\x05value\x18\x04 \x01(\x02\x12\x1a\n\x12\x63ontinuous_actions\x18\x06 \x03(\x02\x12\x18\n\x10\x64iscrete_actions\x18\x07 \x03(\x05J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x05\x10\x06\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') +) + + + + +_AGENTACTIONPROTO = _descriptor.Descriptor( + name='AgentActionProto', + full_name='communicator_objects.AgentActionProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='vector_actions_deprecated', full_name='communicator_objects.AgentActionProto.vector_actions_deprecated', index=0, + number=1, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='value', full_name='communicator_objects.AgentActionProto.value', index=1, + number=4, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='continuous_actions', full_name='communicator_objects.AgentActionProto.continuous_actions', index=2, + number=6, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='discrete_actions', full_name='communicator_objects.AgentActionProto.discrete_actions', index=3, + number=7, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=80, + serialized_end=220, +) + +DESCRIPTOR.message_types_by_name['AgentActionProto'] = _AGENTACTIONPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +AgentActionProto = _reflection.GeneratedProtocolMessageType('AgentActionProto', (_message.Message,), dict( + DESCRIPTOR = _AGENTACTIONPROTO, + __module__ = 'mlagents_envs.communicator_objects.agent_action_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.AgentActionProto) + )) +_sym_db.RegisterMessage(AgentActionProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_info_action_pair_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_info_action_pair_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..667b91a13dcc95d76f754920fa79c2e69a65a791 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_info_action_pair_pb2.py @@ -0,0 +1,83 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/agent_info_action_pair.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from mlagents_envs.communicator_objects import agent_info_pb2 as mlagents__envs_dot_communicator__objects_dot_agent__info__pb2 +from mlagents_envs.communicator_objects import agent_action_pb2 as mlagents__envs_dot_communicator__objects_dot_agent__action__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/agent_info_action_pair.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n?mlagents_envs/communicator_objects/agent_info_action_pair.proto\x12\x14\x63ommunicator_objects\x1a\x33mlagents_envs/communicator_objects/agent_info.proto\x1a\x35mlagents_envs/communicator_objects/agent_action.proto\"\x91\x01\n\x18\x41gentInfoActionPairProto\x12\x38\n\nagent_info\x18\x01 \x01(\x0b\x32$.communicator_objects.AgentInfoProto\x12;\n\x0b\x61\x63tion_info\x18\x02 \x01(\x0b\x32&.communicator_objects.AgentActionProtoB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') + , + dependencies=[mlagents__envs_dot_communicator__objects_dot_agent__info__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_agent__action__pb2.DESCRIPTOR,]) + + + + +_AGENTINFOACTIONPAIRPROTO = _descriptor.Descriptor( + name='AgentInfoActionPairProto', + full_name='communicator_objects.AgentInfoActionPairProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='agent_info', full_name='communicator_objects.AgentInfoActionPairProto.agent_info', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='action_info', full_name='communicator_objects.AgentInfoActionPairProto.action_info', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=198, + serialized_end=343, +) + +_AGENTINFOACTIONPAIRPROTO.fields_by_name['agent_info'].message_type = mlagents__envs_dot_communicator__objects_dot_agent__info__pb2._AGENTINFOPROTO +_AGENTINFOACTIONPAIRPROTO.fields_by_name['action_info'].message_type = mlagents__envs_dot_communicator__objects_dot_agent__action__pb2._AGENTACTIONPROTO +DESCRIPTOR.message_types_by_name['AgentInfoActionPairProto'] = _AGENTINFOACTIONPAIRPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +AgentInfoActionPairProto = _reflection.GeneratedProtocolMessageType('AgentInfoActionPairProto', (_message.Message,), dict( + DESCRIPTOR = _AGENTINFOACTIONPAIRPROTO, + __module__ = 'mlagents_envs.communicator_objects.agent_info_action_pair_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.AgentInfoActionPairProto) + )) +_sym_db.RegisterMessage(AgentInfoActionPairProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_info_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_info_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..57bb77aa575e0bfc5187b14cc3aec946046678e3 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_info_pb2.py @@ -0,0 +1,123 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/agent_info.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from mlagents_envs.communicator_objects import observation_pb2 as mlagents__envs_dot_communicator__objects_dot_observation__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/agent_info.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n3mlagents_envs/communicator_objects/agent_info.proto\x12\x14\x63ommunicator_objects\x1a\x34mlagents_envs/communicator_objects/observation.proto\"\xf9\x01\n\x0e\x41gentInfoProto\x12\x0e\n\x06reward\x18\x07 \x01(\x02\x12\x0c\n\x04\x64one\x18\x08 \x01(\x08\x12\x18\n\x10max_step_reached\x18\t \x01(\x08\x12\n\n\x02id\x18\n \x01(\x05\x12\x13\n\x0b\x61\x63tion_mask\x18\x0b \x03(\x08\x12<\n\x0cobservations\x18\r \x03(\x0b\x32&.communicator_objects.ObservationProto\x12\x10\n\x08group_id\x18\x0e \x01(\x05\x12\x14\n\x0cgroup_reward\x18\x0f \x01(\x02J\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06J\x04\x08\x06\x10\x07J\x04\x08\x0c\x10\rB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') + , + dependencies=[mlagents__envs_dot_communicator__objects_dot_observation__pb2.DESCRIPTOR,]) + + + + +_AGENTINFOPROTO = _descriptor.Descriptor( + name='AgentInfoProto', + full_name='communicator_objects.AgentInfoProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='reward', full_name='communicator_objects.AgentInfoProto.reward', index=0, + number=7, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='done', full_name='communicator_objects.AgentInfoProto.done', index=1, + number=8, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='max_step_reached', full_name='communicator_objects.AgentInfoProto.max_step_reached', index=2, + number=9, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='id', full_name='communicator_objects.AgentInfoProto.id', index=3, + number=10, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='action_mask', full_name='communicator_objects.AgentInfoProto.action_mask', index=4, + number=11, type=8, cpp_type=7, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='observations', full_name='communicator_objects.AgentInfoProto.observations', index=5, + number=13, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='group_id', full_name='communicator_objects.AgentInfoProto.group_id', index=6, + number=14, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='group_reward', full_name='communicator_objects.AgentInfoProto.group_reward', index=7, + number=15, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=132, + serialized_end=381, +) + +_AGENTINFOPROTO.fields_by_name['observations'].message_type = mlagents__envs_dot_communicator__objects_dot_observation__pb2._OBSERVATIONPROTO +DESCRIPTOR.message_types_by_name['AgentInfoProto'] = _AGENTINFOPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +AgentInfoProto = _reflection.GeneratedProtocolMessageType('AgentInfoProto', (_message.Message,), dict( + DESCRIPTOR = _AGENTINFOPROTO, + __module__ = 'mlagents_envs.communicator_objects.agent_info_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.AgentInfoProto) + )) +_sym_db.RegisterMessage(AgentInfoProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/brain_parameters_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/brain_parameters_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..ccd9140d3cc7b7f8ef9a4092480572fbd04d911e --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/brain_parameters_pb2.py @@ -0,0 +1,170 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/brain_parameters.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from mlagents_envs.communicator_objects import space_type_pb2 as mlagents__envs_dot_communicator__objects_dot_space__type__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/brain_parameters.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n9mlagents_envs/communicator_objects/brain_parameters.proto\x12\x14\x63ommunicator_objects\x1a\x33mlagents_envs/communicator_objects/space_type.proto\"\x8b\x01\n\x0f\x41\x63tionSpecProto\x12\x1e\n\x16num_continuous_actions\x18\x01 \x01(\x05\x12\x1c\n\x14num_discrete_actions\x18\x02 \x01(\x05\x12\x1d\n\x15\x64iscrete_branch_sizes\x18\x03 \x03(\x05\x12\x1b\n\x13\x61\x63tion_descriptions\x18\x04 \x03(\t\"\xb6\x02\n\x14\x42rainParametersProto\x12%\n\x1dvector_action_size_deprecated\x18\x03 \x03(\x05\x12-\n%vector_action_descriptions_deprecated\x18\x05 \x03(\t\x12Q\n#vector_action_space_type_deprecated\x18\x06 \x01(\x0e\x32$.communicator_objects.SpaceTypeProto\x12\x12\n\nbrain_name\x18\x07 \x01(\t\x12\x13\n\x0bis_training\x18\x08 \x01(\x08\x12:\n\x0b\x61\x63tion_spec\x18\t \x01(\x0b\x32%.communicator_objects.ActionSpecProtoJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x04\x10\x05\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') + , + dependencies=[mlagents__envs_dot_communicator__objects_dot_space__type__pb2.DESCRIPTOR,]) + + + + +_ACTIONSPECPROTO = _descriptor.Descriptor( + name='ActionSpecProto', + full_name='communicator_objects.ActionSpecProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='num_continuous_actions', full_name='communicator_objects.ActionSpecProto.num_continuous_actions', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='num_discrete_actions', full_name='communicator_objects.ActionSpecProto.num_discrete_actions', index=1, + number=2, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='discrete_branch_sizes', full_name='communicator_objects.ActionSpecProto.discrete_branch_sizes', index=2, + number=3, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='action_descriptions', full_name='communicator_objects.ActionSpecProto.action_descriptions', index=3, + number=4, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=137, + serialized_end=276, +) + + +_BRAINPARAMETERSPROTO = _descriptor.Descriptor( + name='BrainParametersProto', + full_name='communicator_objects.BrainParametersProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='vector_action_size_deprecated', full_name='communicator_objects.BrainParametersProto.vector_action_size_deprecated', index=0, + number=3, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='vector_action_descriptions_deprecated', full_name='communicator_objects.BrainParametersProto.vector_action_descriptions_deprecated', index=1, + number=5, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='vector_action_space_type_deprecated', full_name='communicator_objects.BrainParametersProto.vector_action_space_type_deprecated', index=2, + number=6, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='brain_name', full_name='communicator_objects.BrainParametersProto.brain_name', index=3, + number=7, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='is_training', full_name='communicator_objects.BrainParametersProto.is_training', index=4, + number=8, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='action_spec', full_name='communicator_objects.BrainParametersProto.action_spec', index=5, + number=9, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=279, + serialized_end=589, +) + +_BRAINPARAMETERSPROTO.fields_by_name['vector_action_space_type_deprecated'].enum_type = mlagents__envs_dot_communicator__objects_dot_space__type__pb2._SPACETYPEPROTO +_BRAINPARAMETERSPROTO.fields_by_name['action_spec'].message_type = _ACTIONSPECPROTO +DESCRIPTOR.message_types_by_name['ActionSpecProto'] = _ACTIONSPECPROTO +DESCRIPTOR.message_types_by_name['BrainParametersProto'] = _BRAINPARAMETERSPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +ActionSpecProto = _reflection.GeneratedProtocolMessageType('ActionSpecProto', (_message.Message,), dict( + DESCRIPTOR = _ACTIONSPECPROTO, + __module__ = 'mlagents_envs.communicator_objects.brain_parameters_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.ActionSpecProto) + )) +_sym_db.RegisterMessage(ActionSpecProto) + +BrainParametersProto = _reflection.GeneratedProtocolMessageType('BrainParametersProto', (_message.Message,), dict( + DESCRIPTOR = _BRAINPARAMETERSPROTO, + __module__ = 'mlagents_envs.communicator_objects.brain_parameters_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.BrainParametersProto) + )) +_sym_db.RegisterMessage(BrainParametersProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/capabilities_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/capabilities_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..35b8fbdef15752e0c75bdcd329afbc391d68e6f4 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/capabilities_pb2.py @@ -0,0 +1,113 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/capabilities.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/capabilities.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n5mlagents_envs/communicator_objects/capabilities.proto\x12\x14\x63ommunicator_objects\"\xec\x01\n\x18UnityRLCapabilitiesProto\x12\x1a\n\x12\x62\x61seRLCapabilities\x18\x01 \x01(\x08\x12#\n\x1b\x63oncatenatedPngObservations\x18\x02 \x01(\x08\x12 \n\x18\x63ompressedChannelMapping\x18\x03 \x01(\x08\x12\x15\n\rhybridActions\x18\x04 \x01(\x08\x12\x19\n\x11trainingAnalytics\x18\x05 \x01(\x08\x12!\n\x19variableLengthObservation\x18\x06 \x01(\x08\x12\x18\n\x10multiAgentGroups\x18\x07 \x01(\x08\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') +) + + + + +_UNITYRLCAPABILITIESPROTO = _descriptor.Descriptor( + name='UnityRLCapabilitiesProto', + full_name='communicator_objects.UnityRLCapabilitiesProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='baseRLCapabilities', full_name='communicator_objects.UnityRLCapabilitiesProto.baseRLCapabilities', index=0, + number=1, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='concatenatedPngObservations', full_name='communicator_objects.UnityRLCapabilitiesProto.concatenatedPngObservations', index=1, + number=2, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='compressedChannelMapping', full_name='communicator_objects.UnityRLCapabilitiesProto.compressedChannelMapping', index=2, + number=3, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='hybridActions', full_name='communicator_objects.UnityRLCapabilitiesProto.hybridActions', index=3, + number=4, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='trainingAnalytics', full_name='communicator_objects.UnityRLCapabilitiesProto.trainingAnalytics', index=4, + number=5, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='variableLengthObservation', full_name='communicator_objects.UnityRLCapabilitiesProto.variableLengthObservation', index=5, + number=6, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='multiAgentGroups', full_name='communicator_objects.UnityRLCapabilitiesProto.multiAgentGroups', index=6, + number=7, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=80, + serialized_end=316, +) + +DESCRIPTOR.message_types_by_name['UnityRLCapabilitiesProto'] = _UNITYRLCAPABILITIESPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +UnityRLCapabilitiesProto = _reflection.GeneratedProtocolMessageType('UnityRLCapabilitiesProto', (_message.Message,), dict( + DESCRIPTOR = _UNITYRLCAPABILITIESPROTO, + __module__ = 'mlagents_envs.communicator_objects.capabilities_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLCapabilitiesProto) + )) +_sym_db.RegisterMessage(UnityRLCapabilitiesProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/command_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/command_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..397f31098025a193a2365d150057ad1c3fc90c36 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/command_pb2.py @@ -0,0 +1,64 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/command.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf.internal import enum_type_wrapper +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/command.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n0mlagents_envs/communicator_objects/command.proto\x12\x14\x63ommunicator_objects*-\n\x0c\x43ommandProto\x12\x08\n\x04STEP\x10\x00\x12\t\n\x05RESET\x10\x01\x12\x08\n\x04QUIT\x10\x02\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') +) + +_COMMANDPROTO = _descriptor.EnumDescriptor( + name='CommandProto', + full_name='communicator_objects.CommandProto', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='STEP', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='RESET', index=1, number=1, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='QUIT', index=2, number=2, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=74, + serialized_end=119, +) +_sym_db.RegisterEnumDescriptor(_COMMANDPROTO) + +CommandProto = enum_type_wrapper.EnumTypeWrapper(_COMMANDPROTO) +STEP = 0 +RESET = 1 +QUIT = 2 + + +DESCRIPTOR.enum_types_by_name['CommandProto'] = _COMMANDPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/custom_reset_parameters_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/custom_reset_parameters_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..f4300d1016aa7e7dc6dcdea0f27fb21d641abb39 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/custom_reset_parameters_pb2.py @@ -0,0 +1,64 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/custom_reset_parameters.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/custom_reset_parameters.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n@mlagents_envs/communicator_objects/custom_reset_parameters.proto\x12\x14\x63ommunicator_objects\"\x1c\n\x1a\x43ustomResetParametersProtoB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') +) + + + + +_CUSTOMRESETPARAMETERSPROTO = _descriptor.Descriptor( + name='CustomResetParametersProto', + full_name='communicator_objects.CustomResetParametersProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=90, + serialized_end=118, +) + +DESCRIPTOR.message_types_by_name['CustomResetParametersProto'] = _CUSTOMRESETPARAMETERSPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +CustomResetParametersProto = _reflection.GeneratedProtocolMessageType('CustomResetParametersProto', (_message.Message,), dict( + DESCRIPTOR = _CUSTOMRESETPARAMETERSPROTO, + __module__ = 'mlagents_envs.communicator_objects.custom_reset_parameters_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.CustomResetParametersProto) + )) +_sym_db.RegisterMessage(CustomResetParametersProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/demonstration_meta_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/demonstration_meta_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..3b86fee455033d420e151518070a1e5174c1cd0a --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/demonstration_meta_pb2.py @@ -0,0 +1,99 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/demonstration_meta.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/demonstration_meta.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n;mlagents_envs/communicator_objects/demonstration_meta.proto\x12\x14\x63ommunicator_objects\"\x8d\x01\n\x16\x44\x65monstrationMetaProto\x12\x13\n\x0b\x61pi_version\x18\x01 \x01(\x05\x12\x1a\n\x12\x64\x65monstration_name\x18\x02 \x01(\t\x12\x14\n\x0cnumber_steps\x18\x03 \x01(\x05\x12\x17\n\x0fnumber_episodes\x18\x04 \x01(\x05\x12\x13\n\x0bmean_reward\x18\x05 \x01(\x02\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') +) + + + + +_DEMONSTRATIONMETAPROTO = _descriptor.Descriptor( + name='DemonstrationMetaProto', + full_name='communicator_objects.DemonstrationMetaProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='api_version', full_name='communicator_objects.DemonstrationMetaProto.api_version', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='demonstration_name', full_name='communicator_objects.DemonstrationMetaProto.demonstration_name', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='number_steps', full_name='communicator_objects.DemonstrationMetaProto.number_steps', index=2, + number=3, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='number_episodes', full_name='communicator_objects.DemonstrationMetaProto.number_episodes', index=3, + number=4, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='mean_reward', full_name='communicator_objects.DemonstrationMetaProto.mean_reward', index=4, + number=5, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=86, + serialized_end=227, +) + +DESCRIPTOR.message_types_by_name['DemonstrationMetaProto'] = _DEMONSTRATIONMETAPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +DemonstrationMetaProto = _reflection.GeneratedProtocolMessageType('DemonstrationMetaProto', (_message.Message,), dict( + DESCRIPTOR = _DEMONSTRATIONMETAPROTO, + __module__ = 'mlagents_envs.communicator_objects.demonstration_meta_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.DemonstrationMetaProto) + )) +_sym_db.RegisterMessage(DemonstrationMetaProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/engine_configuration_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/engine_configuration_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..9f4656ca8cb6048231b50892373ec94968b12622 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/engine_configuration_pb2.py @@ -0,0 +1,106 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/engine_configuration.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/engine_configuration.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n=mlagents_envs/communicator_objects/engine_configuration.proto\x12\x14\x63ommunicator_objects\"\x95\x01\n\x18\x45ngineConfigurationProto\x12\r\n\x05width\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\x15\n\rquality_level\x18\x03 \x01(\x05\x12\x12\n\ntime_scale\x18\x04 \x01(\x02\x12\x19\n\x11target_frame_rate\x18\x05 \x01(\x05\x12\x14\n\x0cshow_monitor\x18\x06 \x01(\x08\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') +) + + + + +_ENGINECONFIGURATIONPROTO = _descriptor.Descriptor( + name='EngineConfigurationProto', + full_name='communicator_objects.EngineConfigurationProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='width', full_name='communicator_objects.EngineConfigurationProto.width', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='height', full_name='communicator_objects.EngineConfigurationProto.height', index=1, + number=2, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='quality_level', full_name='communicator_objects.EngineConfigurationProto.quality_level', index=2, + number=3, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='time_scale', full_name='communicator_objects.EngineConfigurationProto.time_scale', index=3, + number=4, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='target_frame_rate', full_name='communicator_objects.EngineConfigurationProto.target_frame_rate', index=4, + number=5, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='show_monitor', full_name='communicator_objects.EngineConfigurationProto.show_monitor', index=5, + number=6, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=88, + serialized_end=237, +) + +DESCRIPTOR.message_types_by_name['EngineConfigurationProto'] = _ENGINECONFIGURATIONPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +EngineConfigurationProto = _reflection.GeneratedProtocolMessageType('EngineConfigurationProto', (_message.Message,), dict( + DESCRIPTOR = _ENGINECONFIGURATIONPROTO, + __module__ = 'mlagents_envs.communicator_objects.engine_configuration_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.EngineConfigurationProto) + )) +_sym_db.RegisterMessage(EngineConfigurationProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/header_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/header_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..5aa8eb89545b4c97e4eea1f3fd4c92e5cad2b463 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/header_pb2.py @@ -0,0 +1,78 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/header.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/header.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n/mlagents_envs/communicator_objects/header.proto\x12\x14\x63ommunicator_objects\".\n\x0bHeaderProto\x12\x0e\n\x06status\x18\x01 \x01(\x05\x12\x0f\n\x07message\x18\x02 \x01(\tB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') +) + + + + +_HEADERPROTO = _descriptor.Descriptor( + name='HeaderProto', + full_name='communicator_objects.HeaderProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='status', full_name='communicator_objects.HeaderProto.status', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='message', full_name='communicator_objects.HeaderProto.message', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=73, + serialized_end=119, +) + +DESCRIPTOR.message_types_by_name['HeaderProto'] = _HEADERPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +HeaderProto = _reflection.GeneratedProtocolMessageType('HeaderProto', (_message.Message,), dict( + DESCRIPTOR = _HEADERPROTO, + __module__ = 'mlagents_envs.communicator_objects.header_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.HeaderProto) + )) +_sym_db.RegisterMessage(HeaderProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/observation_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/observation_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..838ca1d87dabe02d7355fc627fe5b131196ab8bf --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/observation_pb2.py @@ -0,0 +1,224 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/observation.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf.internal import enum_type_wrapper +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/observation.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n4mlagents_envs/communicator_objects/observation.proto\x12\x14\x63ommunicator_objects\"\x8f\x03\n\x10ObservationProto\x12\r\n\x05shape\x18\x01 \x03(\x05\x12\x44\n\x10\x63ompression_type\x18\x02 \x01(\x0e\x32*.communicator_objects.CompressionTypeProto\x12\x19\n\x0f\x63ompressed_data\x18\x03 \x01(\x0cH\x00\x12\x46\n\nfloat_data\x18\x04 \x01(\x0b\x32\x30.communicator_objects.ObservationProto.FloatDataH\x00\x12\"\n\x1a\x63ompressed_channel_mapping\x18\x05 \x03(\x05\x12\x1c\n\x14\x64imension_properties\x18\x06 \x03(\x05\x12\x44\n\x10observation_type\x18\x07 \x01(\x0e\x32*.communicator_objects.ObservationTypeProto\x12\x0c\n\x04name\x18\x08 \x01(\t\x1a\x19\n\tFloatData\x12\x0c\n\x04\x64\x61ta\x18\x01 \x03(\x02\x42\x12\n\x10observation_data*)\n\x14\x43ompressionTypeProto\x12\x08\n\x04NONE\x10\x00\x12\x07\n\x03PNG\x10\x01*@\n\x14ObservationTypeProto\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\x0f\n\x0bGOAL_SIGNAL\x10\x01\"\x04\x08\x02\x10\x02\"\x04\x08\x03\x10\x03\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') +) + +_COMPRESSIONTYPEPROTO = _descriptor.EnumDescriptor( + name='CompressionTypeProto', + full_name='communicator_objects.CompressionTypeProto', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='NONE', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PNG', index=1, number=1, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=480, + serialized_end=521, +) +_sym_db.RegisterEnumDescriptor(_COMPRESSIONTYPEPROTO) + +CompressionTypeProto = enum_type_wrapper.EnumTypeWrapper(_COMPRESSIONTYPEPROTO) +_OBSERVATIONTYPEPROTO = _descriptor.EnumDescriptor( + name='ObservationTypeProto', + full_name='communicator_objects.ObservationTypeProto', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='DEFAULT', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='GOAL_SIGNAL', index=1, number=1, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=523, + serialized_end=587, +) +_sym_db.RegisterEnumDescriptor(_OBSERVATIONTYPEPROTO) + +ObservationTypeProto = enum_type_wrapper.EnumTypeWrapper(_OBSERVATIONTYPEPROTO) +NONE = 0 +PNG = 1 +DEFAULT = 0 +GOAL_SIGNAL = 1 + + + +_OBSERVATIONPROTO_FLOATDATA = _descriptor.Descriptor( + name='FloatData', + full_name='communicator_objects.ObservationProto.FloatData', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='data', full_name='communicator_objects.ObservationProto.FloatData.data', index=0, + number=1, type=2, cpp_type=6, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=433, + serialized_end=458, +) + +_OBSERVATIONPROTO = _descriptor.Descriptor( + name='ObservationProto', + full_name='communicator_objects.ObservationProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='shape', full_name='communicator_objects.ObservationProto.shape', index=0, + number=1, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='compression_type', full_name='communicator_objects.ObservationProto.compression_type', index=1, + number=2, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='compressed_data', full_name='communicator_objects.ObservationProto.compressed_data', index=2, + number=3, type=12, cpp_type=9, label=1, + has_default_value=False, default_value=_b(""), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='float_data', full_name='communicator_objects.ObservationProto.float_data', index=3, + number=4, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='compressed_channel_mapping', full_name='communicator_objects.ObservationProto.compressed_channel_mapping', index=4, + number=5, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='dimension_properties', full_name='communicator_objects.ObservationProto.dimension_properties', index=5, + number=6, type=5, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='observation_type', full_name='communicator_objects.ObservationProto.observation_type', index=6, + number=7, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='name', full_name='communicator_objects.ObservationProto.name', index=7, + number=8, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[_OBSERVATIONPROTO_FLOATDATA, ], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + _descriptor.OneofDescriptor( + name='observation_data', full_name='communicator_objects.ObservationProto.observation_data', + index=0, containing_type=None, fields=[]), + ], + serialized_start=79, + serialized_end=478, +) + +_OBSERVATIONPROTO_FLOATDATA.containing_type = _OBSERVATIONPROTO +_OBSERVATIONPROTO.fields_by_name['compression_type'].enum_type = _COMPRESSIONTYPEPROTO +_OBSERVATIONPROTO.fields_by_name['float_data'].message_type = _OBSERVATIONPROTO_FLOATDATA +_OBSERVATIONPROTO.fields_by_name['observation_type'].enum_type = _OBSERVATIONTYPEPROTO +_OBSERVATIONPROTO.oneofs_by_name['observation_data'].fields.append( + _OBSERVATIONPROTO.fields_by_name['compressed_data']) +_OBSERVATIONPROTO.fields_by_name['compressed_data'].containing_oneof = _OBSERVATIONPROTO.oneofs_by_name['observation_data'] +_OBSERVATIONPROTO.oneofs_by_name['observation_data'].fields.append( + _OBSERVATIONPROTO.fields_by_name['float_data']) +_OBSERVATIONPROTO.fields_by_name['float_data'].containing_oneof = _OBSERVATIONPROTO.oneofs_by_name['observation_data'] +DESCRIPTOR.message_types_by_name['ObservationProto'] = _OBSERVATIONPROTO +DESCRIPTOR.enum_types_by_name['CompressionTypeProto'] = _COMPRESSIONTYPEPROTO +DESCRIPTOR.enum_types_by_name['ObservationTypeProto'] = _OBSERVATIONTYPEPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +ObservationProto = _reflection.GeneratedProtocolMessageType('ObservationProto', (_message.Message,), dict( + + FloatData = _reflection.GeneratedProtocolMessageType('FloatData', (_message.Message,), dict( + DESCRIPTOR = _OBSERVATIONPROTO_FLOATDATA, + __module__ = 'mlagents_envs.communicator_objects.observation_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.ObservationProto.FloatData) + )) + , + DESCRIPTOR = _OBSERVATIONPROTO, + __module__ = 'mlagents_envs.communicator_objects.observation_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.ObservationProto) + )) +_sym_db.RegisterMessage(ObservationProto) +_sym_db.RegisterMessage(ObservationProto.FloatData) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/space_type_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/space_type_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..df62c23319c2be5890aa5d021aad9f8590820dc3 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/space_type_pb2.py @@ -0,0 +1,59 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/space_type.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf.internal import enum_type_wrapper +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/space_type.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n3mlagents_envs/communicator_objects/space_type.proto\x12\x14\x63ommunicator_objects*.\n\x0eSpaceTypeProto\x12\x0c\n\x08\x64iscrete\x10\x00\x12\x0e\n\ncontinuous\x10\x01\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') +) + +_SPACETYPEPROTO = _descriptor.EnumDescriptor( + name='SpaceTypeProto', + full_name='communicator_objects.SpaceTypeProto', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='discrete', index=0, number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='continuous', index=1, number=1, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=77, + serialized_end=123, +) +_sym_db.RegisterEnumDescriptor(_SPACETYPEPROTO) + +SpaceTypeProto = enum_type_wrapper.EnumTypeWrapper(_SPACETYPEPROTO) +discrete = 0 +continuous = 1 + + +DESCRIPTOR.enum_types_by_name['SpaceTypeProto'] = _SPACETYPEPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/training_analytics_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/training_analytics_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..2701dac8586bf004fd61430e2c0aa55a9f412691 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/training_analytics_pb2.py @@ -0,0 +1,257 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/training_analytics.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/training_analytics.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n;mlagents_envs/communicator_objects/training_analytics.proto\x12\x14\x63ommunicator_objects\"\xee\x01\n\x1eTrainingEnvironmentInitialized\x12\x18\n\x10mlagents_version\x18\x01 \x01(\t\x12\x1d\n\x15mlagents_envs_version\x18\x02 \x01(\t\x12\x16\n\x0epython_version\x18\x03 \x01(\t\x12\x15\n\rtorch_version\x18\x04 \x01(\t\x12\x19\n\x11torch_device_type\x18\x05 \x01(\t\x12\x10\n\x08num_envs\x18\x06 \x01(\x05\x12\"\n\x1anum_environment_parameters\x18\x07 \x01(\x05\x12\x13\n\x0brun_options\x18\x08 \x01(\t\"\xbd\x03\n\x1bTrainingBehaviorInitialized\x12\x15\n\rbehavior_name\x18\x01 \x01(\t\x12\x14\n\x0ctrainer_type\x18\x02 \x01(\t\x12 \n\x18\x65xtrinsic_reward_enabled\x18\x03 \x01(\x08\x12\x1b\n\x13gail_reward_enabled\x18\x04 \x01(\x08\x12 \n\x18\x63uriosity_reward_enabled\x18\x05 \x01(\x08\x12\x1a\n\x12rnd_reward_enabled\x18\x06 \x01(\x08\x12\"\n\x1a\x62\x65havioral_cloning_enabled\x18\x07 \x01(\x08\x12\x19\n\x11recurrent_enabled\x18\x08 \x01(\x08\x12\x16\n\x0evisual_encoder\x18\t \x01(\t\x12\x1a\n\x12num_network_layers\x18\n \x01(\x05\x12 \n\x18num_network_hidden_units\x18\x0b \x01(\x05\x12\x18\n\x10trainer_threaded\x18\x0c \x01(\x08\x12\x19\n\x11self_play_enabled\x18\r \x01(\x08\x12\x1a\n\x12\x63urriculum_enabled\x18\x0e \x01(\x08\x12\x0e\n\x06\x63onfig\x18\x0f \x01(\tB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') +) + + + + +_TRAININGENVIRONMENTINITIALIZED = _descriptor.Descriptor( + name='TrainingEnvironmentInitialized', + full_name='communicator_objects.TrainingEnvironmentInitialized', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='mlagents_version', full_name='communicator_objects.TrainingEnvironmentInitialized.mlagents_version', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='mlagents_envs_version', full_name='communicator_objects.TrainingEnvironmentInitialized.mlagents_envs_version', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='python_version', full_name='communicator_objects.TrainingEnvironmentInitialized.python_version', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='torch_version', full_name='communicator_objects.TrainingEnvironmentInitialized.torch_version', index=3, + number=4, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='torch_device_type', full_name='communicator_objects.TrainingEnvironmentInitialized.torch_device_type', index=4, + number=5, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='num_envs', full_name='communicator_objects.TrainingEnvironmentInitialized.num_envs', index=5, + number=6, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='num_environment_parameters', full_name='communicator_objects.TrainingEnvironmentInitialized.num_environment_parameters', index=6, + number=7, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='run_options', full_name='communicator_objects.TrainingEnvironmentInitialized.run_options', index=7, + number=8, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=86, + serialized_end=324, +) + + +_TRAININGBEHAVIORINITIALIZED = _descriptor.Descriptor( + name='TrainingBehaviorInitialized', + full_name='communicator_objects.TrainingBehaviorInitialized', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='behavior_name', full_name='communicator_objects.TrainingBehaviorInitialized.behavior_name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='trainer_type', full_name='communicator_objects.TrainingBehaviorInitialized.trainer_type', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='extrinsic_reward_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.extrinsic_reward_enabled', index=2, + number=3, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='gail_reward_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.gail_reward_enabled', index=3, + number=4, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='curiosity_reward_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.curiosity_reward_enabled', index=4, + number=5, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='rnd_reward_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.rnd_reward_enabled', index=5, + number=6, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='behavioral_cloning_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.behavioral_cloning_enabled', index=6, + number=7, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='recurrent_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.recurrent_enabled', index=7, + number=8, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='visual_encoder', full_name='communicator_objects.TrainingBehaviorInitialized.visual_encoder', index=8, + number=9, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='num_network_layers', full_name='communicator_objects.TrainingBehaviorInitialized.num_network_layers', index=9, + number=10, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='num_network_hidden_units', full_name='communicator_objects.TrainingBehaviorInitialized.num_network_hidden_units', index=10, + number=11, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='trainer_threaded', full_name='communicator_objects.TrainingBehaviorInitialized.trainer_threaded', index=11, + number=12, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='self_play_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.self_play_enabled', index=12, + number=13, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='curriculum_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.curriculum_enabled', index=13, + number=14, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='config', full_name='communicator_objects.TrainingBehaviorInitialized.config', index=14, + number=15, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=327, + serialized_end=772, +) + +DESCRIPTOR.message_types_by_name['TrainingEnvironmentInitialized'] = _TRAININGENVIRONMENTINITIALIZED +DESCRIPTOR.message_types_by_name['TrainingBehaviorInitialized'] = _TRAININGBEHAVIORINITIALIZED +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +TrainingEnvironmentInitialized = _reflection.GeneratedProtocolMessageType('TrainingEnvironmentInitialized', (_message.Message,), dict( + DESCRIPTOR = _TRAININGENVIRONMENTINITIALIZED, + __module__ = 'mlagents_envs.communicator_objects.training_analytics_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.TrainingEnvironmentInitialized) + )) +_sym_db.RegisterMessage(TrainingEnvironmentInitialized) + +TrainingBehaviorInitialized = _reflection.GeneratedProtocolMessageType('TrainingBehaviorInitialized', (_message.Message,), dict( + DESCRIPTOR = _TRAININGBEHAVIORINITIALIZED, + __module__ = 'mlagents_envs.communicator_objects.training_analytics_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.TrainingBehaviorInitialized) + )) +_sym_db.RegisterMessage(TrainingBehaviorInitialized) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_input_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_input_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..6c4a24f3922f209ed928119b97c0f67ea38e7b15 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_input_pb2.py @@ -0,0 +1,83 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/unity_input.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from mlagents_envs.communicator_objects import unity_rl_input_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__rl__input__pb2 +from mlagents_envs.communicator_objects import unity_rl_initialization_input_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__rl__initialization__input__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/unity_input.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n4mlagents_envs/communicator_objects/unity_input.proto\x12\x14\x63ommunicator_objects\x1a\x37mlagents_envs/communicator_objects/unity_rl_input.proto\x1a\x46mlagents_envs/communicator_objects/unity_rl_initialization_input.proto\"\xa4\x01\n\x0fUnityInputProto\x12\x39\n\x08rl_input\x18\x01 \x01(\x0b\x32\'.communicator_objects.UnityRLInputProto\x12V\n\x17rl_initialization_input\x18\x02 \x01(\x0b\x32\x35.communicator_objects.UnityRLInitializationInputProtoB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') + , + dependencies=[mlagents__envs_dot_communicator__objects_dot_unity__rl__input__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_unity__rl__initialization__input__pb2.DESCRIPTOR,]) + + + + +_UNITYINPUTPROTO = _descriptor.Descriptor( + name='UnityInputProto', + full_name='communicator_objects.UnityInputProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='rl_input', full_name='communicator_objects.UnityInputProto.rl_input', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='rl_initialization_input', full_name='communicator_objects.UnityInputProto.rl_initialization_input', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=208, + serialized_end=372, +) + +_UNITYINPUTPROTO.fields_by_name['rl_input'].message_type = mlagents__envs_dot_communicator__objects_dot_unity__rl__input__pb2._UNITYRLINPUTPROTO +_UNITYINPUTPROTO.fields_by_name['rl_initialization_input'].message_type = mlagents__envs_dot_communicator__objects_dot_unity__rl__initialization__input__pb2._UNITYRLINITIALIZATIONINPUTPROTO +DESCRIPTOR.message_types_by_name['UnityInputProto'] = _UNITYINPUTPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +UnityInputProto = _reflection.GeneratedProtocolMessageType('UnityInputProto', (_message.Message,), dict( + DESCRIPTOR = _UNITYINPUTPROTO, + __module__ = 'mlagents_envs.communicator_objects.unity_input_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.UnityInputProto) + )) +_sym_db.RegisterMessage(UnityInputProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_message_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_message_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..e01b52298f8b05f477802c53ea5ab5a9082f9b0e --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_message_pb2.py @@ -0,0 +1,92 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/unity_message.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from mlagents_envs.communicator_objects import unity_output_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__output__pb2 +from mlagents_envs.communicator_objects import unity_input_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__input__pb2 +from mlagents_envs.communicator_objects import header_pb2 as mlagents__envs_dot_communicator__objects_dot_header__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/unity_message.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n6mlagents_envs/communicator_objects/unity_message.proto\x12\x14\x63ommunicator_objects\x1a\x35mlagents_envs/communicator_objects/unity_output.proto\x1a\x34mlagents_envs/communicator_objects/unity_input.proto\x1a/mlagents_envs/communicator_objects/header.proto\"\xc0\x01\n\x11UnityMessageProto\x12\x31\n\x06header\x18\x01 \x01(\x0b\x32!.communicator_objects.HeaderProto\x12<\n\x0cunity_output\x18\x02 \x01(\x0b\x32&.communicator_objects.UnityOutputProto\x12:\n\x0bunity_input\x18\x03 \x01(\x0b\x32%.communicator_objects.UnityInputProtoB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') + , + dependencies=[mlagents__envs_dot_communicator__objects_dot_unity__output__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_unity__input__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_header__pb2.DESCRIPTOR,]) + + + + +_UNITYMESSAGEPROTO = _descriptor.Descriptor( + name='UnityMessageProto', + full_name='communicator_objects.UnityMessageProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='header', full_name='communicator_objects.UnityMessageProto.header', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='unity_output', full_name='communicator_objects.UnityMessageProto.unity_output', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='unity_input', full_name='communicator_objects.UnityMessageProto.unity_input', index=2, + number=3, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=239, + serialized_end=431, +) + +_UNITYMESSAGEPROTO.fields_by_name['header'].message_type = mlagents__envs_dot_communicator__objects_dot_header__pb2._HEADERPROTO +_UNITYMESSAGEPROTO.fields_by_name['unity_output'].message_type = mlagents__envs_dot_communicator__objects_dot_unity__output__pb2._UNITYOUTPUTPROTO +_UNITYMESSAGEPROTO.fields_by_name['unity_input'].message_type = mlagents__envs_dot_communicator__objects_dot_unity__input__pb2._UNITYINPUTPROTO +DESCRIPTOR.message_types_by_name['UnityMessageProto'] = _UNITYMESSAGEPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +UnityMessageProto = _reflection.GeneratedProtocolMessageType('UnityMessageProto', (_message.Message,), dict( + DESCRIPTOR = _UNITYMESSAGEPROTO, + __module__ = 'mlagents_envs.communicator_objects.unity_message_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.UnityMessageProto) + )) +_sym_db.RegisterMessage(UnityMessageProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_output_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_output_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..400fdb00f736a6b6a496b270b20bb65bdcc304f6 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_output_pb2.py @@ -0,0 +1,83 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/unity_output.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from mlagents_envs.communicator_objects import unity_rl_output_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__rl__output__pb2 +from mlagents_envs.communicator_objects import unity_rl_initialization_output_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__rl__initialization__output__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/unity_output.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n5mlagents_envs/communicator_objects/unity_output.proto\x12\x14\x63ommunicator_objects\x1a\x38mlagents_envs/communicator_objects/unity_rl_output.proto\x1aGmlagents_envs/communicator_objects/unity_rl_initialization_output.proto\"\xa9\x01\n\x10UnityOutputProto\x12;\n\trl_output\x18\x01 \x01(\x0b\x32(.communicator_objects.UnityRLOutputProto\x12X\n\x18rl_initialization_output\x18\x02 \x01(\x0b\x32\x36.communicator_objects.UnityRLInitializationOutputProtoB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') + , + dependencies=[mlagents__envs_dot_communicator__objects_dot_unity__rl__output__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_unity__rl__initialization__output__pb2.DESCRIPTOR,]) + + + + +_UNITYOUTPUTPROTO = _descriptor.Descriptor( + name='UnityOutputProto', + full_name='communicator_objects.UnityOutputProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='rl_output', full_name='communicator_objects.UnityOutputProto.rl_output', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='rl_initialization_output', full_name='communicator_objects.UnityOutputProto.rl_initialization_output', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=211, + serialized_end=380, +) + +_UNITYOUTPUTPROTO.fields_by_name['rl_output'].message_type = mlagents__envs_dot_communicator__objects_dot_unity__rl__output__pb2._UNITYRLOUTPUTPROTO +_UNITYOUTPUTPROTO.fields_by_name['rl_initialization_output'].message_type = mlagents__envs_dot_communicator__objects_dot_unity__rl__initialization__output__pb2._UNITYRLINITIALIZATIONOUTPUTPROTO +DESCRIPTOR.message_types_by_name['UnityOutputProto'] = _UNITYOUTPUTPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +UnityOutputProto = _reflection.GeneratedProtocolMessageType('UnityOutputProto', (_message.Message,), dict( + DESCRIPTOR = _UNITYOUTPUTPROTO, + __module__ = 'mlagents_envs.communicator_objects.unity_output_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.UnityOutputProto) + )) +_sym_db.RegisterMessage(UnityOutputProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_initialization_input_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_initialization_input_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..d111397adaa25c1d73a00291092fb98358ef0ff7 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_initialization_input_pb2.py @@ -0,0 +1,102 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/unity_rl_initialization_input.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from mlagents_envs.communicator_objects import capabilities_pb2 as mlagents__envs_dot_communicator__objects_dot_capabilities__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/unity_rl_initialization_input.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\nFmlagents_envs/communicator_objects/unity_rl_initialization_input.proto\x12\x14\x63ommunicator_objects\x1a\x35mlagents_envs/communicator_objects/capabilities.proto\"\xc0\x01\n\x1fUnityRLInitializationInputProto\x12\x0c\n\x04seed\x18\x01 \x01(\x05\x12\x1d\n\x15\x63ommunication_version\x18\x02 \x01(\t\x12\x17\n\x0fpackage_version\x18\x03 \x01(\t\x12\x44\n\x0c\x63\x61pabilities\x18\x04 \x01(\x0b\x32..communicator_objects.UnityRLCapabilitiesProto\x12\x11\n\tnum_areas\x18\x05 \x01(\x05\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') + , + dependencies=[mlagents__envs_dot_communicator__objects_dot_capabilities__pb2.DESCRIPTOR,]) + + + + +_UNITYRLINITIALIZATIONINPUTPROTO = _descriptor.Descriptor( + name='UnityRLInitializationInputProto', + full_name='communicator_objects.UnityRLInitializationInputProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='seed', full_name='communicator_objects.UnityRLInitializationInputProto.seed', index=0, + number=1, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='communication_version', full_name='communicator_objects.UnityRLInitializationInputProto.communication_version', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='package_version', full_name='communicator_objects.UnityRLInitializationInputProto.package_version', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='capabilities', full_name='communicator_objects.UnityRLInitializationInputProto.capabilities', index=3, + number=4, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='num_areas', full_name='communicator_objects.UnityRLInitializationInputProto.num_areas', index=4, + number=5, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=152, + serialized_end=344, +) + +_UNITYRLINITIALIZATIONINPUTPROTO.fields_by_name['capabilities'].message_type = mlagents__envs_dot_communicator__objects_dot_capabilities__pb2._UNITYRLCAPABILITIESPROTO +DESCRIPTOR.message_types_by_name['UnityRLInitializationInputProto'] = _UNITYRLINITIALIZATIONINPUTPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +UnityRLInitializationInputProto = _reflection.GeneratedProtocolMessageType('UnityRLInitializationInputProto', (_message.Message,), dict( + DESCRIPTOR = _UNITYRLINITIALIZATIONINPUTPROTO, + __module__ = 'mlagents_envs.communicator_objects.unity_rl_initialization_input_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLInitializationInputProto) + )) +_sym_db.RegisterMessage(UnityRLInitializationInputProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_initialization_output_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_initialization_output_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..155af96d825b5ee5ae1f1df3adaa0abf35c65db2 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_initialization_output_pb2.py @@ -0,0 +1,111 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/unity_rl_initialization_output.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from mlagents_envs.communicator_objects import capabilities_pb2 as mlagents__envs_dot_communicator__objects_dot_capabilities__pb2 +from mlagents_envs.communicator_objects import brain_parameters_pb2 as mlagents__envs_dot_communicator__objects_dot_brain__parameters__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/unity_rl_initialization_output.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\nGmlagents_envs/communicator_objects/unity_rl_initialization_output.proto\x12\x14\x63ommunicator_objects\x1a\x35mlagents_envs/communicator_objects/capabilities.proto\x1a\x39mlagents_envs/communicator_objects/brain_parameters.proto\"\x8c\x02\n UnityRLInitializationOutputProto\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1d\n\x15\x63ommunication_version\x18\x02 \x01(\t\x12\x10\n\x08log_path\x18\x03 \x01(\t\x12\x44\n\x10\x62rain_parameters\x18\x05 \x03(\x0b\x32*.communicator_objects.BrainParametersProto\x12\x17\n\x0fpackage_version\x18\x07 \x01(\t\x12\x44\n\x0c\x63\x61pabilities\x18\x08 \x01(\x0b\x32..communicator_objects.UnityRLCapabilitiesProtoJ\x04\x08\x06\x10\x07\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') + , + dependencies=[mlagents__envs_dot_communicator__objects_dot_capabilities__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_brain__parameters__pb2.DESCRIPTOR,]) + + + + +_UNITYRLINITIALIZATIONOUTPUTPROTO = _descriptor.Descriptor( + name='UnityRLInitializationOutputProto', + full_name='communicator_objects.UnityRLInitializationOutputProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='name', full_name='communicator_objects.UnityRLInitializationOutputProto.name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='communication_version', full_name='communicator_objects.UnityRLInitializationOutputProto.communication_version', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='log_path', full_name='communicator_objects.UnityRLInitializationOutputProto.log_path', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='brain_parameters', full_name='communicator_objects.UnityRLInitializationOutputProto.brain_parameters', index=3, + number=5, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='package_version', full_name='communicator_objects.UnityRLInitializationOutputProto.package_version', index=4, + number=7, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='capabilities', full_name='communicator_objects.UnityRLInitializationOutputProto.capabilities', index=5, + number=8, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=212, + serialized_end=480, +) + +_UNITYRLINITIALIZATIONOUTPUTPROTO.fields_by_name['brain_parameters'].message_type = mlagents__envs_dot_communicator__objects_dot_brain__parameters__pb2._BRAINPARAMETERSPROTO +_UNITYRLINITIALIZATIONOUTPUTPROTO.fields_by_name['capabilities'].message_type = mlagents__envs_dot_communicator__objects_dot_capabilities__pb2._UNITYRLCAPABILITIESPROTO +DESCRIPTOR.message_types_by_name['UnityRLInitializationOutputProto'] = _UNITYRLINITIALIZATIONOUTPUTPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +UnityRLInitializationOutputProto = _reflection.GeneratedProtocolMessageType('UnityRLInitializationOutputProto', (_message.Message,), dict( + DESCRIPTOR = _UNITYRLINITIALIZATIONOUTPUTPROTO, + __module__ = 'mlagents_envs.communicator_objects.unity_rl_initialization_output_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLInitializationOutputProto) + )) +_sym_db.RegisterMessage(UnityRLInitializationOutputProto) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_input_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_input_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..69e14f3fd291f3cedd03446926fadfd2f4e589b8 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_input_pb2.py @@ -0,0 +1,179 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/unity_rl_input.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from mlagents_envs.communicator_objects import agent_action_pb2 as mlagents__envs_dot_communicator__objects_dot_agent__action__pb2 +from mlagents_envs.communicator_objects import command_pb2 as mlagents__envs_dot_communicator__objects_dot_command__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/unity_rl_input.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n7mlagents_envs/communicator_objects/unity_rl_input.proto\x12\x14\x63ommunicator_objects\x1a\x35mlagents_envs/communicator_objects/agent_action.proto\x1a\x30mlagents_envs/communicator_objects/command.proto\"\xfe\x02\n\x11UnityRLInputProto\x12P\n\ragent_actions\x18\x01 \x03(\x0b\x32\x39.communicator_objects.UnityRLInputProto.AgentActionsEntry\x12\x33\n\x07\x63ommand\x18\x04 \x01(\x0e\x32\".communicator_objects.CommandProto\x12\x14\n\x0cside_channel\x18\x05 \x01(\x0c\x1aM\n\x14ListAgentActionProto\x12\x35\n\x05value\x18\x01 \x03(\x0b\x32&.communicator_objects.AgentActionProto\x1aq\n\x11\x41gentActionsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12K\n\x05value\x18\x02 \x01(\x0b\x32<.communicator_objects.UnityRLInputProto.ListAgentActionProto:\x02\x38\x01J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') + , + dependencies=[mlagents__envs_dot_communicator__objects_dot_agent__action__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_command__pb2.DESCRIPTOR,]) + + + + +_UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO = _descriptor.Descriptor( + name='ListAgentActionProto', + full_name='communicator_objects.UnityRLInputProto.ListAgentActionProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='value', full_name='communicator_objects.UnityRLInputProto.ListAgentActionProto.value', index=0, + number=1, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=365, + serialized_end=442, +) + +_UNITYRLINPUTPROTO_AGENTACTIONSENTRY = _descriptor.Descriptor( + name='AgentActionsEntry', + full_name='communicator_objects.UnityRLInputProto.AgentActionsEntry', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='key', full_name='communicator_objects.UnityRLInputProto.AgentActionsEntry.key', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='value', full_name='communicator_objects.UnityRLInputProto.AgentActionsEntry.value', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=_descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')), + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=444, + serialized_end=557, +) + +_UNITYRLINPUTPROTO = _descriptor.Descriptor( + name='UnityRLInputProto', + full_name='communicator_objects.UnityRLInputProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='agent_actions', full_name='communicator_objects.UnityRLInputProto.agent_actions', index=0, + number=1, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='command', full_name='communicator_objects.UnityRLInputProto.command', index=1, + number=4, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='side_channel', full_name='communicator_objects.UnityRLInputProto.side_channel', index=2, + number=5, type=12, cpp_type=9, label=1, + has_default_value=False, default_value=_b(""), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[_UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO, _UNITYRLINPUTPROTO_AGENTACTIONSENTRY, ], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=187, + serialized_end=569, +) + +_UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO.fields_by_name['value'].message_type = mlagents__envs_dot_communicator__objects_dot_agent__action__pb2._AGENTACTIONPROTO +_UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO.containing_type = _UNITYRLINPUTPROTO +_UNITYRLINPUTPROTO_AGENTACTIONSENTRY.fields_by_name['value'].message_type = _UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO +_UNITYRLINPUTPROTO_AGENTACTIONSENTRY.containing_type = _UNITYRLINPUTPROTO +_UNITYRLINPUTPROTO.fields_by_name['agent_actions'].message_type = _UNITYRLINPUTPROTO_AGENTACTIONSENTRY +_UNITYRLINPUTPROTO.fields_by_name['command'].enum_type = mlagents__envs_dot_communicator__objects_dot_command__pb2._COMMANDPROTO +DESCRIPTOR.message_types_by_name['UnityRLInputProto'] = _UNITYRLINPUTPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +UnityRLInputProto = _reflection.GeneratedProtocolMessageType('UnityRLInputProto', (_message.Message,), dict( + + ListAgentActionProto = _reflection.GeneratedProtocolMessageType('ListAgentActionProto', (_message.Message,), dict( + DESCRIPTOR = _UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO, + __module__ = 'mlagents_envs.communicator_objects.unity_rl_input_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLInputProto.ListAgentActionProto) + )) + , + + AgentActionsEntry = _reflection.GeneratedProtocolMessageType('AgentActionsEntry', (_message.Message,), dict( + DESCRIPTOR = _UNITYRLINPUTPROTO_AGENTACTIONSENTRY, + __module__ = 'mlagents_envs.communicator_objects.unity_rl_input_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLInputProto.AgentActionsEntry) + )) + , + DESCRIPTOR = _UNITYRLINPUTPROTO, + __module__ = 'mlagents_envs.communicator_objects.unity_rl_input_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLInputProto) + )) +_sym_db.RegisterMessage(UnityRLInputProto) +_sym_db.RegisterMessage(UnityRLInputProto.ListAgentActionProto) +_sym_db.RegisterMessage(UnityRLInputProto.AgentActionsEntry) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +_UNITYRLINPUTPROTO_AGENTACTIONSENTRY.has_options = True +_UNITYRLINPUTPROTO_AGENTACTIONSENTRY._options = _descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_output_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_output_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..099d0b0040e98418950442adc34410e908a52a08 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_output_pb2.py @@ -0,0 +1,170 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/unity_rl_output.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from mlagents_envs.communicator_objects import agent_info_pb2 as mlagents__envs_dot_communicator__objects_dot_agent__info__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/unity_rl_output.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n8mlagents_envs/communicator_objects/unity_rl_output.proto\x12\x14\x63ommunicator_objects\x1a\x33mlagents_envs/communicator_objects/agent_info.proto\"\xb9\x02\n\x12UnityRLOutputProto\x12L\n\nagentInfos\x18\x02 \x03(\x0b\x32\x38.communicator_objects.UnityRLOutputProto.AgentInfosEntry\x12\x14\n\x0cside_channel\x18\x03 \x01(\x0c\x1aI\n\x12ListAgentInfoProto\x12\x33\n\x05value\x18\x01 \x03(\x0b\x32$.communicator_objects.AgentInfoProto\x1an\n\x0f\x41gentInfosEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12J\n\x05value\x18\x02 \x01(\x0b\x32;.communicator_objects.UnityRLOutputProto.ListAgentInfoProto:\x02\x38\x01J\x04\x08\x01\x10\x02\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') + , + dependencies=[mlagents__envs_dot_communicator__objects_dot_agent__info__pb2.DESCRIPTOR,]) + + + + +_UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO = _descriptor.Descriptor( + name='ListAgentInfoProto', + full_name='communicator_objects.UnityRLOutputProto.ListAgentInfoProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='value', full_name='communicator_objects.UnityRLOutputProto.ListAgentInfoProto.value', index=0, + number=1, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=258, + serialized_end=331, +) + +_UNITYRLOUTPUTPROTO_AGENTINFOSENTRY = _descriptor.Descriptor( + name='AgentInfosEntry', + full_name='communicator_objects.UnityRLOutputProto.AgentInfosEntry', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='key', full_name='communicator_objects.UnityRLOutputProto.AgentInfosEntry.key', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=_b("").decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='value', full_name='communicator_objects.UnityRLOutputProto.AgentInfosEntry.value', index=1, + number=2, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=_descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')), + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=333, + serialized_end=443, +) + +_UNITYRLOUTPUTPROTO = _descriptor.Descriptor( + name='UnityRLOutputProto', + full_name='communicator_objects.UnityRLOutputProto', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='agentInfos', full_name='communicator_objects.UnityRLOutputProto.agentInfos', index=0, + number=2, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='side_channel', full_name='communicator_objects.UnityRLOutputProto.side_channel', index=1, + number=3, type=12, cpp_type=9, label=1, + has_default_value=False, default_value=_b(""), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[_UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO, _UNITYRLOUTPUTPROTO_AGENTINFOSENTRY, ], + enum_types=[ + ], + options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=136, + serialized_end=449, +) + +_UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO.fields_by_name['value'].message_type = mlagents__envs_dot_communicator__objects_dot_agent__info__pb2._AGENTINFOPROTO +_UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO.containing_type = _UNITYRLOUTPUTPROTO +_UNITYRLOUTPUTPROTO_AGENTINFOSENTRY.fields_by_name['value'].message_type = _UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO +_UNITYRLOUTPUTPROTO_AGENTINFOSENTRY.containing_type = _UNITYRLOUTPUTPROTO +_UNITYRLOUTPUTPROTO.fields_by_name['agentInfos'].message_type = _UNITYRLOUTPUTPROTO_AGENTINFOSENTRY +DESCRIPTOR.message_types_by_name['UnityRLOutputProto'] = _UNITYRLOUTPUTPROTO +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +UnityRLOutputProto = _reflection.GeneratedProtocolMessageType('UnityRLOutputProto', (_message.Message,), dict( + + ListAgentInfoProto = _reflection.GeneratedProtocolMessageType('ListAgentInfoProto', (_message.Message,), dict( + DESCRIPTOR = _UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO, + __module__ = 'mlagents_envs.communicator_objects.unity_rl_output_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLOutputProto.ListAgentInfoProto) + )) + , + + AgentInfosEntry = _reflection.GeneratedProtocolMessageType('AgentInfosEntry', (_message.Message,), dict( + DESCRIPTOR = _UNITYRLOUTPUTPROTO_AGENTINFOSENTRY, + __module__ = 'mlagents_envs.communicator_objects.unity_rl_output_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLOutputProto.AgentInfosEntry) + )) + , + DESCRIPTOR = _UNITYRLOUTPUTPROTO, + __module__ = 'mlagents_envs.communicator_objects.unity_rl_output_pb2' + # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLOutputProto) + )) +_sym_db.RegisterMessage(UnityRLOutputProto) +_sym_db.RegisterMessage(UnityRLOutputProto.ListAgentInfoProto) +_sym_db.RegisterMessage(UnityRLOutputProto.AgentInfosEntry) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) +_UNITYRLOUTPUTPROTO_AGENTINFOSENTRY.has_options = True +_UNITYRLOUTPUTPROTO_AGENTINFOSENTRY._options = _descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')) +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_to_external_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_to_external_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..6053ccc46a4b550649b8f20238b15117f8748e88 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_to_external_pb2.py @@ -0,0 +1,58 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: mlagents_envs/communicator_objects/unity_to_external.proto + +import sys +_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +from mlagents_envs.communicator_objects import unity_message_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__message__pb2 + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='mlagents_envs/communicator_objects/unity_to_external.proto', + package='communicator_objects', + syntax='proto3', + serialized_pb=_b('\n:mlagents_envs/communicator_objects/unity_to_external.proto\x12\x14\x63ommunicator_objects\x1a\x36mlagents_envs/communicator_objects/unity_message.proto2v\n\x14UnityToExternalProto\x12^\n\x08\x45xchange\x12\'.communicator_objects.UnityMessageProto\x1a\'.communicator_objects.UnityMessageProto\"\x00\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3') + , + dependencies=[mlagents__envs_dot_communicator__objects_dot_unity__message__pb2.DESCRIPTOR,]) + + + +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects')) + +_UNITYTOEXTERNALPROTO = _descriptor.ServiceDescriptor( + name='UnityToExternalProto', + full_name='communicator_objects.UnityToExternalProto', + file=DESCRIPTOR, + index=0, + options=None, + serialized_start=140, + serialized_end=258, + methods=[ + _descriptor.MethodDescriptor( + name='Exchange', + full_name='communicator_objects.UnityToExternalProto.Exchange', + index=0, + containing_service=None, + input_type=mlagents__envs_dot_communicator__objects_dot_unity__message__pb2._UNITYMESSAGEPROTO, + output_type=mlagents__envs_dot_communicator__objects_dot_unity__message__pb2._UNITYMESSAGEPROTO, + options=None, + ), +]) +_sym_db.RegisterServiceDescriptor(_UNITYTOEXTERNALPROTO) + +DESCRIPTOR.services_by_name['UnityToExternalProto'] = _UNITYTOEXTERNALPROTO + +# @@protoc_insertion_point(module_scope) diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_to_external_pb2_grpc.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_to_external_pb2_grpc.py new file mode 100644 index 0000000000000000000000000000000000000000..bedeb100594c7ba9728c2118b526a18de5f2421d --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_to_external_pb2_grpc.py @@ -0,0 +1,46 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +import grpc + +from mlagents_envs.communicator_objects import unity_message_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__message__pb2 + + +class UnityToExternalProtoStub(object): + # missing associated documentation comment in .proto file + pass + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.Exchange = channel.unary_unary( + '/communicator_objects.UnityToExternalProto/Exchange', + request_serializer=mlagents__envs_dot_communicator__objects_dot_unity__message__pb2.UnityMessageProto.SerializeToString, + response_deserializer=mlagents__envs_dot_communicator__objects_dot_unity__message__pb2.UnityMessageProto.FromString, + ) + + +class UnityToExternalProtoServicer(object): + # missing associated documentation comment in .proto file + pass + + def Exchange(self, request, context): + """Sends the academy parameters + """ + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + + +def add_UnityToExternalProtoServicer_to_server(servicer, server): + rpc_method_handlers = { + 'Exchange': grpc.unary_unary_rpc_method_handler( + servicer.Exchange, + request_deserializer=mlagents__envs_dot_communicator__objects_dot_unity__message__pb2.UnityMessageProto.FromString, + response_serializer=mlagents__envs_dot_communicator__objects_dot_unity__message__pb2.UnityMessageProto.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler( + 'communicator_objects.UnityToExternalProto', rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) diff --git a/MLPY/Lib/site-packages/mlagents_envs/env_utils.py b/MLPY/Lib/site-packages/mlagents_envs/env_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..73393d945617a2e3ac0c3489939ae9575d5d8d80 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/env_utils.py @@ -0,0 +1,130 @@ +import glob +import os +import subprocess +from sys import platform +from typing import Optional, List +from mlagents_envs.logging_util import get_logger, DEBUG +from mlagents_envs.exception import UnityEnvironmentException + + +logger = get_logger(__name__) + + +def get_platform(): + """ + returns the platform of the operating system : linux, darwin or win32 + """ + return platform + + +def validate_environment_path(env_path: str) -> Optional[str]: + """ + Strip out executable extensions of the env_path + :param env_path: The path to the executable + """ + env_path = ( + env_path.strip() + .replace(".app", "") + .replace(".exe", "") + .replace(".x86_64", "") + .replace(".x86", "") + ) + true_filename = os.path.basename(os.path.normpath(env_path)) + logger.debug(f"The true file name is {true_filename}") + + if not (glob.glob(env_path) or glob.glob(env_path + ".*")): + return None + + cwd = os.getcwd() + launch_string = None + true_filename = os.path.basename(os.path.normpath(env_path)) + if get_platform() == "linux" or get_platform() == "linux2": + candidates = glob.glob(os.path.join(cwd, env_path) + ".x86_64") + if len(candidates) == 0: + candidates = glob.glob(os.path.join(cwd, env_path) + ".x86") + if len(candidates) == 0: + candidates = glob.glob(env_path + ".x86_64") + if len(candidates) == 0: + candidates = glob.glob(env_path + ".x86") + if len(candidates) == 0: + if os.path.isfile(env_path): + candidates = [env_path] + if len(candidates) > 0: + launch_string = candidates[0] + + elif get_platform() == "darwin": + candidates = glob.glob( + os.path.join(cwd, env_path + ".app", "Contents", "MacOS", true_filename) + ) + if len(candidates) == 0: + candidates = glob.glob( + os.path.join(env_path + ".app", "Contents", "MacOS", true_filename) + ) + if len(candidates) == 0: + candidates = glob.glob( + os.path.join(cwd, env_path + ".app", "Contents", "MacOS", "*") + ) + if len(candidates) == 0: + candidates = glob.glob( + os.path.join(env_path + ".app", "Contents", "MacOS", "*") + ) + if len(candidates) > 0: + launch_string = candidates[0] + elif get_platform() == "win32": + candidates = glob.glob(os.path.join(cwd, env_path + ".exe")) + if len(candidates) == 0: + candidates = glob.glob(env_path + ".exe") + if len(candidates) == 0: + # Look for e.g. 3DBall\UnityEnvironment.exe + crash_handlers = set( + glob.glob(os.path.join(cwd, env_path, "UnityCrashHandler*.exe")) + ) + candidates = [ + c + for c in glob.glob(os.path.join(cwd, env_path, "*.exe")) + if c not in crash_handlers + ] + if len(candidates) > 0: + launch_string = candidates[0] + return launch_string + + +def launch_executable(file_name: str, args: List[str]) -> subprocess.Popen: + """ + Launches a Unity executable and returns the process handle for it. + :param file_name: the name of the executable + :param args: List of string that will be passed as command line arguments + when launching the executable. + """ + launch_string = validate_environment_path(file_name) + if launch_string is None: + raise UnityEnvironmentException( + f"Couldn't launch the {file_name} environment. Provided filename does not match any environments." + ) + else: + logger.debug(f"The launch string is {launch_string}") + logger.debug(f"Running with args {args}") + # Launch Unity environment + subprocess_args = [launch_string] + args + # std_out_option = DEVNULL means the outputs will not be displayed on terminal. + # std_out_option = None is default behavior: the outputs are displayed on terminal. + std_out_option = subprocess.DEVNULL if logger.level > DEBUG else None + try: + return subprocess.Popen( + subprocess_args, + # start_new_session=True means that signals to the parent python process + # (e.g. SIGINT from keyboard interrupt) will not be sent to the new process on POSIX platforms. + # This is generally good since we want the environment to have a chance to shutdown, + # but may be undesirable in come cases; if so, we'll add a command-line toggle. + # Note that on Windows, the CTRL_C signal will still be sent. + start_new_session=True, + stdout=std_out_option, + stderr=std_out_option, + ) + except PermissionError as perm: + # This is likely due to missing read or execute permissions on file. + raise UnityEnvironmentException( + f"Error when trying to launch environment - make sure " + f"permissions are set correctly. For example " + f'"chmod -R 755 {launch_string}"' + ) from perm diff --git a/MLPY/Lib/site-packages/mlagents_envs/environment.py b/MLPY/Lib/site-packages/mlagents_envs/environment.py new file mode 100644 index 0000000000000000000000000000000000000000..18731a20bb198c4be456d5fe8772fb2dae3e4574 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/environment.py @@ -0,0 +1,518 @@ +import atexit +from distutils.version import StrictVersion + +import numpy as np +import os +import subprocess +from typing import Dict, List, Optional, Tuple, Mapping as MappingType + +import mlagents_envs + +from mlagents_envs.logging_util import get_logger +from mlagents_envs.side_channel.side_channel import SideChannel +from mlagents_envs.side_channel import DefaultTrainingAnalyticsSideChannel +from mlagents_envs.side_channel.side_channel_manager import SideChannelManager +from mlagents_envs import env_utils + +from mlagents_envs.base_env import ( + BaseEnv, + DecisionSteps, + TerminalSteps, + BehaviorSpec, + ActionTuple, + BehaviorName, + AgentId, + BehaviorMapping, +) +from mlagents_envs.timers import timed, hierarchical_timer +from mlagents_envs.exception import ( + UnityEnvironmentException, + UnityActionException, + UnityTimeOutException, + UnityCommunicatorStoppedException, +) + +from mlagents_envs.communicator_objects.command_pb2 import STEP, RESET +from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto + +from mlagents_envs.communicator_objects.unity_rl_input_pb2 import UnityRLInputProto +from mlagents_envs.communicator_objects.unity_rl_output_pb2 import UnityRLOutputProto +from mlagents_envs.communicator_objects.agent_action_pb2 import AgentActionProto +from mlagents_envs.communicator_objects.unity_output_pb2 import UnityOutputProto +from mlagents_envs.communicator_objects.capabilities_pb2 import UnityRLCapabilitiesProto +from mlagents_envs.communicator_objects.unity_rl_initialization_input_pb2 import ( + UnityRLInitializationInputProto, +) + +from mlagents_envs.communicator_objects.unity_input_pb2 import UnityInputProto + +from .rpc_communicator import RpcCommunicator +import signal + +logger = get_logger(__name__) + + +class UnityEnvironment(BaseEnv): + # Communication protocol version. + # When connecting to C#, this must be compatible with Academy.k_ApiVersion. + # We follow semantic versioning on the communication version, so existing + # functionality will work as long the major versions match. + # This should be changed whenever a change is made to the communication protocol. + # Revision history: + # * 1.0.0 - initial version + # * 1.1.0 - support concatenated PNGs for compressed observations. + # * 1.2.0 - support compression mapping for stacked compressed observations. + # * 1.3.0 - support action spaces with both continuous and discrete actions. + # * 1.4.0 - support training analytics sent from python trainer to the editor. + # * 1.5.0 - support variable length observation training and multi-agent groups. + API_VERSION = "1.5.0" + + # Default port that the editor listens on. If an environment executable + # isn't specified, this port will be used. + DEFAULT_EDITOR_PORT = 5004 + + # Default base port for environments. Each environment will be offset from this + # by it's worker_id. + BASE_ENVIRONMENT_PORT = 5005 + + # Command line argument used to pass the port to the executable environment. + _PORT_COMMAND_LINE_ARG = "--mlagents-port" + + @staticmethod + def _raise_version_exception(unity_com_ver: str) -> None: + raise UnityEnvironmentException( + f"The communication API version is not compatible between Unity and python. " + f"Python API: {UnityEnvironment.API_VERSION}, Unity API: {unity_com_ver}.\n " + f"Please find the versions that work best together from our release page.\n" + "https://github.com/Unity-Technologies/ml-agents/releases" + ) + + @staticmethod + def _check_communication_compatibility( + unity_com_ver: str, python_api_version: str, unity_package_version: str + ) -> bool: + unity_communicator_version = StrictVersion(unity_com_ver) + api_version = StrictVersion(python_api_version) + if unity_communicator_version.version[0] == 0: + if ( + unity_communicator_version.version[0] != api_version.version[0] + or unity_communicator_version.version[1] != api_version.version[1] + ): + # Minor beta versions differ. + return False + elif unity_communicator_version.version[0] != api_version.version[0]: + # Major versions mismatch. + return False + else: + # Major versions match, so either: + # 1) The versions are identical, in which case there's no compatibility issues + # 2) The Unity version is newer, in which case we'll warn or fail on the Unity side if trying to use + # unsupported features + # 3) The trainer version is newer, in which case new trainer features might be available but unused by C# + # In any of the cases, there's no reason to warn about mismatch here. + logger.info( + f"Connected to Unity environment with package version {unity_package_version} " + f"and communication version {unity_com_ver}" + ) + return True + + @staticmethod + def _get_capabilities_proto() -> UnityRLCapabilitiesProto: + capabilities = UnityRLCapabilitiesProto() + capabilities.baseRLCapabilities = True + capabilities.concatenatedPngObservations = True + capabilities.compressedChannelMapping = True + capabilities.hybridActions = True + capabilities.trainingAnalytics = True + capabilities.variableLengthObservation = True + capabilities.multiAgentGroups = True + return capabilities + + @staticmethod + def _warn_csharp_base_capabilities( + caps: UnityRLCapabilitiesProto, unity_package_ver: str, python_package_ver: str + ) -> None: + if not caps.baseRLCapabilities: + logger.warning( + "WARNING: The Unity process is not running with the expected base Reinforcement Learning" + " capabilities. Please be sure upgrade the Unity Package to a version that is compatible with this " + "python package.\n" + f"Python package version: {python_package_ver}, C# package version: {unity_package_ver}" + f"Please find the versions that work best together from our release page.\n" + "https://github.com/Unity-Technologies/ml-agents/releases" + ) + + def __init__( + self, + file_name: Optional[str] = None, + worker_id: int = 0, + base_port: Optional[int] = None, + seed: int = 0, + no_graphics: bool = False, + timeout_wait: int = 60, + additional_args: Optional[List[str]] = None, + side_channels: Optional[List[SideChannel]] = None, + log_folder: Optional[str] = None, + num_areas: int = 1, + ): + """ + Starts a new unity environment and establishes a connection with the environment. + Notice: Currently communication between Unity and Python takes place over an open socket without authentication. + Ensure that the network where training takes place is secure. + + :string file_name: Name of Unity environment binary. + :int base_port: Baseline port number to connect to Unity environment over. worker_id increments over this. + If no environment is specified (i.e. file_name is None), the DEFAULT_EDITOR_PORT will be used. + :int worker_id: Offset from base_port. Used for training multiple environments simultaneously. + :bool no_graphics: Whether to run the Unity simulator in no-graphics mode + :int timeout_wait: Time (in seconds) to wait for connection from environment. + :list args: Addition Unity command line arguments + :list side_channels: Additional side channel for no-rl communication with Unity + :str log_folder: Optional folder to write the Unity Player log file into. Requires absolute path. + """ + atexit.register(self._close) + self._additional_args = additional_args or [] + self._no_graphics = no_graphics + # If base port is not specified, use BASE_ENVIRONMENT_PORT if we have + # an environment, otherwise DEFAULT_EDITOR_PORT + if base_port is None: + base_port = ( + self.BASE_ENVIRONMENT_PORT if file_name else self.DEFAULT_EDITOR_PORT + ) + self._port = base_port + worker_id + self._buffer_size = 12000 + # If true, this means the environment was successfully loaded + self._loaded = False + # The process that is started. If None, no process was started + self._process: Optional[subprocess.Popen] = None + self._timeout_wait: int = timeout_wait + self._communicator = self._get_communicator(worker_id, base_port, timeout_wait) + self._worker_id = worker_id + if side_channels is None: + side_channels = [] + default_training_side_channel: Optional[ + DefaultTrainingAnalyticsSideChannel + ] = None + if DefaultTrainingAnalyticsSideChannel.CHANNEL_ID not in [ + _.channel_id for _ in side_channels + ]: + default_training_side_channel = DefaultTrainingAnalyticsSideChannel() + side_channels.append(default_training_side_channel) + self._side_channel_manager = SideChannelManager(side_channels) + self._log_folder = log_folder + self.academy_capabilities: UnityRLCapabilitiesProto = None # type: ignore + + # If the environment name is None, a new environment will not be launched + # and the communicator will directly try to connect to an existing unity environment. + # If the worker-id is not 0 and the environment name is None, an error is thrown + if file_name is None and worker_id != 0: + raise UnityEnvironmentException( + "If the environment name is None, " + "the worker-id must be 0 in order to connect with the Editor." + ) + if file_name is not None: + try: + self._process = env_utils.launch_executable( + file_name, self._executable_args() + ) + except UnityEnvironmentException: + self._close(0) + raise + else: + logger.info( + f"Listening on port {self._port}. " + f"Start training by pressing the Play button in the Unity Editor." + ) + self._loaded = True + + rl_init_parameters_in = UnityRLInitializationInputProto( + seed=seed, + communication_version=self.API_VERSION, + package_version=mlagents_envs.__version__, + capabilities=UnityEnvironment._get_capabilities_proto(), + num_areas=num_areas, + ) + try: + aca_output = self._send_academy_parameters(rl_init_parameters_in) + aca_params = aca_output.rl_initialization_output + except UnityTimeOutException: + self._close(0) + raise + + if not UnityEnvironment._check_communication_compatibility( + aca_params.communication_version, + UnityEnvironment.API_VERSION, + aca_params.package_version, + ): + self._close(0) + UnityEnvironment._raise_version_exception(aca_params.communication_version) + + UnityEnvironment._warn_csharp_base_capabilities( + aca_params.capabilities, + aca_params.package_version, + UnityEnvironment.API_VERSION, + ) + + self._env_state: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {} + self._env_specs: Dict[str, BehaviorSpec] = {} + self._env_actions: Dict[str, ActionTuple] = {} + self._is_first_message = True + self._update_behavior_specs(aca_output) + self.academy_capabilities = aca_params.capabilities + if default_training_side_channel is not None: + default_training_side_channel.environment_initialized() + + @staticmethod + def _get_communicator(worker_id, base_port, timeout_wait): + return RpcCommunicator(worker_id, base_port, timeout_wait) + + def _executable_args(self) -> List[str]: + args: List[str] = [] + if self._no_graphics: + args += ["-nographics", "-batchmode"] + args += [UnityEnvironment._PORT_COMMAND_LINE_ARG, str(self._port)] + + # If the logfile arg isn't already set in the env args, + # try to set it to an output directory + logfile_set = "-logfile" in (arg.lower() for arg in self._additional_args) + if self._log_folder and not logfile_set: + log_file_path = os.path.join( + self._log_folder, f"Player-{self._worker_id}.log" + ) + args += ["-logFile", log_file_path] + # Add in arguments passed explicitly by the user. + args += self._additional_args + return args + + def _update_behavior_specs(self, output: UnityOutputProto) -> None: + init_output = output.rl_initialization_output + for brain_param in init_output.brain_parameters: + # Each BrainParameter in the rl_initialization_output should have at least one AgentInfo + # Get that agent, because we need some of its observations. + agent_infos = output.rl_output.agentInfos[brain_param.brain_name] + if agent_infos.value: + agent = agent_infos.value[0] + new_spec = behavior_spec_from_proto(brain_param, agent) + self._env_specs[brain_param.brain_name] = new_spec + logger.info(f"Connected new brain: {brain_param.brain_name}") + + def _update_state(self, output: UnityRLOutputProto) -> None: + """ + Collects experience information from all external brains in environment at current step. + """ + for brain_name in self._env_specs.keys(): + if brain_name in output.agentInfos: + agent_info_list = output.agentInfos[brain_name].value + self._env_state[brain_name] = steps_from_proto( + agent_info_list, self._env_specs[brain_name] + ) + else: + self._env_state[brain_name] = ( + DecisionSteps.empty(self._env_specs[brain_name]), + TerminalSteps.empty(self._env_specs[brain_name]), + ) + self._side_channel_manager.process_side_channel_message(output.side_channel) + + def reset(self) -> None: + if self._loaded: + outputs = self._communicator.exchange( + self._generate_reset_input(), self._poll_process + ) + if outputs is None: + raise UnityCommunicatorStoppedException("Communicator has exited.") + self._update_behavior_specs(outputs) + rl_output = outputs.rl_output + self._update_state(rl_output) + self._is_first_message = False + self._env_actions.clear() + else: + raise UnityEnvironmentException("No Unity environment is loaded.") + + @timed + def step(self) -> None: + if self._is_first_message: + return self.reset() + if not self._loaded: + raise UnityEnvironmentException("No Unity environment is loaded.") + # fill the blanks for missing actions + for group_name in self._env_specs: + if group_name not in self._env_actions: + n_agents = 0 + if group_name in self._env_state: + n_agents = len(self._env_state[group_name][0]) + self._env_actions[group_name] = self._env_specs[ + group_name + ].action_spec.empty_action(n_agents) + step_input = self._generate_step_input(self._env_actions) + with hierarchical_timer("communicator.exchange"): + outputs = self._communicator.exchange(step_input, self._poll_process) + if outputs is None: + raise UnityCommunicatorStoppedException("Communicator has exited.") + self._update_behavior_specs(outputs) + rl_output = outputs.rl_output + self._update_state(rl_output) + self._env_actions.clear() + + @property + def behavior_specs(self) -> MappingType[str, BehaviorSpec]: + return BehaviorMapping(self._env_specs) + + def _assert_behavior_exists(self, behavior_name: str) -> None: + if behavior_name not in self._env_specs: + raise UnityActionException( + f"The group {behavior_name} does not correspond to an existing " + f"agent group in the environment" + ) + + def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None: + self._assert_behavior_exists(behavior_name) + if behavior_name not in self._env_state: + return + action_spec = self._env_specs[behavior_name].action_spec + num_agents = len(self._env_state[behavior_name][0]) + action = action_spec._validate_action(action, num_agents, behavior_name) + self._env_actions[behavior_name] = action + + def set_action_for_agent( + self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple + ) -> None: + self._assert_behavior_exists(behavior_name) + if behavior_name not in self._env_state: + return + action_spec = self._env_specs[behavior_name].action_spec + action = action_spec._validate_action(action, 1, behavior_name) + if behavior_name not in self._env_actions: + num_agents = len(self._env_state[behavior_name][0]) + self._env_actions[behavior_name] = action_spec.empty_action(num_agents) + try: + index = np.where(self._env_state[behavior_name][0].agent_id == agent_id)[0][ + 0 + ] + except IndexError as ie: + raise IndexError( + "agent_id {} is did not request a decision at the previous step".format( + agent_id + ) + ) from ie + if action_spec.continuous_size > 0: + self._env_actions[behavior_name].continuous[index] = action.continuous[0, :] + if action_spec.discrete_size > 0: + self._env_actions[behavior_name].discrete[index] = action.discrete[0, :] + + def get_steps( + self, behavior_name: BehaviorName + ) -> Tuple[DecisionSteps, TerminalSteps]: + self._assert_behavior_exists(behavior_name) + return self._env_state[behavior_name] + + def _poll_process(self) -> None: + """ + Check the status of the subprocess. If it has exited, raise a UnityEnvironmentException + :return: None + """ + if not self._process: + return + poll_res = self._process.poll() + if poll_res is not None: + exc_msg = self._returncode_to_env_message(self._process.returncode) + raise UnityEnvironmentException(exc_msg) + + def close(self): + """ + Sends a shutdown signal to the unity environment, and closes the socket connection. + """ + if self._loaded: + self._close() + else: + raise UnityEnvironmentException("No Unity environment is loaded.") + + def _close(self, timeout: Optional[int] = None) -> None: + """ + Close the communicator and environment subprocess (if necessary). + + :int timeout: [Optional] Number of seconds to wait for the environment to shut down before + force-killing it. Defaults to `self.timeout_wait`. + """ + if timeout is None: + timeout = self._timeout_wait + self._loaded = False + self._communicator.close() + if self._process is not None: + # Wait a bit for the process to shutdown, but kill it if it takes too long + try: + self._process.wait(timeout=timeout) + logger.debug(self._returncode_to_env_message(self._process.returncode)) + except subprocess.TimeoutExpired: + logger.warning("Environment timed out shutting down. Killing...") + self._process.kill() + # Set to None so we don't try to close multiple times. + self._process = None + + @timed + def _generate_step_input( + self, vector_action: Dict[str, ActionTuple] + ) -> UnityInputProto: + rl_in = UnityRLInputProto() + for b in vector_action: + n_agents = len(self._env_state[b][0]) + if n_agents == 0: + continue + for i in range(n_agents): + action = AgentActionProto() + if vector_action[b].continuous is not None: + action.vector_actions_deprecated.extend( + vector_action[b].continuous[i] + ) + action.continuous_actions.extend(vector_action[b].continuous[i]) + if vector_action[b].discrete is not None: + action.vector_actions_deprecated.extend( + vector_action[b].discrete[i] + ) + action.discrete_actions.extend(vector_action[b].discrete[i]) + rl_in.agent_actions[b].value.extend([action]) + rl_in.command = STEP + rl_in.side_channel = bytes( + self._side_channel_manager.generate_side_channel_messages() + ) + return self._wrap_unity_input(rl_in) + + def _generate_reset_input(self) -> UnityInputProto: + rl_in = UnityRLInputProto() + rl_in.command = RESET + rl_in.side_channel = bytes( + self._side_channel_manager.generate_side_channel_messages() + ) + return self._wrap_unity_input(rl_in) + + def _send_academy_parameters( + self, init_parameters: UnityRLInitializationInputProto + ) -> UnityOutputProto: + inputs = UnityInputProto() + inputs.rl_initialization_input.CopyFrom(init_parameters) + return self._communicator.initialize(inputs, self._poll_process) + + @staticmethod + def _wrap_unity_input(rl_input: UnityRLInputProto) -> UnityInputProto: + result = UnityInputProto() + result.rl_input.CopyFrom(rl_input) + return result + + @staticmethod + def _returncode_to_signal_name(returncode: int) -> Optional[str]: + """ + Try to convert return codes into their corresponding signal name. + E.g. returncode_to_signal_name(-2) -> "SIGINT" + """ + try: + # A negative value -N indicates that the child was terminated by signal N (POSIX only). + s = signal.Signals(-returncode) + return s.name + except Exception: + # Should generally be a ValueError, but catch everything just in case. + return None + + @staticmethod + def _returncode_to_env_message(returncode: int) -> str: + signal_name = UnityEnvironment._returncode_to_signal_name(returncode) + signal_name = f" ({signal_name})" if signal_name else "" + return f"Environment shut down with return code {returncode}{signal_name}." diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__init__.py b/MLPY/Lib/site-packages/mlagents_envs/envs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9d7a4f89414563d505de11853ace56ff470a83ed --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/envs/__init__.py @@ -0,0 +1,15 @@ +from mlagents_envs.registry import default_registry +from mlagents_envs.envs.pettingzoo_env_factory import logger, PettingZooEnvFactory + +# Register each environment in default_registry as a PettingZooEnv +for key in default_registry: + env_name = key + if key[0].isdigit(): + env_name = key.replace("3", "Three") + if not env_name.isidentifier(): + logger.warning( + f"Environment id {env_name} can not be registered since it is" + f"not a valid identifier name." + ) + continue + locals()[env_name] = PettingZooEnvFactory(key) diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20ac2010ce326f37cb4212607df016800ebe6198 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/env_helpers.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/env_helpers.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb7a0abd9f9da9f26fc4f36c61c53f594e873690 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/env_helpers.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/pettingzoo_env_factory.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/pettingzoo_env_factory.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da7a1625e561a98f90998cfe6571f704a7ae93f7 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/pettingzoo_env_factory.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_aec_env.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_aec_env.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aecd0f8aaca4ceb9f1306977e6e33cd9779d538a Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_aec_env.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_gym_env.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_gym_env.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9cf77cfe2ded45ad576b640bc727c72304571899 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_gym_env.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_parallel_env.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_parallel_env.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1b490835f74435c12feda4010d7b54da8292a1d Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_parallel_env.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_pettingzoo_base_env.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_pettingzoo_base_env.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c786d28d769100f6caa71a140aa61f4dea6e8855 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_pettingzoo_base_env.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/env_helpers.py b/MLPY/Lib/site-packages/mlagents_envs/envs/env_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..768e6706038b5cb1030d26f8f0efc40cfe695435 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/envs/env_helpers.py @@ -0,0 +1,76 @@ +from urllib.parse import urlparse, parse_qs + + +def _behavior_to_agent_id(behavior_name: str, unique_id: int) -> str: + return f"{behavior_name}?agent_id={unique_id}" + + +def _agent_id_to_behavior(agent_id: str) -> str: + return agent_id.split("?agent_id=")[0] + + +def _unwrap_batch_steps(batch_steps, behavior_name): + decision_batch, termination_batch = batch_steps + decision_id = [ + _behavior_to_agent_id(behavior_name, i) for i in decision_batch.agent_id + ] + termination_id = [ + _behavior_to_agent_id(behavior_name, i) for i in termination_batch.agent_id + ] + agents = decision_id + termination_id + obs = { + agent_id: [batch_obs[i] for batch_obs in termination_batch.obs] + for i, agent_id in enumerate(termination_id) + } + if decision_batch.action_mask is not None: + obs.update( + { + agent_id: { + "observation": [batch_obs[i] for batch_obs in decision_batch.obs], + "action_mask": [mask[i] for mask in decision_batch.action_mask], + } + for i, agent_id in enumerate(decision_id) + } + ) + else: + obs.update( + { + agent_id: [batch_obs[i] for batch_obs in decision_batch.obs] + for i, agent_id in enumerate(decision_id) + } + ) + obs = {k: v if len(v) > 1 else v[0] for k, v in obs.items()} + dones = {agent_id: True for agent_id in termination_id} + dones.update({agent_id: False for agent_id in decision_id}) + rewards = { + agent_id: termination_batch.reward[i] + for i, agent_id in enumerate(termination_id) + } + rewards.update( + {agent_id: decision_batch.reward[i] for i, agent_id in enumerate(decision_id)} + ) + cumulative_rewards = {k: v for k, v in rewards.items()} + infos = {} + for i, agent_id in enumerate(decision_id): + infos[agent_id] = {} + infos[agent_id]["behavior_name"] = behavior_name + infos[agent_id]["group_id"] = decision_batch.group_id[i] + infos[agent_id]["group_reward"] = decision_batch.group_reward[i] + for i, agent_id in enumerate(termination_id): + infos[agent_id] = {} + infos[agent_id]["behavior_name"] = behavior_name + infos[agent_id]["group_id"] = termination_batch.group_id[i] + infos[agent_id]["group_reward"] = termination_batch.group_reward[i] + infos[agent_id]["interrupted"] = termination_batch.interrupted[i] + id_map = {agent_id: i for i, agent_id in enumerate(decision_id)} + return agents, obs, dones, rewards, cumulative_rewards, infos, id_map + + +def _parse_behavior(full_behavior): + parsed = urlparse(full_behavior) + name = parsed.path + ids = parse_qs(parsed.query) + team_id: int = 0 + if "team" in ids: + team_id = int(ids["team"][0]) + return name, team_id diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/pettingzoo_env_factory.py b/MLPY/Lib/site-packages/mlagents_envs/envs/pettingzoo_env_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..aae82d36e81ef1d4e67913b673b63edeaec2dff9 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/envs/pettingzoo_env_factory.py @@ -0,0 +1,50 @@ +from typing import Optional, Union, List + +from mlagents_envs import logging_util +from mlagents_envs.exception import UnityWorkerInUseException +from mlagents_envs.registry import default_registry +from mlagents_envs.side_channel.engine_configuration_channel import ( + EngineConfigurationChannel, +) +from mlagents_envs.side_channel.environment_parameters_channel import ( + EnvironmentParametersChannel, +) +from mlagents_envs.side_channel.stats_side_channel import StatsSideChannel +from mlagents_envs.envs.unity_aec_env import UnityAECEnv + +logger = logging_util.get_logger(__name__) + + +class PettingZooEnvFactory: + def __init__(self, env_id: str) -> None: + self.env_id = env_id + + def env( + self, seed: Optional[int] = None, **kwargs: Union[List, int, bool, None] + ) -> UnityAECEnv: + """ + Creates the environment with env_id from unity's default_registry and wraps it in a UnityToPettingZooWrapper + :param seed: The seed for the action spaces of the agents. + :param kwargs: Any argument accepted by `UnityEnvironment`class except file_name + """ + # If not side_channels specified, add the followings + if "side_channels" not in kwargs: + kwargs["side_channels"] = [ + EngineConfigurationChannel(), + EnvironmentParametersChannel(), + StatsSideChannel(), + ] + _env = None + # If no base port argument is provided, try ports starting at 6000 until one is free + if "base_port" not in kwargs: + port = 6000 + while _env is None: + try: + kwargs["base_port"] = port + _env = default_registry[self.env_id].make(**kwargs) + except UnityWorkerInUseException: + port += 1 + pass + else: + _env = default_registry[self.env_id].make(**kwargs) + return UnityAECEnv(_env, seed) diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/unity_aec_env.py b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_aec_env.py new file mode 100644 index 0000000000000000000000000000000000000000..4bb6fdf390998755b9443c6e0d687175cb41f7cc --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_aec_env.py @@ -0,0 +1,72 @@ +from typing import Any, Optional +from gym import error +from mlagents_envs.base_env import BaseEnv +from pettingzoo import AECEnv + +from mlagents_envs.envs.unity_pettingzoo_base_env import UnityPettingzooBaseEnv + + +class UnityAECEnv(UnityPettingzooBaseEnv, AECEnv): + """ + Unity AEC (PettingZoo) environment wrapper. + """ + + def __init__(self, env: BaseEnv, seed: Optional[int] = None): + """ + Initializes a Unity AEC environment wrapper. + + :param env: The UnityEnvironment that is being wrapped. + :param seed: The seed for the action spaces of the agents. + """ + super().__init__(env, seed) + + def step(self, action: Any) -> None: + """ + Sets the action of the active agent and get the observation, reward, done + and info of the next agent. + :param action: The action for the active agent + """ + self._assert_loaded() + if len(self._live_agents) <= 0: + raise error.Error( + "You must reset the environment before you can perform a step" + ) + + # Process action + current_agent = self._agents[self._agent_index] + self._process_action(current_agent, action) + + self._agent_index += 1 + # Reset reward + for k in self._rewards.keys(): + self._rewards[k] = 0 + + if self._agent_index >= len(self._agents) and self.num_agents > 0: + # The index is too high, time to set the action for the agents we have + self._step() + self._live_agents.sort() # unnecessary, only for passing API test + + def observe(self, agent_id): + """ + Returns the observation an agent currently can make. `last()` calls this function. + """ + return ( + self._observations[agent_id], + self._cumm_rewards[agent_id], + self._dones[agent_id], + self._infos[agent_id], + ) + + def last(self, observe=True): + """ + returns observation, cumulative reward, done, info for the current agent (specified by self.agent_selection) + """ + obs, reward, done, info = self.observe(self._agents[self._agent_index]) + return obs if observe else None, reward, done, info + + @property + def agent_selection(self): + if not self._live_agents: + # If we had an agent finish then return that agent even though it isn't alive. + return self._agents[0] + return self._agents[self._agent_index] diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/unity_gym_env.py b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_gym_env.py new file mode 100644 index 0000000000000000000000000000000000000000..df29a95c9ab79d01389a2e66f9ebfcabc72c77ec --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_gym_env.py @@ -0,0 +1,360 @@ +import itertools + +import numpy as np +from typing import Any, Dict, List, Optional, Tuple, Union + +import gym +from gym import error, spaces + +from mlagents_envs.base_env import ActionTuple, BaseEnv +from mlagents_envs.base_env import DecisionSteps, TerminalSteps +from mlagents_envs import logging_util + + +class UnityGymException(error.Error): + """ + Any error related to the gym wrapper of ml-agents. + """ + + pass + + +logger = logging_util.get_logger(__name__) +GymStepResult = Tuple[np.ndarray, float, bool, Dict] + + +class UnityToGymWrapper(gym.Env): + """ + Provides Gym wrapper for Unity Learning Environments. + """ + + def __init__( + self, + unity_env: BaseEnv, + uint8_visual: bool = False, + flatten_branched: bool = False, + allow_multiple_obs: bool = False, + action_space_seed: Optional[int] = None, + ): + """ + Environment initialization + :param unity_env: The Unity BaseEnv to be wrapped in the gym. Will be closed when the UnityToGymWrapper closes. + :param uint8_visual: Return visual observations as uint8 (0-255) matrices instead of float (0.0-1.0). + :param flatten_branched: If True, turn branched discrete action spaces into a Discrete space rather than + MultiDiscrete. + :param allow_multiple_obs: If True, return a list of np.ndarrays as observations with the first elements + containing the visual observations and the last element containing the array of vector observations. + If False, returns a single np.ndarray containing either only a single visual observation or the array of + vector observations. + :param action_space_seed: If non-None, will be used to set the random seed on created gym.Space instances. + """ + self._env = unity_env + + # Take a single step so that the brain information will be sent over + if not self._env.behavior_specs: + self._env.step() + + self.visual_obs = None + + # Save the step result from the last time all Agents requested decisions. + self._previous_decision_step: Optional[DecisionSteps] = None + self._flattener = None + # Hidden flag used by Atari environments to determine if the game is over + self.game_over = False + self._allow_multiple_obs = allow_multiple_obs + + # Check brain configuration + if len(self._env.behavior_specs) != 1: + raise UnityGymException( + "There can only be one behavior in a UnityEnvironment " + "if it is wrapped in a gym." + ) + + self.name = list(self._env.behavior_specs.keys())[0] + self.group_spec = self._env.behavior_specs[self.name] + + if self._get_n_vis_obs() == 0 and self._get_vec_obs_size() == 0: + raise UnityGymException( + "There are no observations provided by the environment." + ) + + if not self._get_n_vis_obs() >= 1 and uint8_visual: + logger.warning( + "uint8_visual was set to true, but visual observations are not in use. " + "This setting will not have any effect." + ) + else: + self.uint8_visual = uint8_visual + if ( + self._get_n_vis_obs() + self._get_vec_obs_size() >= 2 + and not self._allow_multiple_obs + ): + logger.warning( + "The environment contains multiple observations. " + "You must define allow_multiple_obs=True to receive them all. " + "Otherwise, only the first visual observation (or vector observation if" + "there are no visual observations) will be provided in the observation." + ) + + # Check for number of agents in scene. + self._env.reset() + decision_steps, _ = self._env.get_steps(self.name) + self._check_agents(len(decision_steps)) + self._previous_decision_step = decision_steps + + # Set action spaces + if self.group_spec.action_spec.is_discrete(): + self.action_size = self.group_spec.action_spec.discrete_size + branches = self.group_spec.action_spec.discrete_branches + if self.group_spec.action_spec.discrete_size == 1: + self._action_space = spaces.Discrete(branches[0]) + else: + if flatten_branched: + self._flattener = ActionFlattener(branches) + self._action_space = self._flattener.action_space + else: + self._action_space = spaces.MultiDiscrete(branches) + + elif self.group_spec.action_spec.is_continuous(): + if flatten_branched: + logger.warning( + "The environment has a non-discrete action space. It will " + "not be flattened." + ) + + self.action_size = self.group_spec.action_spec.continuous_size + high = np.array([1] * self.group_spec.action_spec.continuous_size) + self._action_space = spaces.Box(-high, high, dtype=np.float32) + else: + raise UnityGymException( + "The gym wrapper does not provide explicit support for both discrete " + "and continuous actions." + ) + + if action_space_seed is not None: + self._action_space.seed(action_space_seed) + + # Set observations space + list_spaces: List[gym.Space] = [] + shapes = self._get_vis_obs_shape() + for shape in shapes: + if uint8_visual: + list_spaces.append(spaces.Box(0, 255, dtype=np.uint8, shape=shape)) + else: + list_spaces.append(spaces.Box(0, 1, dtype=np.float32, shape=shape)) + if self._get_vec_obs_size() > 0: + # vector observation is last + high = np.array([np.inf] * self._get_vec_obs_size()) + list_spaces.append(spaces.Box(-high, high, dtype=np.float32)) + if self._allow_multiple_obs: + self._observation_space = spaces.Tuple(list_spaces) + else: + self._observation_space = list_spaces[0] # only return the first one + + def reset(self) -> Union[List[np.ndarray], np.ndarray]: + """Resets the state of the environment and returns an initial observation. + Returns: observation (object/list): the initial observation of the + space. + """ + self._env.reset() + decision_step, _ = self._env.get_steps(self.name) + n_agents = len(decision_step) + self._check_agents(n_agents) + self.game_over = False + + res: GymStepResult = self._single_step(decision_step) + return res[0] + + def step(self, action: List[Any]) -> GymStepResult: + """Run one timestep of the environment's dynamics. When end of + episode is reached, you are responsible for calling `reset()` + to reset this environment's state. + Accepts an action and returns a tuple (observation, reward, done, info). + Args: + action (object/list): an action provided by the environment + Returns: + observation (object/list): agent's observation of the current environment + reward (float/list) : amount of reward returned after previous action + done (boolean/list): whether the episode has ended. + info (dict): contains auxiliary diagnostic information. + """ + if self.game_over: + raise UnityGymException( + "You are calling 'step()' even though this environment has already " + "returned done = True. You must always call 'reset()' once you " + "receive 'done = True'." + ) + if self._flattener is not None: + # Translate action into list + action = self._flattener.lookup_action(action) + + action = np.array(action).reshape((1, self.action_size)) + + action_tuple = ActionTuple() + if self.group_spec.action_spec.is_continuous(): + action_tuple.add_continuous(action) + else: + action_tuple.add_discrete(action) + self._env.set_actions(self.name, action_tuple) + + self._env.step() + decision_step, terminal_step = self._env.get_steps(self.name) + self._check_agents(max(len(decision_step), len(terminal_step))) + if len(terminal_step) != 0: + # The agent is done + self.game_over = True + return self._single_step(terminal_step) + else: + return self._single_step(decision_step) + + def _single_step(self, info: Union[DecisionSteps, TerminalSteps]) -> GymStepResult: + if self._allow_multiple_obs: + visual_obs = self._get_vis_obs_list(info) + visual_obs_list = [] + for obs in visual_obs: + visual_obs_list.append(self._preprocess_single(obs[0])) + default_observation = visual_obs_list + if self._get_vec_obs_size() >= 1: + default_observation.append(self._get_vector_obs(info)[0, :]) + else: + if self._get_n_vis_obs() >= 1: + visual_obs = self._get_vis_obs_list(info) + default_observation = self._preprocess_single(visual_obs[0][0]) + else: + default_observation = self._get_vector_obs(info)[0, :] + + if self._get_n_vis_obs() >= 1: + visual_obs = self._get_vis_obs_list(info) + self.visual_obs = self._preprocess_single(visual_obs[0][0]) + + done = isinstance(info, TerminalSteps) + + return (default_observation, info.reward[0], done, {"step": info}) + + def _preprocess_single(self, single_visual_obs: np.ndarray) -> np.ndarray: + if self.uint8_visual: + return (255.0 * single_visual_obs).astype(np.uint8) + else: + return single_visual_obs + + def _get_n_vis_obs(self) -> int: + result = 0 + for obs_spec in self.group_spec.observation_specs: + if len(obs_spec.shape) == 3: + result += 1 + return result + + def _get_vis_obs_shape(self) -> List[Tuple]: + result: List[Tuple] = [] + for obs_spec in self.group_spec.observation_specs: + if len(obs_spec.shape) == 3: + result.append(obs_spec.shape) + return result + + def _get_vis_obs_list( + self, step_result: Union[DecisionSteps, TerminalSteps] + ) -> List[np.ndarray]: + result: List[np.ndarray] = [] + for obs in step_result.obs: + if len(obs.shape) == 4: + result.append(obs) + return result + + def _get_vector_obs( + self, step_result: Union[DecisionSteps, TerminalSteps] + ) -> np.ndarray: + result: List[np.ndarray] = [] + for obs in step_result.obs: + if len(obs.shape) == 2: + result.append(obs) + return np.concatenate(result, axis=1) + + def _get_vec_obs_size(self) -> int: + result = 0 + for obs_spec in self.group_spec.observation_specs: + if len(obs_spec.shape) == 1: + result += obs_spec.shape[0] + return result + + def render(self, mode="rgb_array"): + """ + Return the latest visual observations. + Note that it will not render a new frame of the environment. + """ + return self.visual_obs + + def close(self) -> None: + """Override _close in your subclass to perform any necessary cleanup. + Environments will automatically close() themselves when + garbage collected or when the program exits. + """ + self._env.close() + + def seed(self, seed: Any = None) -> None: + """Sets the seed for this env's random number generator(s). + Currently not implemented. + """ + logger.warning("Could not seed environment %s", self.name) + return + + @staticmethod + def _check_agents(n_agents: int) -> None: + if n_agents > 1: + raise UnityGymException( + f"There can only be one Agent in the environment but {n_agents} were detected." + ) + + @property + def metadata(self): + return {"render.modes": ["rgb_array"]} + + @property + def reward_range(self) -> Tuple[float, float]: + return -float("inf"), float("inf") + + @property + def action_space(self) -> gym.Space: + return self._action_space + + @property + def observation_space(self): + return self._observation_space + + +class ActionFlattener: + """ + Flattens branched discrete action spaces into single-branch discrete action spaces. + """ + + def __init__(self, branched_action_space): + """ + Initialize the flattener. + :param branched_action_space: A List containing the sizes of each branch of the action + space, e.g. [2,3,3] for three branches with size 2, 3, and 3 respectively. + """ + self._action_shape = branched_action_space + self.action_lookup = self._create_lookup(self._action_shape) + self.action_space = spaces.Discrete(len(self.action_lookup)) + + @classmethod + def _create_lookup(self, branched_action_space): + """ + Creates a Dict that maps discrete actions (scalars) to branched actions (lists). + Each key in the Dict maps to one unique set of branched actions, and each value + contains the List of branched actions. + """ + possible_vals = [range(_num) for _num in branched_action_space] + all_actions = [list(_action) for _action in itertools.product(*possible_vals)] + # Dict should be faster than List for large action spaces + action_lookup = { + _scalar: _action for (_scalar, _action) in enumerate(all_actions) + } + return action_lookup + + def lookup_action(self, action): + """ + Convert a scalar discrete action into a unique set of branched actions. + :param action: A scalar value representing one of the discrete actions. + :returns: The List containing the branched actions. + """ + return self.action_lookup[action] diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/unity_parallel_env.py b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_parallel_env.py new file mode 100644 index 0000000000000000000000000000000000000000..09398d27fa8369d3af63629d18b93fca8dc218a7 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_parallel_env.py @@ -0,0 +1,53 @@ +from typing import Optional, Dict, Any, Tuple +from gym import error +from mlagents_envs.base_env import BaseEnv +from pettingzoo import ParallelEnv + +from mlagents_envs.envs.unity_pettingzoo_base_env import UnityPettingzooBaseEnv + + +class UnityParallelEnv(UnityPettingzooBaseEnv, ParallelEnv): + """ + Unity Parallel (PettingZoo) environment wrapper. + """ + + def __init__(self, env: BaseEnv, seed: Optional[int] = None): + """ + Initializes a Unity Parallel environment wrapper. + + :param env: The UnityEnvironment that is being wrapped. + :param seed: The seed for the action spaces of the agents. + """ + super().__init__(env, seed) + + def reset(self) -> Dict[str, Any]: + """ + Resets the environment. + """ + super().reset() + + return self._observations + + def step(self, actions: Dict[str, Any]) -> Tuple: + self._assert_loaded() + if len(self._live_agents) <= 0 and actions: + raise error.Error( + "You must reset the environment before you can perform a step." + ) + + # Process actions + for current_agent, action in actions.items(): + self._process_action(current_agent, action) + + # Reset reward + for k in self._rewards.keys(): + self._rewards[k] = 0 + + # Step environment + self._step() + + # Agent cleanup and sorting + self._cleanup_agents() + self._live_agents.sort() # unnecessary, only for passing API test + + return self._observations, self._rewards, self._dones, self._infos diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/unity_pettingzoo_base_env.py b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_pettingzoo_base_env.py new file mode 100644 index 0000000000000000000000000000000000000000..3457f18c882643346c020fc1057a7d0c194e51f9 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_pettingzoo_base_env.py @@ -0,0 +1,323 @@ +import atexit +from typing import Optional, List, Set, Dict, Any, Tuple +import numpy as np +from gym import error, spaces +from mlagents_envs.base_env import BaseEnv, ActionTuple +from mlagents_envs.envs.env_helpers import _agent_id_to_behavior, _unwrap_batch_steps + + +class UnityPettingzooBaseEnv: + """ + Unity Petting Zoo base environment. + """ + + def __init__( + self, env: BaseEnv, seed: Optional[int] = None, metadata: Optional[dict] = None + ): + super().__init__() + atexit.register(self.close) + self._env = env + self.metadata = metadata + self._assert_loaded() + + self._agent_index = 0 + self._seed = seed + self._side_channel_dict = { + type(v).__name__: v + for v in self._env._side_channel_manager._side_channels_dict.values() # type: ignore + } + + self._live_agents: List[str] = [] # agent id for agents alive + self._agents: List[str] = [] # all agent id in current step + self._possible_agents: Set[str] = set() # all agents that have ever appear + self._agent_id_to_index: Dict[str, int] = {} # agent_id: index in decision step + self._observations: Dict[str, np.ndarray] = {} # agent_id: obs + self._dones: Dict[str, bool] = {} # agent_id: done + self._rewards: Dict[str, float] = {} # agent_id: reward + self._cumm_rewards: Dict[str, float] = {} # agent_id: reward + self._infos: Dict[str, Dict] = {} # agent_id: info + self._action_spaces: Dict[str, spaces.Space] = {} # behavior_name: action_space + self._observation_spaces: Dict[ + str, spaces.Space + ] = {} # behavior_name: obs_space + self._current_action: Dict[str, ActionTuple] = {} # behavior_name: ActionTuple + # Take a single step so that the brain information will be sent over + if not self._env.behavior_specs: + self._env.step() + for behavior_name in self._env.behavior_specs.keys(): + _, _, _ = self._batch_update(behavior_name) + self._update_observation_spaces() + self._update_action_spaces() + + def _assert_loaded(self) -> None: + if self._env is None: + raise error.Error("No environment loaded") + + @property + def observation_spaces(self) -> Dict[str, spaces.Space]: + """ + Return the observation spaces of all the agents. + """ + return { + agent_id: self._observation_spaces[_agent_id_to_behavior(agent_id)] + for agent_id in self._possible_agents + } + + def observation_space(self, agent: str) -> Optional[spaces.Space]: + """ + The observation space of the current agent. + """ + behavior_name = _agent_id_to_behavior(agent) + return self._observation_spaces[behavior_name] + + def _update_observation_spaces(self) -> None: + self._assert_loaded() + for behavior_name in self._env.behavior_specs.keys(): + if behavior_name not in self._observation_spaces: + obs_spec = self._env.behavior_specs[behavior_name].observation_specs + obs_spaces = tuple( + spaces.Box( + low=-np.float32(np.inf), + high=np.float32(np.inf), + shape=spec.shape, + dtype=np.float32, + ) + for spec in obs_spec + ) + if len(obs_spaces) == 1: + self._observation_spaces[behavior_name] = obs_spaces[0] + else: + self._observation_spaces[behavior_name] = spaces.Tuple(obs_spaces) + + @property + def action_spaces(self) -> Dict[str, spaces.Space]: + """ + Return the action spaces of all the agents. + """ + return { + agent_id: self._action_spaces[_agent_id_to_behavior(agent_id)] + for agent_id in self._possible_agents + } + + def action_space(self, agent: str) -> Optional[spaces.Space]: + """ + The action space of the current agent. + """ + behavior_name = _agent_id_to_behavior(agent) + return self._action_spaces[behavior_name] + + def _update_action_spaces(self) -> None: + self._assert_loaded() + for behavior_name in self._env.behavior_specs.keys(): + if behavior_name not in self._action_spaces: + act_spec = self._env.behavior_specs[behavior_name].action_spec + if ( + act_spec.continuous_size == 0 + and len(act_spec.discrete_branches) == 0 + ): + raise error.Error("No actions found") + if act_spec.discrete_size == 1: + d_space = spaces.Discrete(act_spec.discrete_branches[0]) + if self._seed is not None: + d_space.seed(self._seed) + if act_spec.continuous_size == 0: + self._action_spaces[behavior_name] = d_space + continue + if act_spec.discrete_size > 0: + d_space = spaces.MultiDiscrete(act_spec.discrete_branches) + if self._seed is not None: + d_space.seed(self._seed) + if act_spec.continuous_size == 0: + self._action_spaces[behavior_name] = d_space + continue + if act_spec.continuous_size > 0: + c_space = spaces.Box( + -1, 1, (act_spec.continuous_size,), dtype=np.int32 + ) + if self._seed is not None: + c_space.seed(self._seed) + if len(act_spec.discrete_branches) == 0: + self._action_spaces[behavior_name] = c_space + continue + self._action_spaces[behavior_name] = spaces.Tuple((c_space, d_space)) + + def _process_action(self, current_agent, action): + current_action_space = self.action_space(current_agent) + # Convert actions + if action is not None: + if isinstance(action, Tuple): + action = tuple(np.array(a) for a in action) + else: + action = self._action_to_np(current_action_space, action) + if not current_action_space.contains(action): # type: ignore + raise error.Error( + f"Invalid action, got {action} but was expecting action from {self.action_space}" + ) + if isinstance(current_action_space, spaces.Tuple): + action = ActionTuple(action[0], action[1]) + elif isinstance(current_action_space, spaces.MultiDiscrete): + action = ActionTuple(None, action) + elif isinstance(current_action_space, spaces.Discrete): + action = ActionTuple(None, np.array(action).reshape(1, 1)) + else: + action = ActionTuple(action, None) + + if not self._dones[current_agent]: + current_behavior = _agent_id_to_behavior(current_agent) + current_index = self._agent_id_to_index[current_agent] + if action.continuous is not None: + self._current_action[current_behavior].continuous[ + current_index + ] = action.continuous[0] + if action.discrete is not None: + self._current_action[current_behavior].discrete[ + current_index + ] = action.discrete[0] + else: + self._live_agents.remove(current_agent) + del self._observations[current_agent] + del self._dones[current_agent] + del self._rewards[current_agent] + del self._cumm_rewards[current_agent] + del self._infos[current_agent] + + def _step(self): + for behavior_name, actions in self._current_action.items(): + self._env.set_actions(behavior_name, actions) + self._env.step() + self._reset_states() + for behavior_name in self._env.behavior_specs.keys(): + dones, rewards, cumulative_rewards = self._batch_update(behavior_name) + self._dones.update(dones) + self._rewards.update(rewards) + self._cumm_rewards.update(cumulative_rewards) + self._agent_index = 0 + + def _cleanup_agents(self): + for current_agent, done in self.dones.items(): + if done: + self._live_agents.remove(current_agent) + + @property + def side_channel(self) -> Dict[str, Any]: + """ + The side channels of the environment. You can access the side channels + of an environment with `env.side_channel[]`. + """ + self._assert_loaded() + return self._side_channel_dict + + @staticmethod + def _action_to_np(current_action_space, action): + return np.array(action, dtype=current_action_space.dtype) + + def _create_empty_actions(self, behavior_name, num_agents): + a_spec = self._env.behavior_specs[behavior_name].action_spec + return ActionTuple( + np.zeros((num_agents, a_spec.continuous_size), dtype=np.float32), + np.zeros((num_agents, len(a_spec.discrete_branches)), dtype=np.int32), + ) + + @property + def _cumulative_rewards(self): + return self._cumm_rewards + + def _reset_states(self): + self._live_agents = [] + self._agents = [] + self._observations = {} + self._dones = {} + self._rewards = {} + self._cumm_rewards = {} + self._infos = {} + self._agent_id_to_index = {} + + def reset(self): + """ + Resets the environment. + """ + self._assert_loaded() + self._agent_index = 0 + self._reset_states() + self._possible_agents = set() + self._env.reset() + for behavior_name in self._env.behavior_specs.keys(): + _, _, _ = self._batch_update(behavior_name) + self._live_agents.sort() # unnecessary, only for passing API test + self._dones = {agent: False for agent in self._agents} + self._rewards = {agent: 0 for agent in self._agents} + self._cumm_rewards = {agent: 0 for agent in self._agents} + + def _batch_update(self, behavior_name): + current_batch = self._env.get_steps(behavior_name) + self._current_action[behavior_name] = self._create_empty_actions( + behavior_name, len(current_batch[0]) + ) + ( + agents, + obs, + dones, + rewards, + cumulative_rewards, + infos, + id_map, + ) = _unwrap_batch_steps(current_batch, behavior_name) + self._live_agents += agents + self._agents += agents + self._observations.update(obs) + self._infos.update(infos) + self._agent_id_to_index.update(id_map) + self._possible_agents.update(agents) + return dones, rewards, cumulative_rewards + + def seed(self, seed=None): + """ + Reseeds the environment (making the resulting environment deterministic). + `reset()` must be called after `seed()`, and before `step()`. + """ + self._seed = seed + + def render(self, mode="human"): + """ + NOT SUPPORTED. + + Displays a rendered frame from the environment, if supported. + Alternate render modes in the default environments are `'rgb_array'` + which returns a numpy array and is supported by all environments outside of classic, + and `'ansi'` which returns the strings printed (specific to classic environments). + """ + pass + + @property + def dones(self): + return dict(self._dones) + + @property + def agents(self): + return sorted(self._live_agents) + + @property + def rewards(self): + return dict(self._rewards) + + @property + def infos(self): + return dict(self._infos) + + @property + def possible_agents(self): + return sorted(self._possible_agents) + + def close(self) -> None: + """ + Close the environment. + """ + if self._env is not None: + self._env.close() + self._env = None # type: ignore + + def __del__(self) -> None: + self.close() + + def state(self): + pass diff --git a/MLPY/Lib/site-packages/mlagents_envs/exception.py b/MLPY/Lib/site-packages/mlagents_envs/exception.py new file mode 100644 index 0000000000000000000000000000000000000000..324cdd0969984da52f6736674edda66ee5a628d8 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/exception.py @@ -0,0 +1,86 @@ +class UnityException(Exception): + """ + Any error related to ml-agents environment. + """ + + pass + + +class UnityEnvironmentException(UnityException): + """ + Related to errors starting and closing environment. + """ + + pass + + +class UnityCommunicationException(UnityException): + """ + Related to errors with the communicator. + """ + + pass + + +class UnityCommunicatorStoppedException(UnityException): + """ + Raised when communicator has stopped gracefully. + """ + + pass + + +class UnityObservationException(UnityException): + """ + Related to errors with receiving observations. + """ + + pass + + +class UnityActionException(UnityException): + """ + Related to errors with sending actions. + """ + + pass + + +class UnityTimeOutException(UnityException): + """ + Related to errors with communication timeouts. + """ + + pass + + +class UnitySideChannelException(UnityException): + """ + Related to errors with side channels. + """ + + pass + + +class UnityWorkerInUseException(UnityException): + """ + This error occurs when the port for a certain worker ID is already reserved. + """ + + MESSAGE_TEMPLATE = ( + "Couldn't start socket communication because worker number {} is still in use. " + "You may need to manually close a previously opened environment " + "or use a different worker number." + ) + + def __init__(self, worker_id): + message = self.MESSAGE_TEMPLATE.format(str(worker_id)) + super().__init__(message) + + +class UnityPolicyException(UnityException): + """ + Related to errors with the Trainer. + """ + + pass diff --git a/MLPY/Lib/site-packages/mlagents_envs/logging_util.py b/MLPY/Lib/site-packages/mlagents_envs/logging_util.py new file mode 100644 index 0000000000000000000000000000000000000000..ddd4e3b3cf56eed10afe4208b68ce89543e2377b --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/logging_util.py @@ -0,0 +1,63 @@ +import logging # noqa I251 +import sys + +CRITICAL = logging.CRITICAL +FATAL = logging.FATAL +ERROR = logging.ERROR +WARNING = logging.WARNING +INFO = logging.INFO +DEBUG = logging.DEBUG +NOTSET = logging.NOTSET + +_loggers = set() +_log_level = NOTSET +DATE_FORMAT = "%Y-%m-%d %H:%M:%S" +DEBUG_LOG_FORMAT = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" +LOG_FORMAT = "[%(levelname)s] %(message)s" + + +def get_logger(name: str) -> logging.Logger: + """ + Create a logger with the specified name. The logger will use the log level + specified by set_log_level() + """ + logger = logging.getLogger(name=name) + + if _log_level == DEBUG: + formatter = logging.Formatter(fmt=DEBUG_LOG_FORMAT, datefmt=DATE_FORMAT) + else: + formatter = logging.Formatter(fmt=LOG_FORMAT) + handler = logging.StreamHandler(stream=sys.stdout) + handler.setFormatter(formatter) + logger.addHandler(handler) + + # If we've already set the log level, make sure new loggers use it + if _log_level != NOTSET: + logger.setLevel(_log_level) + + # Keep track of this logger so that we can change the log level later + _loggers.add(logger) + return logger + + +def set_log_level(log_level: int) -> None: + """ + Set the ML-Agents logging level. This will also configure the logging format (if it hasn't already been set). + """ + global _log_level + _log_level = log_level + + for logger in _loggers: + logger.setLevel(log_level) + + if log_level == DEBUG: + formatter = logging.Formatter(fmt=DEBUG_LOG_FORMAT, datefmt=DATE_FORMAT) + else: + formatter = logging.Formatter(LOG_FORMAT) + _set_formatter_for_all_loggers(formatter) + + +def _set_formatter_for_all_loggers(formatter: logging.Formatter) -> None: + for logger in _loggers: + for handler in logger.handlers[:]: + handler.setFormatter(formatter) diff --git a/MLPY/Lib/site-packages/mlagents_envs/mock_communicator.py b/MLPY/Lib/site-packages/mlagents_envs/mock_communicator.py new file mode 100644 index 0000000000000000000000000000000000000000..0e425e2759e354d80b16f19b350743ea61acb7e8 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/mock_communicator.py @@ -0,0 +1,111 @@ +from typing import Optional + +from .communicator import Communicator, PollCallback +from .environment import UnityEnvironment +from mlagents_envs.communicator_objects.unity_rl_output_pb2 import UnityRLOutputProto +from mlagents_envs.communicator_objects.brain_parameters_pb2 import ( + BrainParametersProto, + ActionSpecProto, +) +from mlagents_envs.communicator_objects.unity_rl_initialization_output_pb2 import ( + UnityRLInitializationOutputProto, +) +from mlagents_envs.communicator_objects.unity_input_pb2 import UnityInputProto +from mlagents_envs.communicator_objects.unity_output_pb2 import UnityOutputProto +from mlagents_envs.communicator_objects.agent_info_pb2 import AgentInfoProto +from mlagents_envs.communicator_objects.observation_pb2 import ( + ObservationProto, + NONE as COMPRESSION_TYPE_NONE, + PNG as COMPRESSION_TYPE_PNG, +) + + +class MockCommunicator(Communicator): + def __init__( + self, + discrete_action=False, + visual_inputs=0, + num_agents=3, + brain_name="RealFakeBrain", + vec_obs_size=3, + ): + """ + Python side of the grpc communication. Python is the client and Unity the server + """ + super().__init__() + self.is_discrete = discrete_action + self.steps = 0 + self.visual_inputs = visual_inputs + self.has_been_closed = False + self.num_agents = num_agents + self.brain_name = brain_name + self.vec_obs_size = vec_obs_size + + def initialize( + self, inputs: UnityInputProto, poll_callback: Optional[PollCallback] = None + ) -> UnityOutputProto: + if self.is_discrete: + action_spec = ActionSpecProto( + num_discrete_actions=2, discrete_branch_sizes=[3, 2] + ) + else: + action_spec = ActionSpecProto(num_continuous_actions=2) + bp = BrainParametersProto( + brain_name=self.brain_name, is_training=True, action_spec=action_spec + ) + rl_init = UnityRLInitializationOutputProto( + name="RealFakeAcademy", + communication_version=UnityEnvironment.API_VERSION, + package_version="mock_package_version", + log_path="", + brain_parameters=[bp], + ) + output = UnityRLOutputProto(agentInfos=self._get_agent_infos()) + return UnityOutputProto(rl_initialization_output=rl_init, rl_output=output) + + def _get_agent_infos(self): + dict_agent_info = {} + list_agent_info = [] + vector_obs = [1, 2, 3] + + observations = [ + ObservationProto( + compressed_data=None, + shape=[30, 40, 3], + compression_type=COMPRESSION_TYPE_PNG, + ) + for _ in range(self.visual_inputs) + ] + vector_obs_proto = ObservationProto( + float_data=ObservationProto.FloatData(data=vector_obs), + shape=[len(vector_obs)], + compression_type=COMPRESSION_TYPE_NONE, + ) + observations.append(vector_obs_proto) + + for i in range(self.num_agents): + list_agent_info.append( + AgentInfoProto( + reward=1, + done=(i == 2), + max_step_reached=False, + id=i, + observations=observations, + ) + ) + dict_agent_info["RealFakeBrain"] = UnityRLOutputProto.ListAgentInfoProto( + value=list_agent_info + ) + return dict_agent_info + + def exchange( + self, inputs: UnityInputProto, poll_callback: Optional[PollCallback] = None + ) -> UnityOutputProto: + result = UnityRLOutputProto(agentInfos=self._get_agent_infos()) + return UnityOutputProto(rl_output=result) + + def close(self): + """ + Sends a shutdown signal to the unity environment, and closes the grpc connection. + """ + self.has_been_closed = True diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/__init__.py b/MLPY/Lib/site-packages/mlagents_envs/registry/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..74c0e8a7090fbf045dedf89409459e69263794ef --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/registry/__init__.py @@ -0,0 +1,4 @@ +from mlagents_envs.registry.unity_env_registry import ( # noqa F401 + default_registry, + UnityEnvRegistry, +) diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56a19980c23960888efa8c46cdad6cbefd46f41f Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/base_registry_entry.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/base_registry_entry.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63a47e9bb0c2b40769c0eb88b76786abccaf3e42 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/base_registry_entry.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/binary_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/binary_utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77dd5e8a06462e026f0a46e6583e9e5f29136590 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/binary_utils.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/remote_registry_entry.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/remote_registry_entry.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d22903f8d94c0cfe5fb46d27cf17504091f3f7b4 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/remote_registry_entry.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/unity_env_registry.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/unity_env_registry.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d9d9e30860e115eacadf669feb9da4bc1aaf6d8 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/unity_env_registry.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/base_registry_entry.py b/MLPY/Lib/site-packages/mlagents_envs/registry/base_registry_entry.py new file mode 100644 index 0000000000000000000000000000000000000000..f72009f2a950749e199cacf800fa7cbce9a95e33 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/registry/base_registry_entry.py @@ -0,0 +1,56 @@ +from abc import abstractmethod +from typing import Any, Optional +from mlagents_envs.base_env import BaseEnv + + +class BaseRegistryEntry: + def __init__( + self, + identifier: str, + expected_reward: Optional[float], + description: Optional[str], + ): + """ + BaseRegistryEntry allows launching a Unity Environment with its make method. + :param identifier: The name of the Unity Environment. + :param expected_reward: The cumulative reward that an Agent must receive + for the task to be considered solved. + :param description: A description of the Unity Environment. Contains human + readable information about potential special arguments that the make method can + take as well as information regarding the observation, reward, actions, + behaviors and number of agents in the Environment. + """ + self._identifier = identifier + self._expected_reward = expected_reward + self._description = description + + @property + def identifier(self) -> str: + """ + The unique identifier of the entry + """ + return self._identifier + + @property + def expected_reward(self) -> Optional[float]: + """ + The cumulative reward that an Agent must receive for the task to be considered + solved. + """ + return self._expected_reward + + @property + def description(self) -> Optional[str]: + """ + A description of the Unity Environment the entry can make. + """ + return self._description + + @abstractmethod + def make(self, **kwargs: Any) -> BaseEnv: + """ + This method creates a Unity BaseEnv (usually a UnityEnvironment). + """ + raise NotImplementedError( + f"The make() method not implemented for entry {self.identifier}" + ) diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/binary_utils.py b/MLPY/Lib/site-packages/mlagents_envs/registry/binary_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..41078a46795261c433e25c0327100469166af1c5 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/registry/binary_utils.py @@ -0,0 +1,259 @@ +import urllib.request +import tempfile +import os +import uuid +import shutil +import glob + +import yaml +import hashlib + +from zipfile import ZipFile +from sys import platform +from typing import Tuple, Optional, Dict, Any + +from filelock import FileLock + +from mlagents_envs.env_utils import validate_environment_path + +from mlagents_envs.logging_util import get_logger + +logger = get_logger(__name__) + +# The default logical block size is 8192 bytes (8 KB) for UFS file systems. +BLOCK_SIZE = 8192 + + +def get_local_binary_path(name: str, url: str, tmp_dir: Optional[str] = None) -> str: + """ + Returns the path to the executable previously downloaded with the name argument. If + None is found, the executable at the url argument will be downloaded and stored + under name for future uses. + :param name: The name that will be given to the folder containing the extracted data + :param url: The URL of the zip file + :param: tmp_dir: Optional override for the temporary directory to save binaries and zips in. + """ + NUMBER_ATTEMPTS = 5 + tmp_dir = tmp_dir or tempfile.gettempdir() + lock = FileLock(os.path.join(tmp_dir, name + ".lock")) + with lock: + path = get_local_binary_path_if_exists(name, url, tmp_dir=tmp_dir) + if path is None: + logger.debug( + f"Local environment {name} not found, downloading environment from {url}" + ) + for attempt in range( + NUMBER_ATTEMPTS + ): # Perform 5 attempts at downloading the file + if path is not None: + break + try: + download_and_extract_zip(url, name, tmp_dir=tmp_dir) + except Exception: + if attempt + 1 < NUMBER_ATTEMPTS: + logger.warning( + f"Attempt {attempt + 1} / {NUMBER_ATTEMPTS}" + ": Failed to download and extract binary." + ) + else: + raise + path = get_local_binary_path_if_exists(name, url, tmp_dir=tmp_dir) + + if path is None: + raise FileNotFoundError( + f"Binary not found, make sure {url} is a valid url to " + "a zip folder containing a valid Unity executable" + ) + return path + + +def get_local_binary_path_if_exists(name: str, url: str, tmp_dir: str) -> Optional[str]: + """ + Recursively searches for a Unity executable in the extracted files folders. This is + platform dependent : It will only return a Unity executable compatible with the + computer's OS. If no executable is found, None will be returned. + :param name: The name/identifier of the executable + :param url: The url the executable was downloaded from (for verification) + :param: tmp_dir: Optional override for the temporary directory to save binaries and zips in. + """ + _, bin_dir = get_tmp_dirs(tmp_dir) + extension = None + + if platform == "linux" or platform == "linux2": + extension = "*.x86_64" + if platform == "darwin": + extension = "*.app" + if platform == "win32": + extension = "*.exe" + if extension is None: + raise NotImplementedError("No extensions found for this platform.") + url_hash = "-" + hashlib.md5(url.encode()).hexdigest() + path = os.path.join(bin_dir, name + url_hash, "**", extension) + candidates = glob.glob(path, recursive=True) + if len(candidates) == 0: + return None + else: + for c in candidates: + # Unity sometimes produces another .exe file that we must filter out + if "UnityCrashHandler64" not in c: + # If the file is not valid, return None and delete faulty directory + if validate_environment_path(c) is None: + shutil.rmtree(c) + return None + return c + return None + + +def _get_tmp_dir_helper(tmp_dir: Optional[str] = None) -> Tuple[str, str]: + tmp_dir = tmp_dir or ("/tmp" if platform == "darwin" else tempfile.gettempdir()) + MLAGENTS = "ml-agents-binaries" + TMP_FOLDER_NAME = "tmp" + BINARY_FOLDER_NAME = "binaries" + mla_directory = os.path.join(tmp_dir, MLAGENTS) + if not os.path.exists(mla_directory): + os.makedirs(mla_directory) + os.chmod(mla_directory, 16877) + zip_directory = os.path.join(tmp_dir, MLAGENTS, TMP_FOLDER_NAME) + if not os.path.exists(zip_directory): + os.makedirs(zip_directory) + os.chmod(zip_directory, 16877) + bin_directory = os.path.join(tmp_dir, MLAGENTS, BINARY_FOLDER_NAME) + if not os.path.exists(bin_directory): + os.makedirs(bin_directory) + os.chmod(bin_directory, 16877) + return zip_directory, bin_directory + + +def get_tmp_dirs(tmp_dir: Optional[str] = None) -> Tuple[str, str]: + """ + Returns the path to the folder containing the downloaded zip files and the extracted + binaries. If these folders do not exist, they will be created. + :retrun: Tuple containing path to : (zip folder, extracted files folder) + """ + # TODO: Once we don't use python 3.7 we should just use exists_ok=True when creating the dirs to avoid this. + # Should only be able to error out 3 times (once for each subdir). + for _attempt in range(3): + try: + return _get_tmp_dir_helper(tmp_dir) + except FileExistsError: + continue + return _get_tmp_dir_helper(tmp_dir) + + +def download_and_extract_zip( + url: str, name: str, tmp_dir: Optional[str] = None +) -> None: + """ + Downloads a zip file under a URL, extracts its contents into a folder with the name + argument and gives chmod 755 to all the files it contains. Files are downloaded and + extracted into special folders in the temp folder of the machine. + :param url: The URL of the zip file + :param name: The name that will be given to the folder containing the extracted data + :param: tmp_dir: Optional override for the temporary directory to save binaries and zips in. + """ + zip_dir, bin_dir = get_tmp_dirs(tmp_dir) + url_hash = "-" + hashlib.md5(url.encode()).hexdigest() + binary_path = os.path.join(bin_dir, name + url_hash) + if os.path.exists(binary_path): + shutil.rmtree(binary_path) + + # Download zip + try: + request = urllib.request.urlopen(url, timeout=30) + except urllib.error.HTTPError as e: # type: ignore + e.reason = f"{e.reason} {url}" + raise + zip_size = int(request.headers["content-length"]) + zip_file_path = os.path.join(zip_dir, str(uuid.uuid4()) + ".zip") + with open(zip_file_path, "wb") as zip_file: + downloaded = 0 + while True: + buffer = request.read(BLOCK_SIZE) + if not buffer: + # There is nothing more to read + break + downloaded += len(buffer) + zip_file.write(buffer) + downloaded_percent = downloaded / zip_size * 100 + print_progress(f" Downloading {name}", downloaded_percent) + print("") + + # Extraction + with ZipFileWithProgress(zip_file_path, "r") as zip_ref: + zip_ref.extract_zip(f" Extracting {name}", binary_path) # type: ignore + print("") + + # Clean up zip + print_progress(f" Cleaning up {name}", 0) + os.remove(zip_file_path) + + # Give permission + for f in glob.glob(binary_path + "/**/*", recursive=True): + # 16877 is octal 40755, which denotes a directory with permissions 755 + os.chmod(f, 16877) + print_progress(f" Cleaning up {name}", 100) + print("") + + +def print_progress(prefix: str, percent: float) -> None: + """ + Displays a single progress bar in the terminal with value percent. + :param prefix: The string that will precede the progress bar. + :param percent: The percent progression of the bar (min is 0, max is 100) + """ + BAR_LEN = 20 + percent = min(100, max(0, percent)) + bar_progress = min(int(percent / 100 * BAR_LEN), BAR_LEN) + bar = "|" + "\u2588" * bar_progress + " " * (BAR_LEN - bar_progress) + "|" + str_percent = "%3.0f%%" % percent + print(f"{prefix} : {bar} {str_percent} \r", end="", flush=True) + + +def load_remote_manifest(url: str) -> Dict[str, Any]: + """ + Converts a remote yaml file into a Python dictionary + """ + tmp_dir, _ = get_tmp_dirs() + try: + request = urllib.request.urlopen(url, timeout=30) + except urllib.error.HTTPError as e: # type: ignore + e.reason = f"{e.reason} {url}" + raise + manifest_path = os.path.join(tmp_dir, str(uuid.uuid4()) + ".yaml") + with open(manifest_path, "wb") as manifest: + while True: + buffer = request.read(BLOCK_SIZE) + if not buffer: + # There is nothing more to read + break + manifest.write(buffer) + try: + result = load_local_manifest(manifest_path) + finally: + os.remove(manifest_path) + return result + + +def load_local_manifest(path: str) -> Dict[str, Any]: + """ + Converts a local yaml file into a Python dictionary + """ + with open(path) as data_file: + return yaml.safe_load(data_file) + + +class ZipFileWithProgress(ZipFile): + """ + This is a helper class inheriting from ZipFile that allows to display a progress + bar while the files are being extracted. + """ + + def extract_zip(self, prefix: str, path: str) -> None: + members = self.namelist() + path = os.fspath(path) + total = len(members) + n = 0 + for zipinfo in members: + self.extract(zipinfo, path, None) # type: ignore + n += 1 + print_progress(prefix, n / total * 100) diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/remote_registry_entry.py b/MLPY/Lib/site-packages/mlagents_envs/registry/remote_registry_entry.py new file mode 100644 index 0000000000000000000000000000000000000000..816d7331ba70e0faf26fbd7957ec888670cc123c --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/registry/remote_registry_entry.py @@ -0,0 +1,75 @@ +from sys import platform +from typing import Optional, Any, List +from mlagents_envs.environment import UnityEnvironment +from mlagents_envs.base_env import BaseEnv +from mlagents_envs.registry.binary_utils import get_local_binary_path +from mlagents_envs.registry.base_registry_entry import BaseRegistryEntry + + +class RemoteRegistryEntry(BaseRegistryEntry): + def __init__( + self, + identifier: str, + expected_reward: Optional[float], + description: Optional[str], + linux_url: Optional[str], + darwin_url: Optional[str], + win_url: Optional[str], + additional_args: Optional[List[str]] = None, + tmp_dir: Optional[str] = None, + ): + """ + A RemoteRegistryEntry is an implementation of BaseRegistryEntry that uses a + Unity executable downloaded from the internet to launch a UnityEnvironment. + __Note__: The url provided must be a link to a `.zip` file containing a single + compressed folder with the executable inside. There can only be one executable + in the folder and it must be at the root of the folder. + :param identifier: The name of the Unity Environment. + :param expected_reward: The cumulative reward that an Agent must receive + for the task to be considered solved. + :param description: A description of the Unity Environment. Contains human + readable information about potential special arguments that the make method can + take as well as information regarding the observation, reward, actions, + behaviors and number of agents in the Environment. + :param linux_url: The url of the Unity executable for the Linux platform + :param darwin_url: The url of the Unity executable for the OSX platform + :param win_url: The url of the Unity executable for the Windows platform + """ + super().__init__(identifier, expected_reward, description) + self._linux_url = linux_url + self._darwin_url = darwin_url + self._win_url = win_url + self._add_args = additional_args + self._tmp_dir_override = tmp_dir + + def make(self, **kwargs: Any) -> BaseEnv: + """ + Returns the UnityEnvironment that corresponds to the Unity executable found at + the provided url. The arguments passed to this method will be passed to the + constructor of the UnityEnvironment (except for the file_name argument) + """ + url = None + if platform == "linux" or platform == "linux2": + url = self._linux_url + if platform == "darwin": + url = self._darwin_url + if platform == "win32": + url = self._win_url + if url is None: + raise FileNotFoundError( + f"The entry {self.identifier} does not contain a valid url for this " + "platform" + ) + path = get_local_binary_path( + self.identifier, url, tmp_dir=self._tmp_dir_override + ) + if "file_name" in kwargs: + kwargs.pop("file_name") + args: List[str] = [] + if "additional_args" in kwargs: + if kwargs["additional_args"] is not None: + args += kwargs["additional_args"] + if self._add_args is not None: + args += self._add_args + kwargs["additional_args"] = args + return UnityEnvironment(file_name=path, **kwargs) diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/unity_env_registry.py b/MLPY/Lib/site-packages/mlagents_envs/registry/unity_env_registry.py new file mode 100644 index 0000000000000000000000000000000000000000..639f85794774c9271cda215e2aaddbf3a5552852 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/registry/unity_env_registry.py @@ -0,0 +1,125 @@ +from typing import Dict, Iterator, Any, List +from collections.abc import Mapping +from mlagents_envs.registry.base_registry_entry import BaseRegistryEntry +from mlagents_envs.registry.binary_utils import ( + load_local_manifest, + load_remote_manifest, +) +from mlagents_envs.registry.remote_registry_entry import RemoteRegistryEntry + + +class UnityEnvRegistry(Mapping): + """ + ### UnityEnvRegistry + Provides a library of Unity environments that can be launched without the need + of downloading the Unity Editor. + The UnityEnvRegistry implements a Map, to access an entry of the Registry, use: + ```python + registry = UnityEnvRegistry() + entry = registry[] + ``` + An entry has the following properties : + * `identifier` : Uniquely identifies this environment + * `expected_reward` : Corresponds to the reward an agent must obtained for the task + to be considered completed. + * `description` : A human readable description of the environment. + + To launch a Unity environment from a registry entry, use the `make` method: + ```python + registry = UnityEnvRegistry() + env = registry[].make() + ``` + """ + + def __init__(self): + self._REGISTERED_ENVS: Dict[str, BaseRegistryEntry] = {} + self._manifests: List[str] = [] + self._sync = True + + def register(self, new_entry: BaseRegistryEntry) -> None: + """ + Registers a new BaseRegistryEntry to the registry. The + BaseRegistryEntry.identifier value will be used as indexing key. + If two are more environments are registered under the same key, the most + recentry added will replace the others. + """ + self._REGISTERED_ENVS[new_entry.identifier] = new_entry + + def register_from_yaml(self, path_to_yaml: str) -> None: + """ + Registers the environments listed in a yaml file (either local or remote). Note + that the entries are registered lazily: the registration will only happen when + an environment is accessed. + The yaml file must have the following format : + ```yaml + environments: + - : + expected_reward: + description: | + + linux_url: + darwin_url: + win_url: + + - : + expected_reward: + description: | + + linux_url: + darwin_url: + win_url: + + - ... + ``` + :param path_to_yaml: A local path or url to the yaml file + """ + self._manifests.append(path_to_yaml) + self._sync = False + + def _load_all_manifests(self) -> None: + if not self._sync: + for path_to_yaml in self._manifests: + if path_to_yaml[:4] == "http": + manifest = load_remote_manifest(path_to_yaml) + else: + manifest = load_local_manifest(path_to_yaml) + for env in manifest["environments"]: + remote_entry_args = list(env.values())[0] + remote_entry_args["identifier"] = list(env.keys())[0] + self.register(RemoteRegistryEntry(**remote_entry_args)) + self._manifests = [] + self._sync = True + + def clear(self) -> None: + """ + Deletes all entries in the registry. + """ + self._REGISTERED_ENVS.clear() + self._manifests = [] + self._sync = True + + def __getitem__(self, identifier: str) -> BaseRegistryEntry: + """ + Returns the BaseRegistryEntry with the provided identifier. BaseRegistryEntry + can then be used to make a Unity Environment. + :param identifier: The identifier of the BaseRegistryEntry + :returns: The associated BaseRegistryEntry + """ + self._load_all_manifests() + if identifier not in self._REGISTERED_ENVS: + raise KeyError(f"The entry {identifier} is not present in the registry.") + return self._REGISTERED_ENVS[identifier] + + def __len__(self) -> int: + self._load_all_manifests() + return len(self._REGISTERED_ENVS) + + def __iter__(self) -> Iterator[Any]: + self._load_all_manifests() + yield from self._REGISTERED_ENVS + + +default_registry = UnityEnvRegistry() +default_registry.register_from_yaml( + "https://storage.googleapis.com/mlagents-test-environments/1.0.0/manifest.yaml" +) # noqa E501 diff --git a/MLPY/Lib/site-packages/mlagents_envs/rpc_communicator.py b/MLPY/Lib/site-packages/mlagents_envs/rpc_communicator.py new file mode 100644 index 0000000000000000000000000000000000000000..13c0df08b1c36241dd3858c47dc82366746c87e9 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/rpc_communicator.py @@ -0,0 +1,158 @@ +import grpc +from typing import Optional + +from multiprocessing import Pipe +from sys import platform +import socket +import time +from concurrent.futures import ThreadPoolExecutor + +from .communicator import Communicator, PollCallback +from mlagents_envs.communicator_objects.unity_to_external_pb2_grpc import ( + UnityToExternalProtoServicer, + add_UnityToExternalProtoServicer_to_server, +) +from mlagents_envs.communicator_objects.unity_message_pb2 import UnityMessageProto +from mlagents_envs.communicator_objects.unity_input_pb2 import UnityInputProto +from mlagents_envs.communicator_objects.unity_output_pb2 import UnityOutputProto +from .exception import UnityTimeOutException, UnityWorkerInUseException + + +class UnityToExternalServicerImplementation(UnityToExternalProtoServicer): + def __init__(self): + self.parent_conn, self.child_conn = Pipe() + + def Initialize(self, request, context): + self.child_conn.send(request) + return self.child_conn.recv() + + def Exchange(self, request, context): + self.child_conn.send(request) + return self.child_conn.recv() + + +class RpcCommunicator(Communicator): + def __init__(self, worker_id=0, base_port=5005, timeout_wait=30): + """ + Python side of the grpc communication. Python is the server and Unity the client + + + :int base_port: Baseline port number to connect to Unity environment over. worker_id increments over this. + :int worker_id: Offset from base_port. Used for training multiple environments simultaneously. + :int timeout_wait: Timeout (in seconds) to wait for a response before exiting. + """ + super().__init__(worker_id, base_port) + self.port = base_port + worker_id + self.worker_id = worker_id + self.timeout_wait = timeout_wait + self.server = None + self.unity_to_external = None + self.is_open = False + self.create_server() + + def create_server(self): + """ + Creates the GRPC server. + """ + self.check_port(self.port) + + try: + # Establish communication grpc + self.server = grpc.server( + thread_pool=ThreadPoolExecutor(max_workers=10), + options=(("grpc.so_reuseport", 1),), + ) + self.unity_to_external = UnityToExternalServicerImplementation() + add_UnityToExternalProtoServicer_to_server( + self.unity_to_external, self.server + ) + # Using unspecified address, which means that grpc is communicating on all IPs + # This is so that the docker container can connect. + self.server.add_insecure_port("[::]:" + str(self.port)) + self.server.start() + self.is_open = True + except Exception: + raise UnityWorkerInUseException(self.worker_id) + + def check_port(self, port): + """ + Attempts to bind to the requested communicator port, checking if it is already in use. + """ + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + if platform == "linux" or platform == "linux2": + # On linux, the port remains unusable for TIME_WAIT=60 seconds after closing + # SO_REUSEADDR frees the port right after closing the environment + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + try: + s.bind(("localhost", port)) + except OSError: + raise UnityWorkerInUseException(self.worker_id) + finally: + s.close() + + def poll_for_timeout(self, poll_callback: Optional[PollCallback] = None) -> None: + """ + Polls the GRPC parent connection for data, to be used before calling recv. This prevents + us from hanging indefinitely in the case where the environment process has died or was not + launched. + + Additionally, a callback can be passed to periodically check the state of the environment. + This is used to detect the case when the environment dies without cleaning up the connection, + so that we can stop sooner and raise a more appropriate error. + """ + deadline = time.monotonic() + self.timeout_wait + callback_timeout_wait = self.timeout_wait // 10 + while time.monotonic() < deadline: + if self.unity_to_external.parent_conn.poll(callback_timeout_wait): + # Got an acknowledgment from the connection + return + if poll_callback: + # Fire the callback - if it detects something wrong, it should raise an exception. + poll_callback() + + # Got this far without reading any data from the connection, so it must be dead. + raise UnityTimeOutException( + "The Unity environment took too long to respond. Make sure that :\n" + "\t The environment does not need user interaction to launch\n" + '\t The Agents\' Behavior Parameters > Behavior Type is set to "Default"\n' + "\t The environment and the Python interface have compatible versions.\n" + "\t If you're running on a headless server without graphics support, turn off display " + "by either passing --no-graphics option or build your Unity executable as server build." + ) + + def initialize( + self, inputs: UnityInputProto, poll_callback: Optional[PollCallback] = None + ) -> UnityOutputProto: + self.poll_for_timeout(poll_callback) + aca_param = self.unity_to_external.parent_conn.recv().unity_output + message = UnityMessageProto() + message.header.status = 200 + message.unity_input.CopyFrom(inputs) + self.unity_to_external.parent_conn.send(message) + self.unity_to_external.parent_conn.recv() + return aca_param + + def exchange( + self, inputs: UnityInputProto, poll_callback: Optional[PollCallback] = None + ) -> Optional[UnityOutputProto]: + message = UnityMessageProto() + message.header.status = 200 + message.unity_input.CopyFrom(inputs) + self.unity_to_external.parent_conn.send(message) + self.poll_for_timeout(poll_callback) + output = self.unity_to_external.parent_conn.recv() + if output.header.status != 200: + return None + return output.unity_output + + def close(self): + """ + Sends a shutdown signal to the unity environment, and closes the grpc connection. + """ + if self.is_open: + message_input = UnityMessageProto() + message_input.header.status = 400 + self.unity_to_external.parent_conn.send(message_input) + self.unity_to_external.parent_conn.close() + self.server.stop(False) + self.is_open = False diff --git a/MLPY/Lib/site-packages/mlagents_envs/rpc_utils.py b/MLPY/Lib/site-packages/mlagents_envs/rpc_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f2e3d1d4684d998434d714092b0010bdd12653cb --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/rpc_utils.py @@ -0,0 +1,431 @@ +from mlagents_envs.base_env import ( + ActionSpec, + ObservationSpec, + DimensionProperty, + BehaviorSpec, + DecisionSteps, + TerminalSteps, + ObservationType, +) +from mlagents_envs.exception import UnityObservationException +from mlagents_envs.timers import hierarchical_timer, timed +from mlagents_envs.communicator_objects.agent_info_pb2 import AgentInfoProto +from mlagents_envs.communicator_objects.observation_pb2 import ( + ObservationProto, + NONE as COMPRESSION_TYPE_NONE, +) +from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto +import numpy as np +import io +from typing import cast, List, Tuple, Collection, Optional, Iterable +from PIL import Image + + +PNG_HEADER = b"\x89PNG\r\n\x1a\n" + + +def behavior_spec_from_proto( + brain_param_proto: BrainParametersProto, agent_info: AgentInfoProto +) -> BehaviorSpec: + """ + Converts brain parameter and agent info proto to BehaviorSpec object. + :param brain_param_proto: protobuf object. + :param agent_info: protobuf object. + :return: BehaviorSpec object. + """ + observation_specs = [] + for obs in agent_info.observations: + observation_specs.append( + ObservationSpec( + name=obs.name, + shape=tuple(obs.shape), + observation_type=ObservationType(obs.observation_type), + dimension_property=tuple( + DimensionProperty(dim) for dim in obs.dimension_properties + ) + if len(obs.dimension_properties) > 0 + else (DimensionProperty.UNSPECIFIED,) * len(obs.shape), + ) + ) + + # proto from communicator < v1.3 does not set action spec, use deprecated fields instead + if ( + brain_param_proto.action_spec.num_continuous_actions == 0 + and brain_param_proto.action_spec.num_discrete_actions == 0 + ): + if brain_param_proto.vector_action_space_type_deprecated == 1: + action_spec = ActionSpec( + brain_param_proto.vector_action_size_deprecated[0], () + ) + else: + action_spec = ActionSpec( + 0, tuple(brain_param_proto.vector_action_size_deprecated) + ) + else: + action_spec_proto = brain_param_proto.action_spec + action_spec = ActionSpec( + action_spec_proto.num_continuous_actions, + tuple(branch for branch in action_spec_proto.discrete_branch_sizes), + ) + return BehaviorSpec(observation_specs, action_spec) + + +class OffsetBytesIO: + """ + Simple file-like class that wraps a bytes, and allows moving its "start" + position in the bytes. This is only used for reading concatenated PNGs, + because Pillow always calls seek(0) at the start of reading. + """ + + __slots__ = ["fp", "offset"] + + def __init__(self, data: bytes): + self.fp = io.BytesIO(data) + self.offset = 0 + + def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: + if whence == io.SEEK_SET: + res = self.fp.seek(offset + self.offset) + return res - self.offset + raise NotImplementedError() + + def tell(self) -> int: + return self.fp.tell() - self.offset + + def read(self, size: int = -1) -> bytes: + return self.fp.read(size) + + def original_tell(self) -> int: + """ + Returns the offset into the original byte array + """ + return self.fp.tell() + + +@timed +def process_pixels( + image_bytes: bytes, expected_channels: int, mappings: Optional[List[int]] = None +) -> np.ndarray: + """ + Converts byte array observation image into numpy array, re-sizes it, + and optionally converts it to grey scale + :param image_bytes: input byte array corresponding to image + :param expected_channels: Expected output channels + :return: processed numpy array of observation from environment + """ + image_fp = OffsetBytesIO(image_bytes) + + image_arrays = [] + # Read the images back from the bytes (without knowing the sizes). + while True: + with hierarchical_timer("image_decompress"): + image = Image.open(image_fp) + # Normally Image loads lazily, load() forces it to do loading in the timer scope. + image.load() + image_arrays.append(np.array(image, dtype=np.float32) / 255.0) + + # Look for the next header, starting from the current stream location + try: + new_offset = image_bytes.index(PNG_HEADER, image_fp.original_tell()) + image_fp.offset = new_offset + except ValueError: + # Didn't find the header, so must be at the end. + break + + if mappings is not None and len(mappings) > 0: + return _process_images_mapping(image_arrays, mappings) + else: + return _process_images_num_channels(image_arrays, expected_channels) + + +def _process_images_mapping(image_arrays, mappings): + """ + Helper function for processing decompressed images with compressed channel mappings. + """ + image_arrays = np.concatenate(image_arrays, axis=2).transpose((2, 0, 1)) + + if len(mappings) != len(image_arrays): + raise UnityObservationException( + f"Compressed observation and its mapping had different number of channels - " + f"observation had {len(image_arrays)} channels but its mapping had {len(mappings)} channels" + ) + if len({m for m in mappings if m > -1}) != max(mappings) + 1: + raise UnityObservationException( + f"Invalid Compressed Channel Mapping: the mapping {mappings} does not have the correct format." + ) + if max(mappings) >= len(image_arrays): + raise UnityObservationException( + f"Invalid Compressed Channel Mapping: the mapping has index larger than the total " + f"number of channels in observation - mapping index {max(mappings)} is" + f"invalid for input observation with {len(image_arrays)} channels." + ) + + processed_image_arrays: List[np.array] = [[] for _ in range(max(mappings) + 1)] + for mapping_idx, img in zip(mappings, image_arrays): + if mapping_idx > -1: + processed_image_arrays[mapping_idx].append(img) + + for i, img_array in enumerate(processed_image_arrays): + processed_image_arrays[i] = np.mean(img_array, axis=0) + img = np.stack(processed_image_arrays, axis=2) + return img + + +def _process_images_num_channels(image_arrays, expected_channels): + """ + Helper function for processing decompressed images with number of expected channels. + This is for old API without mapping provided. Use the first n channel, n=expected_channels. + """ + if expected_channels == 1: + # Convert to grayscale + img = np.mean(image_arrays[0], axis=2) + img = np.reshape(img, [img.shape[0], img.shape[1], 1]) + else: + img = np.concatenate(image_arrays, axis=2) + # We can drop additional channels since they may need to be added to include + # numbers of observation channels not divisible by 3. + actual_channels = list(img.shape)[2] + if actual_channels > expected_channels: + img = img[..., 0:expected_channels] + return img + + +def _check_observations_match_spec( + obs_index: int, + observation_spec: ObservationSpec, + agent_info_list: Collection[AgentInfoProto], +) -> None: + """ + Check that all the observations match the expected size. + This gives a nicer error than a cryptic numpy error later. + """ + expected_obs_shape = tuple(observation_spec.shape) + for agent_info in agent_info_list: + agent_obs_shape = tuple(agent_info.observations[obs_index].shape) + if expected_obs_shape != agent_obs_shape: + raise UnityObservationException( + f"Observation at index={obs_index} for agent with " + f"id={agent_info.id} didn't match the ObservationSpec. " + f"Expected shape {expected_obs_shape} but got {agent_obs_shape}." + ) + + +@timed +def _observation_to_np_array( + obs: ObservationProto, expected_shape: Optional[Iterable[int]] = None +) -> np.ndarray: + """ + Converts observation proto into numpy array of the appropriate size. + :param obs: observation proto to be converted + :param expected_shape: optional shape information, used for sanity checks. + :return: processed numpy array of observation from environment + """ + if expected_shape is not None: + if list(obs.shape) != list(expected_shape): + raise UnityObservationException( + f"Observation did not have the expected shape - got {obs.shape} but expected {expected_shape}" + ) + expected_channels = obs.shape[2] + if obs.compression_type == COMPRESSION_TYPE_NONE: + img = np.array(obs.float_data.data, dtype=np.float32) + img = np.reshape(img, obs.shape) + return img + else: + img = process_pixels( + obs.compressed_data, expected_channels, list(obs.compressed_channel_mapping) + ) + # Compare decompressed image size to observation shape and make sure they match + if list(obs.shape) != list(img.shape): + raise UnityObservationException( + f"Decompressed observation did not have the expected shape - " + f"decompressed had {img.shape} but expected {obs.shape}" + ) + return img + + +@timed +def _process_maybe_compressed_observation( + obs_index: int, + observation_spec: ObservationSpec, + agent_info_list: Collection[AgentInfoProto], +) -> np.ndarray: + shape = cast(Tuple[int, int, int], observation_spec.shape) + if len(agent_info_list) == 0: + return np.zeros((0, shape[0], shape[1], shape[2]), dtype=np.float32) + + try: + batched_visual = [ + _observation_to_np_array(agent_obs.observations[obs_index], shape) + for agent_obs in agent_info_list + ] + except ValueError: + # Try to get a more useful error message + _check_observations_match_spec(obs_index, observation_spec, agent_info_list) + # If that didn't raise anything, raise the original error + raise + return np.array(batched_visual, dtype=np.float32) + + +def _raise_on_nan_and_inf(data: np.array, source: str) -> np.array: + # Check for NaNs or Infinite values in the observation or reward data. + # If there's a NaN in the observations, the np.mean() result will be NaN + # If there's an Infinite value (either sign) then the result will be Inf + # See https://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy for background + # Note that a very large values (larger than sqrt(float_max)) will result in an Inf value here + # Raise a Runtime error in the case that NaNs or Infinite values make it into the data. + if data.size == 0: + return data + + d = np.mean(data) + has_nan = np.isnan(d) + has_inf = not np.isfinite(d) + + if has_nan: + raise RuntimeError(f"The {source} provided had NaN values.") + if has_inf: + raise RuntimeError(f"The {source} provided had Infinite values.") + + +@timed +def _process_rank_one_or_two_observation( + obs_index: int, + observation_spec: ObservationSpec, + agent_info_list: Collection[AgentInfoProto], +) -> np.ndarray: + if len(agent_info_list) == 0: + return np.zeros((0,) + observation_spec.shape, dtype=np.float32) + try: + np_obs = np.array( + [ + agent_obs.observations[obs_index].float_data.data + for agent_obs in agent_info_list + ], + dtype=np.float32, + ).reshape((len(agent_info_list),) + observation_spec.shape) + except ValueError: + # Try to get a more useful error message + _check_observations_match_spec(obs_index, observation_spec, agent_info_list) + # If that didn't raise anything, raise the original error + raise + _raise_on_nan_and_inf(np_obs, "observations") + return np_obs + + +@timed +def steps_from_proto( + agent_info_list: Collection[AgentInfoProto], behavior_spec: BehaviorSpec +) -> Tuple[DecisionSteps, TerminalSteps]: + decision_agent_info_list = [ + agent_info for agent_info in agent_info_list if not agent_info.done + ] + terminal_agent_info_list = [ + agent_info for agent_info in agent_info_list if agent_info.done + ] + decision_obs_list: List[np.ndarray] = [] + terminal_obs_list: List[np.ndarray] = [] + for obs_index, observation_spec in enumerate(behavior_spec.observation_specs): + is_visual = len(observation_spec.shape) == 3 + if is_visual: + decision_obs_list.append( + _process_maybe_compressed_observation( + obs_index, observation_spec, decision_agent_info_list + ) + ) + terminal_obs_list.append( + _process_maybe_compressed_observation( + obs_index, observation_spec, terminal_agent_info_list + ) + ) + else: + decision_obs_list.append( + _process_rank_one_or_two_observation( + obs_index, observation_spec, decision_agent_info_list + ) + ) + terminal_obs_list.append( + _process_rank_one_or_two_observation( + obs_index, observation_spec, terminal_agent_info_list + ) + ) + decision_rewards = np.array( + [agent_info.reward for agent_info in decision_agent_info_list], dtype=np.float32 + ) + terminal_rewards = np.array( + [agent_info.reward for agent_info in terminal_agent_info_list], dtype=np.float32 + ) + + decision_group_rewards = np.array( + [agent_info.group_reward for agent_info in decision_agent_info_list], + dtype=np.float32, + ) + terminal_group_rewards = np.array( + [agent_info.group_reward for agent_info in terminal_agent_info_list], + dtype=np.float32, + ) + + _raise_on_nan_and_inf(decision_rewards, "rewards") + _raise_on_nan_and_inf(terminal_rewards, "rewards") + _raise_on_nan_and_inf(decision_group_rewards, "group_rewards") + _raise_on_nan_and_inf(terminal_group_rewards, "group_rewards") + + decision_group_id = [agent_info.group_id for agent_info in decision_agent_info_list] + terminal_group_id = [agent_info.group_id for agent_info in terminal_agent_info_list] + + max_step = np.array( + [agent_info.max_step_reached for agent_info in terminal_agent_info_list], + dtype=bool, + ) + decision_agent_id = np.array( + [agent_info.id for agent_info in decision_agent_info_list], dtype=np.int32 + ) + terminal_agent_id = np.array( + [agent_info.id for agent_info in terminal_agent_info_list], dtype=np.int32 + ) + action_mask = None + if behavior_spec.action_spec.discrete_size > 0: + if any( + [agent_info.action_mask is not None] + for agent_info in decision_agent_info_list + ): + n_agents = len(decision_agent_info_list) + a_size = np.sum(behavior_spec.action_spec.discrete_branches) + mask_matrix = np.ones((n_agents, a_size), dtype=bool) + for agent_index, agent_info in enumerate(decision_agent_info_list): + if agent_info.action_mask is not None: + if len(agent_info.action_mask) == a_size: + mask_matrix[agent_index, :] = [ + False if agent_info.action_mask[k] else True + for k in range(a_size) + ] + action_mask = (1 - mask_matrix).astype(bool) + indices = _generate_split_indices( + behavior_spec.action_spec.discrete_branches + ) + action_mask = np.split(action_mask, indices, axis=1) + return ( + DecisionSteps( + decision_obs_list, + decision_rewards, + decision_agent_id, + action_mask, + decision_group_id, + decision_group_rewards, + ), + TerminalSteps( + terminal_obs_list, + terminal_rewards, + max_step, + terminal_agent_id, + terminal_group_id, + terminal_group_rewards, + ), + ) + + +def _generate_split_indices(dims): + if len(dims) <= 1: + return () + result = (dims[0],) + for i in range(len(dims) - 2): + result += (dims[i + 1] + result[i],) + return result diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__init__.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c9a1f5f0f71d0b0971214921d0514538cb3c3a89 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__init__.py @@ -0,0 +1,7 @@ +from mlagents_envs.side_channel.incoming_message import IncomingMessage # noqa +from mlagents_envs.side_channel.outgoing_message import OutgoingMessage # noqa + +from mlagents_envs.side_channel.side_channel import SideChannel # noqa +from mlagents_envs.side_channel.default_training_analytics_side_channel import ( # noqa + DefaultTrainingAnalyticsSideChannel, # noqa +) # noqa diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e59f51d70ab642acfa1f2c21a15aedca8a73c23 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/__init__.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/default_training_analytics_side_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/default_training_analytics_side_channel.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..491514935e40567c595a985f553f2f2900be94d4 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/default_training_analytics_side_channel.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/engine_configuration_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/engine_configuration_channel.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7f1867eb9320b7c8cc163f361ce1e6a7d38d2f9 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/engine_configuration_channel.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/environment_parameters_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/environment_parameters_channel.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f399e5528e9be32428b321484e46d89236379e8 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/environment_parameters_channel.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/float_properties_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/float_properties_channel.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..775bd587b43e541b9ebc83c0c5cddee75ca5285d Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/float_properties_channel.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/incoming_message.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/incoming_message.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b0fc669efb973d01d846586a51d5b353f021370 Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/incoming_message.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/outgoing_message.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/outgoing_message.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ea39198def92a5fe76c947d90e0e36eb94ab95b Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/outgoing_message.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/raw_bytes_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/raw_bytes_channel.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..723d1a503b29d6929465face5c1bca2f28e3b6ae Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/raw_bytes_channel.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/side_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/side_channel.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca45af506c507e50b44342f51714d00905858cdd Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/side_channel.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/side_channel_manager.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/side_channel_manager.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a79f31616c47668d7f4d47a99c026d447241455b Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/side_channel_manager.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/stats_side_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/stats_side_channel.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2888cf935d27e9e44062d10407cf881f7bce39d Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/stats_side_channel.cpython-39.pyc differ diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/default_training_analytics_side_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/default_training_analytics_side_channel.py new file mode 100644 index 0000000000000000000000000000000000000000..a53e686709e3876bf3444bc1027c552d53dd076b --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/default_training_analytics_side_channel.py @@ -0,0 +1,49 @@ +import sys +import uuid +import mlagents_envs + +from mlagents_envs.exception import UnityCommunicationException +from mlagents_envs.side_channel import SideChannel, IncomingMessage, OutgoingMessage +from mlagents_envs.communicator_objects.training_analytics_pb2 import ( + TrainingEnvironmentInitialized, +) +from google.protobuf.any_pb2 import Any + + +class DefaultTrainingAnalyticsSideChannel(SideChannel): + """ + Side channel that sends information about the training to the Unity environment so it can be logged. + """ + + CHANNEL_ID = uuid.UUID("b664a4a9-d86f-5a5f-95cb-e8353a7e8356") + + def __init__(self) -> None: + # >>> uuid.uuid5(uuid.NAMESPACE_URL, "com.unity.ml-agents/TrainingAnalyticsSideChannel") + # UUID('b664a4a9-d86f-5a5f-95cb-e8353a7e8356') + # We purposefully use the SAME side channel as the TrainingAnalyticsSideChannel + + super().__init__(DefaultTrainingAnalyticsSideChannel.CHANNEL_ID) + + def on_message_received(self, msg: IncomingMessage) -> None: + raise UnityCommunicationException( + "The DefaultTrainingAnalyticsSideChannel received a message from Unity, " + + "this should not have happened." + ) + + def environment_initialized(self) -> None: + # Tuple of (major, minor, patch) + vi = sys.version_info + + msg = TrainingEnvironmentInitialized( + python_version=f"{vi[0]}.{vi[1]}.{vi[2]}", + mlagents_version="Custom", + mlagents_envs_version=mlagents_envs.__version__, + torch_version="Unknown", + torch_device_type="Unknown", + ) + any_message = Any() + any_message.Pack(msg) + + env_init_msg = OutgoingMessage() + env_init_msg.set_raw_bytes(any_message.SerializeToString()) # type: ignore + super().queue_message_to_send(env_init_msg) diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/engine_configuration_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/engine_configuration_channel.py new file mode 100644 index 0000000000000000000000000000000000000000..ce1715ba07349fc377fb846dc5e050855b140965 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/engine_configuration_channel.py @@ -0,0 +1,127 @@ +from mlagents_envs.side_channel import SideChannel, OutgoingMessage, IncomingMessage +from mlagents_envs.exception import ( + UnityCommunicationException, + UnitySideChannelException, +) +import uuid +from typing import NamedTuple, Optional +from enum import IntEnum + + +class EngineConfig(NamedTuple): + width: Optional[int] + height: Optional[int] + quality_level: Optional[int] + time_scale: Optional[float] + target_frame_rate: Optional[int] + capture_frame_rate: Optional[int] + + @staticmethod + def default_config(): + return EngineConfig(80, 80, 1, 20.0, -1, 60) + + +class EngineConfigurationChannel(SideChannel): + """ + This is the SideChannel for engine configuration exchange. The data in the + engine configuration is as follows : + - int width; + - int height; + - int qualityLevel; + - float timeScale; + - int targetFrameRate; + - int captureFrameRate; + """ + + class ConfigurationType(IntEnum): + SCREEN_RESOLUTION = 0 + QUALITY_LEVEL = 1 + TIME_SCALE = 2 + TARGET_FRAME_RATE = 3 + CAPTURE_FRAME_RATE = 4 + + def __init__(self) -> None: + super().__init__(uuid.UUID("e951342c-4f7e-11ea-b238-784f4387d1f7")) + + def on_message_received(self, msg: IncomingMessage) -> None: + """ + Is called by the environment to the side channel. Can be called + multiple times per step if multiple messages are meant for that + SideChannel. + Note that Python should never receive an engine configuration from + Unity + """ + raise UnityCommunicationException( + "The EngineConfigurationChannel received a message from Unity, " + + "this should not have happened." + ) + + def set_configuration_parameters( + self, + width: Optional[int] = None, + height: Optional[int] = None, + quality_level: Optional[int] = None, + time_scale: Optional[float] = None, + target_frame_rate: Optional[int] = None, + capture_frame_rate: Optional[int] = None, + ) -> None: + """ + Sets the engine configuration. Takes as input the configurations of the + engine. + :param width: Defines the width of the display. (Must be set alongside height) + :param height: Defines the height of the display. (Must be set alongside width) + :param quality_level: Defines the quality level of the simulation. + :param time_scale: Defines the multiplier for the deltatime in the + simulation. If set to a higher value, time will pass faster in the + simulation but the physics might break. + :param target_frame_rate: Instructs simulation to try to render at a + specified frame rate. + :param capture_frame_rate: Instructs the simulation to consider time between + updates to always be constant, regardless of the actual frame rate. + """ + + if (width is None and height is not None) or ( + width is not None and height is None + ): + raise UnitySideChannelException( + "You cannot set the width/height of the screen resolution without also setting the height/width" + ) + + if width is not None and height is not None: + screen_msg = OutgoingMessage() + screen_msg.write_int32(self.ConfigurationType.SCREEN_RESOLUTION) + screen_msg.write_int32(width) + screen_msg.write_int32(height) + super().queue_message_to_send(screen_msg) + + if quality_level is not None: + quality_level_msg = OutgoingMessage() + quality_level_msg.write_int32(self.ConfigurationType.QUALITY_LEVEL) + quality_level_msg.write_int32(quality_level) + super().queue_message_to_send(quality_level_msg) + + if time_scale is not None: + time_scale_msg = OutgoingMessage() + time_scale_msg.write_int32(self.ConfigurationType.TIME_SCALE) + time_scale_msg.write_float32(time_scale) + super().queue_message_to_send(time_scale_msg) + + if target_frame_rate is not None: + target_frame_rate_msg = OutgoingMessage() + target_frame_rate_msg.write_int32(self.ConfigurationType.TARGET_FRAME_RATE) + target_frame_rate_msg.write_int32(target_frame_rate) + super().queue_message_to_send(target_frame_rate_msg) + + if capture_frame_rate is not None: + capture_frame_rate_msg = OutgoingMessage() + capture_frame_rate_msg.write_int32( + self.ConfigurationType.CAPTURE_FRAME_RATE + ) + capture_frame_rate_msg.write_int32(capture_frame_rate) + super().queue_message_to_send(capture_frame_rate_msg) + + def set_configuration(self, config: EngineConfig) -> None: + """ + Sets the engine configuration. Takes as input an EngineConfig. + """ + self.set_configuration_parameters(**config._asdict()) diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/environment_parameters_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/environment_parameters_channel.py new file mode 100644 index 0000000000000000000000000000000000000000..ff516a5eb586355ee3699d4065e79c5e57340c1d --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/environment_parameters_channel.py @@ -0,0 +1,100 @@ +from mlagents_envs.side_channel import SideChannel, IncomingMessage, OutgoingMessage +from mlagents_envs.exception import UnityCommunicationException +import uuid +from enum import IntEnum +from typing import List, Tuple + + +class EnvironmentParametersChannel(SideChannel): + """ + This is the SideChannel for sending environment parameters to Unity. + You can send parameters to an environment with the command + set_float_parameter. + """ + + class EnvironmentDataTypes(IntEnum): + FLOAT = 0 + SAMPLER = 1 + + class SamplerTypes(IntEnum): + UNIFORM = 0 + GAUSSIAN = 1 + MULTIRANGEUNIFORM = 2 + + def __init__(self) -> None: + channel_id = uuid.UUID("534c891e-810f-11ea-a9d0-822485860400") + super().__init__(channel_id) + + def on_message_received(self, msg: IncomingMessage) -> None: + raise UnityCommunicationException( + "The EnvironmentParametersChannel received a message from Unity, " + + "this should not have happened." + ) + + def set_float_parameter(self, key: str, value: float) -> None: + """ + Sets a float environment parameter in the Unity Environment. + :param key: The string identifier of the parameter. + :param value: The float value of the parameter. + """ + msg = OutgoingMessage() + msg.write_string(key) + msg.write_int32(self.EnvironmentDataTypes.FLOAT) + msg.write_float32(value) + super().queue_message_to_send(msg) + + def set_uniform_sampler_parameters( + self, key: str, min_value: float, max_value: float, seed: int + ) -> None: + """ + Sets a uniform environment parameter sampler. + :param key: The string identifier of the parameter. + :param min_value: The minimum of the sampling distribution. + :param max_value: The maximum of the sampling distribution. + :param seed: The random seed to initialize the sampler. + """ + msg = OutgoingMessage() + msg.write_string(key) + msg.write_int32(self.EnvironmentDataTypes.SAMPLER) + msg.write_int32(seed) + msg.write_int32(self.SamplerTypes.UNIFORM) + msg.write_float32(min_value) + msg.write_float32(max_value) + super().queue_message_to_send(msg) + + def set_gaussian_sampler_parameters( + self, key: str, mean: float, st_dev: float, seed: int + ) -> None: + """ + Sets a gaussian environment parameter sampler. + :param key: The string identifier of the parameter. + :param mean: The mean of the sampling distribution. + :param st_dev: The standard deviation of the sampling distribution. + :param seed: The random seed to initialize the sampler. + """ + msg = OutgoingMessage() + msg.write_string(key) + msg.write_int32(self.EnvironmentDataTypes.SAMPLER) + msg.write_int32(seed) + msg.write_int32(self.SamplerTypes.GAUSSIAN) + msg.write_float32(mean) + msg.write_float32(st_dev) + super().queue_message_to_send(msg) + + def set_multirangeuniform_sampler_parameters( + self, key: str, intervals: List[Tuple[float, float]], seed: int + ) -> None: + """ + Sets a multirangeuniform environment parameter sampler. + :param key: The string identifier of the parameter. + :param intervals: The lists of min and max that define each uniform distribution. + :param seed: The random seed to initialize the sampler. + """ + msg = OutgoingMessage() + msg.write_string(key) + msg.write_int32(self.EnvironmentDataTypes.SAMPLER) + msg.write_int32(seed) + msg.write_int32(self.SamplerTypes.MULTIRANGEUNIFORM) + flattened_intervals = [value for interval in intervals for value in interval] + msg.write_float32_list(flattened_intervals) + super().queue_message_to_send(msg) diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/float_properties_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/float_properties_channel.py new file mode 100644 index 0000000000000000000000000000000000000000..6c6701129f4f3103db238371552a9d6384ca3ab8 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/float_properties_channel.py @@ -0,0 +1,62 @@ +from mlagents_envs.side_channel import SideChannel, IncomingMessage, OutgoingMessage +import uuid +from typing import Dict, Optional, List + + +class FloatPropertiesChannel(SideChannel): + """ + This is the SideChannel for float properties shared with Unity. + You can modify the float properties of an environment with the commands + set_property, get_property and list_properties. + """ + + def __init__(self, channel_id: uuid.UUID = None) -> None: + self._float_properties: Dict[str, float] = {} + if channel_id is None: + channel_id = uuid.UUID("60ccf7d0-4f7e-11ea-b238-784f4387d1f7") + super().__init__(channel_id) + + def on_message_received(self, msg: IncomingMessage) -> None: + """ + Is called by the environment to the side channel. Can be called + multiple times per step if multiple messages are meant for that + SideChannel. + """ + k = msg.read_string() + v = msg.read_float32() + self._float_properties[k] = v + + def set_property(self, key: str, value: float) -> None: + """ + Sets a property in the Unity Environment. + :param key: The string identifier of the property. + :param value: The float value of the property. + """ + self._float_properties[key] = value + msg = OutgoingMessage() + msg.write_string(key) + msg.write_float32(value) + super().queue_message_to_send(msg) + + def get_property(self, key: str) -> Optional[float]: + """ + Gets a property in the Unity Environment. If the property was not + found, will return None. + :param key: The string identifier of the property. + :return: The float value of the property or None. + """ + return self._float_properties.get(key) + + def list_properties(self) -> List[str]: + """ + Returns a list of all the string identifiers of the properties + currently present in the Unity Environment. + """ + return list(self._float_properties.keys()) + + def get_property_dict_copy(self) -> Dict[str, float]: + """ + Returns a copy of the float properties. + :return: + """ + return dict(self._float_properties) diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/incoming_message.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/incoming_message.py new file mode 100644 index 0000000000000000000000000000000000000000..6c00f252868f072f239462581278225c2336b2c8 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/incoming_message.py @@ -0,0 +1,93 @@ +from typing import List +import struct + + +class IncomingMessage: + """ + Utility class for reading the message written to a SideChannel. + Values must be read in the order they were written. + """ + + def __init__(self, buffer: bytes, offset: int = 0): + """ + Create a new IncomingMessage from the bytes. + """ + self.buffer = buffer + self.offset = offset + + def read_bool(self, default_value: bool = False) -> bool: + """ + Read a boolean value from the message buffer. + :param default_value: Default value to use if the end of the message is reached. + :return: The value read from the message, or the default value if the end was reached. + """ + if self._at_end_of_buffer(): + return default_value + + val = struct.unpack_from(" int: + """ + Read an integer value from the message buffer. + :param default_value: Default value to use if the end of the message is reached. + :return: The value read from the message, or the default value if the end was reached. + """ + if self._at_end_of_buffer(): + return default_value + + val = struct.unpack_from(" float: + """ + Read a float value from the message buffer. + :param default_value: Default value to use if the end of the message is reached. + :return: The value read from the message, or the default value if the end was reached. + """ + if self._at_end_of_buffer(): + return default_value + + val = struct.unpack_from(" List[float]: + """ + Read a list of float values from the message buffer. + :param default_value: Default value to use if the end of the message is reached. + :return: The value read from the message, or the default value if the end was reached. + """ + if self._at_end_of_buffer(): + return [] if default_value is None else default_value + + list_len = self.read_int32() + output = [] + for _ in range(list_len): + output.append(self.read_float32()) + return output + + def read_string(self, default_value: str = "") -> str: + """ + Read a string value from the message buffer. + :param default_value: Default value to use if the end of the message is reached. + :return: The value read from the message, or the default value if the end was reached. + """ + if self._at_end_of_buffer(): + return default_value + + encoded_str_len = self.read_int32() + val = self.buffer[self.offset : self.offset + encoded_str_len].decode("ascii") + self.offset += encoded_str_len + return val + + def get_raw_bytes(self) -> bytes: + """ + Get a copy of the internal bytes used by the message. + """ + return bytearray(self.buffer) + + def _at_end_of_buffer(self) -> bool: + return self.offset >= len(self.buffer) diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/outgoing_message.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/outgoing_message.py new file mode 100644 index 0000000000000000000000000000000000000000..83bbe3446cb207e8c9eeffb8146f243eae9e28a2 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/outgoing_message.py @@ -0,0 +1,66 @@ +from typing import List +import struct + +from mlagents_envs.logging_util import get_logger + +logger = get_logger(__name__) + + +class OutgoingMessage: + """ + Utility class for forming the message that is written to a SideChannel. + All data is written in little-endian format using the struct module. + """ + + def __init__(self): + """ + Create an OutgoingMessage with an empty buffer. + """ + self.buffer = bytearray() + + def write_bool(self, b: bool) -> None: + """ + Append a boolean value. + """ + self.buffer += struct.pack(" None: + """ + Append an integer value. + """ + self.buffer += struct.pack(" None: + """ + Append a float value. It will be truncated to 32-bit precision. + """ + self.buffer += struct.pack(" None: + """ + Append a list of float values. They will be truncated to 32-bit precision. + """ + self.write_int32(len(float_list)) + for f in float_list: + self.write_float32(f) + + def write_string(self, s: str) -> None: + """ + Append a string value. Internally, it will be encoded to ascii, and the + encoded length will also be written to the message. + """ + encoded_key = s.encode("ascii") + self.write_int32(len(encoded_key)) + self.buffer += encoded_key + + def set_raw_bytes(self, buffer: bytearray) -> None: + """ + Set the internal buffer to a new bytearray. This will overwrite any existing data. + :param buffer: + :return: + """ + if self.buffer: + logger.warning( + "Called set_raw_bytes but the message already has been written to. This will overwrite data." + ) + self.buffer = bytearray(buffer) diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/raw_bytes_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/raw_bytes_channel.py new file mode 100644 index 0000000000000000000000000000000000000000..d9f748f4fb3bd3360300cfd8810860fffe2ce411 --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/raw_bytes_channel.py @@ -0,0 +1,39 @@ +from mlagents_envs.side_channel import SideChannel, IncomingMessage, OutgoingMessage +from typing import List +import uuid + + +class RawBytesChannel(SideChannel): + """ + This is an example of what the SideChannel for raw bytes exchange would + look like. Is meant to be used for general research purpose. + """ + + def __init__(self, channel_id: uuid.UUID): + self._received_messages: List[bytes] = [] + super().__init__(channel_id) + + def on_message_received(self, msg: IncomingMessage) -> None: + """ + Is called by the environment to the side channel. Can be called + multiple times per step if multiple messages are meant for that + SideChannel. + """ + self._received_messages.append(msg.get_raw_bytes()) + + def get_and_clear_received_messages(self) -> List[bytes]: + """ + returns a list of bytearray received from the environment. + """ + result = list(self._received_messages) + self._received_messages = [] + return result + + def send_raw_data(self, data: bytearray) -> None: + """ + Queues a message to be sent by the environment at the next call to + step. + """ + msg = OutgoingMessage() + msg.set_raw_bytes(data) + super().queue_message_to_send(msg) diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/side_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/side_channel.py new file mode 100644 index 0000000000000000000000000000000000000000..469cb51eabc684194f26b206e65c6fdafd08a29e --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/side_channel.py @@ -0,0 +1,46 @@ +from abc import ABC, abstractmethod +from typing import List +import uuid + +from mlagents_envs.side_channel import IncomingMessage, OutgoingMessage +from mlagents_envs.logging_util import get_logger + +logger = get_logger(__name__) + + +class SideChannel(ABC): + """ + The side channel just get access to a bytes buffer that will be shared + between C# and Python. For example, We will create a specific side channel + for properties that will be a list of string (fixed size) to float number, + that can be modified by both C# and Python. All side channels are passed + to the Env object at construction. + """ + + def __init__(self, channel_id: uuid.UUID): + self._channel_id: uuid.UUID = channel_id + self.message_queue: List[bytearray] = [] + + def queue_message_to_send(self, msg: OutgoingMessage) -> None: + """ + Queues a message to be sent by the environment at the next call to + step. + """ + self.message_queue.append(msg.buffer) + + @abstractmethod + def on_message_received(self, msg: IncomingMessage) -> None: + """ + Is called by the environment to the side channel. Can be called + multiple times per step if multiple messages are meant for that + SideChannel. + """ + pass + + @property + def channel_id(self) -> uuid.UUID: + """ + :return:The type of side channel used. Will influence how the data is + processed in the environment. + """ + return self._channel_id diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/side_channel_manager.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/side_channel_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..e5356c26f86a3eaad95d2a2d5a07e0bfab29d70c --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/side_channel_manager.py @@ -0,0 +1,81 @@ +import uuid +import struct +from typing import Dict, Optional, List +from mlagents_envs.side_channel import SideChannel, IncomingMessage +from mlagents_envs.exception import UnityEnvironmentException +from mlagents_envs.logging_util import get_logger + + +class SideChannelManager: + def __init__(self, side_channels=Optional[List[SideChannel]]): + self._side_channels_dict = self._get_side_channels_dict(side_channels) + + def process_side_channel_message(self, data: bytes) -> None: + """ + Separates the data received from Python into individual messages for each + registered side channel and calls on_message_received on them. + :param data: The packed message sent by Unity + """ + offset = 0 + while offset < len(data): + try: + channel_id = uuid.UUID(bytes_le=bytes(data[offset : offset + 16])) + offset += 16 + (message_len,) = struct.unpack_from(" bytearray: + """ + Gathers the messages that the registered side channels will send to Unity + and combines them into a single message ready to be sent. + """ + result = bytearray() + for channel_id, channel in self._side_channels_dict.items(): + for message in channel.message_queue: + result += channel_id.bytes_le + result += struct.pack(" Dict[uuid.UUID, SideChannel]: + """ + Converts a list of side channels into a dictionary of channel_id to SideChannel + :param side_channels: The list of side channels. + """ + side_channels_dict: Dict[uuid.UUID, SideChannel] = {} + if side_channels is not None: + for _sc in side_channels: + if _sc.channel_id in side_channels_dict: + raise UnityEnvironmentException( + f"There cannot be two side channels with " + f"the same channel id {_sc.channel_id}." + ) + side_channels_dict[_sc.channel_id] = _sc + return side_channels_dict diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/stats_side_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/stats_side_channel.py new file mode 100644 index 0000000000000000000000000000000000000000..9fbbfb23d9f650b87892b8cd6317417dcb82e5ea --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/stats_side_channel.py @@ -0,0 +1,62 @@ +import uuid +from typing import Tuple, List, Mapping +from enum import Enum +from collections import defaultdict + +from mlagents_envs.side_channel import SideChannel, IncomingMessage + + +# Determines the behavior of how multiple stats within the same summary period are combined. +class StatsAggregationMethod(Enum): + # Values within the summary period are averaged before reporting. + AVERAGE = 0 + + # Only the most recent value is reported. + MOST_RECENT = 1 + + # Values within the summary period are summed up before reporting. + SUM = 2 + + # All values within a summary period are reported as a histogram. + HISTOGRAM = 3 + + +StatList = List[Tuple[float, StatsAggregationMethod]] +EnvironmentStats = Mapping[str, StatList] + + +class StatsSideChannel(SideChannel): + """ + Side channel that receives (string, float) pairs from the environment, so that they can eventually + be passed to a StatsReporter. + """ + + def __init__(self) -> None: + # >>> uuid.uuid5(uuid.NAMESPACE_URL, "com.unity.ml-agents/StatsSideChannel") + # UUID('a1d8f7b7-cec8-50f9-b78b-d3e165a78520') + super().__init__(uuid.UUID("a1d8f7b7-cec8-50f9-b78b-d3e165a78520")) + + self.stats: EnvironmentStats = defaultdict(list) + + def on_message_received(self, msg: IncomingMessage) -> None: + """ + Receive the message from the environment, and save it for later retrieval. + + :param msg: + :return: + """ + key = msg.read_string() + val = msg.read_float32() + agg_type = StatsAggregationMethod(msg.read_int32()) + + self.stats[key].append((val, agg_type)) + + def get_and_reset_stats(self) -> EnvironmentStats: + """ + Returns the current stats, and resets the internal storage of the stats. + + :return: + """ + s = self.stats + self.stats = defaultdict(list) + return s diff --git a/MLPY/Lib/site-packages/mlagents_envs/timers.py b/MLPY/Lib/site-packages/mlagents_envs/timers.py new file mode 100644 index 0000000000000000000000000000000000000000..32b8602c9ccc1b17926d5eb45dd04fde50bf07bf --- /dev/null +++ b/MLPY/Lib/site-packages/mlagents_envs/timers.py @@ -0,0 +1,362 @@ +""" +Lightweight, hierarchical timers for profiling sections of code. + +Example: + +@timed +def foo(t): + time.sleep(t) + +def main(): + for i in range(3): + foo(i + 1) + with hierarchical_timer("context"): + foo(1) + + print(get_timer_tree()) + +This would produce a timer tree like + (root) + "foo" + "context" + "foo" + +The total time and counts are tracked for each block of code; in this example "foo" and "context.foo" are considered +distinct blocks, and are tracked separately. + +The decorator and contextmanager are equivalent; the context manager may be more useful if you want more control +over the timer name, or are splitting up multiple sections of a large function. +""" + +import math +import sys +import time +import threading + +from contextlib import contextmanager +from typing import Any, Callable, Dict, Generator, Optional, TypeVar + +TIMER_FORMAT_VERSION = "0.1.0" + + +class TimerNode: + """ + Represents the time spent in a block of code. + """ + + __slots__ = ["children", "total", "count", "is_parallel"] + + def __init__(self): + # Note that since dictionary keys are the node names, we don't explicitly store the name on the TimerNode. + self.children: Dict[str, TimerNode] = {} + self.total: float = 0.0 + self.count: int = 0 + self.is_parallel = False + + def get_child(self, name: str) -> "TimerNode": + """ + Get the child node corresponding to the name (and create if it doesn't already exist). + """ + child = self.children.get(name) + if child is None: + child = TimerNode() + self.children[name] = child + return child + + def add_time(self, elapsed: float) -> None: + """ + Accumulate the time spent in the node (and increment the count). + """ + self.total += elapsed + self.count += 1 + + def merge( + self, other: "TimerNode", root_name: str = None, is_parallel: bool = True + ) -> None: + """ + Add the other node to this node, then do the same recursively on its children. + :param other: The other node to merge + :param root_name: Optional name of the root node being merged. + :param is_parallel: Whether or not the code block was executed in parallel. + :return: + """ + if root_name: + node = self.get_child(root_name) + else: + node = self + + node.total += other.total + node.count += other.count + node.is_parallel |= is_parallel + for other_child_name, other_child_node in other.children.items(): + child = node.get_child(other_child_name) + child.merge(other_child_node, is_parallel=is_parallel) + + +class GaugeNode: + """ + Tracks the most recent value of a metric. This is analogous to gauges in statsd. + """ + + __slots__ = ["value", "min_value", "max_value", "count", "_timestamp"] + + def __init__(self, value: float): + self.value = value + self.min_value = value + self.max_value = value + self.count = 1 + # Internal timestamp so we can determine priority. + self._timestamp = time.time() + + def update(self, new_value: float) -> None: + self.min_value = min(self.min_value, new_value) + self.max_value = max(self.max_value, new_value) + self.value = new_value + self.count += 1 + self._timestamp = time.time() + + def merge(self, other: "GaugeNode") -> None: + if self._timestamp < other._timestamp: + # Keep the "later" value + self.value = other.value + self._timestamp = other._timestamp + self.min_value = min(self.min_value, other.min_value) + self.max_value = max(self.max_value, other.max_value) + self.count += other.count + + def as_dict(self) -> Dict[str, float]: + return { + "value": self.value, + "min": self.min_value, + "max": self.max_value, + "count": self.count, + } + + +class TimerStack: + """ + Tracks all the time spent. Users shouldn't use this directly, they should use the contextmanager below to make + sure that pushes and pops are already matched. + """ + + __slots__ = ["root", "stack", "start_time", "gauges", "metadata"] + + def __init__(self): + self.root = TimerNode() + self.stack = [self.root] + self.start_time = time.perf_counter() + self.gauges: Dict[str, GaugeNode] = {} + self.metadata: Dict[str, str] = {} + self._add_default_metadata() + + def reset(self): + self.root = TimerNode() + self.stack = [self.root] + self.start_time = time.perf_counter() + self.gauges: Dict[str, GaugeNode] = {} + self.metadata: Dict[str, str] = {} + self._add_default_metadata() + + def push(self, name: str) -> TimerNode: + """ + Called when entering a new block of code that is timed (e.g. with a contextmanager). + """ + current_node: TimerNode = self.stack[-1] + next_node = current_node.get_child(name) + self.stack.append(next_node) + return next_node + + def pop(self) -> None: + """ + Called when exiting a new block of code that is timed (e.g. with a contextmanager). + """ + self.stack.pop() + + def get_root(self) -> TimerNode: + """ + Update the total time and count of the root name, and return it. + """ + root = self.root + root.total = time.perf_counter() - self.start_time + root.count = 1 + return root + + def get_timing_tree(self, node: TimerNode = None) -> Dict[str, Any]: + """ + Recursively build a tree of timings, suitable for output/archiving. + """ + res: Dict[str, Any] = {} + if node is None: + # Special case the root - total is time since it was created, and count is 1 + node = self.get_root() + res["name"] = "root" + + # Only output gauges at top level + if self.gauges: + res["gauges"] = self._get_gauges() + + if self.metadata: + self.metadata["end_time_seconds"] = str(int(time.time())) + res["metadata"] = self.metadata + + res["total"] = node.total + res["count"] = node.count + + if node.is_parallel: + # Note when the block ran in parallel, so that it's less confusing that a timer is less that its children. + res["is_parallel"] = True + + child_total = 0.0 + child_dict = {} + for child_name, child_node in node.children.items(): + child_res: Dict[str, Any] = self.get_timing_tree(child_node) + child_dict[child_name] = child_res + child_total += child_res["total"] + + # "self" time is total time minus all time spent on children + res["self"] = max(0.0, node.total - child_total) + if child_dict: + res["children"] = child_dict + + return res + + def set_gauge(self, name: str, value: float) -> None: + if math.isnan(value): + return + gauge_node = self.gauges.get(name) + if gauge_node: + gauge_node.update(value) + else: + self.gauges[name] = GaugeNode(value) + + def add_metadata(self, key: str, value: str) -> None: + self.metadata[key] = value + + def _get_gauges(self) -> Dict[str, Dict[str, float]]: + gauges = {} + for gauge_name, gauge_node in self.gauges.items(): + gauges[gauge_name] = gauge_node.as_dict() + return gauges + + def _add_default_metadata(self): + self.metadata["timer_format_version"] = TIMER_FORMAT_VERSION + self.metadata["start_time_seconds"] = str(int(time.time())) + self.metadata["python_version"] = sys.version + self.metadata["command_line_arguments"] = " ".join(sys.argv) + + +# Maintain a separate "global" timer per thread, so that they don't accidentally conflict with each other. +_thread_timer_stacks: Dict[int, TimerStack] = {} + + +def _get_thread_timer() -> TimerStack: + ident = threading.get_ident() + if ident not in _thread_timer_stacks: + timer_stack = TimerStack() + _thread_timer_stacks[ident] = timer_stack + return _thread_timer_stacks[ident] + + +def get_timer_stack_for_thread(t: threading.Thread) -> Optional[TimerStack]: + if t.ident is None: + # Thread hasn't started, shouldn't ever happen + return None + return _thread_timer_stacks.get(t.ident) + + +@contextmanager +def hierarchical_timer(name: str, timer_stack: TimerStack = None) -> Generator: + """ + Creates a scoped timer around a block of code. This time spent will automatically be incremented when + the context manager exits. + """ + timer_stack = timer_stack or _get_thread_timer() + timer_node = timer_stack.push(name) + start_time = time.perf_counter() + + try: + # The wrapped code block will run here. + yield timer_node + finally: + # This will trigger either when the context manager exits, or an exception is raised. + # We'll accumulate the time, and the exception (if any) gets raised automatically. + elapsed = time.perf_counter() - start_time + timer_node.add_time(elapsed) + timer_stack.pop() + + +# This is used to ensure the signature of the decorated function is preserved +# See also https://github.com/python/mypy/issues/3157 +FuncT = TypeVar("FuncT", bound=Callable[..., Any]) + + +def timed(func: FuncT) -> FuncT: + """ + Decorator for timing a function or method. The name of the timer will be the qualified name of the function. + Usage: + @timed + def my_func(x, y): + return x + y + Note that because this doesn't take arguments, the global timer stack is always used. + """ + + def wrapped(*args, **kwargs): + with hierarchical_timer(func.__qualname__): + return func(*args, **kwargs) + + return wrapped # type: ignore + + +def set_gauge(name: str, value: float, timer_stack: TimerStack = None) -> None: + """ + Updates the value of the gauge (or creates it if it hasn't been set before). + """ + timer_stack = timer_stack or _get_thread_timer() + timer_stack.set_gauge(name, value) + + +def merge_gauges(gauges: Dict[str, GaugeNode], timer_stack: TimerStack = None) -> None: + """ + Merge the gauges from another TimerStack with the provided one (or the + current thread's stack if none is provided). + :param gauges: + :param timer_stack: + :return: + """ + timer_stack = timer_stack or _get_thread_timer() + for n, g in gauges.items(): + if n in timer_stack.gauges: + timer_stack.gauges[n].merge(g) + else: + timer_stack.gauges[n] = g + + +def add_metadata(key: str, value: str, timer_stack: TimerStack = None) -> None: + timer_stack = timer_stack or _get_thread_timer() + timer_stack.add_metadata(key, value) + + +def get_timer_tree(timer_stack: TimerStack = None) -> Dict[str, Any]: + """ + Return the tree of timings from the TimerStack as a dictionary (or the + current thread's stack if none is provided) + """ + timer_stack = timer_stack or _get_thread_timer() + return timer_stack.get_timing_tree() + + +def get_timer_root(timer_stack: TimerStack = None) -> TimerNode: + """ + Get the root TimerNode of the timer_stack (or the current thread's + TimerStack if not specified) + """ + timer_stack = timer_stack or _get_thread_timer() + return timer_stack.get_root() + + +def reset_timers(timer_stack: TimerStack = None) -> None: + """ + Reset the timer_stack (or the current thread's TimerStack if not specified) + """ + timer_stack = timer_stack or _get_thread_timer() + timer_stack.reset()