diff --git a/MLPY/Lib/site-packages/mlagents/__init__.py b/MLPY/Lib/site-packages/mlagents/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/mlagents/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1fc331dcaa50d42d451399c27253555544bff6e
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/plugins/__init__.py b/MLPY/Lib/site-packages/mlagents/plugins/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b63a39732d8027e1ab6c893725b3c6912af161d3
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/plugins/__init__.py
@@ -0,0 +1,8 @@
+from typing import Dict, Any
+
+ML_AGENTS_STATS_WRITER = "mlagents.stats_writer"
+ML_AGENTS_TRAINER_TYPE = "mlagents.trainer_type"
+
+# TODO: the real type is Dict[str, HyperparamSettings]
+all_trainer_types: Dict[str, Any] = {}
+all_trainer_settings: Dict[str, Any] = {}
diff --git a/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a459379f72ef09fd864d2a2e2c1b54b40d2695a5
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/stats_writer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/stats_writer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e18dfb26ce23cb88d269c779720c333794ba811e
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/stats_writer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/trainer_type.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/trainer_type.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..286875aa9a2ce89acbea53f6e8a6e20b6f49dba6
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/plugins/__pycache__/trainer_type.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/plugins/stats_writer.py b/MLPY/Lib/site-packages/mlagents/plugins/stats_writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..17acefd32e25f0a9e49b3f89fc4f90733f8a9495
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/plugins/stats_writer.py
@@ -0,0 +1,72 @@
+import sys
+from typing import List
+
+# importlib.metadata is new in python3.8
+# We use the backport for older python versions.
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata  # pylint: disable=E0611
+
+from mlagents.trainers.stats import StatsWriter
+
+from mlagents_envs import logging_util
+from mlagents.plugins import ML_AGENTS_STATS_WRITER
+from mlagents.trainers.settings import RunOptions
+from mlagents.trainers.stats import TensorboardWriter, GaugeWriter, ConsoleWriter
+
+
+logger = logging_util.get_logger(__name__)
+
+
+def get_default_stats_writers(run_options: RunOptions) -> List[StatsWriter]:
+    """
+    The StatsWriters that mlagents-learn always uses:
+    * A TensorboardWriter to write information to TensorBoard
+    * A GaugeWriter to record our internal stats
+    * A ConsoleWriter to output to stdout.
+    """
+    checkpoint_settings = run_options.checkpoint_settings
+    return [
+        TensorboardWriter(
+            checkpoint_settings.write_path,
+            clear_past_data=not checkpoint_settings.resume,
+            hidden_keys=["Is Training", "Step"],
+        ),
+        GaugeWriter(),
+        ConsoleWriter(),
+    ]
+
+
+def register_stats_writer_plugins(run_options: RunOptions) -> List[StatsWriter]:
+    """
+    Registers all StatsWriter plugins (including the default one),
+    and evaluates them, and returns the list of all the StatsWriter implementations.
+    """
+    all_stats_writers: List[StatsWriter] = []
+    if ML_AGENTS_STATS_WRITER not in importlib_metadata.entry_points():
+        logger.warning(
+            f"Unable to find any entry points for {ML_AGENTS_STATS_WRITER}, even the default ones. "
+            "Uninstalling and reinstalling ml-agents via pip should resolve. "
+            "Using default plugins for now."
+        )
+        return get_default_stats_writers(run_options)
+
+    entry_points = importlib_metadata.entry_points()[ML_AGENTS_STATS_WRITER]
+
+    for entry_point in entry_points:
+
+        try:
+            logger.debug(f"Initializing StatsWriter plugins: {entry_point.name}")
+            plugin_func = entry_point.load()
+            plugin_stats_writers = plugin_func(run_options)
+            logger.debug(
+                f"Found {len(plugin_stats_writers)} StatsWriters for plugin {entry_point.name}"
+            )
+            all_stats_writers += plugin_stats_writers
+        except BaseException:
+            # Catch all exceptions from setting up the plugin, so that bad user code doesn't break things.
+            logger.exception(
+                f"Error initializing StatsWriter plugins for {entry_point.name}. This plugin will not be used."
+            )
+    return all_stats_writers
diff --git a/MLPY/Lib/site-packages/mlagents/plugins/trainer_type.py b/MLPY/Lib/site-packages/mlagents/plugins/trainer_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..2766368863caca5e74182c02a94429f09ba3c148
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/plugins/trainer_type.py
@@ -0,0 +1,80 @@
+import sys
+from typing import Dict, Tuple, Any
+
+# importlib.metadata is new in python3.8
+# We use the backport for older python versions.
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata  # pylint: disable=E0611
+
+
+from mlagents_envs import logging_util
+from mlagents.plugins import ML_AGENTS_TRAINER_TYPE
+from mlagents.trainers.ppo.trainer import PPOTrainer
+from mlagents.trainers.sac.trainer import SACTrainer
+from mlagents.trainers.poca.trainer import POCATrainer
+from mlagents.trainers.ppo.optimizer_torch import PPOSettings
+from mlagents.trainers.sac.optimizer_torch import SACSettings
+from mlagents.trainers.poca.optimizer_torch import POCASettings
+from mlagents import plugins as mla_plugins
+
+logger = logging_util.get_logger(__name__)
+
+
+def get_default_trainer_types() -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """
+    The Trainers that mlagents-learn always uses:
+    """
+
+    mla_plugins.all_trainer_types.update(
+        {
+            PPOTrainer.get_trainer_name(): PPOTrainer,
+            SACTrainer.get_trainer_name(): SACTrainer,
+            POCATrainer.get_trainer_name(): POCATrainer,
+        }
+    )
+    # global all_trainer_settings
+    mla_plugins.all_trainer_settings.update(
+        {
+            PPOTrainer.get_trainer_name(): PPOSettings,
+            SACTrainer.get_trainer_name(): SACSettings,
+            POCATrainer.get_trainer_name(): POCASettings,
+        }
+    )
+
+    return mla_plugins.all_trainer_types, mla_plugins.all_trainer_settings
+
+
+def register_trainer_plugins() -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """
+    Registers all Trainer plugins (including the default one),
+    and evaluates them, and returns the list of all the Trainer implementations.
+    """
+    if ML_AGENTS_TRAINER_TYPE not in importlib_metadata.entry_points():
+        logger.warning(
+            f"Unable to find any entry points for {ML_AGENTS_TRAINER_TYPE}, even the default ones. "
+            "Uninstalling and reinstalling ml-agents via pip should resolve. "
+            "Using default plugins for now."
+        )
+        return get_default_trainer_types()
+
+    entry_points = importlib_metadata.entry_points()[ML_AGENTS_TRAINER_TYPE]
+
+    for entry_point in entry_points:
+
+        try:
+            logger.debug(f"Initializing Trainer plugins: {entry_point.name}")
+            plugin_func = entry_point.load()
+            plugin_trainer_types, plugin_trainer_settings = plugin_func()
+            logger.debug(
+                f"Found {len(plugin_trainer_types)} Trainers for plugin {entry_point.name}"
+            )
+            mla_plugins.all_trainer_types.update(plugin_trainer_types)
+            mla_plugins.all_trainer_settings.update(plugin_trainer_settings)
+        except BaseException:
+            # Catch all exceptions from setting up the plugin, so that bad user code doesn't break things.
+            logger.exception(
+                f"Error initializing Trainer plugins for {entry_point.name}. This plugin will not be used."
+            )
+    return mla_plugins.all_trainer_types, mla_plugins.all_trainer_settings
diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/__init__.py b/MLPY/Lib/site-packages/mlagents/torch_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0acc96997d94d8d9968950e8db21c651484b8cf3
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/torch_utils/__init__.py
@@ -0,0 +1,4 @@
+from mlagents.torch_utils.torch import torch as torch  # noqa
+from mlagents.torch_utils.torch import nn  # noqa
+from mlagents.torch_utils.torch import set_torch_config  # noqa
+from mlagents.torch_utils.torch import default_device  # noqa
diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7235c1375171c5f9647619ff49438ee14d50c4f
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/cpu_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/cpu_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3446f50f861006bb3be343b899dc27c687b12119
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/cpu_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/globals.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/globals.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..515fc26b31cb944de9173b14744f98fdab276230
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/globals.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/torch.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/torch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62fd647d9b47ccf7aac470754be1ee92fb125e19
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/torch_utils/__pycache__/torch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/cpu_utils.py b/MLPY/Lib/site-packages/mlagents/torch_utils/cpu_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f577375703e0066d596cef16f5d4660881b55f9
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/torch_utils/cpu_utils.py
@@ -0,0 +1,41 @@
+from typing import Optional
+
+import os
+
+
+def get_num_threads_to_use() -> Optional[int]:
+    """
+    Gets the number of threads to use. For most problems, 4 is all you
+    need, but for smaller machines, we'd like to scale to less than that.
+    By default, PyTorch uses 1/2 of the available cores.
+    """
+    num_cpus = _get_num_available_cpus()
+    return max(min(num_cpus // 2, 4), 1) if num_cpus is not None else None
+
+
+def _get_num_available_cpus() -> Optional[int]:
+    """
+    Returns number of CPUs using cgroups if possible. This accounts
+    for Docker containers that are limited in cores.
+    """
+    period = _read_in_integer_file("/sys/fs/cgroup/cpu/cpu.cfs_period_us")
+    quota = _read_in_integer_file("/sys/fs/cgroup/cpu/cpu.cfs_quota_us")
+    share = _read_in_integer_file("/sys/fs/cgroup/cpu/cpu.shares")
+    is_kubernetes = os.getenv("KUBERNETES_SERVICE_HOST") is not None
+
+    if period > 0 and quota > 0:
+        return int(quota // period)
+    elif period > 0 and share > 0 and is_kubernetes:
+        # In kubernetes, each requested CPU is 1024 CPU shares
+        # https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#how-pods-with-resource-limits-are-run
+        return int(share // 1024)
+    else:
+        return os.cpu_count()
+
+
+def _read_in_integer_file(filename: str) -> int:
+    try:
+        with open(filename) as f:
+            return int(f.read().rstrip())
+    except FileNotFoundError:
+        return -1
diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/globals.py b/MLPY/Lib/site-packages/mlagents/torch_utils/globals.py
new file mode 100644
index 0000000000000000000000000000000000000000..99705b1067305890316036cb700845af626f9a59
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/torch_utils/globals.py
@@ -0,0 +1,13 @@
+from typing import Optional
+
+_rank: Optional[int] = None
+
+
+def get_rank() -> Optional[int]:
+    """
+    Returns the rank (in the MPI sense) of the current node.
+    For local training, this will always be None.
+    If this needs to be used, it should be done from outside ml-agents.
+    :return:
+    """
+    return _rank
diff --git a/MLPY/Lib/site-packages/mlagents/torch_utils/torch.py b/MLPY/Lib/site-packages/mlagents/torch_utils/torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..24dc45cca3ff5ca99300fff7d86ffc27c0fc49b9
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/torch_utils/torch.py
@@ -0,0 +1,68 @@
+import os
+
+from distutils.version import LooseVersion
+import pkg_resources
+from mlagents.torch_utils import cpu_utils
+from mlagents.trainers.settings import TorchSettings
+from mlagents_envs.logging_util import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def assert_torch_installed():
+    # Check that torch version 1.6.0 or later has been installed. If not, refer
+    # user to the PyTorch webpage for install instructions.
+    torch_pkg = None
+    try:
+        torch_pkg = pkg_resources.get_distribution("torch")
+    except pkg_resources.DistributionNotFound:
+        pass
+    assert torch_pkg is not None and LooseVersion(torch_pkg.version) >= LooseVersion(
+        "1.6.0"
+    ), (
+        "A compatible version of PyTorch was not installed. Please visit the PyTorch homepage "
+        + "(https://pytorch.org/get-started/locally/) and follow the instructions to install. "
+        + "Version 1.6.0 and later are supported."
+    )
+
+
+assert_torch_installed()
+
+# This should be the only place that we import torch directly.
+# Everywhere else is caught by the banned-modules setting for flake8
+import torch  # noqa I201
+
+
+torch.set_num_threads(cpu_utils.get_num_threads_to_use())
+os.environ["KMP_BLOCKTIME"] = "0"
+
+
+_device = torch.device("cpu")
+
+
+def set_torch_config(torch_settings: TorchSettings) -> None:
+    global _device
+
+    if torch_settings.device is None:
+        device_str = "cuda" if torch.cuda.is_available() else "cpu"
+    else:
+        device_str = torch_settings.device
+
+    _device = torch.device(device_str)
+
+    if _device.type == "cuda":
+        torch.set_default_tensor_type(torch.cuda.FloatTensor)
+    else:
+        torch.set_default_tensor_type(torch.FloatTensor)
+    logger.debug(f"default Torch device: {_device}")
+
+
+# Initialize to default settings
+set_torch_config(TorchSettings(device=None))
+
+nn = torch.nn
+
+
+def default_device():
+    return _device
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8476df42fa19998794ea2bcbe8c61682f45f5df5
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/__init__.py
@@ -0,0 +1,5 @@
+# Version of the library that will be used to upload to pypi
+__version__ = "0.30.0"
+
+# Git tag that will be checked to determine whether to trigger upload to pypi
+__release_tag__ = "release_20"
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92fcaded558ee2337e7c708d6581b2390239b53c
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/action_info.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/action_info.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..335cd4c1c9ea86f970eae15c8f914dc45c390cea
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/action_info.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/agent_processor.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/agent_processor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e172529a6e1e7ec4771d30ae304ffb4bb0826df
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/agent_processor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/behavior_id_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/behavior_id_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4ca00581e6ad646ea4bac8c71e0f568a76df96c
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/behavior_id_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/buffer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/buffer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc366be6891c31a25958da748dcfa1b1491b6288
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/buffer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/cli_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/cli_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d040950c5e69fd25c8d931e37bc07b0b8f6ab9c
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/cli_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/demo_loader.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/demo_loader.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92d4e5593951b1cab2ee2e2d85a2710b34559ce0
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/demo_loader.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/directory_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/directory_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d60eea7e059dca67f6bc64ebbb72f19596eb54d
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/directory_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/env_manager.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/env_manager.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbdbdbd4865c896d0e37fa2cc936dc45642a8bb1
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/env_manager.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/environment_parameter_manager.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/environment_parameter_manager.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..154850531d958f4f58ba44dd7e853ccef8864f86
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/environment_parameter_manager.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/exception.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/exception.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5afbf16ca335670aa26551916143ab1ddb6c413a
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/exception.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/learn.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/learn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb25f1b64c5a0df78e195cf834c81384166609dc
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/learn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/run_experiment.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/run_experiment.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4956e2b0b085fd5b517dd41bd744a36312b96600
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/run_experiment.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/settings.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/settings.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d79cc905dc8546cd7e38fe83a948d02e5f31df4
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/settings.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/simple_env_manager.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/simple_env_manager.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eca690e9d68e59d12c319c68a9cf7603f7b6d7bf
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/simple_env_manager.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/stats.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/stats.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38dab601aa83e3d4113ad3cf2d9aff78402c42a5
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/stats.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/subprocess_env_manager.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/subprocess_env_manager.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2aa86d7bd7840e34c9bf8b19b77fab7e617951c
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/subprocess_env_manager.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/trainer_controller.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/trainer_controller.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40cbcc954fc21678fd4fca10a38be9ef7fbc8934
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/trainer_controller.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/training_analytics_side_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/training_analytics_side_channel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c443c32390d6263ba3c81a5b388398aeb9fe222b
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/training_analytics_side_channel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/training_status.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/training_status.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d60965cfd69a9615653cfddf8ec47f406a8342b9
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/training_status.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/trajectory.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/trajectory.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50bedcdf4fa8a07cee4c6cee2f2dfae5ba137dd6
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/trajectory.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/upgrade_config.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/upgrade_config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38487e42cc76f6f3851293d9248d8ac8ccfd24cc
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/__pycache__/upgrade_config.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/action_info.py b/MLPY/Lib/site-packages/mlagents/trainers/action_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0ec02327116119c3115891c4acea4227d3791f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/action_info.py
@@ -0,0 +1,25 @@
+from typing import NamedTuple, Any, Dict, List
+import numpy as np
+from mlagents_envs.base_env import AgentId
+
+ActionInfoOutputs = Dict[str, np.ndarray]
+
+
+class ActionInfo(NamedTuple):
+    """
+    A NamedTuple containing actions and related quantities to the policy forward
+    pass. Additionally contains the agent ids in the corresponding DecisionStep
+    :param action: The action output of the policy
+    :param env_action: The possibly clipped action to be executed in the environment
+    :param outputs: Dict of all quantities associated with the policy forward pass
+    :param agent_ids: List of int agent ids in DecisionStep
+    """
+
+    action: Any
+    env_action: Any
+    outputs: ActionInfoOutputs
+    agent_ids: List[AgentId]
+
+    @staticmethod
+    def empty() -> "ActionInfo":
+        return ActionInfo([], [], {}, [])
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/agent_processor.py b/MLPY/Lib/site-packages/mlagents/trainers/agent_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..720f3d14bdc9060b8def61b5b8fbead30bbf375e
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/agent_processor.py
@@ -0,0 +1,469 @@
+import sys
+import numpy as np
+from typing import List, Dict, TypeVar, Generic, Tuple, Any, Union
+from collections import defaultdict, Counter
+import queue
+from mlagents.torch_utils import torch
+
+from mlagents_envs.base_env import (
+    ActionTuple,
+    DecisionSteps,
+    DecisionStep,
+    TerminalSteps,
+    TerminalStep,
+)
+from mlagents_envs.side_channel.stats_side_channel import (
+    StatsAggregationMethod,
+    EnvironmentStats,
+)
+from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.trajectory import AgentStatus, Trajectory, AgentExperience
+from mlagents.trainers.policy import Policy
+from mlagents.trainers.action_info import ActionInfo, ActionInfoOutputs
+from mlagents.trainers.stats import StatsReporter
+from mlagents.trainers.behavior_id_utils import (
+    get_global_agent_id,
+    get_global_group_id,
+    GlobalAgentId,
+    GlobalGroupId,
+)
+from mlagents.trainers.torch_entities.action_log_probs import LogProbsTuple
+from mlagents.trainers.torch_entities.utils import ModelUtils
+
+T = TypeVar("T")
+
+
+class AgentProcessor:
+    """
+    AgentProcessor contains a dictionary per-agent trajectory buffers. The buffers are indexed by agent_id.
+    Buffer also contains an update_buffer that corresponds to the buffer used when updating the model.
+    One AgentProcessor should be created per agent group.
+    """
+
+    def __init__(
+        self,
+        policy: Policy,
+        behavior_id: str,
+        stats_reporter: StatsReporter,
+        max_trajectory_length: int = sys.maxsize,
+    ):
+        """
+        Create an AgentProcessor.
+
+        :param trainer: Trainer instance connected to this AgentProcessor. Trainer is given trajectory
+        when it is finished.
+        :param policy: Policy instance associated with this AgentProcessor.
+        :param max_trajectory_length: Maximum length of a trajectory before it is added to the trainer.
+        :param stats_category: The category under which to write the stats. Usually, this comes from the Trainer.
+        """
+        self._experience_buffers: Dict[
+            GlobalAgentId, List[AgentExperience]
+        ] = defaultdict(list)
+        self._last_step_result: Dict[GlobalAgentId, Tuple[DecisionStep, int]] = {}
+        # current_group_obs is used to collect the current (i.e. the most recently seen)
+        # obs of all the agents in the same group, and assemble the group obs.
+        # It is a dictionary of GlobalGroupId to dictionaries of GlobalAgentId to observation.
+        self._current_group_obs: Dict[
+            GlobalGroupId, Dict[GlobalAgentId, List[np.ndarray]]
+        ] = defaultdict(lambda: defaultdict(list))
+        # group_status is used to collect the current, most recently seen
+        # group status of all the agents in the same group, and assemble the group's status.
+        # It is a dictionary of GlobalGroupId to dictionaries of GlobalAgentId to AgentStatus.
+        self._group_status: Dict[
+            GlobalGroupId, Dict[GlobalAgentId, AgentStatus]
+        ] = defaultdict(lambda: defaultdict(None))
+        # last_take_action_outputs stores the action a_t taken before the current observation s_(t+1), while
+        # grabbing previous_action from the policy grabs the action PRIOR to that, a_(t-1).
+        self._last_take_action_outputs: Dict[GlobalAgentId, ActionInfoOutputs] = {}
+
+        self._episode_steps: Counter = Counter()
+        self._episode_rewards: Dict[GlobalAgentId, float] = defaultdict(float)
+        self._stats_reporter = stats_reporter
+        self._max_trajectory_length = max_trajectory_length
+        self._trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
+        self._behavior_id = behavior_id
+
+        # Note: In the future this policy reference will be the policy of the env_manager and not the trainer.
+        # We can in that case just grab the action from the policy rather than having it passed in.
+        self.policy = policy
+
+    def add_experiences(
+        self,
+        decision_steps: DecisionSteps,
+        terminal_steps: TerminalSteps,
+        worker_id: int,
+        previous_action: ActionInfo,
+    ) -> None:
+        """
+        Adds experiences to each agent's experience history.
+        :param decision_steps: current DecisionSteps.
+        :param terminal_steps: current TerminalSteps.
+        :param previous_action: The outputs of the Policy's get_action method.
+        """
+        take_action_outputs = previous_action.outputs
+        if take_action_outputs:
+            try:
+                for _entropy in take_action_outputs["entropy"]:
+                    if isinstance(_entropy, torch.Tensor):
+                        _entropy = ModelUtils.to_numpy(_entropy)
+                    self._stats_reporter.add_stat("Policy/Entropy", _entropy)
+            except KeyError:
+                pass
+
+        # Make unique agent_ids that are global across workers
+        action_global_agent_ids = [
+            get_global_agent_id(worker_id, ag_id) for ag_id in previous_action.agent_ids
+        ]
+        for global_id in action_global_agent_ids:
+            if global_id in self._last_step_result:  # Don't store if agent just reset
+                self._last_take_action_outputs[global_id] = take_action_outputs
+
+        # Iterate over all the terminal steps, first gather all the group obs
+        # and then create the AgentExperiences/Trajectories. _add_to_group_status
+        # stores Group statuses in a common data structure self.group_status
+        for terminal_step in terminal_steps.values():
+            self._add_group_status_and_obs(terminal_step, worker_id)
+        for terminal_step in terminal_steps.values():
+            local_id = terminal_step.agent_id
+            global_id = get_global_agent_id(worker_id, local_id)
+            self._process_step(
+                terminal_step, worker_id, terminal_steps.agent_id_to_index[local_id]
+            )
+
+        # Iterate over all the decision steps, first gather all the group obs
+        # and then create the trajectories. _add_to_group_status
+        # stores Group statuses in a common data structure self.group_status
+        for ongoing_step in decision_steps.values():
+            self._add_group_status_and_obs(ongoing_step, worker_id)
+        for ongoing_step in decision_steps.values():
+            local_id = ongoing_step.agent_id
+            self._process_step(
+                ongoing_step, worker_id, decision_steps.agent_id_to_index[local_id]
+            )
+        # Clear the last seen group obs when agents die, but only after all of the group
+        # statuses were added to the trajectory.
+        for terminal_step in terminal_steps.values():
+            local_id = terminal_step.agent_id
+            global_id = get_global_agent_id(worker_id, local_id)
+            self._clear_group_status_and_obs(global_id)
+
+        for _gid in action_global_agent_ids:
+            # If the ID doesn't have a last step result, the agent just reset,
+            # don't store the action.
+            if _gid in self._last_step_result:
+                if "action" in take_action_outputs:
+                    self.policy.save_previous_action(
+                        [_gid], take_action_outputs["action"]
+                    )
+
+    def _add_group_status_and_obs(
+        self, step: Union[TerminalStep, DecisionStep], worker_id: int
+    ) -> None:
+        """
+        Takes a TerminalStep or DecisionStep and adds the information in it
+        to self.group_status. This information can then be retrieved
+        when constructing trajectories to get the status of group mates. Also stores the current
+        observation into current_group_obs, to be used to get the next group observations
+        for bootstrapping.
+        :param step: TerminalStep or DecisionStep
+        :param worker_id: Worker ID of this particular environment. Used to generate a
+            global group id.
+        """
+        global_agent_id = get_global_agent_id(worker_id, step.agent_id)
+        stored_decision_step, idx = self._last_step_result.get(
+            global_agent_id, (None, None)
+        )
+        stored_take_action_outputs = self._last_take_action_outputs.get(
+            global_agent_id, None
+        )
+        if stored_decision_step is not None and stored_take_action_outputs is not None:
+            # 0, the default group_id, means that the agent doesn't belong to an agent group.
+            # If 0, don't add any groupmate information.
+            if step.group_id > 0:
+                global_group_id = get_global_group_id(worker_id, step.group_id)
+                stored_actions = stored_take_action_outputs["action"]
+                action_tuple = ActionTuple(
+                    continuous=stored_actions.continuous[idx],
+                    discrete=stored_actions.discrete[idx],
+                )
+                group_status = AgentStatus(
+                    obs=stored_decision_step.obs,
+                    reward=step.reward,
+                    action=action_tuple,
+                    done=isinstance(step, TerminalStep),
+                )
+                self._group_status[global_group_id][global_agent_id] = group_status
+                self._current_group_obs[global_group_id][global_agent_id] = step.obs
+
+    def _clear_group_status_and_obs(self, global_id: GlobalAgentId) -> None:
+        """
+        Clears an agent from self._group_status and self._current_group_obs.
+        """
+        self._delete_in_nested_dict(self._current_group_obs, global_id)
+        self._delete_in_nested_dict(self._group_status, global_id)
+
+    def _delete_in_nested_dict(self, nested_dict: Dict[str, Any], key: str) -> None:
+        for _manager_id in list(nested_dict.keys()):
+            _team_group = nested_dict[_manager_id]
+            self._safe_delete(_team_group, key)
+            if not _team_group:  # if dict is empty
+                self._safe_delete(nested_dict, _manager_id)
+
+    def _process_step(
+        self, step: Union[TerminalStep, DecisionStep], worker_id: int, index: int
+    ) -> None:
+        terminated = isinstance(step, TerminalStep)
+        global_agent_id = get_global_agent_id(worker_id, step.agent_id)
+        global_group_id = get_global_group_id(worker_id, step.group_id)
+        stored_decision_step, idx = self._last_step_result.get(
+            global_agent_id, (None, None)
+        )
+        stored_take_action_outputs = self._last_take_action_outputs.get(
+            global_agent_id, None
+        )
+        if not terminated:
+            # Index is needed to grab from last_take_action_outputs
+            self._last_step_result[global_agent_id] = (step, index)
+
+        # This state is the consequence of a past action
+        if stored_decision_step is not None and stored_take_action_outputs is not None:
+            obs = stored_decision_step.obs
+            if self.policy.use_recurrent:
+                memory = self.policy.retrieve_previous_memories([global_agent_id])[0, :]
+            else:
+                memory = None
+            done = terminated  # Since this is an ongoing step
+            interrupted = step.interrupted if terminated else False
+            # Add the outputs of the last eval
+            stored_actions = stored_take_action_outputs["action"]
+            action_tuple = ActionTuple(
+                continuous=stored_actions.continuous[idx],
+                discrete=stored_actions.discrete[idx],
+            )
+            try:
+                stored_action_probs = stored_take_action_outputs["log_probs"]
+                if not isinstance(stored_action_probs, LogProbsTuple):
+                    stored_action_probs = stored_action_probs.to_log_probs_tuple()
+                log_probs_tuple = LogProbsTuple(
+                    continuous=stored_action_probs.continuous[idx],
+                    discrete=stored_action_probs.discrete[idx],
+                )
+            except KeyError:
+                log_probs_tuple = LogProbsTuple.empty_log_probs()
+
+            action_mask = stored_decision_step.action_mask
+            prev_action = self.policy.retrieve_previous_action([global_agent_id])[0, :]
+
+            # Assemble teammate_obs. If none saved, then it will be an empty list.
+            group_statuses = []
+            for _id, _mate_status in self._group_status[global_group_id].items():
+                if _id != global_agent_id:
+                    group_statuses.append(_mate_status)
+
+            experience = AgentExperience(
+                obs=obs,
+                reward=step.reward,
+                done=done,
+                action=action_tuple,
+                action_probs=log_probs_tuple,
+                action_mask=action_mask,
+                prev_action=prev_action,
+                interrupted=interrupted,
+                memory=memory,
+                group_status=group_statuses,
+                group_reward=step.group_reward,
+            )
+            # Add the value outputs if needed
+            self._experience_buffers[global_agent_id].append(experience)
+            self._episode_rewards[global_agent_id] += step.reward
+            if not terminated:
+                self._episode_steps[global_agent_id] += 1
+
+            # Add a trajectory segment to the buffer if terminal or the length has reached the time horizon
+            if (
+                len(self._experience_buffers[global_agent_id])
+                >= self._max_trajectory_length
+                or terminated
+            ):
+                next_obs = step.obs
+                next_group_obs = []
+                for _id, _obs in self._current_group_obs[global_group_id].items():
+                    if _id != global_agent_id:
+                        next_group_obs.append(_obs)
+
+                trajectory = Trajectory(
+                    steps=self._experience_buffers[global_agent_id],
+                    agent_id=global_agent_id,
+                    next_obs=next_obs,
+                    next_group_obs=next_group_obs,
+                    behavior_id=self._behavior_id,
+                )
+                for traj_queue in self._trajectory_queues:
+                    traj_queue.put(trajectory)
+                self._experience_buffers[global_agent_id] = []
+            if terminated:
+                # Record episode length.
+                self._stats_reporter.add_stat(
+                    "Environment/Episode Length",
+                    self._episode_steps.get(global_agent_id, 0),
+                )
+                self._clean_agent_data(global_agent_id)
+
+    def _clean_agent_data(self, global_id: GlobalAgentId) -> None:
+        """
+        Removes the data for an Agent.
+        """
+        self._safe_delete(self._experience_buffers, global_id)
+        self._safe_delete(self._last_take_action_outputs, global_id)
+        self._safe_delete(self._last_step_result, global_id)
+        self._safe_delete(self._episode_steps, global_id)
+        self._safe_delete(self._episode_rewards, global_id)
+        self.policy.remove_previous_action([global_id])
+        self.policy.remove_memories([global_id])
+
+    def _safe_delete(self, my_dictionary: Dict[Any, Any], key: Any) -> None:
+        """
+        Safe removes data from a dictionary. If not found,
+        don't delete.
+        """
+        if key in my_dictionary:
+            del my_dictionary[key]
+
+    def publish_trajectory_queue(
+        self, trajectory_queue: "AgentManagerQueue[Trajectory]"
+    ) -> None:
+        """
+        Adds a trajectory queue to the list of queues to publish to when this AgentProcessor
+        assembles a Trajectory
+        :param trajectory_queue: Trajectory queue to publish to.
+        """
+        self._trajectory_queues.append(trajectory_queue)
+
+    def end_episode(self) -> None:
+        """
+        Ends the episode, terminating the current trajectory and stopping stats collection for that
+        episode. Used for forceful reset (e.g. in curriculum or generalization training.)
+        """
+        all_gids = list(self._experience_buffers.keys())  # Need to make copy
+        for _gid in all_gids:
+            self._clean_agent_data(_gid)
+
+
+class AgentManagerQueue(Generic[T]):
+    """
+    Queue used by the AgentManager. Note that we make our own class here because in most implementations
+    deque is sufficient and faster. However, if we want to switch to multiprocessing, we'll need to change
+    out this implementation.
+    """
+
+    class Empty(Exception):
+        """
+        Exception for when the queue is empty.
+        """
+
+        pass
+
+    def __init__(self, behavior_id: str, maxlen: int = 0):
+        """
+        Initializes an AgentManagerQueue. Note that we can give it a behavior_id so that it can be identified
+        separately from an AgentManager.
+        """
+        self._maxlen: int = maxlen
+        self._queue: queue.Queue = queue.Queue(maxsize=maxlen)
+        self._behavior_id = behavior_id
+
+    @property
+    def maxlen(self):
+        """
+        The maximum length of the queue.
+        :return: Maximum length of the queue.
+        """
+        return self._maxlen
+
+    @property
+    def behavior_id(self):
+        """
+        The Behavior ID of this queue.
+        :return: Behavior ID associated with the queue.
+        """
+        return self._behavior_id
+
+    def qsize(self) -> int:
+        """
+        Returns the approximate size of the queue. Note that values may differ
+        depending on the underlying queue implementation.
+        """
+        return self._queue.qsize()
+
+    def empty(self) -> bool:
+        return self._queue.empty()
+
+    def get_nowait(self) -> T:
+        """
+        Gets the next item from the queue, throwing an AgentManagerQueue.Empty exception
+        if the queue is empty.
+        """
+        try:
+            return self._queue.get_nowait()
+        except queue.Empty:
+            raise self.Empty("The AgentManagerQueue is empty.")
+
+    def put(self, item: T) -> None:
+        self._queue.put(item)
+
+
+class AgentManager(AgentProcessor):
+    """
+    An AgentManager is an AgentProcessor that also holds a single trajectory and policy queue.
+    Note: this leaves room for adding AgentProcessors that publish multiple trajectory queues.
+    """
+
+    def __init__(
+        self,
+        policy: Policy,
+        behavior_id: str,
+        stats_reporter: StatsReporter,
+        max_trajectory_length: int = sys.maxsize,
+        threaded: bool = True,
+    ):
+        super().__init__(policy, behavior_id, stats_reporter, max_trajectory_length)
+        trajectory_queue_len = 20 if threaded else 0
+        self.trajectory_queue: AgentManagerQueue[Trajectory] = AgentManagerQueue(
+            self._behavior_id, maxlen=trajectory_queue_len
+        )
+        # NOTE: we make policy queues of infinite length to avoid lockups of the trainers.
+        # In the environment manager, we make sure to empty the policy queue before continuing to produce steps.
+        self.policy_queue: AgentManagerQueue[Policy] = AgentManagerQueue(
+            self._behavior_id, maxlen=0
+        )
+        self.publish_trajectory_queue(self.trajectory_queue)
+
+    def record_environment_stats(
+        self, env_stats: EnvironmentStats, worker_id: int
+    ) -> None:
+        """
+        Pass stats from the environment to the StatsReporter.
+        Depending on the StatsAggregationMethod, either StatsReporter.add_stat or StatsReporter.set_stat is used.
+        The worker_id is used to determine whether StatsReporter.set_stat should be used.
+
+        :param env_stats:
+        :param worker_id:
+        :return:
+        """
+        for stat_name, value_list in env_stats.items():
+            for val, agg_type in value_list:
+                if agg_type == StatsAggregationMethod.AVERAGE:
+                    self._stats_reporter.add_stat(stat_name, val, agg_type)
+                elif agg_type == StatsAggregationMethod.SUM:
+                    self._stats_reporter.add_stat(stat_name, val, agg_type)
+                elif agg_type == StatsAggregationMethod.HISTOGRAM:
+                    self._stats_reporter.add_stat(stat_name, val, agg_type)
+                elif agg_type == StatsAggregationMethod.MOST_RECENT:
+                    # In order to prevent conflicts between multiple environments,
+                    # only stats from the first environment are recorded.
+                    if worker_id == 0:
+                        self._stats_reporter.set_stat(stat_name, val)
+                else:
+                    raise UnityTrainerException(
+                        f"Unknown StatsAggregationMethod encountered. {agg_type}"
+                    )
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/behavior_id_utils.py b/MLPY/Lib/site-packages/mlagents/trainers/behavior_id_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c23069161b8996c2a148b87ca884589cc0c7e8e3
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/behavior_id_utils.py
@@ -0,0 +1,64 @@
+from typing import NamedTuple
+from urllib.parse import urlparse, parse_qs
+from mlagents_envs.base_env import AgentId, GroupId
+
+GlobalGroupId = str
+GlobalAgentId = str
+
+
+class BehaviorIdentifiers(NamedTuple):
+    """
+    BehaviorIdentifiers is a named tuple of the identifiers that uniquely distinguish
+    an agent encountered in the trainer_controller. The named tuple consists of the
+    fully qualified behavior name, the name of the brain name (corresponds to trainer
+    in the trainer controller) and the team id.  In the future, this can be extended
+    to support further identifiers.
+    """
+
+    behavior_id: str
+    brain_name: str
+    team_id: int
+
+    @staticmethod
+    def from_name_behavior_id(name_behavior_id: str) -> "BehaviorIdentifiers":
+        """
+        Parses a name_behavior_id of the form name?team=0
+        into a BehaviorIdentifiers NamedTuple.
+        This allows you to access the brain name and team id of an agent
+        :param name_behavior_id: String of behavior params in HTTP format.
+        :returns: A BehaviorIdentifiers object.
+        """
+
+        parsed = urlparse(name_behavior_id)
+        name = parsed.path
+        ids = parse_qs(parsed.query)
+        team_id: int = 0
+        if "team" in ids:
+            team_id = int(ids["team"][0])
+        return BehaviorIdentifiers(
+            behavior_id=name_behavior_id, brain_name=name, team_id=team_id
+        )
+
+
+def create_name_behavior_id(name: str, team_id: int) -> str:
+    """
+    Reconstructs fully qualified behavior name from name and team_id
+    :param name: brain name
+    :param team_id: team ID
+    :return: name_behavior_id
+    """
+    return name + "?team=" + str(team_id)
+
+
+def get_global_agent_id(worker_id: int, agent_id: AgentId) -> GlobalAgentId:
+    """
+    Create an agent id that is unique across environment workers using the worker_id.
+    """
+    return f"agent_{worker_id}-{agent_id}"
+
+
+def get_global_group_id(worker_id: int, group_id: GroupId) -> GlobalGroupId:
+    """
+    Create a group id that is unique across environment workers when using the worker_id.
+    """
+    return f"group_{worker_id}-{group_id}"
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/buffer.py b/MLPY/Lib/site-packages/mlagents/trainers/buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea6a2d51114906d9c755f4b7d0a411cdd345846e
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/buffer.py
@@ -0,0 +1,521 @@
+from collections import defaultdict
+from collections.abc import MutableMapping
+import enum
+import itertools
+from typing import BinaryIO, DefaultDict, List, Tuple, Union, Optional
+
+import numpy as np
+import h5py
+
+from mlagents_envs.exception import UnityException
+
+# Elements in the buffer can be np.ndarray, or in the case of teammate obs, actions, rewards,
+# a List of np.ndarray. This is done so that we don't have duplicated np.ndarrays, only references.
+BufferEntry = Union[np.ndarray, List[np.ndarray]]
+
+
+class BufferException(UnityException):
+    """
+    Related to errors with the Buffer.
+    """
+
+    pass
+
+
+class BufferKey(enum.Enum):
+    ACTION_MASK = "action_mask"
+    CONTINUOUS_ACTION = "continuous_action"
+    NEXT_CONT_ACTION = "next_continuous_action"
+    CONTINUOUS_LOG_PROBS = "continuous_log_probs"
+    DISCRETE_ACTION = "discrete_action"
+    NEXT_DISC_ACTION = "next_discrete_action"
+    DISCRETE_LOG_PROBS = "discrete_log_probs"
+    DONE = "done"
+    ENVIRONMENT_REWARDS = "environment_rewards"
+    MASKS = "masks"
+    MEMORY = "memory"
+    CRITIC_MEMORY = "critic_memory"
+    BASELINE_MEMORY = "poca_baseline_memory"
+    PREV_ACTION = "prev_action"
+
+    ADVANTAGES = "advantages"
+    DISCOUNTED_RETURNS = "discounted_returns"
+
+    GROUP_DONES = "group_dones"
+    GROUPMATE_REWARDS = "groupmate_reward"
+    GROUP_REWARD = "group_reward"
+    GROUP_CONTINUOUS_ACTION = "group_continuous_action"
+    GROUP_DISCRETE_ACTION = "group_discrete_aaction"
+    GROUP_NEXT_CONT_ACTION = "group_next_cont_action"
+    GROUP_NEXT_DISC_ACTION = "group_next_disc_action"
+
+
+class ObservationKeyPrefix(enum.Enum):
+    OBSERVATION = "obs"
+    NEXT_OBSERVATION = "next_obs"
+
+    GROUP_OBSERVATION = "group_obs"
+    NEXT_GROUP_OBSERVATION = "next_group_obs"
+
+
+class RewardSignalKeyPrefix(enum.Enum):
+    # Reward signals
+    REWARDS = "rewards"
+    VALUE_ESTIMATES = "value_estimates"
+    RETURNS = "returns"
+    ADVANTAGE = "advantage"
+    BASELINES = "baselines"
+
+
+AgentBufferKey = Union[
+    BufferKey, Tuple[ObservationKeyPrefix, int], Tuple[RewardSignalKeyPrefix, str]
+]
+
+
+class RewardSignalUtil:
+    @staticmethod
+    def rewards_key(name: str) -> AgentBufferKey:
+        return RewardSignalKeyPrefix.REWARDS, name
+
+    @staticmethod
+    def value_estimates_key(name: str) -> AgentBufferKey:
+        return RewardSignalKeyPrefix.RETURNS, name
+
+    @staticmethod
+    def returns_key(name: str) -> AgentBufferKey:
+        return RewardSignalKeyPrefix.RETURNS, name
+
+    @staticmethod
+    def advantage_key(name: str) -> AgentBufferKey:
+        return RewardSignalKeyPrefix.ADVANTAGE, name
+
+    @staticmethod
+    def baseline_estimates_key(name: str) -> AgentBufferKey:
+        return RewardSignalKeyPrefix.BASELINES, name
+
+
+class AgentBufferField(list):
+    """
+    AgentBufferField is a list of numpy arrays, or List[np.ndarray] for group entries.
+    When an agent collects a field, you can add it to its AgentBufferField with the append method.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.padding_value = 0
+        super().__init__(*args, **kwargs)
+
+    def __str__(self) -> str:
+        return f"AgentBufferField: {super().__str__()}"
+
+    def __getitem__(self, index):
+        return_data = super().__getitem__(index)
+        if isinstance(return_data, list):
+            return AgentBufferField(return_data)
+        else:
+            return return_data
+
+    @property
+    def contains_lists(self) -> bool:
+        """
+        Checks whether this AgentBufferField contains List[np.ndarray].
+        """
+        return len(self) > 0 and isinstance(self[0], list)
+
+    def append(self, element: BufferEntry, padding_value: float = 0.0) -> None:
+        """
+        Adds an element to this list. Also lets you change the padding
+        type, so that it can be set on append (e.g. action_masks should
+        be padded with 1.)
+        :param element: The element to append to the list.
+        :param padding_value: The value used to pad when get_batch is called.
+        """
+        super().append(element)
+        self.padding_value = padding_value
+
+    def set(self, data: List[BufferEntry]) -> None:
+        """
+        Sets the list of BufferEntry to the input data
+        :param data: The BufferEntry list to be set.
+        """
+        self[:] = data
+
+    def get_batch(
+        self,
+        batch_size: int = None,
+        training_length: Optional[int] = 1,
+        sequential: bool = True,
+    ) -> List[BufferEntry]:
+        """
+        Retrieve the last batch_size elements of length training_length
+        from the list of np.array
+        :param batch_size: The number of elements to retrieve. If None:
+        All elements will be retrieved.
+        :param training_length: The length of the sequence to be retrieved. If
+        None: only takes one element.
+        :param sequential: If true and training_length is not None: the elements
+        will not repeat in the sequence. [a,b,c,d,e] with training_length = 2 and
+        sequential=True gives [[0,a],[b,c],[d,e]]. If sequential=False gives
+        [[a,b],[b,c],[c,d],[d,e]]
+        """
+        if training_length is None:
+            training_length = 1
+        if sequential:
+            # The sequences will not have overlapping elements (this involves padding)
+            leftover = len(self) % training_length
+            # leftover is the number of elements in the first sequence (this sequence might need 0 padding)
+            if batch_size is None:
+                # retrieve the maximum number of elements
+                batch_size = len(self) // training_length + 1 * (leftover != 0)
+            # The maximum number of sequences taken from a list of length len(self) without overlapping
+            # with padding is equal to batch_size
+            if batch_size > (len(self) // training_length + 1 * (leftover != 0)):
+                raise BufferException(
+                    "The batch size and training length requested for get_batch where"
+                    " too large given the current number of data points."
+                )
+            if batch_size * training_length > len(self):
+                if self.contains_lists:
+                    padding = []
+                else:
+                    # We want to duplicate the last value in the array, multiplied by the padding_value.
+                    padding = np.array(self[-1], dtype=np.float32) * self.padding_value
+                return self[:] + [padding] * (training_length - leftover)
+
+            else:
+                return self[len(self) - batch_size * training_length :]
+        else:
+            # The sequences will have overlapping elements
+            if batch_size is None:
+                # retrieve the maximum number of elements
+                batch_size = len(self) - training_length + 1
+            # The number of sequences of length training_length taken from a list of len(self) elements
+            # with overlapping is equal to batch_size
+            if (len(self) - training_length + 1) < batch_size:
+                raise BufferException(
+                    "The batch size and training length requested for get_batch where"
+                    " too large given the current number of data points."
+                )
+            tmp_list: List[np.ndarray] = []
+            for end in range(len(self) - batch_size + 1, len(self) + 1):
+                tmp_list += self[end - training_length : end]
+            return tmp_list
+
+    def reset_field(self) -> None:
+        """
+        Resets the AgentBufferField
+        """
+        self[:] = []
+
+    def padded_to_batch(
+        self, pad_value: np.float = 0, dtype: np.dtype = np.float32
+    ) -> Union[np.ndarray, List[np.ndarray]]:
+        """
+        Converts this AgentBufferField (which is a List[BufferEntry]) into a numpy array
+        with first dimension equal to the length of this AgentBufferField. If this AgentBufferField
+        contains a List[List[BufferEntry]] (i.e., in the case of group observations), return a List
+        containing numpy arrays or tensors, of length equal to the maximum length of an entry. Missing
+        For entries with less than that length, the array will be padded with pad_value.
+        :param pad_value: Value to pad List AgentBufferFields, when there are less than the maximum
+            number of agents present.
+        :param dtype: Dtype of output numpy array.
+        :return: Numpy array or List of numpy arrays representing this AgentBufferField, where the first
+            dimension is equal to the length of the AgentBufferField.
+        """
+        if len(self) > 0 and not isinstance(self[0], list):
+            return np.asanyarray(self, dtype=dtype)
+
+        shape = None
+        for _entry in self:
+            # _entry could be an empty list if there are no group agents in this
+            # step. Find the first non-empty list and use that shape.
+            if _entry:
+                shape = _entry[0].shape
+                break
+        # If there were no groupmate agents in the entire batch, return an empty List.
+        if shape is None:
+            return []
+
+        # Convert to numpy array while padding with 0's
+        new_list = list(
+            map(
+                lambda x: np.asanyarray(x, dtype=dtype),
+                itertools.zip_longest(*self, fillvalue=np.full(shape, pad_value)),
+            )
+        )
+        return new_list
+
+    def to_ndarray(self):
+        """
+        Returns the AgentBufferField which is a list of numpy ndarrays (or List[np.ndarray]) as an ndarray.
+        """
+        return np.array(self)
+
+
+class AgentBuffer(MutableMapping):
+    """
+    AgentBuffer contains a dictionary of AgentBufferFields. Each agent has his own AgentBuffer.
+    The keys correspond to the name of the field. Example: state, action
+    """
+
+    # Whether or not to validate the types of keys at runtime
+    # This should be off for training, but enabled for testing
+    CHECK_KEY_TYPES_AT_RUNTIME = False
+
+    def __init__(self):
+        self.last_brain_info = None
+        self.last_take_action_outputs = None
+        self._fields: DefaultDict[AgentBufferKey, AgentBufferField] = defaultdict(
+            AgentBufferField
+        )
+
+    def __str__(self):
+        return ", ".join([f"'{k}' : {str(self[k])}" for k in self._fields.keys()])
+
+    def reset_agent(self) -> None:
+        """
+        Resets the AgentBuffer
+        """
+        for f in self._fields.values():
+            f.reset_field()
+        self.last_brain_info = None
+        self.last_take_action_outputs = None
+
+    @staticmethod
+    def _check_key(key):
+        if isinstance(key, BufferKey):
+            return
+        if isinstance(key, tuple):
+            key0, key1 = key
+            if isinstance(key0, ObservationKeyPrefix):
+                if isinstance(key1, int):
+                    return
+                raise KeyError(f"{key} has type ({type(key0)}, {type(key1)})")
+            if isinstance(key0, RewardSignalKeyPrefix):
+                if isinstance(key1, str):
+                    return
+                raise KeyError(f"{key} has type ({type(key0)}, {type(key1)})")
+        raise KeyError(f"{key} is a {type(key)}")
+
+    @staticmethod
+    def _encode_key(key: AgentBufferKey) -> str:
+        """
+        Convert the key to a string representation so that it can be used for serialization.
+        """
+        if isinstance(key, BufferKey):
+            return key.value
+        prefix, suffix = key
+        return f"{prefix.value}:{suffix}"
+
+    @staticmethod
+    def _decode_key(encoded_key: str) -> AgentBufferKey:
+        """
+        Convert the string representation back to a key after serialization.
+        """
+        # Simple case: convert the string directly to a BufferKey
+        try:
+            return BufferKey(encoded_key)
+        except ValueError:
+            pass
+
+        # Not a simple key, so split into two parts
+        prefix_str, _, suffix_str = encoded_key.partition(":")
+
+        # See if it's an ObservationKeyPrefix first
+        try:
+            return ObservationKeyPrefix(prefix_str), int(suffix_str)
+        except ValueError:
+            pass
+
+        # If not, it had better be a RewardSignalKeyPrefix
+        try:
+            return RewardSignalKeyPrefix(prefix_str), suffix_str
+        except ValueError:
+            raise ValueError(f"Unable to convert {encoded_key} to an AgentBufferKey")
+
+    def __getitem__(self, key: AgentBufferKey) -> AgentBufferField:
+        if self.CHECK_KEY_TYPES_AT_RUNTIME:
+            self._check_key(key)
+        return self._fields[key]
+
+    def __setitem__(self, key: AgentBufferKey, value: AgentBufferField) -> None:
+        if self.CHECK_KEY_TYPES_AT_RUNTIME:
+            self._check_key(key)
+        self._fields[key] = value
+
+    def __delitem__(self, key: AgentBufferKey) -> None:
+        if self.CHECK_KEY_TYPES_AT_RUNTIME:
+            self._check_key(key)
+        self._fields.__delitem__(key)
+
+    def __iter__(self):
+        return self._fields.__iter__()
+
+    def __len__(self) -> int:
+        return self._fields.__len__()
+
+    def __contains__(self, key):
+        if self.CHECK_KEY_TYPES_AT_RUNTIME:
+            self._check_key(key)
+        return self._fields.__contains__(key)
+
+    def check_length(self, key_list: List[AgentBufferKey]) -> bool:
+        """
+        Some methods will require that some fields have the same length.
+        check_length will return true if the fields in key_list
+        have the same length.
+        :param key_list: The fields which length will be compared
+        """
+        if self.CHECK_KEY_TYPES_AT_RUNTIME:
+            for k in key_list:
+                self._check_key(k)
+
+        if len(key_list) < 2:
+            return True
+        length = None
+        for key in key_list:
+            if key not in self._fields:
+                return False
+            if (length is not None) and (length != len(self[key])):
+                return False
+            length = len(self[key])
+        return True
+
+    def shuffle(
+        self, sequence_length: int, key_list: List[AgentBufferKey] = None
+    ) -> None:
+        """
+        Shuffles the fields in key_list in a consistent way: The reordering will
+        be the same across fields.
+        :param key_list: The fields that must be shuffled.
+        """
+        if key_list is None:
+            key_list = list(self._fields.keys())
+        if not self.check_length(key_list):
+            raise BufferException(
+                "Unable to shuffle if the fields are not of same length"
+            )
+        s = np.arange(len(self[key_list[0]]) // sequence_length)
+        np.random.shuffle(s)
+        for key in key_list:
+            buffer_field = self[key]
+            tmp: List[np.ndarray] = []
+            for i in s:
+                tmp += buffer_field[i * sequence_length : (i + 1) * sequence_length]
+            buffer_field.set(tmp)
+
+    def make_mini_batch(self, start: int, end: int) -> "AgentBuffer":
+        """
+        Creates a mini-batch from buffer.
+        :param start: Starting index of buffer.
+        :param end: Ending index of buffer.
+        :return: Dict of mini batch.
+        """
+        mini_batch = AgentBuffer()
+        for key, field in self._fields.items():
+            # slicing AgentBufferField returns a List[Any}
+            mini_batch[key] = field[start:end]  # type: ignore
+        return mini_batch
+
+    def sample_mini_batch(
+        self, batch_size: int, sequence_length: int = 1
+    ) -> "AgentBuffer":
+        """
+        Creates a mini-batch from a random start and end.
+        :param batch_size: number of elements to withdraw.
+        :param sequence_length: Length of sequences to sample.
+            Number of sequences to sample will be batch_size/sequence_length.
+        """
+        num_seq_to_sample = batch_size // sequence_length
+        mini_batch = AgentBuffer()
+        buff_len = self.num_experiences
+        num_sequences_in_buffer = buff_len // sequence_length
+        start_idxes = (
+            np.random.randint(num_sequences_in_buffer, size=num_seq_to_sample)
+            * sequence_length
+        )  # Sample random sequence starts
+        for key in self:
+            buffer_field = self[key]
+            mb_list = (buffer_field[i : i + sequence_length] for i in start_idxes)
+            # See comparison of ways to make a list from a list of lists here:
+            # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists
+            mini_batch[key].set(list(itertools.chain.from_iterable(mb_list)))
+        return mini_batch
+
+    def save_to_file(self, file_object: BinaryIO) -> None:
+        """
+        Saves the AgentBuffer to a file-like object.
+        """
+        with h5py.File(file_object, "w") as write_file:
+            for key, data in self.items():
+                write_file.create_dataset(
+                    self._encode_key(key), data=data, dtype="f", compression="gzip"
+                )
+
+    def load_from_file(self, file_object: BinaryIO) -> None:
+        """
+        Loads the AgentBuffer from a file-like object.
+        """
+        with h5py.File(file_object, "r") as read_file:
+            for key in list(read_file.keys()):
+                decoded_key = self._decode_key(key)
+                self[decoded_key] = AgentBufferField()
+                # extend() will convert the numpy array's first dimension into list
+                self[decoded_key].extend(read_file[key][()])
+
+    def truncate(self, max_length: int, sequence_length: int = 1) -> None:
+        """
+        Truncates the buffer to a certain length.
+
+        This can be slow for large buffers. We compensate by cutting further than we need to, so that
+        we're not truncating at each update. Note that we must truncate an integer number of sequence_lengths
+        param: max_length: The length at which to truncate the buffer.
+        """
+        current_length = self.num_experiences
+        # make max_length an integer number of sequence_lengths
+        max_length -= max_length % sequence_length
+        if current_length > max_length:
+            for _key in self.keys():
+                self[_key][:] = self[_key][current_length - max_length :]
+
+    def resequence_and_append(
+        self,
+        target_buffer: "AgentBuffer",
+        key_list: List[AgentBufferKey] = None,
+        batch_size: int = None,
+        training_length: int = None,
+    ) -> None:
+        """
+        Takes in a batch size and training length (sequence length), and appends this AgentBuffer to target_buffer
+        properly padded for LSTM use. Optionally, use key_list to restrict which fields are inserted into the new
+        buffer.
+        :param target_buffer: The buffer which to append the samples to.
+        :param key_list: The fields that must be added. If None: all fields will be appended.
+        :param batch_size: The number of elements that must be appended. If None: All of them will be.
+        :param training_length: The length of the samples that must be appended. If None: only takes one element.
+        """
+        if key_list is None:
+            key_list = list(self.keys())
+        if not self.check_length(key_list):
+            raise BufferException(
+                f"The length of the fields {key_list} were not of same length"
+            )
+        for field_key in key_list:
+            target_buffer[field_key].extend(
+                self[field_key].get_batch(
+                    batch_size=batch_size, training_length=training_length
+                )
+            )
+
+    @property
+    def num_experiences(self) -> int:
+        """
+        The number of agent experiences in the AgentBuffer, i.e. the length of the buffer.
+
+        An experience consists of one element across all of the fields of this AgentBuffer.
+        Note that these all have to be the same length, otherwise shuffle and append_to_update_buffer
+        will fail.
+        """
+        if self.values():
+            return len(next(iter(self.values())))
+        else:
+            return 0
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/cli_utils.py b/MLPY/Lib/site-packages/mlagents/trainers/cli_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..de420c42a42c5519f196c502f77ad36f80346abb
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/cli_utils.py
@@ -0,0 +1,331 @@
+from typing import Set, Dict, Any, TextIO
+import os
+import yaml
+from mlagents.trainers.exception import TrainerConfigError
+from mlagents_envs.environment import UnityEnvironment
+import argparse
+from mlagents_envs import logging_util
+
+logger = logging_util.get_logger(__name__)
+
+
+class RaiseRemovedWarning(argparse.Action):
+    """
+    Internal custom Action to raise warning when argument is called.
+    """
+
+    def __init__(self, nargs=0, **kwargs):
+        super().__init__(nargs=nargs, **kwargs)
+
+    def __call__(self, arg_parser, namespace, values, option_string=None):
+        logger.warning(f"The command line argument {option_string} was removed.")
+
+
+class DetectDefault(argparse.Action):
+    """
+    Internal custom Action to help detect arguments that aren't default.
+    """
+
+    non_default_args: Set[str] = set()
+
+    def __call__(self, arg_parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, values)
+        DetectDefault.non_default_args.add(self.dest)
+
+
+class DetectDefaultStoreTrue(DetectDefault):
+    """
+    Internal class to help detect arguments that aren't default.
+    Used for store_true arguments.
+    """
+
+    def __init__(self, nargs=0, **kwargs):
+        super().__init__(nargs=nargs, **kwargs)
+
+    def __call__(self, arg_parser, namespace, values, option_string=None):
+        super().__call__(arg_parser, namespace, True, option_string)
+
+
+class StoreConfigFile(argparse.Action):
+    """
+    Custom Action to store the config file location not as part of the CLI args.
+    This is because we want to maintain an equivalence between the config file's
+    contents and the args themselves.
+    """
+
+    trainer_config_path: str
+
+    def __call__(self, arg_parser, namespace, values, option_string=None):
+        delattr(namespace, self.dest)
+        StoreConfigFile.trainer_config_path = values
+
+
+def _create_parser() -> argparse.ArgumentParser:
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    argparser.add_argument(
+        "trainer_config_path", action=StoreConfigFile, nargs="?", default=None
+    )
+    argparser.add_argument(
+        "--env",
+        default=None,
+        dest="env_path",
+        help="Path to the Unity executable to train",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--load",
+        default=False,
+        dest="load_model",
+        action=DetectDefaultStoreTrue,
+        help=argparse.SUPPRESS,  # Deprecated but still usable for now.
+    )
+    argparser.add_argument(
+        "--resume",
+        default=False,
+        dest="resume",
+        action=DetectDefaultStoreTrue,
+        help="Whether to resume training from a checkpoint. Specify a --run-id to use this option. "
+        "If set, the training code loads an already trained model to initialize the neural network "
+        "before resuming training. This option is only valid when the models exist, and have the same "
+        "behavior names as the current agents in your scene.",
+    )
+    argparser.add_argument(
+        "--deterministic",
+        default=False,
+        dest="deterministic",
+        action=DetectDefaultStoreTrue,
+        help="Whether to select actions deterministically in policy. `dist.mean` for continuous action "
+        "space, and `dist.argmax` for deterministic action space ",
+    )
+    argparser.add_argument(
+        "--force",
+        default=False,
+        dest="force",
+        action=DetectDefaultStoreTrue,
+        help="Whether to force-overwrite this run-id's existing summary and model data. (Without "
+        "this flag, attempting to train a model with a run-id that has been used before will throw "
+        "an error.",
+    )
+    argparser.add_argument(
+        "--run-id",
+        default="ppo",
+        help="The identifier for the training run. This identifier is used to name the "
+        "subdirectories in which the trained model and summary statistics are saved as well "
+        "as the saved model itself. If you use TensorBoard to view the training statistics, "
+        "always set a unique run-id for each training run. (The statistics for all runs with the "
+        "same id are combined as if they were produced by a the same session.)",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--initialize-from",
+        metavar="RUN_ID",
+        default=None,
+        help="Specify a previously saved run ID from which to initialize the model from. "
+        "This can be used, for instance, to fine-tune an existing model on a new environment. "
+        "Note that the previously saved models must have the same behavior parameters as your "
+        "current environment.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--seed",
+        default=-1,
+        type=int,
+        help="A number to use as a seed for the random number generator used by the training code",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--train",
+        default=False,
+        dest="train_model",
+        action=DetectDefaultStoreTrue,
+        help=argparse.SUPPRESS,
+    )
+    argparser.add_argument(
+        "--inference",
+        default=False,
+        dest="inference",
+        action=DetectDefaultStoreTrue,
+        help="Whether to run in Python inference mode (i.e. no training). Use with --resume to load "
+        "a model trained with an existing run ID.",
+    )
+    argparser.add_argument(
+        "--base-port",
+        default=UnityEnvironment.BASE_ENVIRONMENT_PORT,
+        type=int,
+        help="The starting port for environment communication. Each concurrent Unity environment "
+        "instance will get assigned a port sequentially, starting from the base-port. Each instance "
+        "will use the port (base_port + worker_id), where the worker_id is sequential IDs given to "
+        "each instance from 0 to (num_envs - 1). Note that when training using the Editor rather "
+        "than an executable, the base port will be ignored.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--num-envs",
+        default=1,
+        type=int,
+        help="The number of concurrent Unity environment instances to collect experiences "
+        "from when training",
+        action=DetectDefault,
+    )
+
+    argparser.add_argument(
+        "--num-areas",
+        default=1,
+        type=int,
+        help="The number of parallel training areas in each Unity environment instance.",
+        action=DetectDefault,
+    )
+
+    argparser.add_argument(
+        "--debug",
+        default=False,
+        action=DetectDefaultStoreTrue,
+        help="Whether to enable debug-level logging for some parts of the code",
+    )
+    argparser.add_argument(
+        "--env-args",
+        default=None,
+        nargs=argparse.REMAINDER,
+        help="Arguments passed to the Unity executable. Be aware that the standalone build will also "
+        "process these as Unity Command Line Arguments. You should choose different argument names if "
+        "you want to create environment-specific arguments. All arguments after this flag will be "
+        "passed to the executable.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--max-lifetime-restarts",
+        default=10,
+        help="The max number of times a single Unity executable can crash over its lifetime before ml-agents exits. "
+        "Can be set to -1 if no limit is desired.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--restarts-rate-limit-n",
+        default=1,
+        help="The maximum number of times a single Unity executable can crash over a period of time (period set in "
+        "restarts-rate-limit-period-s). Can be set to -1 to not use rate limiting with restarts.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--restarts-rate-limit-period-s",
+        default=60,
+        help="The period of time --restarts-rate-limit-n applies to.",
+        action=DetectDefault,
+    )
+    argparser.add_argument(
+        "--torch",
+        default=False,
+        action=RaiseRemovedWarning,
+        help="(Removed) Use the PyTorch framework.",
+    )
+    argparser.add_argument(
+        "--tensorflow",
+        default=False,
+        action=RaiseRemovedWarning,
+        help="(Removed) Use the TensorFlow framework.",
+    )
+    argparser.add_argument(
+        "--results-dir",
+        default="results",
+        action=DetectDefault,
+        help="Results base directory",
+    )
+
+    eng_conf = argparser.add_argument_group(title="Engine Configuration")
+    eng_conf.add_argument(
+        "--width",
+        default=84,
+        type=int,
+        help="The width of the executable window of the environment(s) in pixels "
+        "(ignored for editor training).",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--height",
+        default=84,
+        type=int,
+        help="The height of the executable window of the environment(s) in pixels "
+        "(ignored for editor training)",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--quality-level",
+        default=5,
+        type=int,
+        help="The quality level of the environment(s). Equivalent to calling "
+        "QualitySettings.SetQualityLevel in Unity.",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--time-scale",
+        default=20,
+        type=float,
+        help="The time scale of the Unity environment(s). Equivalent to setting "
+        "Time.timeScale in Unity.",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--target-frame-rate",
+        default=-1,
+        type=int,
+        help="The target frame rate of the Unity environment(s). Equivalent to setting "
+        "Application.targetFrameRate in Unity.",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--capture-frame-rate",
+        default=60,
+        type=int,
+        help="The capture frame rate of the Unity environment(s). Equivalent to setting "
+        "Time.captureFramerate in Unity.",
+        action=DetectDefault,
+    )
+    eng_conf.add_argument(
+        "--no-graphics",
+        default=False,
+        action=DetectDefaultStoreTrue,
+        help="Whether to run the Unity executable in no-graphics mode (i.e. without initializing "
+        "the graphics driver. Use this only if your agents don't use visual observations.",
+    )
+
+    torch_conf = argparser.add_argument_group(title="Torch Configuration")
+    torch_conf.add_argument(
+        "--torch-device",
+        default=None,
+        dest="device",
+        action=DetectDefault,
+        help='Settings for the default torch.device used in training, for example, "cpu", "cuda", or "cuda:0"',
+    )
+    return argparser
+
+
+def load_config(config_path: str) -> Dict[str, Any]:
+    try:
+        with open(config_path) as data_file:
+            return _load_config(data_file)
+    except OSError:
+        abs_path = os.path.abspath(config_path)
+        raise TrainerConfigError(f"Config file could not be found at {abs_path}.")
+    except UnicodeDecodeError:
+        raise TrainerConfigError(
+            f"There was an error decoding Config file from {config_path}. "
+            f"Make sure your file is save using UTF-8"
+        )
+
+
+def _load_config(fp: TextIO) -> Dict[str, Any]:
+    """
+    Load the yaml config from the file-like object.
+    """
+    try:
+        return yaml.safe_load(fp)
+    except yaml.parser.ParserError as e:
+        raise TrainerConfigError(
+            "Error parsing yaml file. Please check for formatting errors. "
+            "A tool such as http://www.yamllint.com/ can be helpful with this."
+        ) from e
+
+
+parser = _create_parser()
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/demo_loader.py b/MLPY/Lib/site-packages/mlagents/trainers/demo_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..eba0200b8830ce884ec08e674c3be26f8e0e3ea7
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/demo_loader.py
@@ -0,0 +1,246 @@
+import os
+from typing import List, Tuple
+import numpy as np
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
+from mlagents_envs.communicator_objects.agent_info_action_pair_pb2 import (
+    AgentInfoActionPairProto,
+)
+from mlagents.trainers.trajectory import ObsUtil
+from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto
+from mlagents_envs.base_env import BehaviorSpec
+from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
+from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
+    DemonstrationMetaProto,
+)
+from mlagents_envs.timers import timed, hierarchical_timer
+from google.protobuf.internal.decoder import _DecodeVarint32  # type: ignore
+from google.protobuf.internal.encoder import _EncodeVarint  # type: ignore
+
+
+INITIAL_POS = 33
+SUPPORTED_DEMONSTRATION_VERSIONS = frozenset([0, 1])
+
+
+@timed
+def make_demo_buffer(
+    pair_infos: List[AgentInfoActionPairProto],
+    behavior_spec: BehaviorSpec,
+    sequence_length: int,
+) -> AgentBuffer:
+    # Create and populate buffer using experiences
+    demo_raw_buffer = AgentBuffer()
+    demo_processed_buffer = AgentBuffer()
+    for idx, current_pair_info in enumerate(pair_infos):
+        if idx > len(pair_infos) - 2:
+            break
+        next_pair_info = pair_infos[idx + 1]
+        current_decision_step, current_terminal_step = steps_from_proto(
+            [current_pair_info.agent_info], behavior_spec
+        )
+        next_decision_step, next_terminal_step = steps_from_proto(
+            [next_pair_info.agent_info], behavior_spec
+        )
+        previous_action = (
+            np.array(
+                pair_infos[idx].action_info.vector_actions_deprecated, dtype=np.float32
+            )
+            * 0
+        )
+        if idx > 0:
+            previous_action = np.array(
+                pair_infos[idx - 1].action_info.vector_actions_deprecated,
+                dtype=np.float32,
+            )
+
+        next_done = len(next_terminal_step) == 1
+        next_reward = 0
+        if len(next_terminal_step) == 1:
+            next_reward = next_terminal_step.reward[0]
+        else:
+            next_reward = next_decision_step.reward[0]
+        current_obs = None
+        if len(current_terminal_step) == 1:
+            current_obs = list(current_terminal_step.values())[0].obs
+        else:
+            current_obs = list(current_decision_step.values())[0].obs
+
+        demo_raw_buffer[BufferKey.DONE].append(next_done)
+        demo_raw_buffer[BufferKey.ENVIRONMENT_REWARDS].append(next_reward)
+        for i, obs in enumerate(current_obs):
+            demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs)
+        if (
+            len(current_pair_info.action_info.continuous_actions) == 0
+            and len(current_pair_info.action_info.discrete_actions) == 0
+        ):
+            if behavior_spec.action_spec.continuous_size > 0:
+                demo_raw_buffer[BufferKey.CONTINUOUS_ACTION].append(
+                    current_pair_info.action_info.vector_actions_deprecated
+                )
+            else:
+                demo_raw_buffer[BufferKey.DISCRETE_ACTION].append(
+                    current_pair_info.action_info.vector_actions_deprecated
+                )
+        else:
+            if behavior_spec.action_spec.continuous_size > 0:
+                demo_raw_buffer[BufferKey.CONTINUOUS_ACTION].append(
+                    current_pair_info.action_info.continuous_actions
+                )
+            if behavior_spec.action_spec.discrete_size > 0:
+                demo_raw_buffer[BufferKey.DISCRETE_ACTION].append(
+                    current_pair_info.action_info.discrete_actions
+                )
+        demo_raw_buffer[BufferKey.PREV_ACTION].append(previous_action)
+        if next_done:
+            demo_raw_buffer.resequence_and_append(
+                demo_processed_buffer, batch_size=None, training_length=sequence_length
+            )
+            demo_raw_buffer.reset_agent()
+    demo_raw_buffer.resequence_and_append(
+        demo_processed_buffer, batch_size=None, training_length=sequence_length
+    )
+    return demo_processed_buffer
+
+
+@timed
+def demo_to_buffer(
+    file_path: str, sequence_length: int, expected_behavior_spec: BehaviorSpec = None
+) -> Tuple[BehaviorSpec, AgentBuffer]:
+    """
+    Loads demonstration file and uses it to fill training buffer.
+    :param file_path: Location of demonstration file (.demo).
+    :param sequence_length: Length of trajectories to fill buffer.
+    :return:
+    """
+    behavior_spec, info_action_pair, _ = load_demonstration(file_path)
+    demo_buffer = make_demo_buffer(info_action_pair, behavior_spec, sequence_length)
+    if expected_behavior_spec:
+        # check action dimensions in demonstration match
+        if behavior_spec.action_spec != expected_behavior_spec.action_spec:
+            raise RuntimeError(
+                "The actions {} in demonstration do not match the policy's {}.".format(
+                    behavior_spec.action_spec, expected_behavior_spec.action_spec
+                )
+            )
+        # check observations match
+        if len(behavior_spec.observation_specs) != len(
+            expected_behavior_spec.observation_specs
+        ):
+            raise RuntimeError(
+                "The demonstrations do not have the same number of observations as the policy."
+            )
+        else:
+            for i, (demo_obs, policy_obs) in enumerate(
+                zip(
+                    behavior_spec.observation_specs,
+                    expected_behavior_spec.observation_specs,
+                )
+            ):
+                if demo_obs.shape != policy_obs.shape:
+                    raise RuntimeError(
+                        f"The shape {demo_obs} for observation {i} in demonstration \
+                        do not match the policy's {policy_obs}."
+                    )
+    return behavior_spec, demo_buffer
+
+
+def get_demo_files(path: str) -> List[str]:
+    """
+    Retrieves the demonstration file(s) from a path.
+    :param path: Path of demonstration file or directory.
+    :return: List of demonstration files
+
+    Raises errors if |path| is invalid.
+    """
+    if os.path.isfile(path):
+        if not path.endswith(".demo"):
+            raise ValueError("The path provided is not a '.demo' file.")
+        return [path]
+    elif os.path.isdir(path):
+        paths = [
+            os.path.join(path, name)
+            for name in os.listdir(path)
+            if name.endswith(".demo")
+        ]
+        if not paths:
+            raise ValueError("There are no '.demo' files in the provided directory.")
+        return paths
+    else:
+        raise FileNotFoundError(
+            f"The demonstration file or directory {path} does not exist."
+        )
+
+
+@timed
+def load_demonstration(
+    file_path: str,
+) -> Tuple[BehaviorSpec, List[AgentInfoActionPairProto], int]:
+    """
+    Loads and parses a demonstration file.
+    :param file_path: Location of demonstration file (.demo).
+    :return: BrainParameter and list of AgentInfoActionPairProto containing demonstration data.
+    """
+
+    # First 32 bytes of file dedicated to meta-data.
+    file_paths = get_demo_files(file_path)
+    behavior_spec = None
+    brain_param_proto = None
+    info_action_pairs = []
+    total_expected = 0
+    for _file_path in file_paths:
+        with open(_file_path, "rb") as fp:
+            with hierarchical_timer("read_file"):
+                data = fp.read()
+            next_pos, pos, obs_decoded = 0, 0, 0
+            while pos < len(data):
+                next_pos, pos = _DecodeVarint32(data, pos)
+                if obs_decoded == 0:
+                    meta_data_proto = DemonstrationMetaProto()
+                    meta_data_proto.ParseFromString(data[pos : pos + next_pos])
+                    if (
+                        meta_data_proto.api_version
+                        not in SUPPORTED_DEMONSTRATION_VERSIONS
+                    ):
+                        raise RuntimeError(
+                            f"Can't load Demonstration data from an unsupported version ({meta_data_proto.api_version})"
+                        )
+                    total_expected += meta_data_proto.number_steps
+                    pos = INITIAL_POS
+                if obs_decoded == 1:
+                    brain_param_proto = BrainParametersProto()
+                    brain_param_proto.ParseFromString(data[pos : pos + next_pos])
+                    pos += next_pos
+                if obs_decoded > 1:
+                    agent_info_action = AgentInfoActionPairProto()
+                    agent_info_action.ParseFromString(data[pos : pos + next_pos])
+                    if behavior_spec is None:
+                        behavior_spec = behavior_spec_from_proto(
+                            brain_param_proto, agent_info_action.agent_info
+                        )
+                    info_action_pairs.append(agent_info_action)
+                    if len(info_action_pairs) == total_expected:
+                        break
+                    pos += next_pos
+                obs_decoded += 1
+    if not behavior_spec:
+        raise RuntimeError(
+            f"No BrainParameters found in demonstration file at {file_path}."
+        )
+    return behavior_spec, info_action_pairs, total_expected
+
+
+def write_delimited(f, message):
+    msg_string = message.SerializeToString()
+    msg_size = len(msg_string)
+    _EncodeVarint(f.write, msg_size)
+    f.write(msg_string)
+
+
+def write_demo(demo_path, meta_data_proto, brain_param_proto, agent_info_protos):
+    with open(demo_path, "wb") as f:
+        # write metadata
+        write_delimited(f, meta_data_proto)
+        f.seek(INITIAL_POS)
+        write_delimited(f, brain_param_proto)
+
+        for agent in agent_info_protos:
+            write_delimited(f, agent)
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/directory_utils.py b/MLPY/Lib/site-packages/mlagents/trainers/directory_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..80379d81e998743d6bc5e3759f8f7a3b71e3c970
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/directory_utils.py
@@ -0,0 +1,76 @@
+import os
+from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.settings import TrainerSettings
+from mlagents.trainers.model_saver.torch_model_saver import DEFAULT_CHECKPOINT_NAME
+
+
+def validate_existing_directories(
+    output_path: str, resume: bool, force: bool, init_path: str = None
+) -> None:
+    """
+    Validates that if the run_id model exists, we do not overwrite it unless --force is specified.
+    Throws an exception if resume isn't specified and run_id exists. Throws an exception
+    if --resume is specified and run-id was not found.
+    :param model_path: The model path specified.
+    :param summary_path: The summary path to be used.
+    :param resume: Whether or not the --resume flag was passed.
+    :param force: Whether or not the --force flag was passed.
+    :param init_path: Path to run-id dir to initialize from
+    """
+
+    output_path_exists = os.path.isdir(output_path)
+
+    if output_path_exists:
+        if not resume and not force:
+            raise UnityTrainerException(
+                "Previous data from this run ID was found. "
+                "Either specify a new run ID, use --resume to resume this run, "
+                "or use the --force parameter to overwrite existing data."
+            )
+    else:
+        if resume:
+            raise UnityTrainerException(
+                "Previous data from this run ID was not found. "
+                "Train a new run by removing the --resume flag."
+            )
+
+    # Verify init path if specified.
+    if init_path is not None:
+        if not os.path.isdir(init_path):
+            raise UnityTrainerException(
+                "Could not initialize from {}. "
+                "Make sure models have already been saved with that run ID.".format(
+                    init_path
+                )
+            )
+
+
+def setup_init_path(
+    behaviors: TrainerSettings.DefaultTrainerDict, init_dir: str
+) -> None:
+    """
+    For each behavior, setup full init_path to checkpoint file to initialize policy from
+    :param behaviors: mapping from behavior_name to TrainerSettings
+    :param init_dir: Path to run-id dir to initialize from
+    """
+    for behavior_name, ts in behaviors.items():
+        if ts.init_path is None:
+            # set default if None
+            ts.init_path = os.path.join(
+                init_dir, behavior_name, DEFAULT_CHECKPOINT_NAME
+            )
+        elif not os.path.dirname(ts.init_path):
+            # update to full path if just the file name
+            ts.init_path = os.path.join(init_dir, behavior_name, ts.init_path)
+        _validate_init_full_path(ts.init_path)
+
+
+def _validate_init_full_path(init_file: str) -> None:
+    """
+    Validate initialization path to be a .pt file
+    :param init_file: full path to initialization checkpoint file
+    """
+    if not (os.path.isfile(init_file) and init_file.endswith(".pt")):
+        raise UnityTrainerException(
+            f"Could not initialize from {init_file}. file does not exists or is not a `.pt` file"
+        )
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/env_manager.py b/MLPY/Lib/site-packages/mlagents/trainers/env_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4a90fdc709d4829112a11d9dce1ee5dd428ff0e
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/env_manager.py
@@ -0,0 +1,157 @@
+from abc import ABC, abstractmethod
+
+from typing import List, Dict, NamedTuple, Iterable, Tuple
+from mlagents_envs.base_env import (
+    DecisionSteps,
+    TerminalSteps,
+    BehaviorSpec,
+    BehaviorName,
+)
+from mlagents_envs.side_channel.stats_side_channel import EnvironmentStats
+
+from mlagents.trainers.policy import Policy
+from mlagents.trainers.agent_processor import AgentManager, AgentManagerQueue
+from mlagents.trainers.action_info import ActionInfo
+from mlagents.trainers.settings import TrainerSettings
+from mlagents_envs.logging_util import get_logger
+
+AllStepResult = Dict[BehaviorName, Tuple[DecisionSteps, TerminalSteps]]
+AllGroupSpec = Dict[BehaviorName, BehaviorSpec]
+
+logger = get_logger(__name__)
+
+
+class EnvironmentStep(NamedTuple):
+    current_all_step_result: AllStepResult
+    worker_id: int
+    brain_name_to_action_info: Dict[BehaviorName, ActionInfo]
+    environment_stats: EnvironmentStats
+
+    @property
+    def name_behavior_ids(self) -> Iterable[BehaviorName]:
+        return self.current_all_step_result.keys()
+
+    @staticmethod
+    def empty(worker_id: int) -> "EnvironmentStep":
+        return EnvironmentStep({}, worker_id, {}, {})
+
+
+class EnvManager(ABC):
+    def __init__(self):
+        self.policies: Dict[BehaviorName, Policy] = {}
+        self.agent_managers: Dict[BehaviorName, AgentManager] = {}
+        self.first_step_infos: List[EnvironmentStep] = []
+
+    def set_policy(self, brain_name: BehaviorName, policy: Policy) -> None:
+        self.policies[brain_name] = policy
+        if brain_name in self.agent_managers:
+            self.agent_managers[brain_name].policy = policy
+
+    def set_agent_manager(
+        self, brain_name: BehaviorName, manager: AgentManager
+    ) -> None:
+        self.agent_managers[brain_name] = manager
+
+    @abstractmethod
+    def _step(self) -> List[EnvironmentStep]:
+        pass
+
+    @abstractmethod
+    def _reset_env(self, config: Dict = None) -> List[EnvironmentStep]:
+        pass
+
+    def reset(self, config: Dict = None) -> int:
+        for manager in self.agent_managers.values():
+            manager.end_episode()
+        # Save the first step infos, after the reset.
+        # They will be processed on the first advance().
+        self.first_step_infos = self._reset_env(config)
+        return len(self.first_step_infos)
+
+    @abstractmethod
+    def set_env_parameters(self, config: Dict = None) -> None:
+        """
+        Sends environment parameter settings to C# via the
+        EnvironmentParametersSideChannel.
+        :param config: Dict of environment parameter keys and values
+        """
+        pass
+
+    def on_training_started(
+        self, behavior_name: str, trainer_settings: TrainerSettings
+    ) -> None:
+        """
+        Handle traing starting for a new behavior type. Generally nothing is necessary here.
+        :param behavior_name:
+        :param trainer_settings:
+        :return:
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def training_behaviors(self) -> Dict[BehaviorName, BehaviorSpec]:
+        pass
+
+    @abstractmethod
+    def close(self):
+        pass
+
+    def get_steps(self) -> List[EnvironmentStep]:
+        """
+        Updates the policies, steps the environments, and returns the step information from the environments.
+        Calling code should pass the returned EnvironmentSteps to process_steps() after calling this.
+        :return: The list of EnvironmentSteps
+        """
+        # If we had just reset, process the first EnvironmentSteps.
+        # Note that we do it here instead of in reset() so that on the very first reset(),
+        # we can create the needed AgentManagers before calling advance() and processing the EnvironmentSteps.
+        if self.first_step_infos:
+            self._process_step_infos(self.first_step_infos)
+            self.first_step_infos = []
+        # Get new policies if found. Always get the latest policy.
+        for brain_name in self.agent_managers.keys():
+            _policy = None
+            try:
+                # We make sure to empty the policy queue before continuing to produce steps.
+                # This halts the trainers until the policy queue is empty.
+                while True:
+                    _policy = self.agent_managers[brain_name].policy_queue.get_nowait()
+            except AgentManagerQueue.Empty:
+                if _policy is not None:
+                    self.set_policy(brain_name, _policy)
+        # Step the environments
+        new_step_infos = self._step()
+        return new_step_infos
+
+    def process_steps(self, new_step_infos: List[EnvironmentStep]) -> int:
+        # Add to AgentProcessor
+        num_step_infos = self._process_step_infos(new_step_infos)
+        return num_step_infos
+
+    def _process_step_infos(self, step_infos: List[EnvironmentStep]) -> int:
+        for step_info in step_infos:
+            for name_behavior_id in step_info.name_behavior_ids:
+                if name_behavior_id not in self.agent_managers:
+                    logger.warning(
+                        "Agent manager was not created for behavior id {}.".format(
+                            name_behavior_id
+                        )
+                    )
+                    continue
+                decision_steps, terminal_steps = step_info.current_all_step_result[
+                    name_behavior_id
+                ]
+                self.agent_managers[name_behavior_id].add_experiences(
+                    decision_steps,
+                    terminal_steps,
+                    step_info.worker_id,
+                    step_info.brain_name_to_action_info.get(
+                        name_behavior_id, ActionInfo.empty()
+                    ),
+                )
+
+                self.agent_managers[name_behavior_id].record_environment_stats(
+                    step_info.environment_stats, step_info.worker_id
+                )
+        return len(step_infos)
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/environment_parameter_manager.py b/MLPY/Lib/site-packages/mlagents/trainers/environment_parameter_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dd00f98a04a29067a1cd4f5a83c319a237add38
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/environment_parameter_manager.py
@@ -0,0 +1,186 @@
+from typing import Dict, List, Tuple, Optional
+from mlagents.trainers.settings import (
+    EnvironmentParameterSettings,
+    ParameterRandomizationSettings,
+)
+from collections import defaultdict
+from mlagents.trainers.training_status import GlobalTrainingStatus, StatusType
+
+from mlagents_envs.logging_util import get_logger
+
+logger = get_logger(__name__)
+
+
+class EnvironmentParameterManager:
+    def __init__(
+        self,
+        settings: Optional[Dict[str, EnvironmentParameterSettings]] = None,
+        run_seed: int = -1,
+        restore: bool = False,
+    ):
+        """
+        EnvironmentParameterManager manages all the environment parameters of a training
+        session. It determines when parameters should change and gives access to the
+        current sampler of each parameter.
+        :param settings: A dictionary from environment parameter to
+        EnvironmentParameterSettings.
+        :param run_seed: When the seed is not provided for an environment parameter,
+        this seed will be used instead.
+        :param restore: If true, the EnvironmentParameterManager will use the
+        GlobalTrainingStatus to try and reload the lesson status of each environment
+        parameter.
+        """
+        if settings is None:
+            settings = {}
+        self._dict_settings = settings
+        for parameter_name in self._dict_settings.keys():
+            initial_lesson = GlobalTrainingStatus.get_parameter_state(
+                parameter_name, StatusType.LESSON_NUM
+            )
+            if initial_lesson is None or not restore:
+                GlobalTrainingStatus.set_parameter_state(
+                    parameter_name, StatusType.LESSON_NUM, 0
+                )
+        self._smoothed_values: Dict[str, float] = defaultdict(float)
+        for key in self._dict_settings.keys():
+            self._smoothed_values[key] = 0.0
+        # Update the seeds of the samplers
+        self._set_sampler_seeds(run_seed)
+
+    def _set_sampler_seeds(self, seed):
+        """
+        Sets the seeds for the samplers (if no seed was already present). Note that
+        using the provided seed.
+        """
+        offset = 0
+        for settings in self._dict_settings.values():
+            for lesson in settings.curriculum:
+                if lesson.value.seed == -1:
+                    lesson.value.seed = seed + offset
+                    offset += 1
+
+    def get_minimum_reward_buffer_size(self, behavior_name: str) -> int:
+        """
+        Calculates the minimum size of the reward buffer a behavior must use. This
+        method uses the 'min_lesson_length' sampler_parameter to determine this value.
+        :param behavior_name: The name of the behavior the minimum reward buffer
+        size corresponds to.
+        """
+        result = 1
+        for settings in self._dict_settings.values():
+            for lesson in settings.curriculum:
+                if lesson.completion_criteria is not None:
+                    if lesson.completion_criteria.behavior == behavior_name:
+                        result = max(
+                            result, lesson.completion_criteria.min_lesson_length
+                        )
+        return result
+
+    def get_current_samplers(self) -> Dict[str, ParameterRandomizationSettings]:
+        """
+        Creates a dictionary from environment parameter name to their corresponding
+        ParameterRandomizationSettings. If curriculum is used, the
+        ParameterRandomizationSettings corresponds to the sampler of the current lesson.
+        """
+        samplers: Dict[str, ParameterRandomizationSettings] = {}
+        for param_name, settings in self._dict_settings.items():
+            lesson_num = GlobalTrainingStatus.get_parameter_state(
+                param_name, StatusType.LESSON_NUM
+            )
+            lesson = settings.curriculum[lesson_num]
+            samplers[param_name] = lesson.value
+        return samplers
+
+    def get_current_lesson_number(self) -> Dict[str, int]:
+        """
+        Creates a dictionary from environment parameter to the current lesson number.
+        If not using curriculum, this number is always 0 for that environment parameter.
+        """
+        result: Dict[str, int] = {}
+        for parameter_name in self._dict_settings.keys():
+            result[parameter_name] = GlobalTrainingStatus.get_parameter_state(
+                parameter_name, StatusType.LESSON_NUM
+            )
+        return result
+
+    def log_current_lesson(self, parameter_name: Optional[str] = None) -> None:
+        """
+        Logs the current lesson number and sampler value of the parameter with name
+        parameter_name. If no parameter_name is provided, the values and lesson
+        numbers of all parameters will be displayed.
+        """
+        if parameter_name is not None:
+            settings = self._dict_settings[parameter_name]
+            lesson_number = GlobalTrainingStatus.get_parameter_state(
+                parameter_name, StatusType.LESSON_NUM
+            )
+            lesson_name = settings.curriculum[lesson_number].name
+            lesson_value = settings.curriculum[lesson_number].value
+            logger.info(
+                f"Parameter '{parameter_name}' is in lesson '{lesson_name}' "
+                f"and has value '{lesson_value}'."
+            )
+        else:
+            for parameter_name, settings in self._dict_settings.items():
+                lesson_number = GlobalTrainingStatus.get_parameter_state(
+                    parameter_name, StatusType.LESSON_NUM
+                )
+                lesson_name = settings.curriculum[lesson_number].name
+                lesson_value = settings.curriculum[lesson_number].value
+                logger.info(
+                    f"Parameter '{parameter_name}' is in lesson '{lesson_name}' "
+                    f"and has value '{lesson_value}'."
+                )
+
+    def update_lessons(
+        self,
+        trainer_steps: Dict[str, int],
+        trainer_max_steps: Dict[str, int],
+        trainer_reward_buffer: Dict[str, List[float]],
+    ) -> Tuple[bool, bool]:
+        """
+        Given progress metrics, calculates if at least one environment parameter is
+        in a new lesson and if at least one environment parameter requires the env
+        to reset.
+        :param trainer_steps: A dictionary from behavior_name to the number of training
+        steps this behavior's trainer has performed.
+        :param trainer_max_steps: A dictionary from behavior_name to the maximum number
+        of training steps this behavior's trainer has performed.
+        :param trainer_reward_buffer: A dictionary from behavior_name to the list of
+        the most recent episode returns for this behavior's trainer.
+        :returns: A tuple of two booleans : (True if any lesson has changed, True if
+        environment needs to reset)
+        """
+        must_reset = False
+        updated = False
+        for param_name, settings in self._dict_settings.items():
+            lesson_num = GlobalTrainingStatus.get_parameter_state(
+                param_name, StatusType.LESSON_NUM
+            )
+            next_lesson_num = lesson_num + 1
+            lesson = settings.curriculum[lesson_num]
+            if (
+                lesson.completion_criteria is not None
+                and len(settings.curriculum) > next_lesson_num
+            ):
+                behavior_to_consider = lesson.completion_criteria.behavior
+                if behavior_to_consider in trainer_steps:
+                    (
+                        must_increment,
+                        new_smoothing,
+                    ) = lesson.completion_criteria.need_increment(
+                        float(trainer_steps[behavior_to_consider])
+                        / float(trainer_max_steps[behavior_to_consider]),
+                        trainer_reward_buffer[behavior_to_consider],
+                        self._smoothed_values[param_name],
+                    )
+                    self._smoothed_values[param_name] = new_smoothing
+                    if must_increment:
+                        GlobalTrainingStatus.set_parameter_state(
+                            param_name, StatusType.LESSON_NUM, next_lesson_num
+                        )
+                        self.log_current_lesson(param_name)
+                        updated = True
+                        if lesson.completion_criteria.require_reset:
+                            must_reset = True
+        return updated, must_reset
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/exception.py b/MLPY/Lib/site-packages/mlagents/trainers/exception.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c0742bcec7a45f96a7ccb4c320010f845320d4c
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/exception.py
@@ -0,0 +1,75 @@
+"""
+Contains exceptions for the trainers package.
+"""
+
+
+class TrainerError(Exception):
+    """
+    Any error related to the trainers in the ML-Agents Toolkit.
+    """
+
+    pass
+
+
+class TrainerConfigError(Exception):
+    """
+    Any error related to the configuration of trainers in the ML-Agents Toolkit.
+    """
+
+    pass
+
+
+class TrainerConfigWarning(Warning):
+    """
+    Any warning related to the configuration of trainers in the ML-Agents Toolkit.
+    """
+
+    pass
+
+
+class CurriculumError(TrainerError):
+    """
+    Any error related to training with a curriculum.
+    """
+
+    pass
+
+
+class CurriculumLoadingError(CurriculumError):
+    """
+    Any error related to loading the Curriculum config file.
+    """
+
+    pass
+
+
+class CurriculumConfigError(CurriculumError):
+    """
+    Any error related to processing the Curriculum config file.
+    """
+
+    pass
+
+
+class MetaCurriculumError(TrainerError):
+    """
+    Any error related to the configuration of a metacurriculum.
+    """
+
+    pass
+
+
+class SamplerException(TrainerError):
+    """
+    Related to errors with the sampler actions.
+    """
+
+    pass
+
+
+class UnityTrainerException(TrainerError):
+    """
+    Related to errors with the Trainer.
+    """
+
+    pass
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ghost/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbedb9f414f336cb0a7312f4d8644c3dca2cc74a
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/controller.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/controller.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52e60d362256e9209391803531b42358d52372cf
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/controller.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/trainer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9fff08fc583e976e6fad640ed58cd6b84f499a8
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/ghost/__pycache__/trainer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ghost/controller.py b/MLPY/Lib/site-packages/mlagents/trainers/ghost/controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..84901e14f6e57993317a1a5a3f94c7571c7c39d9
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/ghost/controller.py
@@ -0,0 +1,103 @@
+from mlagents_envs.logging_util import get_logger
+from typing import Deque, Dict
+from collections import deque
+from mlagents.trainers.ghost.trainer import GhostTrainer
+
+logger = get_logger(__name__)
+
+
+class GhostController:
+    """
+    GhostController contains a queue of team ids. GhostTrainers subscribe to the GhostController and query
+    it to get the current learning team.  The GhostController cycles through team ids every 'swap_interval'
+    which corresponds to the number of trainer steps between changing learning teams.
+    The GhostController is a unique object and there can only be one per training run.
+    """
+
+    def __init__(self, maxlen: int = 10):
+        """
+        Create a GhostController.
+        :param maxlen: Maximum number of GhostTrainers allowed in this GhostController
+        """
+
+        # Tracks last swap step for  each learning team because trainer
+        # steps of all GhostTrainers do not increment together
+        self._queue: Deque[int] = deque(maxlen=maxlen)
+        self._learning_team: int = -1
+        # Dict from team id to GhostTrainer for ELO calculation
+        self._ghost_trainers: Dict[int, GhostTrainer] = {}
+        # Signals to the trainer control to perform a hard change_training_team
+        self._changed_training_team = False
+
+    @property
+    def get_learning_team(self) -> int:
+        """
+        Returns the current learning team.
+        :return: The learning team id
+        """
+        return self._learning_team
+
+    def should_reset(self) -> bool:
+        """
+        Whether or not team change occurred. Causes full reset in trainer_controller
+        :return: The truth value of the team changing
+        """
+        changed_team = self._changed_training_team
+        if self._changed_training_team:
+            self._changed_training_team = False
+        return changed_team
+
+    def subscribe_team_id(self, team_id: int, trainer: GhostTrainer) -> None:
+        """
+        Given a team_id and trainer, add to queue and trainers if not already.
+        The GhostTrainer is used later by the controller to get ELO ratings of agents.
+        :param team_id: The team_id of an agent managed by this GhostTrainer
+        :param trainer: A GhostTrainer that manages this team_id.
+        """
+        if team_id not in self._ghost_trainers:
+            self._ghost_trainers[team_id] = trainer
+            if self._learning_team < 0:
+                self._learning_team = team_id
+            else:
+                self._queue.append(team_id)
+
+    def change_training_team(self, step: int) -> None:
+        """
+        The current learning team is added to the end of the queue and then updated with the
+        next in line.
+        :param step: The step of the trainer for debugging
+        """
+        self._queue.append(self._learning_team)
+        self._learning_team = self._queue.popleft()
+        logger.debug(f"Learning team {self._learning_team} swapped on step {step}")
+        self._changed_training_team = True
+
+    # Adapted from https://github.com/Unity-Technologies/ml-agents/pull/1975 and
+    # https://metinmediamath.wordpress.com/2013/11/27/how-to-calculate-the-elo-rating-including-example/
+    # ELO calculation
+    # TODO : Generalize this to more than two teams
+    def compute_elo_rating_changes(self, rating: float, result: float) -> float:
+        """
+        Calculates ELO. Given the rating of the learning team and result.  The GhostController
+        queries the other GhostTrainers for the ELO of their agent that is currently being deployed.
+        Note, this could be the current agent or a past snapshot.
+        :param rating: Rating of the learning team.
+        :param result: Win, loss, or draw from the perspective of the learning team.
+        :return: The change in ELO.
+        """
+        opponent_rating: float = 0.0
+        for team_id, trainer in self._ghost_trainers.items():
+            if team_id != self._learning_team:
+                opponent_rating = trainer.get_opponent_elo()
+        r1 = pow(10, rating / 400)
+        r2 = pow(10, opponent_rating / 400)
+
+        summed = r1 + r2
+        e1 = r1 / summed
+
+        change = result - e1
+        for team_id, trainer in self._ghost_trainers.items():
+            if team_id != self._learning_team:
+                trainer.change_opponent_elo(change)
+
+        return change
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ghost/trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/ghost/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f49a643574b8eef7b1f64c28acd9c1a48b0ec0d7
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/ghost/trainer.py
@@ -0,0 +1,480 @@
+# # Unity ML-Agents Toolkit
+# ## ML-Agent Learning (Ghost Trainer)
+
+from collections import defaultdict
+from typing import Deque, Dict, DefaultDict, List
+
+import numpy as np
+
+from mlagents_envs.logging_util import get_logger
+from mlagents_envs.base_env import BehaviorSpec
+from mlagents.trainers.policy import Policy
+
+from mlagents.trainers.trainer import Trainer
+from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
+from mlagents.trainers.trajectory import Trajectory
+from mlagents.trainers.agent_processor import AgentManagerQueue
+from mlagents.trainers.stats import StatsPropertyType
+from mlagents.trainers.behavior_id_utils import (
+    BehaviorIdentifiers,
+    create_name_behavior_id,
+)
+from mlagents.trainers.training_status import GlobalTrainingStatus, StatusType
+
+
+logger = get_logger(__name__)
+
+
+class GhostTrainer(Trainer):
+    """
+    The GhostTrainer trains agents in adversarial games (there are teams in opposition) using a self-play mechanism.
+    In adversarial settings with self-play, at any time, there is only a single learning team. The other team(s) is
+    "ghosted" which means that its agents are executing fixed policies and not learning. The GhostTrainer wraps
+    a standard RL trainer which trains the learning team and ensures that only the trajectories collected
+    by the learning team are used for training.  The GhostTrainer also maintains past policy snapshots to be used
+    as the fixed policies when the team is not learning. The GhostTrainer is 1:1 with brain_names as the other
+    trainers, and is responsible for one or more teams. Note, a GhostTrainer can have only one team in
+    asymmetric games where there is only one team with a particular behavior i.e. Hide and Seek.
+    The GhostController manages high level coordination between multiple ghost trainers. The learning team id
+    is cycled throughout a training run.
+    """
+
+    def __init__(
+        self,
+        trainer,
+        brain_name,
+        controller,
+        reward_buff_cap,
+        trainer_settings,
+        training,
+        artifact_path,
+    ):
+        """
+        Creates a GhostTrainer.
+        :param trainer: The trainer of the policy/policies being trained with self_play
+        :param brain_name: The name of the brain associated with trainer config
+        :param controller: GhostController that coordinates all ghost trainers and calculates ELO
+        :param reward_buff_cap: Max reward history to track in the reward buffer
+        :param trainer_settings: The parameters for the trainer.
+        :param training: Whether the trainer is set for training.
+        :param artifact_path: Path to store artifacts from this trainer.
+        """
+
+        super().__init__(
+            brain_name, trainer_settings, training, artifact_path, reward_buff_cap
+        )
+
+        self.trainer = trainer
+        self.controller = controller
+
+        self._internal_trajectory_queues: Dict[str, AgentManagerQueue[Trajectory]] = {}
+        self._internal_policy_queues: Dict[str, AgentManagerQueue[Policy]] = {}
+
+        self._team_to_name_to_policy_queue: DefaultDict[
+            int, Dict[str, AgentManagerQueue[Policy]]
+        ] = defaultdict(dict)
+
+        self._name_to_parsed_behavior_id: Dict[str, BehaviorIdentifiers] = {}
+
+        # assign ghost's stats collection to wrapped trainer's
+        self._stats_reporter = self.trainer.stats_reporter
+        # Set the logging to print ELO in the console
+        self._stats_reporter.add_property(StatsPropertyType.SELF_PLAY, True)
+
+        self_play_parameters = trainer_settings.self_play
+        self.window = self_play_parameters.window
+        self.play_against_latest_model_ratio = (
+            self_play_parameters.play_against_latest_model_ratio
+        )
+        if (
+            self.play_against_latest_model_ratio > 1.0
+            or self.play_against_latest_model_ratio < 0.0
+        ):
+            logger.warning(
+                "The play_against_latest_model_ratio is not between 0 and 1."
+            )
+
+        self.steps_between_save = self_play_parameters.save_steps
+        self.steps_between_swap = self_play_parameters.swap_steps
+        self.steps_to_train_team = self_play_parameters.team_change
+        if self.steps_to_train_team > self.get_max_steps:
+            logger.warning(
+                "The max steps of the GhostTrainer for behavior name {} is less than team change. This team will not face \
+                opposition that has been trained if the opposition is managed by a different GhostTrainer as in an \
+                asymmetric game.".format(
+                    self.brain_name
+                )
+            )
+
+        # Counts the number of steps of the ghost policies. Snapshot swapping
+        # depends on this counter whereas snapshot saving and team switching depends
+        # on the wrapped. This ensures that all teams train for the same number of trainer
+        # steps.
+        self.ghost_step: int = 0
+
+        # A list of dicts from brain name to a single snapshot for this trainer's policies
+        self.policy_snapshots: List[Dict[str, List[float]]] = []
+
+        # A dict from brain name to the current snapshot of this trainer's policies
+        self.current_policy_snapshot: Dict[str, List[float]] = {}
+
+        self.snapshot_counter: int = 0
+
+        # wrapped_training_team and learning team need to be separate
+        # in the situation where new agents are created destroyed
+        # after learning team switches. These agents need to be added
+        # to trainers properly.
+        self._learning_team: int = None
+        self.wrapped_trainer_team: int = None
+        self.last_save: int = 0
+        self.last_swap: int = 0
+        self.last_team_change: int = 0
+
+        self.initial_elo = GlobalTrainingStatus.get_parameter_state(
+            self.brain_name, StatusType.ELO
+        )
+        if self.initial_elo is None:
+            self.initial_elo = self_play_parameters.initial_elo
+        self.policy_elos: List[float] = [self.initial_elo] * (
+            self.window + 1
+        )  # for learning policy
+        self.current_opponent: int = 0
+
+    @property
+    def get_step(self) -> int:
+        """
+        Returns the number of steps the wrapped trainer has performed
+        :return: the step count of the wrapped trainer
+        """
+        return self.trainer.get_step
+
+    @property
+    def reward_buffer(self) -> Deque[float]:
+        """
+        Returns the reward buffer. The reward buffer contains the cumulative
+        rewards of the most recent episodes completed by agents using this
+        trainer.
+        :return: the reward buffer.
+        """
+        return self.trainer.reward_buffer
+
+    @property
+    def current_elo(self) -> float:
+        """
+        Gets ELO of current policy which is always last in the list
+        :return: ELO of current policy
+        """
+        return self.policy_elos[-1]
+
+    def change_current_elo(self, change: float) -> None:
+        """
+        Changes elo of current policy which is always last in the list
+        :param change: Amount to change current elo by
+        """
+        self.policy_elos[-1] += change
+
+    def get_opponent_elo(self) -> float:
+        """
+        Get elo of current opponent policy
+        :return: ELO of current opponent policy
+        """
+        return self.policy_elos[self.current_opponent]
+
+    def change_opponent_elo(self, change: float) -> None:
+        """
+        Changes elo of current opponent policy
+        :param change: Amount to change current opponent elo by
+        """
+        self.policy_elos[self.current_opponent] -= change
+
+    def _process_trajectory(self, trajectory: Trajectory) -> None:
+        """
+        Determines the final result of an episode and asks the GhostController
+        to calculate the ELO change. The GhostController changes the ELO
+        of the opponent policy since this may be in a different GhostTrainer
+        i.e. in asymmetric games. We assume the last reward determines the winner.
+        :param trajectory: Trajectory.
+        """
+        if (
+            trajectory.done_reached
+            and trajectory.all_group_dones_reached
+            and not trajectory.interrupted
+        ):
+            # Assumption is that final reward is >0/0/<0 for win/draw/loss
+            final_reward = (
+                trajectory.steps[-1].reward + trajectory.steps[-1].group_reward
+            )
+            result = 0.5
+            if final_reward > 0:
+                result = 1.0
+            elif final_reward < 0:
+                result = 0.0
+
+            change = self.controller.compute_elo_rating_changes(
+                self.current_elo, result
+            )
+            self.change_current_elo(change)
+            self._stats_reporter.add_stat("Self-play/ELO", self.current_elo)
+
+    def advance(self) -> None:
+        """
+        Steps the trainer, passing trajectories to wrapped trainer and calling trainer advance
+        """
+        for trajectory_queue in self.trajectory_queues:
+            parsed_behavior_id = self._name_to_parsed_behavior_id[
+                trajectory_queue.behavior_id
+            ]
+            if parsed_behavior_id.team_id == self._learning_team:
+                # With a future multiagent trainer, this will be indexed by 'role'
+                internal_trajectory_queue = self._internal_trajectory_queues[
+                    parsed_behavior_id.brain_name
+                ]
+                try:
+                    # We grab at most the maximum length of the queue.
+                    # This ensures that even if the queue is being filled faster than it is
+                    # being emptied, the trajectories in the queue are on-policy.
+                    for _ in range(trajectory_queue.qsize()):
+                        t = trajectory_queue.get_nowait()
+                        # adds to wrapped trainers queue
+                        internal_trajectory_queue.put(t)
+                        self._process_trajectory(t)
+                except AgentManagerQueue.Empty:
+                    pass
+            else:
+                # Dump trajectories from non-learning policy
+                try:
+                    for _ in range(trajectory_queue.qsize()):
+                        t = trajectory_queue.get_nowait()
+                        # count ghost steps
+                        self.ghost_step += len(t.steps)
+                except AgentManagerQueue.Empty:
+                    pass
+
+        self._next_summary_step = self.trainer._next_summary_step
+        self.trainer.advance()
+        if self.get_step - self.last_team_change > self.steps_to_train_team:
+            self.controller.change_training_team(self.get_step)
+            self.last_team_change = self.get_step
+
+        next_learning_team = self.controller.get_learning_team
+
+        # Case 1: No team change. The if statement just continues to push the policy
+        # into the correct queue (or not if not learning team).
+        for brain_name in self._internal_policy_queues:
+            internal_policy_queue = self._internal_policy_queues[brain_name]
+            try:
+                policy = internal_policy_queue.get_nowait()
+                self.current_policy_snapshot[brain_name] = policy.get_weights()
+            except AgentManagerQueue.Empty:
+                continue
+            if (
+                self._learning_team == next_learning_team
+                and next_learning_team in self._team_to_name_to_policy_queue
+            ):
+                name_to_policy_queue = self._team_to_name_to_policy_queue[
+                    next_learning_team
+                ]
+                if brain_name in name_to_policy_queue:
+                    behavior_id = create_name_behavior_id(
+                        brain_name, next_learning_team
+                    )
+                    policy = self.get_policy(behavior_id)
+                    policy.load_weights(self.current_policy_snapshot[brain_name])
+                    name_to_policy_queue[brain_name].put(policy)
+
+        # CASE 2: Current learning team is managed by this GhostTrainer.
+        # If the learning team changes, the following loop over queues will push the
+        # new policy into the policy queue for the new learning agent if
+        # that policy is managed by this GhostTrainer. Otherwise, it will save the current snapshot.
+        # CASE 3: Current learning team is managed by a different GhostTrainer.
+        # If the learning team changes to a team managed by this GhostTrainer, this loop
+        # will push the current_snapshot into the correct queue.  Otherwise,
+        # it will continue skipping and swap_snapshot will continue to handle
+        # pushing fixed snapshots
+        if (
+            self._learning_team != next_learning_team
+            and next_learning_team in self._team_to_name_to_policy_queue
+        ):
+            name_to_policy_queue = self._team_to_name_to_policy_queue[
+                next_learning_team
+            ]
+            for brain_name in name_to_policy_queue:
+                behavior_id = create_name_behavior_id(brain_name, next_learning_team)
+                policy = self.get_policy(behavior_id)
+                policy.load_weights(self.current_policy_snapshot[brain_name])
+                name_to_policy_queue[brain_name].put(policy)
+
+        # Note save and swap should be on different step counters.
+        # We don't want to save unless the policy is learning.
+        if self.get_step - self.last_save > self.steps_between_save:
+            self._save_snapshot()
+            self.last_save = self.get_step
+
+        if (
+            self._learning_team != next_learning_team
+            or self.ghost_step - self.last_swap > self.steps_between_swap
+        ):
+            self._learning_team = next_learning_team
+            self._swap_snapshots()
+            self.last_swap = self.ghost_step
+
+    def end_episode(self):
+        """
+        Forwarding call to wrapped trainers end_episode
+        """
+        self.trainer.end_episode()
+
+    def save_model(self) -> None:
+        """
+        Forwarding call to wrapped trainers save_model.
+        """
+        GlobalTrainingStatus.set_parameter_state(
+            self.brain_name, StatusType.ELO, self.current_elo
+        )
+        self.trainer.save_model()
+
+    def create_policy(
+        self,
+        parsed_behavior_id: BehaviorIdentifiers,
+        behavior_spec: BehaviorSpec,
+    ) -> Policy:
+        """
+        Creates policy with the wrapped trainer's create_policy function
+        The first policy encountered sets the wrapped
+        trainer team.  This is to ensure that all agents from the same multi-agent
+        team are grouped. All policies associated with this team are added to the
+        wrapped trainer to be trained.
+        """
+        policy = self.trainer.create_policy(parsed_behavior_id, behavior_spec)
+        team_id = parsed_behavior_id.team_id
+        self.controller.subscribe_team_id(team_id, self)
+
+        # First policy or a new agent on the same team encountered
+        if self.wrapped_trainer_team is None or team_id == self.wrapped_trainer_team:
+            internal_trainer_policy = self.trainer.create_policy(
+                parsed_behavior_id, behavior_spec
+            )
+            self.trainer.add_policy(parsed_behavior_id, internal_trainer_policy)
+            self.current_policy_snapshot[
+                parsed_behavior_id.brain_name
+            ] = internal_trainer_policy.get_weights()
+
+            policy.load_weights(internal_trainer_policy.get_weights())
+            self._save_snapshot()  # Need to save after trainer initializes policy
+            self._learning_team = self.controller.get_learning_team
+            self.wrapped_trainer_team = team_id
+        else:
+            # Load the weights of the ghost policy from the wrapped one
+            policy.load_weights(
+                self.trainer.get_policy(parsed_behavior_id).get_weights()
+            )
+        return policy
+
+    def create_optimizer(self) -> TorchOptimizer:
+        pass
+
+    def add_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
+    ) -> None:
+        """
+        Adds policy to GhostTrainer.
+        :param parsed_behavior_id: Behavior ID that the policy should belong to.
+        :param policy: Policy to associate with name_behavior_id.
+        """
+        name_behavior_id = parsed_behavior_id.behavior_id
+        self._name_to_parsed_behavior_id[name_behavior_id] = parsed_behavior_id
+        self.policies[name_behavior_id] = policy
+
+    def _save_snapshot(self) -> None:
+        """
+        Saves a snapshot of the current weights of the policy and maintains the policy_snapshots
+        according to the window size
+        """
+        for brain_name in self.current_policy_snapshot:
+            current_snapshot_for_brain_name = self.current_policy_snapshot[brain_name]
+
+            try:
+                self.policy_snapshots[self.snapshot_counter][
+                    brain_name
+                ] = current_snapshot_for_brain_name
+            except IndexError:
+                self.policy_snapshots.append(
+                    {brain_name: current_snapshot_for_brain_name}
+                )
+        self.policy_elos[self.snapshot_counter] = self.current_elo
+        self.snapshot_counter = (self.snapshot_counter + 1) % self.window
+
+    def _swap_snapshots(self) -> None:
+        """
+        Swaps the appropriate weight to the policy and pushes it to respective policy queues
+        """
+
+        for team_id in self._team_to_name_to_policy_queue:
+            if team_id == self._learning_team:
+                continue
+            elif np.random.uniform() < (1 - self.play_against_latest_model_ratio):
+                x = np.random.randint(len(self.policy_snapshots))
+                snapshot = self.policy_snapshots[x]
+            else:
+                snapshot = self.current_policy_snapshot
+                x = "current"
+
+            self.current_opponent = -1 if x == "current" else x
+            name_to_policy_queue = self._team_to_name_to_policy_queue[team_id]
+            for brain_name in self._team_to_name_to_policy_queue[team_id]:
+                behavior_id = create_name_behavior_id(brain_name, team_id)
+                policy = self.get_policy(behavior_id)
+                policy.load_weights(snapshot[brain_name])
+                name_to_policy_queue[brain_name].put(policy)
+                logger.debug(
+                    "Step {}: Swapping snapshot {} to id {} with team {} learning".format(
+                        self.ghost_step, x, behavior_id, self._learning_team
+                    )
+                )
+
+    def publish_policy_queue(self, policy_queue: AgentManagerQueue[Policy]) -> None:
+        """
+        Adds a policy queue for every member of the team to the list of queues to publish to when this Trainer
+        makes a policy update.  Creates an internal policy queue for the wrapped
+        trainer to push to.  The GhostTrainer pushes all policies to the env.
+        :param queue: Policy queue to publish to.
+        """
+        super().publish_policy_queue(policy_queue)
+        parsed_behavior_id = self._name_to_parsed_behavior_id[policy_queue.behavior_id]
+        self._team_to_name_to_policy_queue[parsed_behavior_id.team_id][
+            parsed_behavior_id.brain_name
+        ] = policy_queue
+        if parsed_behavior_id.team_id == self.wrapped_trainer_team:
+            # With a future multiagent trainer, this will be indexed by 'role'
+            internal_policy_queue: AgentManagerQueue[Policy] = AgentManagerQueue(
+                parsed_behavior_id.brain_name
+            )
+
+            self._internal_policy_queues[
+                parsed_behavior_id.brain_name
+            ] = internal_policy_queue
+            self.trainer.publish_policy_queue(internal_policy_queue)
+
+    def subscribe_trajectory_queue(
+        self, trajectory_queue: AgentManagerQueue[Trajectory]
+    ) -> None:
+        """
+        Adds a trajectory queue for every member of the team to the list of queues for the trainer
+        to ingest Trajectories from. Creates an internal trajectory queue to push trajectories from
+        the learning team.  The wrapped trainer subscribes to this queue.
+        :param queue: Trajectory queue to publish to.
+        """
+        super().subscribe_trajectory_queue(trajectory_queue)
+        parsed_behavior_id = self._name_to_parsed_behavior_id[
+            trajectory_queue.behavior_id
+        ]
+        if parsed_behavior_id.team_id == self.wrapped_trainer_team:
+            # With a future multiagent trainer, this will be indexed by 'role'
+            internal_trajectory_queue: AgentManagerQueue[
+                Trajectory
+            ] = AgentManagerQueue(parsed_behavior_id.brain_name)
+
+            self._internal_trajectory_queues[
+                parsed_behavior_id.brain_name
+            ] = internal_trajectory_queue
+            self.trainer.subscribe_trajectory_queue(internal_trajectory_queue)
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/learn.py b/MLPY/Lib/site-packages/mlagents/trainers/learn.py
new file mode 100644
index 0000000000000000000000000000000000000000..69320920a585d8782c3596cea5e4b2fbefc24f69
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/learn.py
@@ -0,0 +1,269 @@
+# # Unity ML-Agents Toolkit
+from mlagents import torch_utils
+import yaml
+
+import os
+import numpy as np
+import json
+
+from typing import Callable, Optional, List
+
+import mlagents.trainers
+import mlagents_envs
+from mlagents.trainers.trainer_controller import TrainerController
+from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
+from mlagents.trainers.trainer import TrainerFactory
+from mlagents.trainers.directory_utils import (
+    validate_existing_directories,
+    setup_init_path,
+)
+from mlagents.trainers.stats import StatsReporter
+from mlagents.trainers.cli_utils import parser
+from mlagents_envs.environment import UnityEnvironment
+from mlagents.trainers.settings import RunOptions
+
+from mlagents.trainers.training_status import GlobalTrainingStatus
+from mlagents_envs.base_env import BaseEnv
+from mlagents.trainers.subprocess_env_manager import SubprocessEnvManager
+from mlagents_envs.side_channel.side_channel import SideChannel
+from mlagents_envs.timers import (
+    hierarchical_timer,
+    get_timer_tree,
+    add_metadata as add_timer_metadata,
+)
+from mlagents_envs import logging_util
+from mlagents.plugins.stats_writer import register_stats_writer_plugins
+from mlagents.plugins.trainer_type import register_trainer_plugins
+
+logger = logging_util.get_logger(__name__)
+
+TRAINING_STATUS_FILE_NAME = "training_status.json"
+
+
+def get_version_string() -> str:
+    return f""" Version information:
+  ml-agents: {mlagents.trainers.__version__},
+  ml-agents-envs: {mlagents_envs.__version__},
+  Communicator API: {UnityEnvironment.API_VERSION},
+  PyTorch: {torch_utils.torch.__version__}"""
+
+
+def parse_command_line(
+    argv: Optional[List[str]] = None,
+) -> RunOptions:
+    _, _ = register_trainer_plugins()
+    args = parser.parse_args(argv)
+    return RunOptions.from_argparse(args)
+
+
+def run_training(run_seed: int, options: RunOptions, num_areas: int) -> None:
+    """
+    Launches training session.
+    :param run_seed: Random seed used for training.
+    :param num_areas: Number of training areas to instantiate
+    :param options: parsed command line arguments
+    """
+    with hierarchical_timer("run_training.setup"):
+        torch_utils.set_torch_config(options.torch_settings)
+        checkpoint_settings = options.checkpoint_settings
+        env_settings = options.env_settings
+        engine_settings = options.engine_settings
+
+        run_logs_dir = checkpoint_settings.run_logs_dir
+        port: Optional[int] = env_settings.base_port
+        # Check if directory exists
+        validate_existing_directories(
+            checkpoint_settings.write_path,
+            checkpoint_settings.resume,
+            checkpoint_settings.force,
+            checkpoint_settings.maybe_init_path,
+        )
+        # Make run logs directory
+        os.makedirs(run_logs_dir, exist_ok=True)
+        # Load any needed states in case of resume
+        if checkpoint_settings.resume:
+            GlobalTrainingStatus.load_state(
+                os.path.join(run_logs_dir, "training_status.json")
+            )
+        # In case of initialization, set full init_path for all behaviors
+        elif checkpoint_settings.maybe_init_path is not None:
+            setup_init_path(options.behaviors, checkpoint_settings.maybe_init_path)
+
+        # Configure Tensorboard Writers and StatsReporter
+        stats_writers = register_stats_writer_plugins(options)
+        for sw in stats_writers:
+            StatsReporter.add_writer(sw)
+
+        if env_settings.env_path is None:
+            port = None
+        env_factory = create_environment_factory(
+            env_settings.env_path,
+            engine_settings.no_graphics,
+            run_seed,
+            num_areas,
+            port,
+            env_settings.env_args,
+            os.path.abspath(run_logs_dir),  # Unity environment requires absolute path
+        )
+
+        env_manager = SubprocessEnvManager(env_factory, options, env_settings.num_envs)
+        env_parameter_manager = EnvironmentParameterManager(
+            options.environment_parameters, run_seed, restore=checkpoint_settings.resume
+        )
+
+        trainer_factory = TrainerFactory(
+            trainer_config=options.behaviors,
+            output_path=checkpoint_settings.write_path,
+            train_model=not checkpoint_settings.inference,
+            load_model=checkpoint_settings.resume,
+            seed=run_seed,
+            param_manager=env_parameter_manager,
+            init_path=checkpoint_settings.maybe_init_path,
+            multi_gpu=False,
+        )
+        # Create controller and begin training.
+        tc = TrainerController(
+            trainer_factory,
+            checkpoint_settings.write_path,
+            checkpoint_settings.run_id,
+            env_parameter_manager,
+            not checkpoint_settings.inference,
+            run_seed,
+        )
+
+    # Begin training
+    try:
+        tc.start_learning(env_manager)
+    finally:
+        env_manager.close()
+        write_run_options(checkpoint_settings.write_path, options)
+        write_timing_tree(run_logs_dir)
+        write_training_status(run_logs_dir)
+
+
+def write_run_options(output_dir: str, run_options: RunOptions) -> None:
+    run_options_path = os.path.join(output_dir, "configuration.yaml")
+    try:
+        with open(run_options_path, "w") as f:
+            try:
+                yaml.dump(run_options.as_dict(), f, sort_keys=False)
+            except TypeError:  # Older versions of pyyaml don't support sort_keys
+                yaml.dump(run_options.as_dict(), f)
+    except FileNotFoundError:
+        logger.warning(
+            f"Unable to save configuration to {run_options_path}. Make sure the directory exists"
+        )
+
+
+def write_training_status(output_dir: str) -> None:
+    GlobalTrainingStatus.save_state(os.path.join(output_dir, TRAINING_STATUS_FILE_NAME))
+
+
+def write_timing_tree(output_dir: str) -> None:
+    timing_path = os.path.join(output_dir, "timers.json")
+    try:
+        with open(timing_path, "w") as f:
+            json.dump(get_timer_tree(), f, indent=4)
+    except FileNotFoundError:
+        logger.warning(
+            f"Unable to save to {timing_path}. Make sure the directory exists"
+        )
+
+
+def create_environment_factory(
+    env_path: Optional[str],
+    no_graphics: bool,
+    seed: int,
+    num_areas: int,
+    start_port: Optional[int],
+    env_args: Optional[List[str]],
+    log_folder: str,
+) -> Callable[[int, List[SideChannel]], BaseEnv]:
+    def create_unity_environment(
+        worker_id: int, side_channels: List[SideChannel]
+    ) -> UnityEnvironment:
+        # Make sure that each environment gets a different seed
+        env_seed = seed + worker_id
+        return UnityEnvironment(
+            file_name=env_path,
+            worker_id=worker_id,
+            seed=env_seed,
+            num_areas=num_areas,
+            no_graphics=no_graphics,
+            base_port=start_port,
+            additional_args=env_args,
+            side_channels=side_channels,
+            log_folder=log_folder,
+        )
+
+    return create_unity_environment
+
+
+def run_cli(options: RunOptions) -> None:
+    try:
+        print(
+            """
+            ┐  ╖
+        ╓╖╬│╡  ││╬╖╖
+    ╓╖╬│││││┘  ╬│││││╬╖
+ ╖╬│││││╬╜        ╙╬│││││╖╖                               ╗╗╗
+ ╬╬╬╬╖││╦╖        ╖╬││╗╣╣╣╬      ╟╣╣╬    ╟╣╣╣             ╜╜╜  ╟╣╣
+ ╬╬╬╬╬╬╬╬╖│╬╖╖╓╬╪│╓╣╣╣╣╣╣╣╬      ╟╣╣╬    ╟╣╣╣ ╒╣╣╖╗╣╣╣╗   ╣╣╣ ╣╣╣╣╣╣ ╟╣╣╖   ╣╣╣
+ ╬╬╬╬┐  ╙╬╬╬╬│╓╣╣╣╝╜  ╫╣╣╣╬      ╟╣╣╬    ╟╣╣╣ ╟╣╣╣╙ ╙╣╣╣  ╣╣╣ ╙╟╣╣╜╙  ╫╣╣  ╟╣╣
+ ╬╬╬╬┐     ╙╬╬╣╣      ╫╣╣╣╬      ╟╣╣╬    ╟╣╣╣ ╟╣╣╬   ╣╣╣  ╣╣╣  ╟╣╣     ╣╣╣┌╣╣╜
+ ╬╬╬╜       ╬╬╣╣      ╙╝╣╣╬      ╙╣╣╣╗╖╓╗╣╣╣╜ ╟╣╣╬   ╣╣╣  ╣╣╣  ╟╣╣╦╓    ╣╣╣╣╣
+ ╙   ╓╦╖    ╬╬╣╣   ╓╗╗╖            ╙╝╣╣╣╣╝╜   ╘╝╝╜   ╝╝╝  ╝╝╝   ╙╣╣╣    ╟╣╣╣
+   ╩╬╬╬╬╬╬╦╦╬╬╣╣╗╣╣╣╣╣╣╣╝                                             ╫╣╣╣╣
+      ╙╬╬╬╬╬╬╬╣╣╣╣╣╣╝╜
+          ╙╬╬╬╣╣╣╜
+             ╙
+        """
+        )
+    except Exception:
+        print("\n\n\tUnity Technologies\n")
+    print(get_version_string())
+
+    if options.debug:
+        log_level = logging_util.DEBUG
+    else:
+        log_level = logging_util.INFO
+
+    logging_util.set_log_level(log_level)
+
+    logger.debug("Configuration for this run:")
+    logger.debug(json.dumps(options.as_dict(), indent=4))
+
+    # Options deprecation warnings
+    if options.checkpoint_settings.load_model:
+        logger.warning(
+            "The --load option has been deprecated. Please use the --resume option instead."
+        )
+    if options.checkpoint_settings.train_model:
+        logger.warning(
+            "The --train option has been deprecated. Train mode is now the default. Use "
+            "--inference to run in inference mode."
+        )
+
+    run_seed = options.env_settings.seed
+    num_areas = options.env_settings.num_areas
+
+    # Add some timer metadata
+    add_timer_metadata("mlagents_version", mlagents.trainers.__version__)
+    add_timer_metadata("mlagents_envs_version", mlagents_envs.__version__)
+    add_timer_metadata("communication_protocol_version", UnityEnvironment.API_VERSION)
+    add_timer_metadata("pytorch_version", torch_utils.torch.__version__)
+    add_timer_metadata("numpy_version", np.__version__)
+
+    if options.env_settings.seed == -1:
+        run_seed = np.random.randint(0, 10000)
+        logger.debug(f"run_seed set to {run_seed}")
+    run_training(run_seed, options, num_areas)
+
+
+def main():
+    run_cli(parse_command_line())
+
+
+# For python debugger to directly run this script
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4f55d529f7761190dd1804e42dacd164dd2936c
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/model_saver.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/model_saver.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f496daddf5e3f6a50faeae8d141843831674105
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/model_saver.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/torch_model_saver.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/torch_model_saver.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..677fa2db96ed631363e469e8ee4673f22f3536b4
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/__pycache__/torch_model_saver.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/model_saver/model_saver.py b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/model_saver.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1594ff08e79e251a8cb7ce6c074d929819787ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/model_saver.py
@@ -0,0 +1,69 @@
+# # Unity ML-Agents Toolkit
+import abc
+from typing import Any, Tuple, List
+
+
+class BaseModelSaver(abc.ABC):
+    """This class is the base class for the ModelSaver"""
+
+    def __init__(self):
+        pass
+
+    @abc.abstractmethod
+    def register(self, module: Any) -> None:
+        """
+        Register the modules to the ModelSaver.
+        The ModelSaver will store the module and include it in the saved files
+        when saving checkpoint/exporting graph.
+        :param module: the module to be registered
+        """
+        pass
+
+    def _register_policy(self, policy):
+        """
+        Helper function for registering policy to the ModelSaver.
+        :param policy: the policy to be registered
+        """
+        pass
+
+    def _register_optimizer(self, optimizer):
+        """
+        Helper function for registering optimizer to the ModelSaver.
+        :param optimizer: the optimizer to be registered
+        """
+        pass
+
+    @abc.abstractmethod
+    def save_checkpoint(self, behavior_name: str, step: int) -> Tuple[str, List[str]]:
+        """
+        Checkpoints the policy on disk.
+        :param checkpoint_path: filepath to write the checkpoint
+        :param behavior_name: Behavior name of bevavior to be trained
+        :return: A Tuple of the path to the exported file, as well as a List of any
+            auxillary files that were returned. For instance, an exported file would be Model.onnx,
+            and the auxillary files would be [Model.pt] for PyTorch
+        """
+        pass
+
+    @abc.abstractmethod
+    def export(self, output_filepath: str, behavior_name: str) -> None:
+        """
+        Saves the serialized model, given a path and behavior name.
+        This method will save the policy graph to the given filepath.  The path
+        should be provided without an extension as multiple serialized model formats
+        may be generated as a result.
+        :param output_filepath: path (without suffix) for the model file(s)
+        :param behavior_name: Behavior name of behavior to be trained.
+        """
+        pass
+
+    @abc.abstractmethod
+    def initialize_or_load(self, policy):
+        """
+        Initialize/Load registered modules by default.
+        If given input argument policy, do with the input policy instead.
+        This argument is mainly for the initialization of the ghost trainer's fixed policy.
+        :param policy (optional): if given, perform the initializing/loading on this input policy.
+                                  Otherwise, do with the registered policy
+        """
+        pass
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/model_saver/torch_model_saver.py b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/torch_model_saver.py
new file mode 100644
index 0000000000000000000000000000000000000000..70c3f19e431d6cdd9b346a311e79745326238ff5
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/model_saver/torch_model_saver.py
@@ -0,0 +1,153 @@
+import os
+import shutil
+from mlagents.torch_utils import torch
+from typing import Dict, Union, Optional, cast, Tuple, List
+from mlagents_envs.exception import UnityPolicyException
+from mlagents_envs.logging_util import get_logger
+from mlagents.trainers.model_saver.model_saver import BaseModelSaver
+from mlagents.trainers.settings import TrainerSettings, SerializationSettings
+from mlagents.trainers.policy.torch_policy import TorchPolicy
+from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
+from mlagents.trainers.torch_entities.model_serialization import ModelSerializer
+
+
+logger = get_logger(__name__)
+DEFAULT_CHECKPOINT_NAME = "checkpoint.pt"
+
+
+class TorchModelSaver(BaseModelSaver):
+    """
+    ModelSaver class for PyTorch
+    """
+
+    def __init__(
+        self, trainer_settings: TrainerSettings, model_path: str, load: bool = False
+    ):
+        super().__init__()
+        self.model_path = model_path
+        self.initialize_path = trainer_settings.init_path
+        self._keep_checkpoints = trainer_settings.keep_checkpoints
+        self.load = load
+
+        self.policy: Optional[TorchPolicy] = None
+        self.exporter: Optional[ModelSerializer] = None
+        self.modules: Dict[str, torch.nn.Modules] = {}
+
+    def register(self, module: Union[TorchPolicy, TorchOptimizer]) -> None:
+        if isinstance(module, TorchPolicy) or isinstance(module, TorchOptimizer):
+            self.modules.update(module.get_modules())  # type: ignore
+        else:
+            raise UnityPolicyException(
+                "Registering Object of unsupported type {} to ModelSaver ".format(
+                    type(module)
+                )
+            )
+        if self.policy is None and isinstance(module, TorchPolicy):
+            self.policy = module
+            self.exporter = ModelSerializer(self.policy)
+
+    def save_checkpoint(self, behavior_name: str, step: int) -> Tuple[str, List[str]]:
+        if not os.path.exists(self.model_path):
+            os.makedirs(self.model_path)
+        checkpoint_path = os.path.join(self.model_path, f"{behavior_name}-{step}")
+        state_dict = {
+            name: module.state_dict() for name, module in self.modules.items()
+        }
+        pytorch_ckpt_path = f"{checkpoint_path}.pt"
+        export_ckpt_path = f"{checkpoint_path}.onnx"
+        torch.save(state_dict, f"{checkpoint_path}.pt")
+        torch.save(state_dict, os.path.join(self.model_path, DEFAULT_CHECKPOINT_NAME))
+        self.export(checkpoint_path, behavior_name)
+        return export_ckpt_path, [pytorch_ckpt_path]
+
+    def export(self, output_filepath: str, behavior_name: str) -> None:
+        if self.exporter is not None:
+            self.exporter.export_policy_model(output_filepath)
+
+    def initialize_or_load(self, policy: Optional[TorchPolicy] = None) -> None:
+        # Initialize/Load registered self.policy by default.
+        # If given input argument policy, use the input policy instead.
+        # This argument is mainly for initialization of the ghost trainer's fixed policy.
+        reset_steps = not self.load
+        if self.initialize_path is not None:
+            logger.info(f"Initializing from {self.initialize_path}.")
+            self._load_model(
+                self.initialize_path, policy, reset_global_steps=reset_steps
+            )
+        elif self.load:
+            logger.info(f"Resuming from {self.model_path}.")
+            self._load_model(
+                os.path.join(self.model_path, DEFAULT_CHECKPOINT_NAME),
+                policy,
+                reset_global_steps=reset_steps,
+            )
+
+    def _load_model(
+        self,
+        load_path: str,
+        policy: Optional[TorchPolicy] = None,
+        reset_global_steps: bool = False,
+    ) -> None:
+        saved_state_dict = torch.load(load_path)
+        if policy is None:
+            modules = self.modules
+            policy = self.policy
+        else:
+            modules = policy.get_modules()
+        policy = cast(TorchPolicy, policy)
+
+        for name, mod in modules.items():
+            try:
+                if isinstance(mod, torch.nn.Module):
+                    missing_keys, unexpected_keys = mod.load_state_dict(
+                        saved_state_dict[name], strict=False
+                    )
+                    if missing_keys:
+                        logger.warning(
+                            f"Did not find these keys {missing_keys} in checkpoint. Initializing."
+                        )
+                    if unexpected_keys:
+                        logger.warning(
+                            f"Did not expect these keys {unexpected_keys} in checkpoint. Ignoring."
+                        )
+                else:
+                    # If module is not an nn.Module, try to load as one piece
+                    mod.load_state_dict(saved_state_dict[name])
+
+            # KeyError is raised if the module was not present in the last run but is being
+            # accessed in the saved_state_dict.
+            # ValueError is raised by the optimizer's load_state_dict if the parameters have
+            # have changed. Note, the optimizer uses a completely different load_state_dict
+            # function because it is not an nn.Module.
+            # RuntimeError is raised by PyTorch if there is a size mismatch between modules
+            # of the same name. This will still partially assign values to those layers that
+            # have not changed shape.
+            except (KeyError, ValueError, RuntimeError) as err:
+                logger.warning(f"Failed to load for module {name}. Initializing")
+                logger.debug(f"Module loading error : {err}")
+
+        if reset_global_steps:
+            policy.set_step(0)
+            logger.info(
+                "Starting training from step 0 and saving to {}.".format(
+                    self.model_path
+                )
+            )
+        else:
+            logger.info(f"Resuming training from step {policy.get_current_step()}.")
+
+    def copy_final_model(self, source_nn_path: str) -> None:
+        """
+        Copy the .nn file at the given source to the destination.
+        Also copies the corresponding .onnx file if it exists.
+        """
+        final_model_name = os.path.splitext(source_nn_path)[0]
+
+        if SerializationSettings.convert_to_onnx:
+            try:
+                source_path = f"{final_model_name}.onnx"
+                destination_path = f"{self.model_path}.onnx"
+                shutil.copyfile(source_path, destination_path)
+                logger.info(f"Copied {source_path} to {destination_path}.")
+            except OSError:
+                pass
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7b9a71313c198fad246562c0e0f796053eb562
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__init__.py
@@ -0,0 +1 @@
+from mlagents.trainers.optimizer.optimizer import Optimizer  # noqa
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91a2c96bb345fb41c0116e7d7c8c49951c04b917
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/optimizer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/optimizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f3e62685ca86a1156dc09579a15965efbaeeafc
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/optimizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/torch_optimizer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/torch_optimizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b031ab29d01c2bd1b036eb57bec92df5ef621439
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/__pycache__/torch_optimizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/optimizer/optimizer.py b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d18e3c60ec292084127f28e77d144723804c4e9a
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/optimizer.py
@@ -0,0 +1,24 @@
+import abc
+from typing import Dict
+
+from mlagents.trainers.buffer import AgentBuffer
+
+
+class Optimizer(abc.ABC):
+    """
+    Creates loss functions and auxillary networks (e.g. Q or Value) needed for training.
+    Provides methods to update the Policy.
+    """
+
+    def __init__(self):
+        self.reward_signals = {}
+
+    @abc.abstractmethod
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        """
+        Update the Policy based on the batch that was passed in.
+        :param batch: AgentBuffer that contains the minibatch of data used for this update.
+        :param num_sequences: Number of recurrent sequences found in the minibatch.
+        :return: A Dict containing statistics (name, value) from the update (e.g. loss)
+        """
+        pass
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/optimizer/torch_optimizer.py b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/torch_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cb0a6ee8c0809ccbf7a15cdd474e92059575efc
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/optimizer/torch_optimizer.py
@@ -0,0 +1,211 @@
+from typing import Dict, Optional, Tuple, List
+from mlagents.torch_utils import torch
+import numpy as np
+from collections import defaultdict
+
+from mlagents.trainers.buffer import AgentBuffer, AgentBufferField
+from mlagents.trainers.trajectory import ObsUtil
+from mlagents.trainers.torch_entities.components.bc.module import BCModule
+from mlagents.trainers.torch_entities.components.reward_providers import (
+    create_reward_provider,
+)
+
+from mlagents.trainers.policy.torch_policy import TorchPolicy
+from mlagents.trainers.optimizer import Optimizer
+from mlagents.trainers.settings import (
+    TrainerSettings,
+    RewardSignalSettings,
+    RewardSignalType,
+)
+from mlagents.trainers.torch_entities.utils import ModelUtils
+
+
+class TorchOptimizer(Optimizer):
+    def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
+        super().__init__()
+        self.policy = policy
+        self.trainer_settings = trainer_settings
+        self.update_dict: Dict[str, torch.Tensor] = {}
+        self.value_heads: Dict[str, torch.Tensor] = {}
+        self.memory_in: torch.Tensor = None
+        self.memory_out: torch.Tensor = None
+        self.m_size: int = 0
+        self.global_step = torch.tensor(0)
+        self.bc_module: Optional[BCModule] = None
+        self.create_reward_signals(trainer_settings.reward_signals)
+        self.critic_memory_dict: Dict[str, torch.Tensor] = {}
+        if trainer_settings.behavioral_cloning is not None:
+            self.bc_module = BCModule(
+                self.policy,
+                trainer_settings.behavioral_cloning,
+                policy_learning_rate=trainer_settings.hyperparameters.learning_rate,
+                default_batch_size=trainer_settings.hyperparameters.batch_size,
+                default_num_epoch=3,
+            )
+
+    @property
+    def critic(self):
+        raise NotImplementedError
+
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        pass
+
+    def create_reward_signals(
+        self, reward_signal_configs: Dict[RewardSignalType, RewardSignalSettings]
+    ) -> None:
+        """
+        Create reward signals
+        :param reward_signal_configs: Reward signal config.
+        """
+        for reward_signal, settings in reward_signal_configs.items():
+            # Name reward signals by string in case we have duplicates later
+            self.reward_signals[reward_signal.value] = create_reward_provider(
+                reward_signal, self.policy.behavior_spec, settings
+            )
+
+    def _evaluate_by_sequence(
+        self, tensor_obs: List[torch.Tensor], initial_memory: torch.Tensor
+    ) -> Tuple[Dict[str, torch.Tensor], AgentBufferField, torch.Tensor]:
+        """
+        Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the
+        intermediate memories for the critic.
+        :param tensor_obs: A List of tensors of shape (trajectory_len, <obs_dim>) that are the agent's
+            observations for this trajectory.
+        :param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,<mem_size>), i.e.
+            what is returned as the output of a MemoryModules.
+        :return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial
+            memories to be used during value function update, and the final memory at the end of the trajectory.
+        """
+        num_experiences = tensor_obs[0].shape[0]
+        all_next_memories = AgentBufferField()
+        # When using LSTM, we need to divide the trajectory into sequences of equal length. Sometimes,
+        # that division isn't even, and we must pad the leftover sequence.
+        # When it is added to the buffer, the last sequence will be padded. So if seq_len = 3 and
+        # trajectory is of length 10, the last sequence is [obs,pad,pad] once it is added to the buffer.
+        # Compute the number of elements in this sequence that will end up being padded.
+        leftover_seq_len = num_experiences % self.policy.sequence_length
+
+        all_values: Dict[str, List[np.ndarray]] = defaultdict(list)
+        _mem = initial_memory
+        # Evaluate other trajectories, carrying over _mem after each
+        # trajectory
+        for seq_num in range(num_experiences // self.policy.sequence_length):
+            seq_obs = []
+            for _ in range(self.policy.sequence_length):
+                all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze()))
+            start = seq_num * self.policy.sequence_length
+            end = (seq_num + 1) * self.policy.sequence_length
+
+            for _obs in tensor_obs:
+                seq_obs.append(_obs[start:end])
+            values, _mem = self.critic.critic_pass(
+                seq_obs, _mem, sequence_length=self.policy.sequence_length
+            )
+            for signal_name, _val in values.items():
+                all_values[signal_name].append(_val)
+
+        # Compute values for the potentially truncated last sequence. Note that this
+        # sequence isn't padded yet, but will be.
+        seq_obs = []
+
+        if leftover_seq_len > 0:
+            for _obs in tensor_obs:
+                last_seq_obs = _obs[-leftover_seq_len:]
+                seq_obs.append(last_seq_obs)
+
+            # For the last sequence, the initial memory should be the one at the
+            # end of this trajectory.
+            for _ in range(leftover_seq_len):
+                all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze()))
+
+            last_values, _mem = self.critic.critic_pass(
+                seq_obs, _mem, sequence_length=leftover_seq_len
+            )
+            for signal_name, _val in last_values.items():
+                all_values[signal_name].append(_val)
+
+        # Create one tensor per reward signal
+        all_value_tensors = {
+            signal_name: torch.cat(value_list, dim=0)
+            for signal_name, value_list in all_values.items()
+        }
+        next_mem = _mem
+        return all_value_tensors, all_next_memories, next_mem
+
+    def update_reward_signals(self, batch: AgentBuffer) -> Dict[str, float]:
+        update_stats: Dict[str, float] = {}
+        for reward_provider in self.reward_signals.values():
+            update_stats.update(reward_provider.update(batch))
+        return update_stats
+
+    def get_trajectory_value_estimates(
+        self,
+        batch: AgentBuffer,
+        next_obs: List[np.ndarray],
+        done: bool,
+        agent_id: str = "",
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField]]:
+        """
+        Get value estimates and memories for a trajectory, in batch form.
+        :param batch: An AgentBuffer that consists of a trajectory.
+        :param next_obs: the next observation (after the trajectory). Used for boostrapping
+            if this is not a termiinal trajectory.
+        :param done: Set true if this is a terminal trajectory.
+        :param agent_id: Agent ID of the agent that this trajectory belongs to.
+        :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)],
+            the final value estimate as a Dict of [name, float], and optionally (if using memories)
+            an AgentBufferField of initial critic memories to be used during update.
+        """
+        n_obs = len(self.policy.behavior_spec.observation_specs)
+
+        if agent_id in self.critic_memory_dict:
+            memory = self.critic_memory_dict[agent_id]
+        else:
+            memory = (
+                torch.zeros((1, 1, self.critic.memory_size))
+                if self.policy.use_recurrent
+                else None
+            )
+
+        # Convert to tensors
+        current_obs = [
+            ModelUtils.list_to_tensor(obs) for obs in ObsUtil.from_buffer(batch, n_obs)
+        ]
+        next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
+
+        next_obs = [obs.unsqueeze(0) for obs in next_obs]
+
+        # If we're using LSTM, we want to get all the intermediate memories.
+        all_next_memories: Optional[AgentBufferField] = None
+
+        # To prevent memory leak and improve performance, evaluate with no_grad.
+        with torch.no_grad():
+            if self.policy.use_recurrent:
+                (
+                    value_estimates,
+                    all_next_memories,
+                    next_memory,
+                ) = self._evaluate_by_sequence(current_obs, memory)
+            else:
+                value_estimates, next_memory = self.critic.critic_pass(
+                    current_obs, memory, sequence_length=batch.num_experiences
+                )
+
+        # Store the memory for the next trajectory. This should NOT have a gradient.
+        self.critic_memory_dict[agent_id] = next_memory
+
+        next_value_estimate, _ = self.critic.critic_pass(
+            next_obs, next_memory, sequence_length=1
+        )
+
+        for name, estimate in value_estimates.items():
+            value_estimates[name] = ModelUtils.to_numpy(estimate)
+            next_value_estimate[name] = ModelUtils.to_numpy(next_value_estimate[name])
+
+        if done:
+            for k in next_value_estimate:
+                if not self.reward_signals[k].ignore_done:
+                    next_value_estimate[k] = 0.0
+            if agent_id in self.critic_memory_dict:
+                self.critic_memory_dict.pop(agent_id)
+        return value_estimates, next_value_estimate, all_next_memories
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/poca/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/poca/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8d6e46d80918231d0c49c9e8a7660c96d9a60e1
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/optimizer_torch.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/optimizer_torch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20f71e62e75ec4c80baa7f8b08b679380aa0dfec
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/optimizer_torch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/trainer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1040c036b396c886d9209463f1410ad607a32b58
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/poca/__pycache__/trainer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/poca/optimizer_torch.py b/MLPY/Lib/site-packages/mlagents/trainers/poca/optimizer_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f77de4ebbf3f005fb7e2f4eb5ccda0f1bb640e0
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/poca/optimizer_torch.py
@@ -0,0 +1,690 @@
+from typing import Dict, cast, List, Tuple, Optional
+from collections import defaultdict
+import attr
+
+from mlagents.trainers.torch_entities.components.reward_providers.extrinsic_reward_provider import (
+    ExtrinsicRewardProvider,
+)
+import numpy as np
+from mlagents.torch_utils import torch, default_device
+
+from mlagents.trainers.buffer import (
+    AgentBuffer,
+    BufferKey,
+    RewardSignalUtil,
+    AgentBufferField,
+)
+
+from mlagents_envs.timers import timed
+from mlagents_envs.base_env import ObservationSpec, ActionSpec
+from mlagents.trainers.policy.torch_policy import TorchPolicy
+from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
+from mlagents.trainers.settings import (
+    RewardSignalSettings,
+    RewardSignalType,
+    TrainerSettings,
+    NetworkSettings,
+    OnPolicyHyperparamSettings,
+    ScheduleType,
+)
+from mlagents.trainers.torch_entities.networks import Critic, MultiAgentNetworkBody
+from mlagents.trainers.torch_entities.decoders import ValueHeads
+from mlagents.trainers.torch_entities.agent_action import AgentAction
+from mlagents.trainers.torch_entities.action_log_probs import ActionLogProbs
+from mlagents.trainers.torch_entities.utils import ModelUtils
+from mlagents.trainers.trajectory import ObsUtil, GroupObsUtil
+
+from mlagents_envs.logging_util import get_logger
+
+logger = get_logger(__name__)
+
+
+@attr.s(auto_attribs=True)
+class POCASettings(OnPolicyHyperparamSettings):
+    beta: float = 5.0e-3
+    epsilon: float = 0.2
+    lambd: float = 0.95
+    num_epoch: int = 3
+    learning_rate_schedule: ScheduleType = ScheduleType.LINEAR
+    beta_schedule: ScheduleType = ScheduleType.LINEAR
+    epsilon_schedule: ScheduleType = ScheduleType.LINEAR
+
+
+class TorchPOCAOptimizer(TorchOptimizer):
+    class POCAValueNetwork(torch.nn.Module, Critic):
+        """
+        The POCAValueNetwork uses the MultiAgentNetworkBody to compute the value
+        and POCA baseline for a variable number of agents in a group that all
+        share the same observation and action space.
+        """
+
+        def __init__(
+            self,
+            stream_names: List[str],
+            observation_specs: List[ObservationSpec],
+            network_settings: NetworkSettings,
+            action_spec: ActionSpec,
+        ):
+            torch.nn.Module.__init__(self)
+            self.network_body = MultiAgentNetworkBody(
+                observation_specs, network_settings, action_spec
+            )
+            if network_settings.memory is not None:
+                encoding_size = network_settings.memory.memory_size // 2
+            else:
+                encoding_size = network_settings.hidden_units
+
+            self.value_heads = ValueHeads(stream_names, encoding_size + 1, 1)
+            # The + 1 is for the normalized number of agents
+
+        @property
+        def memory_size(self) -> int:
+            return self.network_body.memory_size
+
+        def update_normalization(self, buffer: AgentBuffer) -> None:
+            self.network_body.update_normalization(buffer)
+
+        def baseline(
+            self,
+            obs_without_actions: List[torch.Tensor],
+            obs_with_actions: Tuple[List[List[torch.Tensor]], List[AgentAction]],
+            memories: Optional[torch.Tensor] = None,
+            sequence_length: int = 1,
+        ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
+            """
+            The POCA baseline marginalizes the action of the agent associated with self_obs.
+            It calls the forward pass of the MultiAgentNetworkBody with the state action
+            pairs of groupmates but just the state of the agent in question.
+            :param obs_without_actions: The obs of the agent for which to compute the baseline.
+            :param obs_with_actions: Tuple of observations and actions for all groupmates.
+            :param memories: If using memory, a Tensor of initial memories.
+            :param sequence_length: If using memory, the sequence length.
+
+            :return: A Tuple of Dict of reward stream to tensor and critic memories.
+            """
+            (obs, actions) = obs_with_actions
+            encoding, memories = self.network_body(
+                obs_only=[obs_without_actions],
+                obs=obs,
+                actions=actions,
+                memories=memories,
+                sequence_length=sequence_length,
+            )
+
+            value_outputs, critic_mem_out = self.forward(
+                encoding, memories, sequence_length
+            )
+            return value_outputs, critic_mem_out
+
+        def critic_pass(
+            self,
+            obs: List[List[torch.Tensor]],
+            memories: Optional[torch.Tensor] = None,
+            sequence_length: int = 1,
+        ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
+            """
+            A centralized value function. It calls the forward pass of MultiAgentNetworkBody
+            with just the states of all agents.
+            :param obs: List of observations for all agents in group
+            :param memories: If using memory, a Tensor of initial memories.
+            :param sequence_length: If using memory, the sequence length.
+            :return: A Tuple of Dict of reward stream to tensor and critic memories.
+            """
+            encoding, memories = self.network_body(
+                obs_only=obs,
+                obs=[],
+                actions=[],
+                memories=memories,
+                sequence_length=sequence_length,
+            )
+
+            value_outputs, critic_mem_out = self.forward(
+                encoding, memories, sequence_length
+            )
+            return value_outputs, critic_mem_out
+
+        def forward(
+            self,
+            encoding: torch.Tensor,
+            memories: Optional[torch.Tensor] = None,
+            sequence_length: int = 1,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+            output = self.value_heads(encoding)
+            return output, memories
+
+    def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
+        """
+        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
+        :param policy: A TorchPolicy object that will be updated by this POCA Optimizer.
+        :param trainer_params: Trainer parameters dictionary that specifies the
+        properties of the trainer.
+        """
+        # Create the graph here to give more granular control of the TF graph to the Optimizer.
+
+        super().__init__(policy, trainer_settings)
+        reward_signal_configs = trainer_settings.reward_signals
+        reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
+
+        self._critic = TorchPOCAOptimizer.POCAValueNetwork(
+            reward_signal_names,
+            policy.behavior_spec.observation_specs,
+            network_settings=trainer_settings.network_settings,
+            action_spec=policy.behavior_spec.action_spec,
+        )
+        # Move to GPU if needed
+        self._critic.to(default_device())
+
+        params = list(self.policy.actor.parameters()) + list(self.critic.parameters())
+
+        self.hyperparameters: POCASettings = cast(
+            POCASettings, trainer_settings.hyperparameters
+        )
+
+        self.decay_learning_rate = ModelUtils.DecayedValue(
+            self.hyperparameters.learning_rate_schedule,
+            self.hyperparameters.learning_rate,
+            1e-10,
+            self.trainer_settings.max_steps,
+        )
+        self.decay_epsilon = ModelUtils.DecayedValue(
+            self.hyperparameters.epsilon_schedule,
+            self.hyperparameters.epsilon,
+            0.1,
+            self.trainer_settings.max_steps,
+        )
+        self.decay_beta = ModelUtils.DecayedValue(
+            self.hyperparameters.beta_schedule,
+            self.hyperparameters.beta,
+            1e-5,
+            self.trainer_settings.max_steps,
+        )
+
+        self.optimizer = torch.optim.Adam(
+            params, lr=self.trainer_settings.hyperparameters.learning_rate
+        )
+        self.stats_name_to_update_name = {
+            "Losses/Value Loss": "value_loss",
+            "Losses/Policy Loss": "policy_loss",
+        }
+
+        self.stream_names = list(self.reward_signals.keys())
+        self.value_memory_dict: Dict[str, torch.Tensor] = {}
+        self.baseline_memory_dict: Dict[str, torch.Tensor] = {}
+
+    def create_reward_signals(
+        self, reward_signal_configs: Dict[RewardSignalType, RewardSignalSettings]
+    ) -> None:
+        """
+        Create reward signals. Override default to provide warnings for Curiosity and
+        GAIL, and make sure Extrinsic adds team rewards.
+        :param reward_signal_configs: Reward signal config.
+        """
+        for reward_signal in reward_signal_configs.keys():
+            if reward_signal != RewardSignalType.EXTRINSIC:
+                logger.warning(
+                    f"Reward signal {reward_signal.value.capitalize()} is not supported with the POCA trainer; "
+                    "results may be unexpected."
+                )
+        super().create_reward_signals(reward_signal_configs)
+        # Make sure we add the groupmate rewards in POCA, so agents learn how to help each
+        # other achieve individual rewards as well
+        for reward_provider in self.reward_signals.values():
+            if isinstance(reward_provider, ExtrinsicRewardProvider):
+                reward_provider.add_groupmate_rewards = True
+
+    @property
+    def critic(self):
+        return self._critic
+
+    @timed
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        """
+        Performs update on model.
+        :param batch: Batch of experiences.
+        :param num_sequences: Number of sequences to process.
+        :return: Results of update.
+        """
+        # Get decayed parameters
+        decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
+        decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step())
+        decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
+        returns = {}
+        old_values = {}
+        old_baseline_values = {}
+        for name in self.reward_signals:
+            old_values[name] = ModelUtils.list_to_tensor(
+                batch[RewardSignalUtil.value_estimates_key(name)]
+            )
+            returns[name] = ModelUtils.list_to_tensor(
+                batch[RewardSignalUtil.returns_key(name)]
+            )
+            old_baseline_values[name] = ModelUtils.list_to_tensor(
+                batch[RewardSignalUtil.baseline_estimates_key(name)]
+            )
+
+        n_obs = len(self.policy.behavior_spec.observation_specs)
+        current_obs = ObsUtil.from_buffer(batch, n_obs)
+        # Convert to tensors
+        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
+        groupmate_obs = GroupObsUtil.from_buffer(batch, n_obs)
+        groupmate_obs = [
+            [ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs]
+            for _groupmate_obs in groupmate_obs
+        ]
+
+        act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK])
+        actions = AgentAction.from_buffer(batch)
+        groupmate_actions = AgentAction.group_from_buffer(batch)
+
+        memories = [
+            ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i])
+            for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length)
+        ]
+        if len(memories) > 0:
+            memories = torch.stack(memories).unsqueeze(0)
+        value_memories = [
+            ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i])
+            for i in range(
+                0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length
+            )
+        ]
+
+        baseline_memories = [
+            ModelUtils.list_to_tensor(batch[BufferKey.BASELINE_MEMORY][i])
+            for i in range(
+                0, len(batch[BufferKey.BASELINE_MEMORY]), self.policy.sequence_length
+            )
+        ]
+
+        if len(value_memories) > 0:
+            value_memories = torch.stack(value_memories).unsqueeze(0)
+            baseline_memories = torch.stack(baseline_memories).unsqueeze(0)
+
+        run_out = self.policy.actor.get_stats(
+            current_obs,
+            actions,
+            masks=act_masks,
+            memories=memories,
+            sequence_length=self.policy.sequence_length,
+        )
+
+        log_probs = run_out["log_probs"]
+        entropy = run_out["entropy"]
+
+        all_obs = [current_obs] + groupmate_obs
+        values, _ = self.critic.critic_pass(
+            all_obs,
+            memories=value_memories,
+            sequence_length=self.policy.sequence_length,
+        )
+        groupmate_obs_and_actions = (groupmate_obs, groupmate_actions)
+        baselines, _ = self.critic.baseline(
+            current_obs,
+            groupmate_obs_and_actions,
+            memories=baseline_memories,
+            sequence_length=self.policy.sequence_length,
+        )
+        old_log_probs = ActionLogProbs.from_buffer(batch).flatten()
+        log_probs = log_probs.flatten()
+        loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool)
+
+        baseline_loss = ModelUtils.trust_region_value_loss(
+            baselines, old_baseline_values, returns, decay_eps, loss_masks
+        )
+        value_loss = ModelUtils.trust_region_value_loss(
+            values, old_values, returns, decay_eps, loss_masks
+        )
+        policy_loss = ModelUtils.trust_region_policy_loss(
+            ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]),
+            log_probs,
+            old_log_probs,
+            loss_masks,
+            decay_eps,
+        )
+
+        loss = (
+            policy_loss
+            + 0.5 * (value_loss + 0.5 * baseline_loss)
+            - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)
+        )
+
+        # Set optimizer learning rate
+        ModelUtils.update_learning_rate(self.optimizer, decay_lr)
+        self.optimizer.zero_grad()
+        loss.backward()
+
+        self.optimizer.step()
+        update_stats = {
+            # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow.
+            # TODO: After PyTorch is default, change to something more correct.
+            "Losses/Policy Loss": torch.abs(policy_loss).item(),
+            "Losses/Value Loss": value_loss.item(),
+            "Losses/Baseline Loss": baseline_loss.item(),
+            "Policy/Learning Rate": decay_lr,
+            "Policy/Epsilon": decay_eps,
+            "Policy/Beta": decay_bet,
+        }
+
+        return update_stats
+
+    def get_modules(self):
+        modules = {"Optimizer:adam": self.optimizer, "Optimizer:critic": self._critic}
+        for reward_provider in self.reward_signals.values():
+            modules.update(reward_provider.get_modules())
+        return modules
+
+    def _evaluate_by_sequence_team(
+        self,
+        self_obs: List[torch.Tensor],
+        obs: List[List[torch.Tensor]],
+        actions: List[AgentAction],
+        init_value_mem: torch.Tensor,
+        init_baseline_mem: torch.Tensor,
+    ) -> Tuple[
+        Dict[str, torch.Tensor],
+        Dict[str, torch.Tensor],
+        AgentBufferField,
+        AgentBufferField,
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        """
+        Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the
+        intermediate memories for the critic.
+        :param tensor_obs: A List of tensors of shape (trajectory_len, <obs_dim>) that are the agent's
+            observations for this trajectory.
+        :param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,<mem_size>), i.e.
+            what is returned as the output of a MemoryModules.
+        :return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial
+            memories to be used during value function update, and the final memory at the end of the trajectory.
+        """
+        num_experiences = self_obs[0].shape[0]
+        all_next_value_mem = AgentBufferField()
+        all_next_baseline_mem = AgentBufferField()
+
+        # When using LSTM, we need to divide the trajectory into sequences of equal length. Sometimes,
+        # that division isn't even, and we must pad the leftover sequence.
+        # In the buffer, the last sequence are the ones that are padded. So if seq_len = 3 and
+        # trajectory is of length 10, the last sequence is [obs,pad,pad].
+        # Compute the number of elements in this padded seq.
+        leftover_seq_len = num_experiences % self.policy.sequence_length
+
+        all_values: Dict[str, List[np.ndarray]] = defaultdict(list)
+        all_baseline: Dict[str, List[np.ndarray]] = defaultdict(list)
+        _baseline_mem = init_baseline_mem
+        _value_mem = init_value_mem
+
+        # Evaluate other trajectories, carrying over _mem after each
+        # trajectory
+        for seq_num in range(num_experiences // self.policy.sequence_length):
+            for _ in range(self.policy.sequence_length):
+                all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze()))
+                all_next_baseline_mem.append(
+                    ModelUtils.to_numpy(_baseline_mem.squeeze())
+                )
+
+            start = seq_num * self.policy.sequence_length
+            end = (seq_num + 1) * self.policy.sequence_length
+
+            self_seq_obs = []
+            groupmate_seq_obs = []
+            groupmate_seq_act = []
+            seq_obs = []
+            for _self_obs in self_obs:
+                seq_obs.append(_self_obs[start:end])
+            self_seq_obs.append(seq_obs)
+
+            for groupmate_obs, groupmate_action in zip(obs, actions):
+                seq_obs = []
+                for _obs in groupmate_obs:
+                    sliced_seq_obs = _obs[start:end]
+                    seq_obs.append(sliced_seq_obs)
+                groupmate_seq_obs.append(seq_obs)
+                _act = groupmate_action.slice(start, end)
+                groupmate_seq_act.append(_act)
+
+            all_seq_obs = self_seq_obs + groupmate_seq_obs
+            values, _value_mem = self.critic.critic_pass(
+                all_seq_obs, _value_mem, sequence_length=self.policy.sequence_length
+            )
+            for signal_name, _val in values.items():
+                all_values[signal_name].append(_val)
+
+            groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act)
+            baselines, _baseline_mem = self.critic.baseline(
+                self_seq_obs[0],
+                groupmate_obs_and_actions,
+                _baseline_mem,
+                sequence_length=self.policy.sequence_length,
+            )
+            for signal_name, _val in baselines.items():
+                all_baseline[signal_name].append(_val)
+
+        # Compute values for the potentially truncated initial sequence
+        if leftover_seq_len > 0:
+            self_seq_obs = []
+            groupmate_seq_obs = []
+            groupmate_seq_act = []
+            seq_obs = []
+            for _self_obs in self_obs:
+                last_seq_obs = _self_obs[-leftover_seq_len:]
+                seq_obs.append(last_seq_obs)
+            self_seq_obs.append(seq_obs)
+
+            for groupmate_obs, groupmate_action in zip(obs, actions):
+                seq_obs = []
+                for _obs in groupmate_obs:
+                    last_seq_obs = _obs[-leftover_seq_len:]
+                    seq_obs.append(last_seq_obs)
+                groupmate_seq_obs.append(seq_obs)
+                _act = groupmate_action.slice(len(_obs) - leftover_seq_len, len(_obs))
+                groupmate_seq_act.append(_act)
+
+            # For the last sequence, the initial memory should be the one at the
+            # beginning of this trajectory.
+            seq_obs = []
+            for _ in range(leftover_seq_len):
+                all_next_value_mem.append(ModelUtils.to_numpy(_value_mem.squeeze()))
+                all_next_baseline_mem.append(
+                    ModelUtils.to_numpy(_baseline_mem.squeeze())
+                )
+
+            all_seq_obs = self_seq_obs + groupmate_seq_obs
+            last_values, _value_mem = self.critic.critic_pass(
+                all_seq_obs, _value_mem, sequence_length=leftover_seq_len
+            )
+            for signal_name, _val in last_values.items():
+                all_values[signal_name].append(_val)
+            groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act)
+            last_baseline, _baseline_mem = self.critic.baseline(
+                self_seq_obs[0],
+                groupmate_obs_and_actions,
+                _baseline_mem,
+                sequence_length=leftover_seq_len,
+            )
+            for signal_name, _val in last_baseline.items():
+                all_baseline[signal_name].append(_val)
+        # Create one tensor per reward signal
+        all_value_tensors = {
+            signal_name: torch.cat(value_list, dim=0)
+            for signal_name, value_list in all_values.items()
+        }
+        all_baseline_tensors = {
+            signal_name: torch.cat(baseline_list, dim=0)
+            for signal_name, baseline_list in all_baseline.items()
+        }
+        next_value_mem = _value_mem
+        next_baseline_mem = _baseline_mem
+        return (
+            all_value_tensors,
+            all_baseline_tensors,
+            all_next_value_mem,
+            all_next_baseline_mem,
+            next_value_mem,
+            next_baseline_mem,
+        )
+
+    def get_trajectory_value_estimates(
+        self,
+        batch: AgentBuffer,
+        next_obs: List[np.ndarray],
+        done: bool,
+        agent_id: str = "",
+    ) -> Tuple[Dict[str, np.ndarray], Dict[str, float], Optional[AgentBufferField]]:
+        """
+        Override base class method. Unused in the trainer, but needed to make sure class heirarchy is maintained.
+        Assume that there are no group obs.
+        """
+        (
+            value_estimates,
+            _,
+            next_value_estimates,
+            all_next_value_mem,
+            _,
+        ) = self.get_trajectory_and_baseline_value_estimates(
+            batch, next_obs, [], done, agent_id
+        )
+
+        return value_estimates, next_value_estimates, all_next_value_mem
+
+    def get_trajectory_and_baseline_value_estimates(
+        self,
+        batch: AgentBuffer,
+        next_obs: List[np.ndarray],
+        next_groupmate_obs: List[List[np.ndarray]],
+        done: bool,
+        agent_id: str = "",
+    ) -> Tuple[
+        Dict[str, np.ndarray],
+        Dict[str, np.ndarray],
+        Dict[str, float],
+        Optional[AgentBufferField],
+        Optional[AgentBufferField],
+    ]:
+        """
+        Get value estimates, baseline estimates, and memories for a trajectory, in batch form.
+        :param batch: An AgentBuffer that consists of a trajectory.
+        :param next_obs: the next observation (after the trajectory). Used for boostrapping
+            if this is not a termiinal trajectory.
+        :param next_groupmate_obs: the next observations from other members of the group.
+        :param done: Set true if this is a terminal trajectory.
+        :param agent_id: Agent ID of the agent that this trajectory belongs to.
+        :returns: A Tuple of the Value Estimates as a Dict of [name, np.ndarray(trajectory_len)],
+            the baseline estimates as a Dict, the final value estimate as a Dict of [name, float], and
+            optionally (if using memories) an AgentBufferField of initial critic and baseline memories to be used
+            during update.
+        """
+
+        n_obs = len(self.policy.behavior_spec.observation_specs)
+
+        current_obs = ObsUtil.from_buffer(batch, n_obs)
+        groupmate_obs = GroupObsUtil.from_buffer(batch, n_obs)
+
+        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
+        groupmate_obs = [
+            [ModelUtils.list_to_tensor(obs) for obs in _groupmate_obs]
+            for _groupmate_obs in groupmate_obs
+        ]
+
+        groupmate_actions = AgentAction.group_from_buffer(batch)
+
+        next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
+        next_obs = [obs.unsqueeze(0) for obs in next_obs]
+
+        next_groupmate_obs = [
+            ModelUtils.list_to_tensor_list(_list_obs)
+            for _list_obs in next_groupmate_obs
+        ]
+        # Expand dimensions of next critic obs
+        next_groupmate_obs = [
+            [_obs.unsqueeze(0) for _obs in _list_obs]
+            for _list_obs in next_groupmate_obs
+        ]
+
+        if agent_id in self.value_memory_dict:
+            # The agent_id should always be in both since they are added together
+            _init_value_mem = self.value_memory_dict[agent_id]
+            _init_baseline_mem = self.baseline_memory_dict[agent_id]
+        else:
+            _init_value_mem = (
+                torch.zeros((1, 1, self.critic.memory_size))
+                if self.policy.use_recurrent
+                else None
+            )
+            _init_baseline_mem = (
+                torch.zeros((1, 1, self.critic.memory_size))
+                if self.policy.use_recurrent
+                else None
+            )
+
+        all_obs = (
+            [current_obs] + groupmate_obs
+            if groupmate_obs is not None
+            else [current_obs]
+        )
+        all_next_value_mem: Optional[AgentBufferField] = None
+        all_next_baseline_mem: Optional[AgentBufferField] = None
+        with torch.no_grad():
+            if self.policy.use_recurrent:
+                (
+                    value_estimates,
+                    baseline_estimates,
+                    all_next_value_mem,
+                    all_next_baseline_mem,
+                    next_value_mem,
+                    next_baseline_mem,
+                ) = self._evaluate_by_sequence_team(
+                    current_obs,
+                    groupmate_obs,
+                    groupmate_actions,
+                    _init_value_mem,
+                    _init_baseline_mem,
+                )
+            else:
+                value_estimates, next_value_mem = self.critic.critic_pass(
+                    all_obs, _init_value_mem, sequence_length=batch.num_experiences
+                )
+                groupmate_obs_and_actions = (groupmate_obs, groupmate_actions)
+                baseline_estimates, next_baseline_mem = self.critic.baseline(
+                    current_obs,
+                    groupmate_obs_and_actions,
+                    _init_baseline_mem,
+                    sequence_length=batch.num_experiences,
+                )
+        # Store the memory for the next trajectory
+        self.value_memory_dict[agent_id] = next_value_mem
+        self.baseline_memory_dict[agent_id] = next_baseline_mem
+
+        all_next_obs = (
+            [next_obs] + next_groupmate_obs
+            if next_groupmate_obs is not None
+            else [next_obs]
+        )
+
+        next_value_estimates, _ = self.critic.critic_pass(
+            all_next_obs, next_value_mem, sequence_length=1
+        )
+
+        for name, estimate in baseline_estimates.items():
+            baseline_estimates[name] = ModelUtils.to_numpy(estimate)
+
+        for name, estimate in value_estimates.items():
+            value_estimates[name] = ModelUtils.to_numpy(estimate)
+
+        # the base line and V shpuld  not be on the same done flag
+        for name, estimate in next_value_estimates.items():
+            next_value_estimates[name] = ModelUtils.to_numpy(estimate)
+
+        if done:
+            for k in next_value_estimates:
+                if not self.reward_signals[k].ignore_done:
+                    next_value_estimates[k][-1] = 0.0
+
+        return (
+            value_estimates,
+            baseline_estimates,
+            next_value_estimates,
+            all_next_value_mem,
+            all_next_baseline_mem,
+        )
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/poca/trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/poca/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..266a14932154fc7413c9949bef221a1c268c1d4f
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/poca/trainer.py
@@ -0,0 +1,249 @@
+# # Unity ML-Agents Toolkit
+# ## ML-Agents Learning (POCA)
+# Contains an implementation of MA-POCA.
+
+from collections import defaultdict
+from typing import cast, Dict, Union, Any, Type
+
+import numpy as np
+
+from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
+from mlagents_envs.logging_util import get_logger
+from mlagents_envs.base_env import BehaviorSpec
+from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
+from mlagents.trainers.trainer.on_policy_trainer import OnPolicyTrainer
+from mlagents.trainers.trainer.trainer_utils import lambda_return
+from mlagents.trainers.policy import Policy
+from mlagents.trainers.policy.torch_policy import TorchPolicy
+from mlagents.trainers.poca.optimizer_torch import TorchPOCAOptimizer, POCASettings
+from mlagents.trainers.trajectory import Trajectory
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.settings import TrainerSettings
+
+from mlagents.trainers.torch_entities.networks import SimpleActor, SharedActorCritic
+
+logger = get_logger(__name__)
+
+TRAINER_NAME = "poca"
+
+
+class POCATrainer(OnPolicyTrainer):
+    """The POCATrainer is an implementation of the MA-POCA algorithm."""
+
+    def __init__(
+        self,
+        behavior_name: str,
+        reward_buff_cap: int,
+        trainer_settings: TrainerSettings,
+        training: bool,
+        load: bool,
+        seed: int,
+        artifact_path: str,
+    ):
+        """
+        Responsible for collecting experiences and training POCA model.
+        :param behavior_name: The name of the behavior associated with trainer config
+        :param reward_buff_cap: Max reward history to track in the reward buffer
+        :param trainer_settings: The parameters for the trainer.
+        :param training: Whether the trainer is set for training.
+        :param load: Whether the model should be loaded.
+        :param seed: The seed the model will be initialized with
+        :param artifact_path: The directory within which to store artifacts from this trainer.
+        """
+        super().__init__(
+            behavior_name,
+            reward_buff_cap,
+            trainer_settings,
+            training,
+            load,
+            seed,
+            artifact_path,
+        )
+        self.hyperparameters: POCASettings = cast(
+            POCASettings, self.trainer_settings.hyperparameters
+        )
+        self.seed = seed
+        self.policy: TorchPolicy = None  # type: ignore
+        self.optimizer: TorchPOCAOptimizer = None  # type: ignore
+        self.collected_group_rewards: Dict[str, int] = defaultdict(lambda: 0)
+
+    def _process_trajectory(self, trajectory: Trajectory) -> None:
+        """
+        Takes a trajectory and processes it, putting it into the update buffer.
+        Processing involves calculating value and advantage targets for model updating step.
+        :param trajectory: The Trajectory tuple containing the steps to be processed.
+        """
+        super()._process_trajectory(trajectory)
+        agent_id = trajectory.agent_id  # All the agents should have the same ID
+
+        agent_buffer_trajectory = trajectory.to_agentbuffer()
+        # Update the normalization
+        if self.is_training:
+            self.policy.actor.update_normalization(agent_buffer_trajectory)
+            self.optimizer.critic.update_normalization(agent_buffer_trajectory)
+
+        # Get all value estimates
+        (
+            value_estimates,
+            baseline_estimates,
+            value_next,
+            value_memories,
+            baseline_memories,
+        ) = self.optimizer.get_trajectory_and_baseline_value_estimates(
+            agent_buffer_trajectory,
+            trajectory.next_obs,
+            trajectory.next_group_obs,
+            trajectory.all_group_dones_reached
+            and trajectory.done_reached
+            and not trajectory.interrupted,
+        )
+
+        if value_memories is not None and baseline_memories is not None:
+            agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(value_memories)
+            agent_buffer_trajectory[BufferKey.BASELINE_MEMORY].set(baseline_memories)
+
+        for name, v in value_estimates.items():
+            agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(name)].extend(
+                v
+            )
+            agent_buffer_trajectory[
+                RewardSignalUtil.baseline_estimates_key(name)
+            ].extend(baseline_estimates[name])
+            self._stats_reporter.add_stat(
+                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Baseline Estimate",
+                np.mean(baseline_estimates[name]),
+            )
+            self._stats_reporter.add_stat(
+                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
+                np.mean(value_estimates[name]),
+            )
+
+        self.collected_rewards["environment"][agent_id] += np.sum(
+            agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]
+        )
+        self.collected_group_rewards[agent_id] += np.sum(
+            agent_buffer_trajectory[BufferKey.GROUP_REWARD]
+        )
+        for name, reward_signal in self.optimizer.reward_signals.items():
+            evaluate_result = (
+                reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength
+            )
+            agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend(
+                evaluate_result
+            )
+            # Report the reward signals
+            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
+
+        # Compute lambda returns and advantage
+        tmp_advantages = []
+        for name in self.optimizer.reward_signals:
+
+            local_rewards = np.array(
+                agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].get_batch(),
+                dtype=np.float32,
+            )
+
+            baseline_estimate = agent_buffer_trajectory[
+                RewardSignalUtil.baseline_estimates_key(name)
+            ].get_batch()
+            v_estimates = agent_buffer_trajectory[
+                RewardSignalUtil.value_estimates_key(name)
+            ].get_batch()
+
+            lambd_returns = lambda_return(
+                r=local_rewards,
+                value_estimates=v_estimates,
+                gamma=self.optimizer.reward_signals[name].gamma,
+                lambd=self.hyperparameters.lambd,
+                value_next=value_next[name],
+            )
+
+            local_advantage = np.array(lambd_returns) - np.array(baseline_estimate)
+
+            agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set(
+                lambd_returns
+            )
+            agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set(
+                local_advantage
+            )
+            tmp_advantages.append(local_advantage)
+
+        # Get global advantages
+        global_advantages = list(
+            np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)
+        )
+        agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)
+
+        self._append_to_update_buffer(agent_buffer_trajectory)
+
+        # If this was a terminal trajectory, append stats and reset reward collection
+        if trajectory.done_reached:
+            self._update_end_episode_stats(agent_id, self.optimizer)
+            # Remove dead agents from group reward recording
+            if not trajectory.all_group_dones_reached:
+                self.collected_group_rewards.pop(agent_id)
+
+        # If the whole team is done, average the remaining group rewards.
+        if trajectory.all_group_dones_reached and trajectory.done_reached:
+            self.stats_reporter.add_stat(
+                "Environment/Group Cumulative Reward",
+                self.collected_group_rewards.get(agent_id, 0),
+                aggregation=StatsAggregationMethod.HISTOGRAM,
+            )
+            self.collected_group_rewards.pop(agent_id)
+
+    def _is_ready_update(self):
+        """
+        Returns whether or not the trainer has enough elements to run update model
+        :return: A boolean corresponding to whether or not update_model() can be run
+        """
+        size_of_buffer = self.update_buffer.num_experiences
+        return size_of_buffer > self.hyperparameters.buffer_size
+
+    def end_episode(self) -> None:
+        """
+        A signal that the Episode has ended. The buffer must be reset.
+        Get only called when the academy resets. For POCA, we should
+        also zero out the group rewards.
+        """
+        super().end_episode()
+        self.collected_group_rewards.clear()
+
+    def create_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
+    ) -> TorchPolicy:
+        """
+        Creates a policy with a PyTorch backend and POCA hyperparameters
+        :param parsed_behavior_id:
+        :param behavior_spec: specifications for policy construction
+        :return policy
+        """
+        actor_cls: Union[Type[SimpleActor], Type[SharedActorCritic]] = SimpleActor
+        actor_kwargs: Dict[str, Any] = {
+            "conditional_sigma": False,
+            "tanh_squash": False,
+        }
+
+        policy = TorchPolicy(
+            self.seed,
+            behavior_spec,
+            self.trainer_settings.network_settings,
+            actor_cls,
+            actor_kwargs,
+        )
+        return policy
+
+    def create_optimizer(self) -> TorchPOCAOptimizer:
+        return TorchPOCAOptimizer(self.policy, self.trainer_settings)
+
+    def get_policy(self, name_behavior_id: str) -> Policy:
+        """
+        Gets policy from trainer associated with name_behavior_id
+        :param name_behavior_id: full identifier of policy
+        """
+
+        return self.policy
+
+    @staticmethod
+    def get_trainer_name() -> str:
+        return TRAINER_NAME
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/policy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8348d8738b939b384661966836d61d4509b19f87
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/policy/__init__.py
@@ -0,0 +1 @@
+from mlagents.trainers.policy.policy import Policy  # noqa
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da57693726e5894ec1ad99d7891a3364d6e2965b
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/checkpoint_manager.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/checkpoint_manager.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5674d230aa50ff60add6ec5c5ee4aa756da9622
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/checkpoint_manager.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/policy.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/policy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68b66f6f868fa2e13d6e95619b153b8963989e92
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/policy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/torch_policy.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/torch_policy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ae17b2bdf44c825457634f1cb09b5b8c8c26cd0
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/policy/__pycache__/torch_policy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/checkpoint_manager.py b/MLPY/Lib/site-packages/mlagents/trainers/policy/checkpoint_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f3e2762b36357c97c6114d969a627d3b874a661
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/policy/checkpoint_manager.py
@@ -0,0 +1,101 @@
+# # Unity ML-Agents Toolkit
+from typing import Dict, Any, Optional, List
+import os
+import attr
+from mlagents.trainers.training_status import GlobalTrainingStatus, StatusType
+from mlagents_envs.logging_util import get_logger
+
+logger = get_logger(__name__)
+
+
+@attr.s(auto_attribs=True)
+class ModelCheckpoint:
+    steps: int
+    file_path: str
+    reward: Optional[float]
+    creation_time: float
+    auxillary_file_paths: List[str] = attr.ib(factory=list)
+
+
+class ModelCheckpointManager:
+    @staticmethod
+    def get_checkpoints(behavior_name: str) -> List[Dict[str, Any]]:
+        checkpoint_list = GlobalTrainingStatus.get_parameter_state(
+            behavior_name, StatusType.CHECKPOINTS
+        )
+        if not checkpoint_list:
+            checkpoint_list = []
+            GlobalTrainingStatus.set_parameter_state(
+                behavior_name, StatusType.CHECKPOINTS, checkpoint_list
+            )
+        return checkpoint_list
+
+    @staticmethod
+    def remove_checkpoint(checkpoint: Dict[str, Any]) -> None:
+        """
+        Removes a checkpoint stored in checkpoint_list.
+        If checkpoint cannot be found, no action is done.
+
+        :param checkpoint: A checkpoint stored in checkpoint_list
+        """
+        file_paths: List[str] = [checkpoint["file_path"]]
+        file_paths.extend(checkpoint["auxillary_file_paths"])
+        for file_path in file_paths:
+            if os.path.exists(file_path):
+                os.remove(file_path)
+                logger.debug(f"Removed checkpoint model {file_path}.")
+            else:
+                logger.debug(f"Checkpoint at {file_path} could not be found.")
+        return
+
+    @classmethod
+    def _cleanup_extra_checkpoints(
+        cls, checkpoints: List[Dict], keep_checkpoints: int
+    ) -> List[Dict]:
+        """
+        Ensures that the number of checkpoints stored are within the number
+        of checkpoints the user defines. If the limit is hit, checkpoints are
+        removed to create room for the next checkpoint to be inserted.
+
+        :param behavior_name: The behavior name whose checkpoints we will mange.
+        :param keep_checkpoints: Number of checkpoints to record (user-defined).
+        """
+        while len(checkpoints) > keep_checkpoints:
+            if keep_checkpoints <= 0 or len(checkpoints) == 0:
+                break
+            ModelCheckpointManager.remove_checkpoint(checkpoints.pop(0))
+        return checkpoints
+
+    @classmethod
+    def add_checkpoint(
+        cls, behavior_name: str, new_checkpoint: ModelCheckpoint, keep_checkpoints: int
+    ) -> None:
+        """
+        Make room for new checkpoint if needed and insert new checkpoint information.
+        :param behavior_name: Behavior name for the checkpoint.
+        :param new_checkpoint: The new checkpoint to be recorded.
+        :param keep_checkpoints: Number of checkpoints to record (user-defined).
+        """
+        new_checkpoint_dict = attr.asdict(new_checkpoint)
+        checkpoints = cls.get_checkpoints(behavior_name)
+        checkpoints.append(new_checkpoint_dict)
+        cls._cleanup_extra_checkpoints(checkpoints, keep_checkpoints)
+        GlobalTrainingStatus.set_parameter_state(
+            behavior_name, StatusType.CHECKPOINTS, checkpoints
+        )
+
+    @classmethod
+    def track_final_checkpoint(
+        cls, behavior_name: str, final_checkpoint: ModelCheckpoint
+    ) -> None:
+        """
+        Ensures number of checkpoints stored is within the max number of checkpoints
+        defined by the user and finally stores the information about the final
+        model (or intermediate model if training is interrupted).
+        :param behavior_name: Behavior name of the model.
+        :param final_checkpoint: Checkpoint information for the final model.
+        """
+        final_model_dict = attr.asdict(final_checkpoint)
+        GlobalTrainingStatus.set_parameter_state(
+            behavior_name, StatusType.FINAL_CHECKPOINT, final_model_dict
+        )
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/policy.py b/MLPY/Lib/site-packages/mlagents/trainers/policy/policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c5e9f72472d071676b9fbaa9ba845d0f31ab40b
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/policy/policy.py
@@ -0,0 +1,146 @@
+from abc import abstractmethod
+from typing import Dict, List, Optional
+import numpy as np
+
+from mlagents_envs.base_env import ActionTuple, BehaviorSpec, DecisionSteps
+from mlagents_envs.exception import UnityException
+
+from mlagents.trainers.action_info import ActionInfo
+from mlagents.trainers.settings import NetworkSettings
+from mlagents.trainers.behavior_id_utils import GlobalAgentId
+
+
+class UnityPolicyException(UnityException):
+    """
+    Related to errors with the Trainer.
+    """
+
+    pass
+
+
+class Policy:
+    def __init__(
+        self,
+        seed: int,
+        behavior_spec: BehaviorSpec,
+        network_settings: NetworkSettings,
+    ):
+        self.behavior_spec = behavior_spec
+        self.network_settings: NetworkSettings = network_settings
+        self.seed = seed
+        self.previous_action_dict: Dict[str, np.ndarray] = {}
+        self.previous_memory_dict: Dict[str, np.ndarray] = {}
+        self.memory_dict: Dict[str, np.ndarray] = {}
+        self.normalize = network_settings.normalize
+        self.use_recurrent = self.network_settings.memory is not None
+        self.m_size = 0
+        self.sequence_length = 1
+        if self.use_recurrent:
+            self.m_size = self.network_settings.memory.memory_size
+            self.sequence_length = self.network_settings.memory.sequence_length
+
+    def make_empty_memory(self, num_agents):
+        """
+        Creates empty memory for use with RNNs
+        :param num_agents: Number of agents.
+        :return: Numpy array of zeros.
+        """
+        return np.zeros((num_agents, self.m_size), dtype=np.float32)
+
+    def save_memories(
+        self, agent_ids: List[GlobalAgentId], memory_matrix: Optional[np.ndarray]
+    ) -> None:
+        if memory_matrix is None:
+            return
+
+        # Pass old memories into previous_memory_dict
+        for agent_id in agent_ids:
+            if agent_id in self.memory_dict:
+                self.previous_memory_dict[agent_id] = self.memory_dict[agent_id]
+
+        for index, agent_id in enumerate(agent_ids):
+            self.memory_dict[agent_id] = memory_matrix[index, :]
+
+    def retrieve_memories(self, agent_ids: List[GlobalAgentId]) -> np.ndarray:
+        memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
+        for index, agent_id in enumerate(agent_ids):
+            if agent_id in self.memory_dict:
+                memory_matrix[index, :] = self.memory_dict[agent_id]
+        return memory_matrix
+
+    def retrieve_previous_memories(self, agent_ids: List[GlobalAgentId]) -> np.ndarray:
+        memory_matrix = np.zeros((len(agent_ids), self.m_size), dtype=np.float32)
+        for index, agent_id in enumerate(agent_ids):
+            if agent_id in self.previous_memory_dict:
+                memory_matrix[index, :] = self.previous_memory_dict[agent_id]
+        return memory_matrix
+
+    def remove_memories(self, agent_ids: List[GlobalAgentId]) -> None:
+        for agent_id in agent_ids:
+            if agent_id in self.memory_dict:
+                self.memory_dict.pop(agent_id)
+            if agent_id in self.previous_memory_dict:
+                self.previous_memory_dict.pop(agent_id)
+
+    def make_empty_previous_action(self, num_agents: int) -> np.ndarray:
+        """
+        Creates empty previous action for use with RNNs and discrete control
+        :param num_agents: Number of agents.
+        :return: Numpy array of zeros.
+        """
+        return np.zeros(
+            (num_agents, self.behavior_spec.action_spec.discrete_size), dtype=np.int32
+        )
+
+    def save_previous_action(
+        self, agent_ids: List[GlobalAgentId], action_tuple: ActionTuple
+    ) -> None:
+        for index, agent_id in enumerate(agent_ids):
+            self.previous_action_dict[agent_id] = action_tuple.discrete[index, :]
+
+    def retrieve_previous_action(self, agent_ids: List[GlobalAgentId]) -> np.ndarray:
+        action_matrix = self.make_empty_previous_action(len(agent_ids))
+        for index, agent_id in enumerate(agent_ids):
+            if agent_id in self.previous_action_dict:
+                action_matrix[index, :] = self.previous_action_dict[agent_id]
+        return action_matrix
+
+    def remove_previous_action(self, agent_ids: List[GlobalAgentId]) -> None:
+        for agent_id in agent_ids:
+            if agent_id in self.previous_action_dict:
+                self.previous_action_dict.pop(agent_id)
+
+    def get_action(
+        self, decision_requests: DecisionSteps, worker_id: int = 0
+    ) -> ActionInfo:
+        raise NotImplementedError
+
+    @staticmethod
+    def check_nan_action(action: Optional[ActionTuple]) -> None:
+        # Fast NaN check on the action
+        # See https://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy for background.
+        if action is not None:
+            d = np.sum(action.continuous)
+            has_nan = np.isnan(d)
+            if has_nan:
+                raise RuntimeError("Continuous NaN action detected.")
+
+    @abstractmethod
+    def increment_step(self, n_steps):
+        pass
+
+    @abstractmethod
+    def get_current_step(self):
+        pass
+
+    @abstractmethod
+    def load_weights(self, values: List[np.ndarray]) -> None:
+        pass
+
+    @abstractmethod
+    def get_weights(self) -> List[np.ndarray]:
+        return []
+
+    @abstractmethod
+    def init_load_weights(self) -> None:
+        pass
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/policy/torch_policy.py b/MLPY/Lib/site-packages/mlagents/trainers/policy/torch_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..fceacda6e95c8fb31a7dbeb837a65992af94b41f
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/policy/torch_policy.py
@@ -0,0 +1,173 @@
+from typing import Any, Dict, List
+import numpy as np
+from mlagents.torch_utils import torch, default_device
+import copy
+
+from mlagents.trainers.action_info import ActionInfo
+from mlagents.trainers.behavior_id_utils import get_global_agent_id
+from mlagents.trainers.policy import Policy
+from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
+from mlagents_envs.timers import timed
+
+from mlagents.trainers.settings import NetworkSettings
+from mlagents.trainers.torch_entities.networks import GlobalSteps
+
+from mlagents.trainers.torch_entities.utils import ModelUtils
+
+EPSILON = 1e-7  # Small value to avoid divide by zero
+
+
+class TorchPolicy(Policy):
+    def __init__(
+        self,
+        seed: int,
+        behavior_spec: BehaviorSpec,
+        network_settings: NetworkSettings,
+        actor_cls: type,
+        actor_kwargs: Dict[str, Any],
+    ):
+        """
+        Policy that uses a multilayer perceptron to map the observations to actions. Could
+        also use a CNN to encode visual input prior to the MLP. Supports discrete and
+        continuous actions, as well as recurrent networks.
+        :param seed: Random seed.
+        :param behavior_spec: Assigned BehaviorSpec object.
+        :param network_settings: Defined network parameters.
+        :param actor_cls: The type of Actor
+        :param actor_kwargs: Keyword args for the Actor class
+        """
+        super().__init__(seed, behavior_spec, network_settings)
+        self.global_step = (
+            GlobalSteps()
+        )  # could be much simpler if TorchPolicy is nn.Module
+
+        self.stats_name_to_update_name = {
+            "Losses/Value Loss": "value_loss",
+            "Losses/Policy Loss": "policy_loss",
+        }
+
+        self.actor = actor_cls(
+            observation_specs=self.behavior_spec.observation_specs,
+            network_settings=network_settings,
+            action_spec=behavior_spec.action_spec,
+            **actor_kwargs,
+        )
+
+        # Save the m_size needed for export
+        self._export_m_size = self.m_size
+        # m_size needed for training is determined by network, not trainer settings
+        self.m_size = self.actor.memory_size
+
+        self.actor.to(default_device())
+
+    @property
+    def export_memory_size(self) -> int:
+        """
+        Returns the memory size of the exported ONNX policy. This only includes the memory
+        of the Actor and not any auxillary networks.
+        """
+        return self._export_m_size
+
+    def _extract_masks(self, decision_requests: DecisionSteps) -> np.ndarray:
+        mask = None
+        if self.behavior_spec.action_spec.discrete_size > 0:
+            num_discrete_flat = np.sum(self.behavior_spec.action_spec.discrete_branches)
+            mask = torch.ones([len(decision_requests), num_discrete_flat])
+            if decision_requests.action_mask is not None:
+                mask = torch.as_tensor(
+                    1 - np.concatenate(decision_requests.action_mask, axis=1)
+                )
+        return mask
+
+    @timed
+    def evaluate(
+        self, decision_requests: DecisionSteps, global_agent_ids: List[str]
+    ) -> Dict[str, Any]:
+        """
+        Evaluates policy for the agent experiences provided.
+        :param global_agent_ids:
+        :param decision_requests: DecisionStep object containing inputs.
+        :return: Outputs from network as defined by self.inference_dict.
+        """
+        obs = decision_requests.obs
+        masks = self._extract_masks(decision_requests)
+        tensor_obs = [torch.as_tensor(np_ob) for np_ob in obs]
+
+        memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
+            0
+        )
+        with torch.no_grad():
+            action, run_out, memories = self.actor.get_action_and_stats(
+                tensor_obs, masks=masks, memories=memories
+            )
+        run_out["action"] = action.to_action_tuple()
+        if "log_probs" in run_out:
+            run_out["log_probs"] = run_out["log_probs"].to_log_probs_tuple()
+        if "entropy" in run_out:
+            run_out["entropy"] = ModelUtils.to_numpy(run_out["entropy"])
+        if self.use_recurrent:
+            run_out["memory_out"] = ModelUtils.to_numpy(memories).squeeze(0)
+        return run_out
+
+    def get_action(
+        self, decision_requests: DecisionSteps, worker_id: int = 0
+    ) -> ActionInfo:
+        """
+        Decides actions given observations information, and takes them in environment.
+        :param worker_id:
+        :param decision_requests: A dictionary of behavior names and DecisionSteps from environment.
+        :return: an ActionInfo containing action, memories, values and an object
+        to be passed to add experiences
+        """
+        if len(decision_requests) == 0:
+            return ActionInfo.empty()
+
+        global_agent_ids = [
+            get_global_agent_id(worker_id, int(agent_id))
+            for agent_id in decision_requests.agent_id
+        ]  # For 1-D array, the iterator order is correct.
+
+        run_out = self.evaluate(decision_requests, global_agent_ids)
+        self.save_memories(global_agent_ids, run_out.get("memory_out"))
+        self.check_nan_action(run_out.get("action"))
+        return ActionInfo(
+            action=run_out.get("action"),
+            env_action=run_out.get("env_action"),
+            outputs=run_out,
+            agent_ids=list(decision_requests.agent_id),
+        )
+
+    def get_current_step(self):
+        """
+        Gets current model step.
+        :return: current model step.
+        """
+        return self.global_step.current_step
+
+    def set_step(self, step: int) -> int:
+        """
+        Sets current model step to step without creating additional ops.
+        :param step: Step to set the current model step to.
+        :return: The step the model was set to.
+        """
+        self.global_step.current_step = step
+        return step
+
+    def increment_step(self, n_steps):
+        """
+        Increments model step.
+        """
+        self.global_step.increment(n_steps)
+        return self.get_current_step()
+
+    def load_weights(self, values: List[np.ndarray]) -> None:
+        self.actor.load_state_dict(values)
+
+    def init_load_weights(self) -> None:
+        pass
+
+    def get_weights(self) -> List[np.ndarray]:
+        return copy.deepcopy(self.actor.state_dict())
+
+    def get_modules(self):
+        return {"Policy": self.actor, "global_step": self.global_step}
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ppo/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27701694d1965d32e08249fc92ce943bb5483cff
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/optimizer_torch.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/optimizer_torch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32cde79232d9c19fdd1582ba7c9edfde2fb50cc2
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/optimizer_torch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/trainer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8bf6bf44fff1af85cc7242253cc84e5cfb4fb20
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/ppo/__pycache__/trainer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ppo/optimizer_torch.py b/MLPY/Lib/site-packages/mlagents/trainers/ppo/optimizer_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..41a452c65cb114e81767d92ac90d198aa44ef6e9
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/ppo/optimizer_torch.py
@@ -0,0 +1,207 @@
+from typing import Dict, cast
+import attr
+
+from mlagents.torch_utils import torch, default_device
+
+from mlagents.trainers.buffer import AgentBuffer, BufferKey, RewardSignalUtil
+
+from mlagents_envs.timers import timed
+from mlagents.trainers.policy.torch_policy import TorchPolicy
+from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
+from mlagents.trainers.settings import (
+    TrainerSettings,
+    OnPolicyHyperparamSettings,
+    ScheduleType,
+)
+from mlagents.trainers.torch_entities.networks import ValueNetwork
+from mlagents.trainers.torch_entities.agent_action import AgentAction
+from mlagents.trainers.torch_entities.action_log_probs import ActionLogProbs
+from mlagents.trainers.torch_entities.utils import ModelUtils
+from mlagents.trainers.trajectory import ObsUtil
+
+
+@attr.s(auto_attribs=True)
+class PPOSettings(OnPolicyHyperparamSettings):
+    beta: float = 5.0e-3
+    epsilon: float = 0.2
+    lambd: float = 0.95
+    num_epoch: int = 3
+    shared_critic: bool = False
+    learning_rate_schedule: ScheduleType = ScheduleType.LINEAR
+    beta_schedule: ScheduleType = ScheduleType.LINEAR
+    epsilon_schedule: ScheduleType = ScheduleType.LINEAR
+
+
+class TorchPPOOptimizer(TorchOptimizer):
+    def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
+        """
+        Takes a Policy and a Dict of trainer parameters and creates an Optimizer around the policy.
+        The PPO optimizer has a value estimator and a loss function.
+        :param policy: A TorchPolicy object that will be updated by this PPO Optimizer.
+        :param trainer_params: Trainer parameters dictionary that specifies the
+        properties of the trainer.
+        """
+        # Create the graph here to give more granular control of the TF graph to the Optimizer.
+
+        super().__init__(policy, trainer_settings)
+        reward_signal_configs = trainer_settings.reward_signals
+        reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
+
+        self.hyperparameters: PPOSettings = cast(
+            PPOSettings, trainer_settings.hyperparameters
+        )
+
+        params = list(self.policy.actor.parameters())
+        if self.hyperparameters.shared_critic:
+            self._critic = policy.actor
+        else:
+            self._critic = ValueNetwork(
+                reward_signal_names,
+                policy.behavior_spec.observation_specs,
+                network_settings=trainer_settings.network_settings,
+            )
+            self._critic.to(default_device())
+            params += list(self._critic.parameters())
+
+        self.decay_learning_rate = ModelUtils.DecayedValue(
+            self.hyperparameters.learning_rate_schedule,
+            self.hyperparameters.learning_rate,
+            1e-10,
+            self.trainer_settings.max_steps,
+        )
+        self.decay_epsilon = ModelUtils.DecayedValue(
+            self.hyperparameters.epsilon_schedule,
+            self.hyperparameters.epsilon,
+            0.1,
+            self.trainer_settings.max_steps,
+        )
+        self.decay_beta = ModelUtils.DecayedValue(
+            self.hyperparameters.beta_schedule,
+            self.hyperparameters.beta,
+            1e-5,
+            self.trainer_settings.max_steps,
+        )
+
+        self.optimizer = torch.optim.Adam(
+            params, lr=self.trainer_settings.hyperparameters.learning_rate
+        )
+        self.stats_name_to_update_name = {
+            "Losses/Value Loss": "value_loss",
+            "Losses/Policy Loss": "policy_loss",
+        }
+
+        self.stream_names = list(self.reward_signals.keys())
+
+    @property
+    def critic(self):
+        return self._critic
+
+    @timed
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        """
+        Performs update on model.
+        :param batch: Batch of experiences.
+        :param num_sequences: Number of sequences to process.
+        :return: Results of update.
+        """
+        # Get decayed parameters
+        decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
+        decay_eps = self.decay_epsilon.get_value(self.policy.get_current_step())
+        decay_bet = self.decay_beta.get_value(self.policy.get_current_step())
+        returns = {}
+        old_values = {}
+        for name in self.reward_signals:
+            old_values[name] = ModelUtils.list_to_tensor(
+                batch[RewardSignalUtil.value_estimates_key(name)]
+            )
+            returns[name] = ModelUtils.list_to_tensor(
+                batch[RewardSignalUtil.returns_key(name)]
+            )
+
+        n_obs = len(self.policy.behavior_spec.observation_specs)
+        current_obs = ObsUtil.from_buffer(batch, n_obs)
+        # Convert to tensors
+        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
+
+        act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK])
+        actions = AgentAction.from_buffer(batch)
+
+        memories = [
+            ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i])
+            for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length)
+        ]
+        if len(memories) > 0:
+            memories = torch.stack(memories).unsqueeze(0)
+
+        # Get value memories
+        value_memories = [
+            ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i])
+            for i in range(
+                0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length
+            )
+        ]
+        if len(value_memories) > 0:
+            value_memories = torch.stack(value_memories).unsqueeze(0)
+
+        run_out = self.policy.actor.get_stats(
+            current_obs,
+            actions,
+            masks=act_masks,
+            memories=memories,
+            sequence_length=self.policy.sequence_length,
+        )
+
+        log_probs = run_out["log_probs"]
+        entropy = run_out["entropy"]
+
+        values, _ = self.critic.critic_pass(
+            current_obs,
+            memories=value_memories,
+            sequence_length=self.policy.sequence_length,
+        )
+        old_log_probs = ActionLogProbs.from_buffer(batch).flatten()
+        log_probs = log_probs.flatten()
+        loss_masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool)
+        value_loss = ModelUtils.trust_region_value_loss(
+            values, old_values, returns, decay_eps, loss_masks
+        )
+        policy_loss = ModelUtils.trust_region_policy_loss(
+            ModelUtils.list_to_tensor(batch[BufferKey.ADVANTAGES]),
+            log_probs,
+            old_log_probs,
+            loss_masks,
+            decay_eps,
+        )
+        loss = (
+            policy_loss
+            + 0.5 * value_loss
+            - decay_bet * ModelUtils.masked_mean(entropy, loss_masks)
+        )
+
+        # Set optimizer learning rate
+        ModelUtils.update_learning_rate(self.optimizer, decay_lr)
+        self.optimizer.zero_grad()
+        loss.backward()
+
+        self.optimizer.step()
+        update_stats = {
+            # NOTE: abs() is not technically correct, but matches the behavior in TensorFlow.
+            # TODO: After PyTorch is default, change to something more correct.
+            "Losses/Policy Loss": torch.abs(policy_loss).item(),
+            "Losses/Value Loss": value_loss.item(),
+            "Policy/Learning Rate": decay_lr,
+            "Policy/Epsilon": decay_eps,
+            "Policy/Beta": decay_bet,
+        }
+
+        return update_stats
+
+    # TODO move module update into TorchOptimizer for reward_provider
+    def get_modules(self):
+        modules = {
+            "Optimizer:value_optimizer": self.optimizer,
+            "Optimizer:critic": self._critic,
+        }
+        for reward_provider in self.reward_signals.values():
+            modules.update(reward_provider.get_modules())
+        return modules
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/ppo/trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/ppo/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7421f0da155244058ac234edc7b49fedc3f046e
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/ppo/trainer.py
@@ -0,0 +1,213 @@
+# # Unity ML-Agents Toolkit
+# ## ML-Agent Learning (PPO)
+# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347
+
+from typing import cast, Type, Union, Dict, Any
+
+import numpy as np
+
+from mlagents_envs.base_env import BehaviorSpec
+from mlagents_envs.logging_util import get_logger
+from mlagents.trainers.buffer import BufferKey, RewardSignalUtil
+from mlagents.trainers.trainer.on_policy_trainer import OnPolicyTrainer
+from mlagents.trainers.policy.policy import Policy
+from mlagents.trainers.trainer.trainer_utils import get_gae
+from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
+from mlagents.trainers.policy.torch_policy import TorchPolicy
+from mlagents.trainers.ppo.optimizer_torch import TorchPPOOptimizer, PPOSettings
+from mlagents.trainers.trajectory import Trajectory
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.settings import TrainerSettings
+
+from mlagents.trainers.torch_entities.networks import SimpleActor, SharedActorCritic
+
+logger = get_logger(__name__)
+
+TRAINER_NAME = "ppo"
+
+
+class PPOTrainer(OnPolicyTrainer):
+    """The PPOTrainer is an implementation of the PPO algorithm."""
+
+    def __init__(
+        self,
+        behavior_name: str,
+        reward_buff_cap: int,
+        trainer_settings: TrainerSettings,
+        training: bool,
+        load: bool,
+        seed: int,
+        artifact_path: str,
+    ):
+        """
+        Responsible for collecting experiences and training PPO model.
+        :param behavior_name: The name of the behavior associated with trainer config
+        :param reward_buff_cap: Max reward history to track in the reward buffer
+        :param trainer_settings: The parameters for the trainer.
+        :param training: Whether the trainer is set for training.
+        :param load: Whether the model should be loaded.
+        :param seed: The seed the model will be initialized with
+        :param artifact_path: The directory within which to store artifacts from this trainer.
+        """
+        super().__init__(
+            behavior_name,
+            reward_buff_cap,
+            trainer_settings,
+            training,
+            load,
+            seed,
+            artifact_path,
+        )
+        self.hyperparameters: PPOSettings = cast(
+            PPOSettings, self.trainer_settings.hyperparameters
+        )
+        self.seed = seed
+        self.shared_critic = self.hyperparameters.shared_critic
+        self.policy: TorchPolicy = None  # type: ignore
+
+    def _process_trajectory(self, trajectory: Trajectory) -> None:
+        """
+        Takes a trajectory and processes it, putting it into the update buffer.
+        Processing involves calculating value and advantage targets for model updating step.
+        :param trajectory: The Trajectory tuple containing the steps to be processed.
+        """
+        super()._process_trajectory(trajectory)
+        agent_id = trajectory.agent_id  # All the agents should have the same ID
+
+        agent_buffer_trajectory = trajectory.to_agentbuffer()
+        # Check if we used group rewards, warn if so.
+        self._warn_if_group_reward(agent_buffer_trajectory)
+
+        # Update the normalization
+        if self.is_training:
+            self.policy.actor.update_normalization(agent_buffer_trajectory)
+            self.optimizer.critic.update_normalization(agent_buffer_trajectory)
+
+        # Get all value estimates
+        (
+            value_estimates,
+            value_next,
+            value_memories,
+        ) = self.optimizer.get_trajectory_value_estimates(
+            agent_buffer_trajectory,
+            trajectory.next_obs,
+            trajectory.done_reached and not trajectory.interrupted,
+        )
+        if value_memories is not None:
+            agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(value_memories)
+
+        for name, v in value_estimates.items():
+            agent_buffer_trajectory[RewardSignalUtil.value_estimates_key(name)].extend(
+                v
+            )
+            self._stats_reporter.add_stat(
+                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value Estimate",
+                np.mean(v),
+            )
+
+        # Evaluate all reward functions
+        self.collected_rewards["environment"][agent_id] += np.sum(
+            agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]
+        )
+        for name, reward_signal in self.optimizer.reward_signals.items():
+            evaluate_result = (
+                reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength
+            )
+            agent_buffer_trajectory[RewardSignalUtil.rewards_key(name)].extend(
+                evaluate_result
+            )
+            # Report the reward signals
+            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
+
+        # Compute GAE and returns
+        tmp_advantages = []
+        tmp_returns = []
+        for name in self.optimizer.reward_signals:
+            bootstrap_value = value_next[name]
+
+            local_rewards = agent_buffer_trajectory[
+                RewardSignalUtil.rewards_key(name)
+            ].get_batch()
+            local_value_estimates = agent_buffer_trajectory[
+                RewardSignalUtil.value_estimates_key(name)
+            ].get_batch()
+
+            local_advantage = get_gae(
+                rewards=local_rewards,
+                value_estimates=local_value_estimates,
+                value_next=bootstrap_value,
+                gamma=self.optimizer.reward_signals[name].gamma,
+                lambd=self.hyperparameters.lambd,
+            )
+            local_return = local_advantage + local_value_estimates
+            # This is later use as target for the different value estimates
+            agent_buffer_trajectory[RewardSignalUtil.returns_key(name)].set(
+                local_return
+            )
+            agent_buffer_trajectory[RewardSignalUtil.advantage_key(name)].set(
+                local_advantage
+            )
+            tmp_advantages.append(local_advantage)
+            tmp_returns.append(local_return)
+
+        # Get global advantages
+        global_advantages = list(
+            np.mean(np.array(tmp_advantages, dtype=np.float32), axis=0)
+        )
+        global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
+        agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)
+        agent_buffer_trajectory[BufferKey.DISCOUNTED_RETURNS].set(global_returns)
+
+        self._append_to_update_buffer(agent_buffer_trajectory)
+
+        # If this was a terminal trajectory, append stats and reset reward collection
+        if trajectory.done_reached:
+            self._update_end_episode_stats(agent_id, self.optimizer)
+
+    def create_optimizer(self) -> TorchOptimizer:
+        return TorchPPOOptimizer(  # type: ignore
+            cast(TorchPolicy, self.policy), self.trainer_settings  # type: ignore
+        )  # type: ignore
+
+    def create_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
+    ) -> TorchPolicy:
+        """
+        Creates a policy with a PyTorch backend and PPO hyperparameters
+        :param parsed_behavior_id:
+        :param behavior_spec: specifications for policy construction
+        :return policy
+        """
+        actor_cls: Union[Type[SimpleActor], Type[SharedActorCritic]] = SimpleActor
+        actor_kwargs: Dict[str, Any] = {
+            "conditional_sigma": False,
+            "tanh_squash": False,
+        }
+        if self.shared_critic:
+            reward_signal_configs = self.trainer_settings.reward_signals
+            reward_signal_names = [
+                key.value for key, _ in reward_signal_configs.items()
+            ]
+            actor_cls = SharedActorCritic
+            actor_kwargs.update({"stream_names": reward_signal_names})
+
+        policy = TorchPolicy(
+            self.seed,
+            behavior_spec,
+            self.trainer_settings.network_settings,
+            actor_cls,
+            actor_kwargs,
+        )
+        return policy
+
+    def get_policy(self, name_behavior_id: str) -> Policy:
+        """
+        Gets policy from trainer associated with name_behavior_id
+        :param name_behavior_id: full identifier of policy
+        """
+
+        return self.policy
+
+    @staticmethod
+    def get_trainer_name() -> str:
+        return TRAINER_NAME
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/run_experiment.py b/MLPY/Lib/site-packages/mlagents/trainers/run_experiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..8544b673bcf209ac1c42a4967788dec5ef45c94b
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/run_experiment.py
@@ -0,0 +1,31 @@
+import argparse
+from typing import Optional, List
+from mlagents.trainers.learn import run_cli
+from mlagents.trainers.settings import RunOptions
+from mlagents.trainers.cli_utils import load_config
+
+from mlagents.plugins.trainer_type import register_trainer_plugins
+
+
+def parse_command_line(argv: Optional[List[str]] = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("experiment_config_path")
+    return parser.parse_args(argv)
+
+
+def main():
+    """
+    Provides an alternative CLI interface to mlagents-learn, 'mlagents-run-experiment'.
+    Accepts a JSON/YAML formatted mlagents.trainers.learn.RunOptions object, and executes
+    the run loop as defined in mlagents.trainers.learn.run_cli.
+    """
+    args = parse_command_line()
+    expt_config = load_config(args.experiment_config_path)
+    _, _ = register_trainer_plugins()
+    run_cli(RunOptions.from_dict(expt_config))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/sac/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/sac/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb216e3d08c3e4aa43b01e7776954288e37e66cc
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/optimizer_torch.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/optimizer_torch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d9b022d73a26de28c08d1db78a6a4935f838547
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/optimizer_torch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/trainer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18b510bde245fb576c6a0aadc9a2126f5ad5b9cf
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/sac/__pycache__/trainer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/sac/optimizer_torch.py b/MLPY/Lib/site-packages/mlagents/trainers/sac/optimizer_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d566859d07937c36d0b1cc7389c84b91155b9d
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/sac/optimizer_torch.py
@@ -0,0 +1,655 @@
+import numpy as np
+from typing import Dict, List, NamedTuple, cast, Tuple, Optional
+import attr
+
+from mlagents.torch_utils import torch, nn, default_device
+
+from mlagents_envs.logging_util import get_logger
+from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
+from mlagents.trainers.policy.torch_policy import TorchPolicy
+from mlagents.trainers.settings import NetworkSettings
+from mlagents.trainers.torch_entities.networks import ValueNetwork, SharedActorCritic
+from mlagents.trainers.torch_entities.agent_action import AgentAction
+from mlagents.trainers.torch_entities.action_log_probs import ActionLogProbs
+from mlagents.trainers.torch_entities.utils import ModelUtils
+from mlagents.trainers.buffer import AgentBuffer, BufferKey, RewardSignalUtil
+from mlagents_envs.timers import timed
+from mlagents_envs.base_env import ActionSpec, ObservationSpec
+from mlagents.trainers.exception import UnityTrainerException
+from mlagents.trainers.settings import TrainerSettings, OffPolicyHyperparamSettings
+from contextlib import ExitStack
+from mlagents.trainers.trajectory import ObsUtil
+
+EPSILON = 1e-6  # Small value to avoid divide by zero
+
+logger = get_logger(__name__)
+
+
+@attr.s(auto_attribs=True)
+class SACSettings(OffPolicyHyperparamSettings):
+    batch_size: int = 128
+    buffer_size: int = 50000
+    buffer_init_steps: int = 0
+    tau: float = 0.005
+    steps_per_update: float = 1
+    save_replay_buffer: bool = False
+    init_entcoef: float = 1.0
+    reward_signal_steps_per_update: float = attr.ib()
+
+    @reward_signal_steps_per_update.default
+    def _reward_signal_steps_per_update_default(self):
+        return self.steps_per_update
+
+
+class TorchSACOptimizer(TorchOptimizer):
+    class PolicyValueNetwork(nn.Module):
+        def __init__(
+            self,
+            stream_names: List[str],
+            observation_specs: List[ObservationSpec],
+            network_settings: NetworkSettings,
+            action_spec: ActionSpec,
+        ):
+            super().__init__()
+            num_value_outs = max(sum(action_spec.discrete_branches), 1)
+            num_action_ins = int(action_spec.continuous_size)
+
+            self.q1_network = ValueNetwork(
+                stream_names,
+                observation_specs,
+                network_settings,
+                num_action_ins,
+                num_value_outs,
+            )
+            self.q2_network = ValueNetwork(
+                stream_names,
+                observation_specs,
+                network_settings,
+                num_action_ins,
+                num_value_outs,
+            )
+
+        def forward(
+            self,
+            inputs: List[torch.Tensor],
+            actions: Optional[torch.Tensor] = None,
+            memories: Optional[torch.Tensor] = None,
+            sequence_length: int = 1,
+            q1_grad: bool = True,
+            q2_grad: bool = True,
+        ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
+            """
+            Performs a forward pass on the value network, which consists of a Q1 and Q2
+            network. Optionally does not evaluate gradients for either the Q1, Q2, or both.
+            :param inputs: List of observation tensors.
+            :param actions: For a continuous Q function (has actions), tensor of actions.
+                Otherwise, None.
+            :param memories: Initial memories if using memory. Otherwise, None.
+            :param sequence_length: Sequence length if using memory.
+            :param q1_grad: Whether or not to compute gradients for the Q1 network.
+            :param q2_grad: Whether or not to compute gradients for the Q2 network.
+            :return: Tuple of two dictionaries, which both map {reward_signal: Q} for Q1 and Q2,
+                respectively.
+            """
+            # ExitStack allows us to enter the torch.no_grad() context conditionally
+            with ExitStack() as stack:
+                if not q1_grad:
+                    stack.enter_context(torch.no_grad())
+                q1_out, _ = self.q1_network(
+                    inputs,
+                    actions=actions,
+                    memories=memories,
+                    sequence_length=sequence_length,
+                )
+            with ExitStack() as stack:
+                if not q2_grad:
+                    stack.enter_context(torch.no_grad())
+                q2_out, _ = self.q2_network(
+                    inputs,
+                    actions=actions,
+                    memories=memories,
+                    sequence_length=sequence_length,
+                )
+            return q1_out, q2_out
+
+    class TargetEntropy(NamedTuple):
+
+        discrete: List[float] = []  # One per branch
+        continuous: float = 0.0
+
+    class LogEntCoef(nn.Module):
+        def __init__(self, discrete, continuous):
+            super().__init__()
+            self.discrete = discrete
+            self.continuous = continuous
+
+    def __init__(self, policy: TorchPolicy, trainer_settings: TrainerSettings):
+        super().__init__(policy, trainer_settings)
+        reward_signal_configs = trainer_settings.reward_signals
+        reward_signal_names = [key.value for key, _ in reward_signal_configs.items()]
+        if isinstance(policy.actor, SharedActorCritic):
+            raise UnityTrainerException("SAC does not support SharedActorCritic")
+        self._critic = ValueNetwork(
+            reward_signal_names,
+            policy.behavior_spec.observation_specs,
+            policy.network_settings,
+        )
+        hyperparameters: SACSettings = cast(
+            SACSettings, trainer_settings.hyperparameters
+        )
+
+        self.tau = hyperparameters.tau
+        self.init_entcoef = hyperparameters.init_entcoef
+
+        self.policy = policy
+        policy_network_settings = policy.network_settings
+
+        self.tau = hyperparameters.tau
+        self.burn_in_ratio = 0.0
+
+        # Non-exposed SAC parameters
+        self.discrete_target_entropy_scale = 0.2  # Roughly equal to e-greedy 0.05
+        self.continuous_target_entropy_scale = 1.0
+
+        self.stream_names = list(self.reward_signals.keys())
+        # Use to reduce "survivor bonus" when using Curiosity or GAIL.
+        self.gammas = [_val.gamma for _val in trainer_settings.reward_signals.values()]
+        self.use_dones_in_backup = {
+            name: int(not self.reward_signals[name].ignore_done)
+            for name in self.stream_names
+        }
+        self._action_spec = self.policy.behavior_spec.action_spec
+
+        self.q_network = TorchSACOptimizer.PolicyValueNetwork(
+            self.stream_names,
+            self.policy.behavior_spec.observation_specs,
+            policy_network_settings,
+            self._action_spec,
+        )
+
+        self.target_network = ValueNetwork(
+            self.stream_names,
+            self.policy.behavior_spec.observation_specs,
+            policy_network_settings,
+        )
+        ModelUtils.soft_update(self._critic, self.target_network, 1.0)
+
+        # We create one entropy coefficient per action, whether discrete or continuous.
+        _disc_log_ent_coef = torch.nn.Parameter(
+            torch.log(
+                torch.as_tensor(
+                    [self.init_entcoef] * len(self._action_spec.discrete_branches)
+                )
+            ),
+            requires_grad=True,
+        )
+        _cont_log_ent_coef = torch.nn.Parameter(
+            torch.log(torch.as_tensor([self.init_entcoef])), requires_grad=True
+        )
+        self._log_ent_coef = TorchSACOptimizer.LogEntCoef(
+            discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef
+        )
+        _cont_target = (
+            -1
+            * self.continuous_target_entropy_scale
+            * np.prod(self._action_spec.continuous_size).astype(np.float32)
+        )
+        _disc_target = [
+            self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
+            for i in self._action_spec.discrete_branches
+        ]
+        self.target_entropy = TorchSACOptimizer.TargetEntropy(
+            continuous=_cont_target, discrete=_disc_target
+        )
+        policy_params = list(self.policy.actor.parameters())
+        value_params = list(self.q_network.parameters()) + list(
+            self._critic.parameters()
+        )
+
+        logger.debug("value_vars")
+        for param in value_params:
+            logger.debug(param.shape)
+        logger.debug("policy_vars")
+        for param in policy_params:
+            logger.debug(param.shape)
+
+        self.decay_learning_rate = ModelUtils.DecayedValue(
+            hyperparameters.learning_rate_schedule,
+            hyperparameters.learning_rate,
+            1e-10,
+            self.trainer_settings.max_steps,
+        )
+        self.policy_optimizer = torch.optim.Adam(
+            policy_params, lr=hyperparameters.learning_rate
+        )
+        self.value_optimizer = torch.optim.Adam(
+            value_params, lr=hyperparameters.learning_rate
+        )
+        self.entropy_optimizer = torch.optim.Adam(
+            self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate
+        )
+        self._move_to_device(default_device())
+
+    @property
+    def critic(self):
+        return self._critic
+
+    def _move_to_device(self, device: torch.device) -> None:
+        self._log_ent_coef.to(device)
+        self.target_network.to(device)
+        self._critic.to(device)
+        self.q_network.to(device)
+
+    def sac_q_loss(
+        self,
+        q1_out: Dict[str, torch.Tensor],
+        q2_out: Dict[str, torch.Tensor],
+        target_values: Dict[str, torch.Tensor],
+        dones: torch.Tensor,
+        rewards: Dict[str, torch.Tensor],
+        loss_masks: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        q1_losses = []
+        q2_losses = []
+        # Multiple q losses per stream
+        for i, name in enumerate(q1_out.keys()):
+            q1_stream = q1_out[name].squeeze()
+            q2_stream = q2_out[name].squeeze()
+            with torch.no_grad():
+                q_backup = rewards[name] + (
+                    (1.0 - self.use_dones_in_backup[name] * dones)
+                    * self.gammas[i]
+                    * target_values[name]
+                )
+            _q1_loss = 0.5 * ModelUtils.masked_mean(
+                torch.nn.functional.mse_loss(q_backup, q1_stream), loss_masks
+            )
+            _q2_loss = 0.5 * ModelUtils.masked_mean(
+                torch.nn.functional.mse_loss(q_backup, q2_stream), loss_masks
+            )
+
+            q1_losses.append(_q1_loss)
+            q2_losses.append(_q2_loss)
+        q1_loss = torch.mean(torch.stack(q1_losses))
+        q2_loss = torch.mean(torch.stack(q2_losses))
+        return q1_loss, q2_loss
+
+    def sac_value_loss(
+        self,
+        log_probs: ActionLogProbs,
+        values: Dict[str, torch.Tensor],
+        q1p_out: Dict[str, torch.Tensor],
+        q2p_out: Dict[str, torch.Tensor],
+        loss_masks: torch.Tensor,
+    ) -> torch.Tensor:
+        min_policy_qs = {}
+        with torch.no_grad():
+            _cont_ent_coef = self._log_ent_coef.continuous.exp()
+            _disc_ent_coef = self._log_ent_coef.discrete.exp()
+            for name in values.keys():
+                if self._action_spec.discrete_size <= 0:
+                    min_policy_qs[name] = torch.min(q1p_out[name], q2p_out[name])
+                else:
+                    disc_action_probs = log_probs.all_discrete_tensor.exp()
+                    _branched_q1p = ModelUtils.break_into_branches(
+                        q1p_out[name] * disc_action_probs,
+                        self._action_spec.discrete_branches,
+                    )
+                    _branched_q2p = ModelUtils.break_into_branches(
+                        q2p_out[name] * disc_action_probs,
+                        self._action_spec.discrete_branches,
+                    )
+                    _q1p_mean = torch.mean(
+                        torch.stack(
+                            [
+                                torch.sum(_br, dim=1, keepdim=True)
+                                for _br in _branched_q1p
+                            ]
+                        ),
+                        dim=0,
+                    )
+                    _q2p_mean = torch.mean(
+                        torch.stack(
+                            [
+                                torch.sum(_br, dim=1, keepdim=True)
+                                for _br in _branched_q2p
+                            ]
+                        ),
+                        dim=0,
+                    )
+
+                    min_policy_qs[name] = torch.min(_q1p_mean, _q2p_mean)
+
+        value_losses = []
+        if self._action_spec.discrete_size <= 0:
+            for name in values.keys():
+                with torch.no_grad():
+                    v_backup = min_policy_qs[name] - torch.sum(
+                        _cont_ent_coef * log_probs.continuous_tensor, dim=1
+                    )
+                value_loss = 0.5 * ModelUtils.masked_mean(
+                    torch.nn.functional.mse_loss(values[name], v_backup), loss_masks
+                )
+                value_losses.append(value_loss)
+        else:
+            disc_log_probs = log_probs.all_discrete_tensor
+            branched_per_action_ent = ModelUtils.break_into_branches(
+                disc_log_probs * disc_log_probs.exp(),
+                self._action_spec.discrete_branches,
+            )
+            # We have to do entropy bonus per action branch
+            branched_ent_bonus = torch.stack(
+                [
+                    torch.sum(_disc_ent_coef[i] * _lp, dim=1, keepdim=True)
+                    for i, _lp in enumerate(branched_per_action_ent)
+                ]
+            )
+            for name in values.keys():
+                with torch.no_grad():
+                    v_backup = min_policy_qs[name] - torch.mean(
+                        branched_ent_bonus, axis=0
+                    )
+                    # Add continuous entropy bonus to minimum Q
+                    if self._action_spec.continuous_size > 0:
+                        v_backup += torch.sum(
+                            _cont_ent_coef * log_probs.continuous_tensor,
+                            dim=1,
+                            keepdim=True,
+                        )
+                value_loss = 0.5 * ModelUtils.masked_mean(
+                    torch.nn.functional.mse_loss(values[name], v_backup.squeeze()),
+                    loss_masks,
+                )
+                value_losses.append(value_loss)
+        value_loss = torch.mean(torch.stack(value_losses))
+        if torch.isinf(value_loss).any() or torch.isnan(value_loss).any():
+            raise UnityTrainerException("Inf found")
+        return value_loss
+
+    def sac_policy_loss(
+        self,
+        log_probs: ActionLogProbs,
+        q1p_outs: Dict[str, torch.Tensor],
+        loss_masks: torch.Tensor,
+    ) -> torch.Tensor:
+        _cont_ent_coef, _disc_ent_coef = (
+            self._log_ent_coef.continuous,
+            self._log_ent_coef.discrete,
+        )
+        _cont_ent_coef = _cont_ent_coef.exp()
+        _disc_ent_coef = _disc_ent_coef.exp()
+
+        mean_q1 = torch.mean(torch.stack(list(q1p_outs.values())), axis=0)
+        batch_policy_loss = 0
+        if self._action_spec.discrete_size > 0:
+            disc_log_probs = log_probs.all_discrete_tensor
+            disc_action_probs = disc_log_probs.exp()
+            branched_per_action_ent = ModelUtils.break_into_branches(
+                disc_log_probs * disc_action_probs, self._action_spec.discrete_branches
+            )
+            branched_q_term = ModelUtils.break_into_branches(
+                mean_q1 * disc_action_probs, self._action_spec.discrete_branches
+            )
+            branched_policy_loss = torch.stack(
+                [
+                    torch.sum(_disc_ent_coef[i] * _lp - _qt, dim=1, keepdim=False)
+                    for i, (_lp, _qt) in enumerate(
+                        zip(branched_per_action_ent, branched_q_term)
+                    )
+                ],
+                dim=1,
+            )
+            batch_policy_loss += torch.sum(branched_policy_loss, dim=1)
+            all_mean_q1 = torch.sum(disc_action_probs * mean_q1, dim=1)
+        else:
+            all_mean_q1 = mean_q1
+        if self._action_spec.continuous_size > 0:
+            cont_log_probs = log_probs.continuous_tensor
+            batch_policy_loss += (
+                _cont_ent_coef * torch.sum(cont_log_probs, dim=1) - all_mean_q1
+            )
+        policy_loss = ModelUtils.masked_mean(batch_policy_loss, loss_masks)
+
+        return policy_loss
+
+    def sac_entropy_loss(
+        self, log_probs: ActionLogProbs, loss_masks: torch.Tensor
+    ) -> torch.Tensor:
+        _cont_ent_coef, _disc_ent_coef = (
+            self._log_ent_coef.continuous,
+            self._log_ent_coef.discrete,
+        )
+        entropy_loss = 0
+        if self._action_spec.discrete_size > 0:
+            with torch.no_grad():
+                # Break continuous into separate branch
+                disc_log_probs = log_probs.all_discrete_tensor
+                branched_per_action_ent = ModelUtils.break_into_branches(
+                    disc_log_probs * disc_log_probs.exp(),
+                    self._action_spec.discrete_branches,
+                )
+                target_current_diff_branched = torch.stack(
+                    [
+                        torch.sum(_lp, axis=1, keepdim=True) + _te
+                        for _lp, _te in zip(
+                            branched_per_action_ent, self.target_entropy.discrete
+                        )
+                    ],
+                    axis=1,
+                )
+                target_current_diff = torch.squeeze(
+                    target_current_diff_branched, axis=2
+                )
+            entropy_loss += -1 * ModelUtils.masked_mean(
+                torch.mean(_disc_ent_coef * target_current_diff, axis=1), loss_masks
+            )
+        if self._action_spec.continuous_size > 0:
+            with torch.no_grad():
+                cont_log_probs = log_probs.continuous_tensor
+                target_current_diff = (
+                    torch.sum(cont_log_probs, dim=1) + self.target_entropy.continuous
+                )
+            # We update all the _cont_ent_coef as one block
+            entropy_loss += -1 * ModelUtils.masked_mean(
+                _cont_ent_coef * target_current_diff, loss_masks
+            )
+
+        return entropy_loss
+
+    def _condense_q_streams(
+        self, q_output: Dict[str, torch.Tensor], discrete_actions: torch.Tensor
+    ) -> Dict[str, torch.Tensor]:
+        condensed_q_output = {}
+        onehot_actions = ModelUtils.actions_to_onehot(
+            discrete_actions, self._action_spec.discrete_branches
+        )
+        for key, item in q_output.items():
+            branched_q = ModelUtils.break_into_branches(
+                item, self._action_spec.discrete_branches
+            )
+            only_action_qs = torch.stack(
+                [
+                    torch.sum(_act * _q, dim=1, keepdim=True)
+                    for _act, _q in zip(onehot_actions, branched_q)
+                ]
+            )
+
+            condensed_q_output[key] = torch.mean(only_action_qs, dim=0)
+        return condensed_q_output
+
+    @timed
+    def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
+        """
+        Updates model using buffer.
+        :param num_sequences: Number of trajectories in batch.
+        :param batch: Experience mini-batch.
+        :param update_target: Whether or not to update target value network
+        :param reward_signal_batches: Minibatches to use for updating the reward signals,
+            indexed by name. If none, don't update the reward signals.
+        :return: Output from update process.
+        """
+        rewards = {}
+        for name in self.reward_signals:
+            rewards[name] = ModelUtils.list_to_tensor(
+                batch[RewardSignalUtil.rewards_key(name)]
+            )
+
+        n_obs = len(self.policy.behavior_spec.observation_specs)
+        current_obs = ObsUtil.from_buffer(batch, n_obs)
+        # Convert to tensors
+        current_obs = [ModelUtils.list_to_tensor(obs) for obs in current_obs]
+
+        next_obs = ObsUtil.from_buffer_next(batch, n_obs)
+        # Convert to tensors
+        next_obs = [ModelUtils.list_to_tensor(obs) for obs in next_obs]
+
+        act_masks = ModelUtils.list_to_tensor(batch[BufferKey.ACTION_MASK])
+        actions = AgentAction.from_buffer(batch)
+
+        memories_list = [
+            ModelUtils.list_to_tensor(batch[BufferKey.MEMORY][i])
+            for i in range(0, len(batch[BufferKey.MEMORY]), self.policy.sequence_length)
+        ]
+        # LSTM shouldn't have sequence length <1, but stop it from going out of the index if true.
+        value_memories_list = [
+            ModelUtils.list_to_tensor(batch[BufferKey.CRITIC_MEMORY][i])
+            for i in range(
+                0, len(batch[BufferKey.CRITIC_MEMORY]), self.policy.sequence_length
+            )
+        ]
+
+        if len(memories_list) > 0:
+            memories = torch.stack(memories_list).unsqueeze(0)
+            value_memories = torch.stack(value_memories_list).unsqueeze(0)
+        else:
+            memories = None
+            value_memories = None
+
+        # Q and V network memories are 0'ed out, since we don't have them during inference.
+        q_memories = (
+            torch.zeros_like(value_memories) if value_memories is not None else None
+        )
+
+        # Copy normalizers from policy
+        self.q_network.q1_network.network_body.copy_normalization(
+            self.policy.actor.network_body
+        )
+        self.q_network.q2_network.network_body.copy_normalization(
+            self.policy.actor.network_body
+        )
+        self.target_network.network_body.copy_normalization(
+            self.policy.actor.network_body
+        )
+        self._critic.network_body.copy_normalization(self.policy.actor.network_body)
+        sampled_actions, run_out, _, = self.policy.actor.get_action_and_stats(
+            current_obs,
+            masks=act_masks,
+            memories=memories,
+            sequence_length=self.policy.sequence_length,
+        )
+        log_probs = run_out["log_probs"]
+        value_estimates, _ = self._critic.critic_pass(
+            current_obs, value_memories, sequence_length=self.policy.sequence_length
+        )
+
+        cont_sampled_actions = sampled_actions.continuous_tensor
+        cont_actions = actions.continuous_tensor
+        q1p_out, q2p_out = self.q_network(
+            current_obs,
+            cont_sampled_actions,
+            memories=q_memories,
+            sequence_length=self.policy.sequence_length,
+            q2_grad=False,
+        )
+        q1_out, q2_out = self.q_network(
+            current_obs,
+            cont_actions,
+            memories=q_memories,
+            sequence_length=self.policy.sequence_length,
+        )
+
+        if self._action_spec.discrete_size > 0:
+            disc_actions = actions.discrete_tensor
+            q1_stream = self._condense_q_streams(q1_out, disc_actions)
+            q2_stream = self._condense_q_streams(q2_out, disc_actions)
+        else:
+            q1_stream, q2_stream = q1_out, q2_out
+
+        with torch.no_grad():
+            # Since we didn't record the next value memories, evaluate one step in the critic to
+            # get them.
+            if value_memories is not None:
+                # Get the first observation in each sequence
+                just_first_obs = [
+                    _obs[:: self.policy.sequence_length] for _obs in current_obs
+                ]
+                _, next_value_memories = self._critic.critic_pass(
+                    just_first_obs, value_memories, sequence_length=1
+                )
+            else:
+                next_value_memories = None
+            target_values, _ = self.target_network(
+                next_obs,
+                memories=next_value_memories,
+                sequence_length=self.policy.sequence_length,
+            )
+        masks = ModelUtils.list_to_tensor(batch[BufferKey.MASKS], dtype=torch.bool)
+        dones = ModelUtils.list_to_tensor(batch[BufferKey.DONE])
+
+        q1_loss, q2_loss = self.sac_q_loss(
+            q1_stream, q2_stream, target_values, dones, rewards, masks
+        )
+        value_loss = self.sac_value_loss(
+            log_probs, value_estimates, q1p_out, q2p_out, masks
+        )
+        policy_loss = self.sac_policy_loss(log_probs, q1p_out, masks)
+        entropy_loss = self.sac_entropy_loss(log_probs, masks)
+
+        total_value_loss = q1_loss + q2_loss + value_loss
+
+        decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
+        ModelUtils.update_learning_rate(self.policy_optimizer, decay_lr)
+        self.policy_optimizer.zero_grad()
+        policy_loss.backward()
+        self.policy_optimizer.step()
+
+        ModelUtils.update_learning_rate(self.value_optimizer, decay_lr)
+        self.value_optimizer.zero_grad()
+        total_value_loss.backward()
+        self.value_optimizer.step()
+
+        ModelUtils.update_learning_rate(self.entropy_optimizer, decay_lr)
+        self.entropy_optimizer.zero_grad()
+        entropy_loss.backward()
+        self.entropy_optimizer.step()
+
+        # Update target network
+        ModelUtils.soft_update(self._critic, self.target_network, self.tau)
+        update_stats = {
+            "Losses/Policy Loss": policy_loss.item(),
+            "Losses/Value Loss": value_loss.item(),
+            "Losses/Q1 Loss": q1_loss.item(),
+            "Losses/Q2 Loss": q2_loss.item(),
+            "Policy/Discrete Entropy Coeff": torch.mean(
+                torch.exp(self._log_ent_coef.discrete)
+            ).item(),
+            "Policy/Continuous Entropy Coeff": torch.mean(
+                torch.exp(self._log_ent_coef.continuous)
+            ).item(),
+            "Policy/Learning Rate": decay_lr,
+        }
+
+        return update_stats
+
+    def get_modules(self):
+        modules = {
+            "Optimizer:q_network": self.q_network,
+            "Optimizer:value_network": self._critic,
+            "Optimizer:target_network": self.target_network,
+            "Optimizer:policy_optimizer": self.policy_optimizer,
+            "Optimizer:value_optimizer": self.value_optimizer,
+            "Optimizer:entropy_optimizer": self.entropy_optimizer,
+        }
+        for reward_provider in self.reward_signals.values():
+            modules.update(reward_provider.get_modules())
+        return modules
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/sac/trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/sac/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..56860c7381849dfb4ec4edab2813972c76f568ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/sac/trainer.py
@@ -0,0 +1,181 @@
+# ## ML-Agent Learning (SAC)
+# Contains an implementation of SAC as described in https://arxiv.org/abs/1801.01290
+# and implemented in https://github.com/hill-a/stable-baselines
+
+from typing import cast
+
+import numpy as np
+
+from mlagents_envs.logging_util import get_logger
+from mlagents_envs.base_env import BehaviorSpec
+from mlagents.trainers.buffer import BufferKey
+from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
+from mlagents.trainers.trainer.off_policy_trainer import OffPolicyTrainer
+from mlagents.trainers.policy.torch_policy import TorchPolicy
+from mlagents.trainers.policy.policy import Policy
+from mlagents.trainers.sac.optimizer_torch import TorchSACOptimizer, SACSettings
+from mlagents.trainers.trajectory import Trajectory, ObsUtil
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.settings import TrainerSettings
+
+from mlagents.trainers.torch_entities.networks import SimpleActor
+
+logger = get_logger(__name__)
+
+BUFFER_TRUNCATE_PERCENT = 0.8
+
+TRAINER_NAME = "sac"
+
+
+class SACTrainer(OffPolicyTrainer):
+    """
+    The SACTrainer is an implementation of the SAC algorithm, with support
+    for discrete actions and recurrent networks.
+    """
+
+    def __init__(
+        self,
+        behavior_name: str,
+        reward_buff_cap: int,
+        trainer_settings: TrainerSettings,
+        training: bool,
+        load: bool,
+        seed: int,
+        artifact_path: str,
+    ):
+        """
+        Responsible for collecting experiences and training SAC model.
+        :param behavior_name: The name of the behavior associated with trainer config
+        :param reward_buff_cap: Max reward history to track in the reward buffer
+        :param trainer_settings: The parameters for the trainer.
+        :param training: Whether the trainer is set for training.
+        :param load: Whether the model should be loaded.
+        :param seed: The seed the model will be initialized with
+        :param artifact_path: The directory within which to store artifacts from this trainer.
+        """
+        super().__init__(
+            behavior_name,
+            reward_buff_cap,
+            trainer_settings,
+            training,
+            load,
+            seed,
+            artifact_path,
+        )
+
+        self.seed = seed
+        self.policy: TorchPolicy = None  # type: ignore
+        self.optimizer: TorchSACOptimizer = None  # type: ignore
+        self.hyperparameters: SACSettings = cast(
+            SACSettings, trainer_settings.hyperparameters
+        )
+        self._step = 0
+
+        # Don't divide by zero
+        self.update_steps = 1
+        self.reward_signal_update_steps = 1
+
+        self.steps_per_update = self.hyperparameters.steps_per_update
+        self.reward_signal_steps_per_update = (
+            self.hyperparameters.reward_signal_steps_per_update
+        )
+
+        self.checkpoint_replay_buffer = self.hyperparameters.save_replay_buffer
+
+    def _process_trajectory(self, trajectory: Trajectory) -> None:
+        """
+        Takes a trajectory and processes it, putting it into the replay buffer.
+        """
+        super()._process_trajectory(trajectory)
+        last_step = trajectory.steps[-1]
+        agent_id = trajectory.agent_id  # All the agents should have the same ID
+
+        agent_buffer_trajectory = trajectory.to_agentbuffer()
+        # Check if we used group rewards, warn if so.
+        self._warn_if_group_reward(agent_buffer_trajectory)
+
+        # Update the normalization
+        if self.is_training:
+            self.policy.actor.update_normalization(agent_buffer_trajectory)
+            self.optimizer.critic.update_normalization(agent_buffer_trajectory)
+
+        # Evaluate all reward functions for reporting purposes
+        self.collected_rewards["environment"][agent_id] += np.sum(
+            agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS]
+        )
+        for name, reward_signal in self.optimizer.reward_signals.items():
+            evaluate_result = (
+                reward_signal.evaluate(agent_buffer_trajectory) * reward_signal.strength
+            )
+
+            # Report the reward signals
+            self.collected_rewards[name][agent_id] += np.sum(evaluate_result)
+
+        # Get all value estimates for reporting purposes
+        (
+            value_estimates,
+            _,
+            value_memories,
+        ) = self.optimizer.get_trajectory_value_estimates(
+            agent_buffer_trajectory, trajectory.next_obs, trajectory.done_reached
+        )
+        if value_memories is not None:
+            agent_buffer_trajectory[BufferKey.CRITIC_MEMORY].set(value_memories)
+
+        for name, v in value_estimates.items():
+            self._stats_reporter.add_stat(
+                f"Policy/{self.optimizer.reward_signals[name].name.capitalize()} Value",
+                np.mean(v),
+            )
+
+        # Bootstrap using the last step rather than the bootstrap step if max step is reached.
+        # Set last element to duplicate obs and remove dones.
+        if last_step.interrupted:
+            last_step_obs = last_step.obs
+            for i, obs in enumerate(last_step_obs):
+                agent_buffer_trajectory[ObsUtil.get_name_at_next(i)][-1] = obs
+            agent_buffer_trajectory[BufferKey.DONE][-1] = False
+
+        self._append_to_update_buffer(agent_buffer_trajectory)
+
+        if trajectory.done_reached:
+            self._update_end_episode_stats(agent_id, self.optimizer)
+
+    def create_optimizer(self) -> TorchOptimizer:
+        return TorchSACOptimizer(  # type: ignore
+            cast(TorchPolicy, self.policy), self.trainer_settings  # type: ignore
+        )  # type: ignore
+
+    def create_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
+    ) -> TorchPolicy:
+        """
+        Creates a policy with a PyTorch backend and SAC hyperparameters
+        :param parsed_behavior_id:
+        :param behavior_spec: specifications for policy construction
+        :return policy
+        """
+        actor_cls = SimpleActor
+        actor_kwargs = {"conditional_sigma": True, "tanh_squash": True}
+
+        policy = TorchPolicy(
+            self.seed,
+            behavior_spec,
+            self.trainer_settings.network_settings,
+            actor_cls,
+            actor_kwargs,
+        )
+        self.maybe_load_replay_buffer()
+        return policy
+
+    def get_policy(self, name_behavior_id: str) -> Policy:
+        """
+        Gets policy from trainer associated with name_behavior_id
+        :param name_behavior_id: full identifier of policy
+        """
+
+        return self.policy
+
+    @staticmethod
+    def get_trainer_name() -> str:
+        return TRAINER_NAME
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/settings.py b/MLPY/Lib/site-packages/mlagents/trainers/settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cff991ba22ec8c55e8fc785e46b346a58c7ceaf
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/settings.py
@@ -0,0 +1,961 @@
+import os.path
+import warnings
+
+import attr
+import cattr
+from typing import (
+    Dict,
+    Optional,
+    List,
+    Any,
+    DefaultDict,
+    Mapping,
+    Tuple,
+    Union,
+    ClassVar,
+)
+from enum import Enum
+import collections
+import argparse
+import abc
+import numpy as np
+import math
+import copy
+
+from mlagents.trainers.cli_utils import StoreConfigFile, DetectDefault, parser
+from mlagents.trainers.cli_utils import load_config
+from mlagents.trainers.exception import TrainerConfigError, TrainerConfigWarning
+
+from mlagents_envs import logging_util
+from mlagents_envs.side_channel.environment_parameters_channel import (
+    EnvironmentParametersChannel,
+)
+from mlagents.plugins import all_trainer_settings, all_trainer_types
+
+logger = logging_util.get_logger(__name__)
+
+
+def check_and_structure(key: str, value: Any, class_type: type) -> Any:
+    attr_fields_dict = attr.fields_dict(class_type)
+    if key not in attr_fields_dict:
+        raise TrainerConfigError(
+            f"The option {key} was specified in your YAML file for {class_type.__name__}, but is invalid."
+        )
+    # Apply cattr structure to the values
+    return cattr.structure(value, attr_fields_dict[key].type)
+
+
+def check_hyperparam_schedules(val: Dict, trainer_type: str) -> Dict:
+    # Check if beta and epsilon are set. If not, set to match learning rate schedule.
+    if trainer_type == "ppo" or trainer_type == "poca":
+        if "beta_schedule" not in val.keys() and "learning_rate_schedule" in val.keys():
+            val["beta_schedule"] = val["learning_rate_schedule"]
+        if (
+            "epsilon_schedule" not in val.keys()
+            and "learning_rate_schedule" in val.keys()
+        ):
+            val["epsilon_schedule"] = val["learning_rate_schedule"]
+    return val
+
+
+def strict_to_cls(d: Mapping, t: type) -> Any:
+    if not isinstance(d, Mapping):
+        raise TrainerConfigError(f"Unsupported config {d} for {t.__name__}.")
+    d_copy: Dict[str, Any] = {}
+    d_copy.update(d)
+    for key, val in d_copy.items():
+        d_copy[key] = check_and_structure(key, val, t)
+    return t(**d_copy)
+
+
+def defaultdict_to_dict(d: DefaultDict) -> Dict:
+    return {key: cattr.unstructure(val) for key, val in d.items()}
+
+
+def deep_update_dict(d: Dict, update_d: Mapping) -> None:
+    """
+    Similar to dict.update(), but works for nested dicts of dicts as well.
+    """
+    for key, val in update_d.items():
+        if key in d and isinstance(d[key], Mapping) and isinstance(val, Mapping):
+            deep_update_dict(d[key], val)
+        else:
+            d[key] = val
+
+
+class SerializationSettings:
+    convert_to_onnx = True
+    onnx_opset = 9
+
+
+@attr.s(auto_attribs=True)
+class ExportableSettings:
+    def as_dict(self):
+        return cattr.unstructure(self)
+
+
+class EncoderType(Enum):
+    FULLY_CONNECTED = "fully_connected"
+    MATCH3 = "match3"
+    SIMPLE = "simple"
+    NATURE_CNN = "nature_cnn"
+    RESNET = "resnet"
+
+
+class ScheduleType(Enum):
+    CONSTANT = "constant"
+    LINEAR = "linear"
+    # TODO add support for lesson based scheduling
+    # LESSON = "lesson"
+
+
+class ConditioningType(Enum):
+    HYPER = "hyper"
+    NONE = "none"
+
+
+@attr.s(auto_attribs=True)
+class NetworkSettings:
+    @attr.s
+    class MemorySettings:
+        sequence_length: int = attr.ib(default=64)
+        memory_size: int = attr.ib(default=128)
+
+        @memory_size.validator
+        def _check_valid_memory_size(self, attribute, value):
+            if value <= 0:
+                raise TrainerConfigError(
+                    "When using a recurrent network, memory size must be greater than 0."
+                )
+            elif value % 2 != 0:
+                raise TrainerConfigError(
+                    "When using a recurrent network, memory size must be divisible by 2."
+                )
+
+    normalize: bool = False
+    hidden_units: int = 128
+    num_layers: int = 2
+    vis_encode_type: EncoderType = EncoderType.SIMPLE
+    memory: Optional[MemorySettings] = None
+    goal_conditioning_type: ConditioningType = ConditioningType.HYPER
+    deterministic: bool = parser.get_default("deterministic")
+
+
+@attr.s(auto_attribs=True)
+class BehavioralCloningSettings:
+    demo_path: str
+    steps: int = 0
+    strength: float = 1.0
+    samples_per_update: int = 0
+    # Setting either of these to None will allow the Optimizer
+    # to decide these parameters, based on Trainer hyperparams
+    num_epoch: Optional[int] = None
+    batch_size: Optional[int] = None
+
+
+@attr.s(auto_attribs=True)
+class HyperparamSettings:
+    batch_size: int = 1024
+    buffer_size: int = 10240
+    learning_rate: float = 3.0e-4
+    learning_rate_schedule: ScheduleType = ScheduleType.CONSTANT
+
+
+@attr.s(auto_attribs=True)
+class OnPolicyHyperparamSettings(HyperparamSettings):
+    num_epoch: int = 3
+
+
+@attr.s(auto_attribs=True)
+class OffPolicyHyperparamSettings(HyperparamSettings):
+    batch_size: int = 128
+    buffer_size: int = 50000
+    buffer_init_steps: int = 0
+    steps_per_update: float = 1
+    save_replay_buffer: bool = False
+    reward_signal_steps_per_update: float = 4
+
+
+# INTRINSIC REWARD SIGNALS #############################################################
+class RewardSignalType(Enum):
+    EXTRINSIC: str = "extrinsic"
+    GAIL: str = "gail"
+    CURIOSITY: str = "curiosity"
+    RND: str = "rnd"
+
+    def to_settings(self) -> type:
+        _mapping = {
+            RewardSignalType.EXTRINSIC: RewardSignalSettings,
+            RewardSignalType.GAIL: GAILSettings,
+            RewardSignalType.CURIOSITY: CuriositySettings,
+            RewardSignalType.RND: RNDSettings,
+        }
+        return _mapping[self]
+
+
+@attr.s(auto_attribs=True)
+class RewardSignalSettings:
+    gamma: float = 0.99
+    strength: float = 1.0
+    network_settings: NetworkSettings = attr.ib(factory=NetworkSettings)
+
+    @staticmethod
+    def structure(d: Mapping, t: type) -> Any:
+        """
+        Helper method to structure a Dict of RewardSignalSettings class. Meant to be registered with
+        cattr.register_structure_hook() and called with cattr.structure(). This is needed to handle
+        the special Enum selection of RewardSignalSettings classes.
+        """
+        if not isinstance(d, Mapping):
+            raise TrainerConfigError(f"Unsupported reward signal configuration {d}.")
+        d_final: Dict[RewardSignalType, RewardSignalSettings] = {}
+        for key, val in d.items():
+            enum_key = RewardSignalType(key)
+            t = enum_key.to_settings()
+            d_final[enum_key] = strict_to_cls(val, t)
+            # Checks to see if user specifying deprecated encoding_size for RewardSignals.
+            # If network_settings is not specified, this updates the default hidden_units
+            # to the value of encoding size. If specified, this ignores encoding size and
+            # uses network_settings values.
+            if "encoding_size" in val:
+                logger.warning(
+                    "'encoding_size' was deprecated for RewardSignals. Please use network_settings."
+                )
+                # If network settings was not specified, use the encoding size. Otherwise, use hidden_units
+                if "network_settings" not in val:
+                    d_final[enum_key].network_settings.hidden_units = val[
+                        "encoding_size"
+                    ]
+        return d_final
+
+
+@attr.s(auto_attribs=True)
+class GAILSettings(RewardSignalSettings):
+    learning_rate: float = 3e-4
+    encoding_size: Optional[int] = None
+    use_actions: bool = False
+    use_vail: bool = False
+    demo_path: str = attr.ib(kw_only=True)
+
+
+@attr.s(auto_attribs=True)
+class CuriositySettings(RewardSignalSettings):
+    learning_rate: float = 3e-4
+    encoding_size: Optional[int] = None
+
+
+@attr.s(auto_attribs=True)
+class RNDSettings(RewardSignalSettings):
+    learning_rate: float = 1e-4
+    encoding_size: Optional[int] = None
+
+
+# SAMPLERS #############################################################################
+class ParameterRandomizationType(Enum):
+    UNIFORM: str = "uniform"
+    GAUSSIAN: str = "gaussian"
+    MULTIRANGEUNIFORM: str = "multirangeuniform"
+    CONSTANT: str = "constant"
+
+    def to_settings(self) -> type:
+        _mapping = {
+            ParameterRandomizationType.UNIFORM: UniformSettings,
+            ParameterRandomizationType.GAUSSIAN: GaussianSettings,
+            ParameterRandomizationType.MULTIRANGEUNIFORM: MultiRangeUniformSettings,
+            ParameterRandomizationType.CONSTANT: ConstantSettings
+            # Constant type is handled if a float is provided instead of a config
+        }
+        return _mapping[self]
+
+
+@attr.s(auto_attribs=True)
+class ParameterRandomizationSettings(abc.ABC):
+    seed: int = parser.get_default("seed")
+
+    def __str__(self) -> str:
+        """
+        Helper method to output sampler stats to console.
+        """
+        raise TrainerConfigError(f"__str__ not implemented for type {self.__class__}.")
+
+    @staticmethod
+    def structure(
+        d: Union[Mapping, float], t: type
+    ) -> "ParameterRandomizationSettings":
+        """
+        Helper method to a ParameterRandomizationSettings class. Meant to be registered with
+        cattr.register_structure_hook() and called with cattr.structure(). This is needed to handle
+        the special Enum selection of ParameterRandomizationSettings classes.
+        """
+        if isinstance(d, (float, int)):
+            return ConstantSettings(value=d)
+        if not isinstance(d, Mapping):
+            raise TrainerConfigError(
+                f"Unsupported parameter randomization configuration {d}."
+            )
+        if "sampler_type" not in d:
+            raise TrainerConfigError(
+                f"Sampler configuration does not contain sampler_type : {d}."
+            )
+        if "sampler_parameters" not in d:
+            raise TrainerConfigError(
+                f"Sampler configuration does not contain sampler_parameters : {d}."
+            )
+        enum_key = ParameterRandomizationType(d["sampler_type"])
+        t = enum_key.to_settings()
+        return strict_to_cls(d["sampler_parameters"], t)
+
+    @staticmethod
+    def unstructure(d: "ParameterRandomizationSettings") -> Mapping:
+        """
+        Helper method to a ParameterRandomizationSettings class. Meant to be registered with
+        cattr.register_unstructure_hook() and called with cattr.unstructure().
+        """
+        _reversed_mapping = {
+            UniformSettings: ParameterRandomizationType.UNIFORM,
+            GaussianSettings: ParameterRandomizationType.GAUSSIAN,
+            MultiRangeUniformSettings: ParameterRandomizationType.MULTIRANGEUNIFORM,
+            ConstantSettings: ParameterRandomizationType.CONSTANT,
+        }
+        sampler_type: Optional[str] = None
+        for t, name in _reversed_mapping.items():
+            if isinstance(d, t):
+                sampler_type = name.value
+        sampler_parameters = attr.asdict(d)
+        return {"sampler_type": sampler_type, "sampler_parameters": sampler_parameters}
+
+    @abc.abstractmethod
+    def apply(self, key: str, env_channel: EnvironmentParametersChannel) -> None:
+        """
+        Helper method to send sampler settings over EnvironmentParametersChannel
+        Calls the appropriate sampler type set method.
+        :param key: environment parameter to be sampled
+        :param env_channel: The EnvironmentParametersChannel to communicate sampler settings to environment
+        """
+        pass
+
+
+@attr.s(auto_attribs=True)
+class ConstantSettings(ParameterRandomizationSettings):
+    value: float = 0.0
+
+    def __str__(self) -> str:
+        """
+        Helper method to output sampler stats to console.
+        """
+        return f"Float: value={self.value}"
+
+    def apply(self, key: str, env_channel: EnvironmentParametersChannel) -> None:
+        """
+        Helper method to send sampler settings over EnvironmentParametersChannel
+        Calls the constant sampler type set method.
+        :param key: environment parameter to be sampled
+        :param env_channel: The EnvironmentParametersChannel to communicate sampler settings to environment
+        """
+        env_channel.set_float_parameter(key, self.value)
+
+
+@attr.s(auto_attribs=True)
+class UniformSettings(ParameterRandomizationSettings):
+    min_value: float = attr.ib()
+    max_value: float = 1.0
+
+    def __str__(self) -> str:
+        """
+        Helper method to output sampler stats to console.
+        """
+        return f"Uniform sampler: min={self.min_value}, max={self.max_value}"
+
+    @min_value.default
+    def _min_value_default(self):
+        return 0.0
+
+    @min_value.validator
+    def _check_min_value(self, attribute, value):
+        if self.min_value > self.max_value:
+            raise TrainerConfigError(
+                "Minimum value is greater than maximum value in uniform sampler."
+            )
+
+    def apply(self, key: str, env_channel: EnvironmentParametersChannel) -> None:
+        """
+        Helper method to send sampler settings over EnvironmentParametersChannel
+        Calls the uniform sampler type set method.
+        :param key: environment parameter to be sampled
+        :param env_channel: The EnvironmentParametersChannel to communicate sampler settings to environment
+        """
+        env_channel.set_uniform_sampler_parameters(
+            key, self.min_value, self.max_value, self.seed
+        )
+
+
+@attr.s(auto_attribs=True)
+class GaussianSettings(ParameterRandomizationSettings):
+    mean: float = 1.0
+    st_dev: float = 1.0
+
+    def __str__(self) -> str:
+        """
+        Helper method to output sampler stats to console.
+        """
+        return f"Gaussian sampler: mean={self.mean}, stddev={self.st_dev}"
+
+    def apply(self, key: str, env_channel: EnvironmentParametersChannel) -> None:
+        """
+        Helper method to send sampler settings over EnvironmentParametersChannel
+        Calls the gaussian sampler type set method.
+        :param key: environment parameter to be sampled
+        :param env_channel: The EnvironmentParametersChannel to communicate sampler settings to environment
+        """
+        env_channel.set_gaussian_sampler_parameters(
+            key, self.mean, self.st_dev, self.seed
+        )
+
+
+@attr.s(auto_attribs=True)
+class MultiRangeUniformSettings(ParameterRandomizationSettings):
+    intervals: List[Tuple[float, float]] = attr.ib()
+
+    def __str__(self) -> str:
+        """
+        Helper method to output sampler stats to console.
+        """
+        return f"MultiRangeUniform sampler: intervals={self.intervals}"
+
+    @intervals.default
+    def _intervals_default(self):
+        return [[0.0, 1.0]]
+
+    @intervals.validator
+    def _check_intervals(self, attribute, value):
+        for interval in self.intervals:
+            if len(interval) != 2:
+                raise TrainerConfigError(
+                    f"The sampling interval {interval} must contain exactly two values."
+                )
+            min_value, max_value = interval
+            if min_value > max_value:
+                raise TrainerConfigError(
+                    f"Minimum value is greater than maximum value in interval {interval}."
+                )
+
+    def apply(self, key: str, env_channel: EnvironmentParametersChannel) -> None:
+        """
+        Helper method to send sampler settings over EnvironmentParametersChannel
+        Calls the multirangeuniform sampler type set method.
+        :param key: environment parameter to be sampled
+        :param env_channel: The EnvironmentParametersChannel to communicate sampler settings to environment
+        """
+        env_channel.set_multirangeuniform_sampler_parameters(
+            key, self.intervals, self.seed
+        )
+
+
+# ENVIRONMENT PARAMETERS ###############################################################
+@attr.s(auto_attribs=True)
+class CompletionCriteriaSettings:
+    """
+    CompletionCriteriaSettings contains the information needed to figure out if the next
+    lesson must start.
+    """
+
+    class MeasureType(Enum):
+        PROGRESS: str = "progress"
+        REWARD: str = "reward"
+
+    behavior: str
+    measure: MeasureType = attr.ib(default=MeasureType.REWARD)
+    min_lesson_length: int = 0
+    signal_smoothing: bool = True
+    threshold: float = attr.ib(default=0.0)
+    require_reset: bool = False
+
+    @threshold.validator
+    def _check_threshold_value(self, attribute, value):
+        """
+        Verify that the threshold has a value between 0 and 1 when the measure is
+        PROGRESS
+        """
+        if self.measure == self.MeasureType.PROGRESS:
+            if self.threshold > 1.0:
+                raise TrainerConfigError(
+                    "Threshold for next lesson cannot be greater than 1 when the measure is progress."
+                )
+            if self.threshold < 0.0:
+                raise TrainerConfigError(
+                    "Threshold for next lesson cannot be negative when the measure is progress."
+                )
+
+    def need_increment(
+        self, progress: float, reward_buffer: List[float], smoothing: float
+    ) -> Tuple[bool, float]:
+        """
+        Given measures, this method returns a boolean indicating if the lesson
+        needs to change now, and a float corresponding to the new smoothed value.
+        """
+        # Is the min number of episodes reached
+        if len(reward_buffer) < self.min_lesson_length:
+            return False, smoothing
+        if self.measure == CompletionCriteriaSettings.MeasureType.PROGRESS:
+            if progress > self.threshold:
+                return True, smoothing
+        if self.measure == CompletionCriteriaSettings.MeasureType.REWARD:
+            if len(reward_buffer) < 1:
+                return False, smoothing
+            measure = np.mean(reward_buffer)
+            if math.isnan(measure):
+                return False, smoothing
+            if self.signal_smoothing:
+                measure = 0.25 * smoothing + 0.75 * measure
+                smoothing = measure
+            if measure > self.threshold:
+                return True, smoothing
+        return False, smoothing
+
+
+@attr.s(auto_attribs=True)
+class Lesson:
+    """
+    Gathers the data of one lesson for one environment parameter including its name,
+    the condition that must be fullfiled for the lesson to be completed and a sampler
+    for the environment parameter. If the completion_criteria is None, then this is
+    the last lesson in the curriculum.
+    """
+
+    value: ParameterRandomizationSettings
+    name: str
+    completion_criteria: Optional[CompletionCriteriaSettings] = attr.ib(default=None)
+
+
+@attr.s(auto_attribs=True)
+class EnvironmentParameterSettings:
+    """
+    EnvironmentParameterSettings is an ordered list of lessons for one environment
+    parameter.
+    """
+
+    curriculum: List[Lesson]
+
+    @staticmethod
+    def _check_lesson_chain(lessons, parameter_name):
+        """
+        Ensures that when using curriculum, all non-terminal lessons have a valid
+        CompletionCriteria, and that the terminal lesson does not contain a CompletionCriteria.
+        """
+        num_lessons = len(lessons)
+        for index, lesson in enumerate(lessons):
+            if index < num_lessons - 1 and lesson.completion_criteria is None:
+                raise TrainerConfigError(
+                    f"A non-terminal lesson does not have a completion_criteria for {parameter_name}."
+                )
+            if index == num_lessons - 1 and lesson.completion_criteria is not None:
+                warnings.warn(
+                    f"Your final lesson definition contains completion_criteria for {parameter_name}."
+                    f"It will be ignored.",
+                    TrainerConfigWarning,
+                )
+
+    @staticmethod
+    def structure(d: Mapping, t: type) -> Dict[str, "EnvironmentParameterSettings"]:
+        """
+        Helper method to structure a Dict of EnvironmentParameterSettings class. Meant
+        to be registered with cattr.register_structure_hook() and called with
+        cattr.structure().
+        """
+        if not isinstance(d, Mapping):
+            raise TrainerConfigError(
+                f"Unsupported parameter environment parameter settings {d}."
+            )
+        d_final: Dict[str, EnvironmentParameterSettings] = {}
+        for environment_parameter, environment_parameter_config in d.items():
+            if (
+                isinstance(environment_parameter_config, Mapping)
+                and "curriculum" in environment_parameter_config
+            ):
+                d_final[environment_parameter] = strict_to_cls(
+                    environment_parameter_config, EnvironmentParameterSettings
+                )
+                EnvironmentParameterSettings._check_lesson_chain(
+                    d_final[environment_parameter].curriculum, environment_parameter
+                )
+            else:
+                sampler = ParameterRandomizationSettings.structure(
+                    environment_parameter_config, ParameterRandomizationSettings
+                )
+                d_final[environment_parameter] = EnvironmentParameterSettings(
+                    curriculum=[
+                        Lesson(
+                            completion_criteria=None,
+                            value=sampler,
+                            name=environment_parameter,
+                        )
+                    ]
+                )
+        return d_final
+
+
+# TRAINERS #############################################################################
+@attr.s(auto_attribs=True)
+class SelfPlaySettings:
+    save_steps: int = 20000
+    team_change: int = attr.ib()
+
+    @team_change.default
+    def _team_change_default(self):
+        # Assign team_change to about 4x save_steps
+        return self.save_steps * 5
+
+    swap_steps: int = 2000
+    window: int = 10
+    play_against_latest_model_ratio: float = 0.5
+    initial_elo: float = 1200.0
+
+
+@attr.s(auto_attribs=True)
+class TrainerSettings(ExportableSettings):
+    default_override: ClassVar[Optional["TrainerSettings"]] = None
+    trainer_type: str = "ppo"
+    hyperparameters: HyperparamSettings = attr.ib()
+
+    @hyperparameters.default
+    def _set_default_hyperparameters(self):
+        return all_trainer_settings[self.trainer_type]()
+
+    network_settings: NetworkSettings = attr.ib(factory=NetworkSettings)
+    reward_signals: Dict[RewardSignalType, RewardSignalSettings] = attr.ib(
+        factory=lambda: {RewardSignalType.EXTRINSIC: RewardSignalSettings()}
+    )
+    init_path: Optional[str] = None
+    keep_checkpoints: int = 5
+    checkpoint_interval: int = 500000
+    max_steps: int = 500000
+    time_horizon: int = 64
+    summary_freq: int = 50000
+    threaded: bool = False
+    self_play: Optional[SelfPlaySettings] = None
+    behavioral_cloning: Optional[BehavioralCloningSettings] = None
+
+    cattr.register_structure_hook_func(
+        lambda t: t == Dict[RewardSignalType, RewardSignalSettings],
+        RewardSignalSettings.structure,
+    )
+
+    @network_settings.validator
+    def _check_batch_size_seq_length(self, attribute, value):
+        if self.network_settings.memory is not None:
+            if (
+                self.network_settings.memory.sequence_length
+                > self.hyperparameters.batch_size
+            ):
+                raise TrainerConfigError(
+                    "When using memory, sequence length must be less than or equal to batch size. "
+                )
+
+    @staticmethod
+    def dict_to_trainerdict(d: Dict, t: type) -> "TrainerSettings.DefaultTrainerDict":
+        return TrainerSettings.DefaultTrainerDict(
+            cattr.structure(d, Dict[str, TrainerSettings])
+        )
+
+    @staticmethod
+    def structure(d: Mapping, t: type) -> Any:
+        """
+        Helper method to structure a TrainerSettings class. Meant to be registered with
+        cattr.register_structure_hook() and called with cattr.structure().
+        """
+
+        if not isinstance(d, Mapping):
+            raise TrainerConfigError(f"Unsupported config {d} for {t.__name__}.")
+
+        d_copy: Dict[str, Any] = {}
+
+        # Check if a default_settings was specified. If so, used those as the default
+        # rather than an empty dict.
+        if TrainerSettings.default_override is not None:
+            d_copy.update(cattr.unstructure(TrainerSettings.default_override))
+
+        deep_update_dict(d_copy, d)
+
+        if "framework" in d_copy:
+            logger.warning("Framework option was deprecated but was specified")
+            d_copy.pop("framework", None)
+
+        for key, val in d_copy.items():
+            if attr.has(type(val)):
+                # Don't convert already-converted attrs classes.
+                continue
+            if key == "hyperparameters":
+                if "trainer_type" not in d_copy:
+                    raise TrainerConfigError(
+                        "Hyperparameters were specified but no trainer_type was given."
+                    )
+                else:
+                    d_copy[key] = check_hyperparam_schedules(
+                        val, d_copy["trainer_type"]
+                    )
+                    try:
+                        d_copy[key] = strict_to_cls(
+                            d_copy[key], all_trainer_settings[d_copy["trainer_type"]]
+                        )
+                    except KeyError:
+                        raise TrainerConfigError(
+                            f"Settings for trainer type {d_copy['trainer_type']} were not found"
+                        )
+            elif key == "max_steps":
+                d_copy[key] = int(float(val))
+                # In some legacy configs, max steps was specified as a float
+            elif key == "trainer_type":
+                if val not in all_trainer_types.keys():
+                    raise TrainerConfigError(f"Invalid trainer type {val} was found")
+            else:
+                d_copy[key] = check_and_structure(key, val, t)
+        return t(**d_copy)
+
+    class DefaultTrainerDict(collections.defaultdict):
+        def __init__(self, *args):
+            # Depending on how this is called, args may have the defaultdict
+            # callable at the start of the list or not. In particular, unpickling
+            # will pass [TrainerSettings].
+            if args and args[0] == TrainerSettings:
+                super().__init__(*args)
+            else:
+                super().__init__(TrainerSettings, *args)
+            self._config_specified = True
+
+        def set_config_specified(self, require_config_specified: bool) -> None:
+            self._config_specified = require_config_specified
+
+        def __missing__(self, key: Any) -> "TrainerSettings":
+            if TrainerSettings.default_override is not None:
+                self[key] = copy.deepcopy(TrainerSettings.default_override)
+            elif self._config_specified:
+                raise TrainerConfigError(
+                    f"The behavior name {key} has not been specified in the trainer configuration. "
+                    f"Please add an entry in the configuration file for {key}, or set default_settings."
+                )
+            else:
+                logger.warning(
+                    f"Behavior name {key} does not match any behaviors specified "
+                    f"in the trainer configuration file. A default configuration will be used."
+                )
+                self[key] = TrainerSettings()
+            return self[key]
+
+
+# COMMAND LINE #########################################################################
+@attr.s(auto_attribs=True)
+class CheckpointSettings:
+    run_id: str = parser.get_default("run_id")
+    initialize_from: Optional[str] = parser.get_default("initialize_from")
+    load_model: bool = parser.get_default("load_model")
+    resume: bool = parser.get_default("resume")
+    force: bool = parser.get_default("force")
+    train_model: bool = parser.get_default("train_model")
+    inference: bool = parser.get_default("inference")
+    results_dir: str = parser.get_default("results_dir")
+
+    @property
+    def write_path(self) -> str:
+        return os.path.join(self.results_dir, self.run_id)
+
+    @property
+    def maybe_init_path(self) -> Optional[str]:
+        return (
+            os.path.join(self.results_dir, self.initialize_from)
+            if self.initialize_from is not None
+            else None
+        )
+
+    @property
+    def run_logs_dir(self) -> str:
+        return os.path.join(self.write_path, "run_logs")
+
+    def prioritize_resume_init(self) -> None:
+        """Prioritize explicit command line resume/init over conflicting yaml options.
+        if both resume/init are set at one place use resume"""
+        _non_default_args = DetectDefault.non_default_args
+        if "resume" in _non_default_args:
+            if self.initialize_from is not None:
+                logger.warning(
+                    f"Both 'resume' and 'initialize_from={self.initialize_from}' are set!"
+                    f" Current run will be resumed ignoring initialization."
+                )
+                self.initialize_from = parser.get_default("initialize_from")
+        elif "initialize_from" in _non_default_args:
+            if self.resume:
+                logger.warning(
+                    f"Both 'resume' and 'initialize_from={self.initialize_from}' are set!"
+                    f" {self.run_id} is initialized_from {self.initialize_from} and resume will be ignored."
+                )
+                self.resume = parser.get_default("resume")
+        elif self.resume and self.initialize_from is not None:
+            # no cli args but both are set in yaml file
+            logger.warning(
+                f"Both 'resume' and 'initialize_from={self.initialize_from}' are set in yaml file!"
+                f" Current run will be resumed ignoring initialization."
+            )
+            self.initialize_from = parser.get_default("initialize_from")
+
+
+@attr.s(auto_attribs=True)
+class EnvironmentSettings:
+    env_path: Optional[str] = parser.get_default("env_path")
+    env_args: Optional[List[str]] = parser.get_default("env_args")
+    base_port: int = parser.get_default("base_port")
+    num_envs: int = attr.ib(default=parser.get_default("num_envs"))
+    num_areas: int = attr.ib(default=parser.get_default("num_areas"))
+    seed: int = parser.get_default("seed")
+    max_lifetime_restarts: int = parser.get_default("max_lifetime_restarts")
+    restarts_rate_limit_n: int = parser.get_default("restarts_rate_limit_n")
+    restarts_rate_limit_period_s: int = parser.get_default(
+        "restarts_rate_limit_period_s"
+    )
+
+    @num_envs.validator
+    def validate_num_envs(self, attribute, value):
+        if value > 1 and self.env_path is None:
+            raise ValueError("num_envs must be 1 if env_path is not set.")
+
+    @num_areas.validator
+    def validate_num_area(self, attribute, value):
+        if value <= 0:
+            raise ValueError("num_areas must be set to a positive number >= 1.")
+
+
+@attr.s(auto_attribs=True)
+class EngineSettings:
+    width: int = parser.get_default("width")
+    height: int = parser.get_default("height")
+    quality_level: int = parser.get_default("quality_level")
+    time_scale: float = parser.get_default("time_scale")
+    target_frame_rate: int = parser.get_default("target_frame_rate")
+    capture_frame_rate: int = parser.get_default("capture_frame_rate")
+    no_graphics: bool = parser.get_default("no_graphics")
+
+
+@attr.s(auto_attribs=True)
+class TorchSettings:
+    device: Optional[str] = parser.get_default("device")
+
+
+@attr.s(auto_attribs=True)
+class RunOptions(ExportableSettings):
+    default_settings: Optional[TrainerSettings] = None
+    behaviors: TrainerSettings.DefaultTrainerDict = attr.ib(
+        factory=TrainerSettings.DefaultTrainerDict
+    )
+    env_settings: EnvironmentSettings = attr.ib(factory=EnvironmentSettings)
+    engine_settings: EngineSettings = attr.ib(factory=EngineSettings)
+    environment_parameters: Optional[Dict[str, EnvironmentParameterSettings]] = None
+    checkpoint_settings: CheckpointSettings = attr.ib(factory=CheckpointSettings)
+    torch_settings: TorchSettings = attr.ib(factory=TorchSettings)
+
+    # These are options that are relevant to the run itself, and not the engine or environment.
+    # They will be left here.
+    debug: bool = parser.get_default("debug")
+
+    # Convert to settings while making sure all fields are valid
+    cattr.register_structure_hook(EnvironmentSettings, strict_to_cls)
+    cattr.register_structure_hook(EngineSettings, strict_to_cls)
+    cattr.register_structure_hook(CheckpointSettings, strict_to_cls)
+    cattr.register_structure_hook_func(
+        lambda t: t == Dict[str, EnvironmentParameterSettings],
+        EnvironmentParameterSettings.structure,
+    )
+    cattr.register_structure_hook(Lesson, strict_to_cls)
+    cattr.register_structure_hook(
+        ParameterRandomizationSettings, ParameterRandomizationSettings.structure
+    )
+    cattr.register_unstructure_hook(
+        ParameterRandomizationSettings, ParameterRandomizationSettings.unstructure
+    )
+    cattr.register_structure_hook(TrainerSettings, TrainerSettings.structure)
+    cattr.register_structure_hook(
+        TrainerSettings.DefaultTrainerDict, TrainerSettings.dict_to_trainerdict
+    )
+    cattr.register_unstructure_hook(collections.defaultdict, defaultdict_to_dict)
+
+    @staticmethod
+    def from_argparse(args: argparse.Namespace) -> "RunOptions":
+        """
+        Takes an argparse.Namespace as specified in `parse_command_line`, loads input configuration files
+        from file paths, and converts to a RunOptions instance.
+        :param args: collection of command-line parameters passed to mlagents-learn
+        :return: RunOptions representing the passed in arguments, with trainer config, curriculum and sampler
+          configs loaded from files.
+        """
+        argparse_args = vars(args)
+        config_path = StoreConfigFile.trainer_config_path
+
+        # Load YAML
+        configured_dict: Dict[str, Any] = {
+            "checkpoint_settings": {},
+            "env_settings": {},
+            "engine_settings": {},
+            "torch_settings": {},
+        }
+        _require_all_behaviors = True
+        if config_path is not None:
+            configured_dict.update(load_config(config_path))
+        else:
+            # If we're not loading from a file, we don't require all behavior names to be specified.
+            _require_all_behaviors = False
+
+        # Use the YAML file values for all values not specified in the CLI.
+        for key in configured_dict.keys():
+            # Detect bad config options
+            if key not in attr.fields_dict(RunOptions):
+                raise TrainerConfigError(
+                    "The option {} was specified in your YAML file, but is invalid.".format(
+                        key
+                    )
+                )
+
+        # Override with CLI args
+        # Keep deprecated --load working, TODO: remove
+        argparse_args["resume"] = argparse_args["resume"] or argparse_args["load_model"]
+
+        for key, val in argparse_args.items():
+            if key in DetectDefault.non_default_args:
+                if key in attr.fields_dict(CheckpointSettings):
+                    configured_dict["checkpoint_settings"][key] = val
+                elif key in attr.fields_dict(EnvironmentSettings):
+                    configured_dict["env_settings"][key] = val
+                elif key in attr.fields_dict(EngineSettings):
+                    configured_dict["engine_settings"][key] = val
+                elif key in attr.fields_dict(TorchSettings):
+                    configured_dict["torch_settings"][key] = val
+                else:  # Base options
+                    configured_dict[key] = val
+
+        final_runoptions = RunOptions.from_dict(configured_dict)
+        final_runoptions.checkpoint_settings.prioritize_resume_init()
+        # Need check to bypass type checking but keep structure on dict working
+        if isinstance(final_runoptions.behaviors, TrainerSettings.DefaultTrainerDict):
+            # configure whether or not we should require all behavior names to be found in the config YAML
+            final_runoptions.behaviors.set_config_specified(_require_all_behaviors)
+
+        _non_default_args = DetectDefault.non_default_args
+
+        # Prioritize the deterministic mode from the cli for deterministic actions.
+        if "deterministic" in _non_default_args:
+            for behaviour in final_runoptions.behaviors.keys():
+                final_runoptions.behaviors[
+                    behaviour
+                ].network_settings.deterministic = argparse_args["deterministic"]
+
+        return final_runoptions
+
+    @staticmethod
+    def from_dict(
+        options_dict: Dict[str, Any],
+    ) -> "RunOptions":
+        # If a default settings was specified, set the TrainerSettings class override
+        if (
+            "default_settings" in options_dict.keys()
+            and options_dict["default_settings"] is not None
+        ):
+            TrainerSettings.default_override = cattr.structure(
+                options_dict["default_settings"], TrainerSettings
+            )
+        return cattr.structure(options_dict, RunOptions)
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/simple_env_manager.py b/MLPY/Lib/site-packages/mlagents/trainers/simple_env_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..b29fd1ad3b969e33512171090f45f57f0037e305
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/simple_env_manager.py
@@ -0,0 +1,84 @@
+from typing import Dict, List
+
+from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec
+from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult
+from mlagents_envs.timers import timed
+from mlagents.trainers.action_info import ActionInfo
+from mlagents.trainers.settings import ParameterRandomizationSettings
+from mlagents_envs.side_channel.environment_parameters_channel import (
+    EnvironmentParametersChannel,
+)
+
+
+class SimpleEnvManager(EnvManager):
+    """
+    Simple implementation of the EnvManager interface that only handles one BaseEnv at a time.
+    This is generally only useful for testing; see SubprocessEnvManager for a production-quality implementation.
+    """
+
+    def __init__(self, env: BaseEnv, env_params: EnvironmentParametersChannel):
+        super().__init__()
+        self.env_params = env_params
+        self.env = env
+        self.previous_step: EnvironmentStep = EnvironmentStep.empty(0)
+        self.previous_all_action_info: Dict[str, ActionInfo] = {}
+
+    def _step(self) -> List[EnvironmentStep]:
+        all_action_info = self._take_step(self.previous_step)
+        self.previous_all_action_info = all_action_info
+
+        for brain_name, action_info in all_action_info.items():
+            self.env.set_actions(brain_name, action_info.env_action)
+        self.env.step()
+        all_step_result = self._generate_all_results()
+
+        step_info = EnvironmentStep(
+            all_step_result, 0, self.previous_all_action_info, {}
+        )
+        self.previous_step = step_info
+        return [step_info]
+
+    def _reset_env(
+        self, config: Dict[BehaviorName, float] = None
+    ) -> List[EnvironmentStep]:  # type: ignore
+        self.set_env_parameters(config)
+        self.env.reset()
+        all_step_result = self._generate_all_results()
+        self.previous_step = EnvironmentStep(all_step_result, 0, {}, {})
+        return [self.previous_step]
+
+    def set_env_parameters(self, config: Dict = None) -> None:
+        """
+        Sends environment parameter settings to C# via the
+        EnvironmentParametersSidehannel.
+        :param config: Dict of environment parameter keys and values
+        """
+        if config is not None:
+            for k, v in config.items():
+                if isinstance(v, float):
+                    self.env_params.set_float_parameter(k, v)
+                elif isinstance(v, ParameterRandomizationSettings):
+                    v.apply(k, self.env_params)
+
+    @property
+    def training_behaviors(self) -> Dict[BehaviorName, BehaviorSpec]:
+        return self.env.behavior_specs
+
+    def close(self):
+        self.env.close()
+
+    @timed
+    def _take_step(self, last_step: EnvironmentStep) -> Dict[BehaviorName, ActionInfo]:
+        all_action_info: Dict[str, ActionInfo] = {}
+        for brain_name, step_tuple in last_step.current_all_step_result.items():
+            all_action_info[brain_name] = self.policies[brain_name].get_action(
+                step_tuple[0],
+                0,  # As there is only one worker, we assign the worker_id to 0.
+            )
+        return all_action_info
+
+    def _generate_all_results(self) -> AllStepResult:
+        all_step_result: AllStepResult = {}
+        for brain_name in self.env.behavior_specs:
+            all_step_result[brain_name] = self.env.get_steps(brain_name)
+        return all_step_result
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/stats.py b/MLPY/Lib/site-packages/mlagents/trainers/stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..78c86116e2abe2724a0d1ce563b828dec5c5c527
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/stats.py
@@ -0,0 +1,393 @@
+from collections import defaultdict
+from enum import Enum
+from typing import List, Dict, NamedTuple, Any, Optional
+import numpy as np
+import abc
+import os
+import time
+from threading import RLock
+
+from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
+
+from mlagents_envs.logging_util import get_logger
+from mlagents_envs.timers import set_gauge
+from torch.utils.tensorboard import SummaryWriter
+from mlagents.torch_utils.globals import get_rank
+
+logger = get_logger(__name__)
+
+
+def _dict_to_str(param_dict: Dict[str, Any], num_tabs: int) -> str:
+    """
+    Takes a parameter dictionary and converts it to a human-readable string.
+    Recurses if there are multiple levels of dict. Used to print out hyperparameters.
+
+    :param param_dict: A Dictionary of key, value parameters.
+    :return: A string version of this dictionary.
+    """
+    if not isinstance(param_dict, dict):
+        return str(param_dict)
+    else:
+        append_newline = "\n" if num_tabs > 0 else ""
+        return append_newline + "\n".join(
+            [
+                "\t"
+                + "  " * num_tabs
+                + f"{x}:\t{_dict_to_str(param_dict[x], num_tabs + 1)}"
+                for x in param_dict
+            ]
+        )
+
+
+class StatsSummary(NamedTuple):
+    full_dist: List[float]
+    aggregation_method: StatsAggregationMethod
+
+    @staticmethod
+    def empty() -> "StatsSummary":
+        return StatsSummary([], StatsAggregationMethod.AVERAGE)
+
+    @property
+    def aggregated_value(self):
+        if self.aggregation_method == StatsAggregationMethod.SUM:
+            return self.sum
+        else:
+            return self.mean
+
+    @property
+    def mean(self):
+        return np.mean(self.full_dist)
+
+    @property
+    def std(self):
+        return np.std(self.full_dist)
+
+    @property
+    def num(self):
+        return len(self.full_dist)
+
+    @property
+    def sum(self):
+        return np.sum(self.full_dist)
+
+
+class StatsPropertyType(Enum):
+    HYPERPARAMETERS = "hyperparameters"
+    SELF_PLAY = "selfplay"
+
+
+class StatsWriter(abc.ABC):
+    """
+    A StatsWriter abstract class. A StatsWriter takes in a category, key, scalar value, and step
+    and writes it out by some method.
+    """
+
+    def on_add_stat(
+        self,
+        category: str,
+        key: str,
+        value: float,
+        aggregation: StatsAggregationMethod = StatsAggregationMethod.AVERAGE,
+    ) -> None:
+        """
+        Callback method for handling an individual stat value as reported to the StatsReporter add_stat
+        or set_stat methods.
+
+        :param category: Category of the statistics. Usually this is the behavior name.
+        :param key: The type of statistic, e.g. Environment/Reward.
+        :param value: The value of the statistic.
+        :param aggregation: The aggregation method for the statistic, default StatsAggregationMethod.AVERAGE.
+        """
+        pass
+
+    @abc.abstractmethod
+    def write_stats(
+        self, category: str, values: Dict[str, StatsSummary], step: int
+    ) -> None:
+        """
+        Callback to record training information
+        :param category: Category of the statistics. Usually this is the behavior name.
+        :param values: Dictionary of statistics.
+        :param step: The current training step.
+        :return:
+        """
+        pass
+
+    def add_property(
+        self, category: str, property_type: StatsPropertyType, value: Any
+    ) -> None:
+        """
+        Add a generic property to the StatsWriter. This could be e.g. a Dict of hyperparameters,
+        a max step count, a trainer type, etc. Note that not all StatsWriters need to be compatible
+        with all types of properties. For instance, a TB writer doesn't need a max step.
+
+        :param category: The category that the property belongs to.
+        :param property_type: The type of property.
+        :param value: The property itself.
+        """
+        pass
+
+
+class GaugeWriter(StatsWriter):
+    """
+    Write all stats that we receive to the timer gauges, so we can track them offline easily
+    """
+
+    @staticmethod
+    def sanitize_string(s: str) -> str:
+        """
+        Clean up special characters in the category and value names.
+        """
+        return s.replace("/", ".").replace(" ", "")
+
+    def write_stats(
+        self, category: str, values: Dict[str, StatsSummary], step: int
+    ) -> None:
+        for val, stats_summary in values.items():
+            set_gauge(
+                GaugeWriter.sanitize_string(f"{category}.{val}.mean"),
+                float(stats_summary.mean),
+            )
+            set_gauge(
+                GaugeWriter.sanitize_string(f"{category}.{val}.sum"),
+                float(stats_summary.sum),
+            )
+
+
+class ConsoleWriter(StatsWriter):
+    def __init__(self):
+        self.training_start_time = time.time()
+        # If self-play, we want to print ELO as well as reward
+        self.self_play = False
+        self.self_play_team = -1
+        self.rank = get_rank()
+
+    def write_stats(
+        self, category: str, values: Dict[str, StatsSummary], step: int
+    ) -> None:
+        is_training = "Not Training"
+        if "Is Training" in values:
+            stats_summary = values["Is Training"]
+            if stats_summary.aggregated_value > 0.0:
+                is_training = "Training"
+
+        elapsed_time = time.time() - self.training_start_time
+        log_info: List[str] = [category]
+        log_info.append(f"Step: {step}")
+        log_info.append(f"Time Elapsed: {elapsed_time:0.3f} s")
+        if "Environment/Cumulative Reward" in values:
+            stats_summary = values["Environment/Cumulative Reward"]
+            if self.rank is not None:
+                log_info.append(f"Rank: {self.rank}")
+
+            log_info.append(f"Mean Reward: {stats_summary.mean:0.3f}")
+            if "Environment/Group Cumulative Reward" in values:
+                group_stats_summary = values["Environment/Group Cumulative Reward"]
+                log_info.append(f"Mean Group Reward: {group_stats_summary.mean:0.3f}")
+            else:
+                log_info.append(f"Std of Reward: {stats_summary.std:0.3f}")
+            log_info.append(is_training)
+
+            if self.self_play and "Self-play/ELO" in values:
+                elo_stats = values["Self-play/ELO"]
+                log_info.append(f"ELO: {elo_stats.mean:0.3f}")
+        else:
+            log_info.append("No episode was completed since last summary")
+            log_info.append(is_training)
+        logger.info(". ".join(log_info) + ".")
+
+    def add_property(
+        self, category: str, property_type: StatsPropertyType, value: Any
+    ) -> None:
+        if property_type == StatsPropertyType.HYPERPARAMETERS:
+            logger.info(
+                """Hyperparameters for behavior name {}: \n{}""".format(
+                    category, _dict_to_str(value, 0)
+                )
+            )
+        elif property_type == StatsPropertyType.SELF_PLAY:
+            assert isinstance(value, bool)
+            self.self_play = value
+
+
+class TensorboardWriter(StatsWriter):
+    def __init__(
+        self,
+        base_dir: str,
+        clear_past_data: bool = False,
+        hidden_keys: Optional[List[str]] = None,
+    ):
+        """
+        A StatsWriter that writes to a Tensorboard summary.
+
+        :param base_dir: The directory within which to place all the summaries. Tensorboard files will be written to a
+        {base_dir}/{category} directory.
+        :param clear_past_data: Whether or not to clean up existing Tensorboard files associated with the base_dir and
+        category.
+        :param hidden_keys: If provided, Tensorboard Writer won't write statistics identified with these Keys in
+        Tensorboard summary.
+        """
+        self.summary_writers: Dict[str, SummaryWriter] = {}
+        self.base_dir: str = base_dir
+        self._clear_past_data = clear_past_data
+        self.hidden_keys: List[str] = hidden_keys if hidden_keys is not None else []
+
+    def write_stats(
+        self, category: str, values: Dict[str, StatsSummary], step: int
+    ) -> None:
+        self._maybe_create_summary_writer(category)
+        for key, value in values.items():
+            if key in self.hidden_keys:
+                continue
+            self.summary_writers[category].add_scalar(
+                f"{key}", value.aggregated_value, step
+            )
+            if value.aggregation_method == StatsAggregationMethod.HISTOGRAM:
+                self.summary_writers[category].add_histogram(
+                    f"{key}_hist", np.array(value.full_dist), step
+                )
+            self.summary_writers[category].flush()
+
+    def _maybe_create_summary_writer(self, category: str) -> None:
+        if category not in self.summary_writers:
+            filewriter_dir = "{basedir}/{category}".format(
+                basedir=self.base_dir, category=category
+            )
+            os.makedirs(filewriter_dir, exist_ok=True)
+            if self._clear_past_data:
+                self._delete_all_events_files(filewriter_dir)
+            self.summary_writers[category] = SummaryWriter(filewriter_dir)
+
+    def _delete_all_events_files(self, directory_name: str) -> None:
+        for file_name in os.listdir(directory_name):
+            if file_name.startswith("events.out"):
+                logger.warning(
+                    f"Deleting TensorBoard data {file_name} that was left over from a "
+                    "previous run."
+                )
+                full_fname = os.path.join(directory_name, file_name)
+                try:
+                    os.remove(full_fname)
+                except OSError:
+                    logger.error(
+                        "{} was left over from a previous run and "
+                        "not deleted.".format(full_fname)
+                    )
+
+    def add_property(
+        self, category: str, property_type: StatsPropertyType, value: Any
+    ) -> None:
+        if property_type == StatsPropertyType.HYPERPARAMETERS:
+            assert isinstance(value, dict)
+            summary = _dict_to_str(value, 0)
+            self._maybe_create_summary_writer(category)
+            if summary is not None:
+                self.summary_writers[category].add_text("Hyperparameters", summary)
+                self.summary_writers[category].flush()
+
+
+class StatsReporter:
+    writers: List[StatsWriter] = []
+    stats_dict: Dict[str, Dict[str, List]] = defaultdict(lambda: defaultdict(list))
+    lock = RLock()
+    stats_aggregation: Dict[str, Dict[str, StatsAggregationMethod]] = defaultdict(
+        lambda: defaultdict(lambda: StatsAggregationMethod.AVERAGE)
+    )
+
+    def __init__(self, category: str):
+        """
+        Generic StatsReporter. A category is the broadest type of storage (would
+        correspond the run name and trainer name, e.g. 3DBalltest_3DBall. A key is the
+        type of stat it is (e.g. Environment/Reward). Finally the Value is the float value
+        attached to this stat.
+        """
+        self.category: str = category
+
+    @staticmethod
+    def add_writer(writer: StatsWriter) -> None:
+        with StatsReporter.lock:
+            StatsReporter.writers.append(writer)
+
+    def add_property(self, property_type: StatsPropertyType, value: Any) -> None:
+        """
+        Add a generic property to the StatsReporter. This could be e.g. a Dict of hyperparameters,
+        a max step count, a trainer type, etc. Note that not all StatsWriters need to be compatible
+        with all types of properties. For instance, a TB writer doesn't need a max step.
+
+        :param property_type: The type of property.
+        :param value: The property itself.
+        """
+        with StatsReporter.lock:
+            for writer in StatsReporter.writers:
+                writer.add_property(self.category, property_type, value)
+
+    def add_stat(
+        self,
+        key: str,
+        value: float,
+        aggregation: StatsAggregationMethod = StatsAggregationMethod.AVERAGE,
+    ) -> None:
+        """
+        Add a float value stat to the StatsReporter.
+
+        :param key: The type of statistic, e.g. Environment/Reward.
+        :param value: the value of the statistic.
+        :param aggregation: the aggregation method for the statistic, default StatsAggregationMethod.AVERAGE.
+        """
+        with StatsReporter.lock:
+            StatsReporter.stats_dict[self.category][key].append(value)
+            StatsReporter.stats_aggregation[self.category][key] = aggregation
+            for writer in StatsReporter.writers:
+                writer.on_add_stat(self.category, key, value, aggregation)
+
+    def set_stat(self, key: str, value: float) -> None:
+        """
+        Sets a stat value to a float. This is for values that we don't want to average, and just
+        want the latest.
+
+        :param key: The type of statistic, e.g. Environment/Reward.
+        :param value: the value of the statistic.
+        """
+        with StatsReporter.lock:
+            StatsReporter.stats_dict[self.category][key] = [value]
+            StatsReporter.stats_aggregation[self.category][
+                key
+            ] = StatsAggregationMethod.MOST_RECENT
+            for writer in StatsReporter.writers:
+                writer.on_add_stat(
+                    self.category, key, value, StatsAggregationMethod.MOST_RECENT
+                )
+
+    def write_stats(self, step: int) -> None:
+        """
+        Write out all stored statistics that fall under the category specified.
+        The currently stored values will be averaged, written out as a single value,
+        and the buffer cleared.
+
+        :param step: Training step which to write these stats as.
+        """
+        with StatsReporter.lock:
+            values: Dict[str, StatsSummary] = {}
+            for key in StatsReporter.stats_dict[self.category]:
+                if len(StatsReporter.stats_dict[self.category][key]) > 0:
+                    stat_summary = self.get_stats_summaries(key)
+                    values[key] = stat_summary
+            for writer in StatsReporter.writers:
+                writer.write_stats(self.category, values, step)
+            del StatsReporter.stats_dict[self.category]
+
+    def get_stats_summaries(self, key: str) -> StatsSummary:
+        """
+        Get the mean, std, count, sum and aggregation method of a particular statistic, since last write.
+
+        :param key: The type of statistic, e.g. Environment/Reward.
+        :returns: A StatsSummary containing summary statistics.
+        """
+        stat_values = StatsReporter.stats_dict[self.category][key]
+        if len(stat_values) == 0:
+            return StatsSummary.empty()
+
+        return StatsSummary(
+            full_dist=stat_values,
+            aggregation_method=StatsReporter.stats_aggregation[self.category][key],
+        )
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/subprocess_env_manager.py b/MLPY/Lib/site-packages/mlagents/trainers/subprocess_env_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d468f2bc338549d5fe95659804d31ae7602000
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/subprocess_env_manager.py
@@ -0,0 +1,546 @@
+import datetime
+from typing import Dict, NamedTuple, List, Any, Optional, Callable, Set
+import cloudpickle
+import enum
+import time
+
+from mlagents_envs.environment import UnityEnvironment
+from mlagents_envs.exception import (
+    UnityCommunicationException,
+    UnityTimeOutException,
+    UnityEnvironmentException,
+    UnityCommunicatorStoppedException,
+)
+from multiprocessing import Process, Pipe, Queue
+from multiprocessing.connection import Connection
+from queue import Empty as EmptyQueueException
+from mlagents_envs.base_env import BaseEnv, BehaviorName, BehaviorSpec
+from mlagents_envs import logging_util
+from mlagents.trainers.env_manager import EnvManager, EnvironmentStep, AllStepResult
+from mlagents.trainers.settings import TrainerSettings
+from mlagents_envs.timers import (
+    TimerNode,
+    timed,
+    hierarchical_timer,
+    reset_timers,
+    get_timer_root,
+)
+from mlagents.trainers.settings import ParameterRandomizationSettings, RunOptions
+from mlagents.trainers.action_info import ActionInfo
+from mlagents_envs.side_channel.environment_parameters_channel import (
+    EnvironmentParametersChannel,
+)
+from mlagents_envs.side_channel.engine_configuration_channel import (
+    EngineConfigurationChannel,
+    EngineConfig,
+)
+from mlagents_envs.side_channel.stats_side_channel import (
+    EnvironmentStats,
+    StatsSideChannel,
+)
+from mlagents.trainers.training_analytics_side_channel import (
+    TrainingAnalyticsSideChannel,
+)
+from mlagents_envs.side_channel.side_channel import SideChannel
+
+
+logger = logging_util.get_logger(__name__)
+WORKER_SHUTDOWN_TIMEOUT_S = 10
+
+
+class EnvironmentCommand(enum.Enum):
+    STEP = 1
+    BEHAVIOR_SPECS = 2
+    ENVIRONMENT_PARAMETERS = 3
+    RESET = 4
+    CLOSE = 5
+    ENV_EXITED = 6
+    CLOSED = 7
+    TRAINING_STARTED = 8
+
+
+class EnvironmentRequest(NamedTuple):
+    cmd: EnvironmentCommand
+    payload: Any = None
+
+
+class EnvironmentResponse(NamedTuple):
+    cmd: EnvironmentCommand
+    worker_id: int
+    payload: Any
+
+
+class StepResponse(NamedTuple):
+    all_step_result: AllStepResult
+    timer_root: Optional[TimerNode]
+    environment_stats: EnvironmentStats
+
+
+class UnityEnvWorker:
+    def __init__(self, process: Process, worker_id: int, conn: Connection):
+        self.process = process
+        self.worker_id = worker_id
+        self.conn = conn
+        self.previous_step: EnvironmentStep = EnvironmentStep.empty(worker_id)
+        self.previous_all_action_info: Dict[str, ActionInfo] = {}
+        self.waiting = False
+        self.closed = False
+
+    def send(self, cmd: EnvironmentCommand, payload: Any = None) -> None:
+        try:
+            req = EnvironmentRequest(cmd, payload)
+            self.conn.send(req)
+        except (BrokenPipeError, EOFError):
+            raise UnityCommunicationException("UnityEnvironment worker: send failed.")
+
+    def recv(self) -> EnvironmentResponse:
+        try:
+            response: EnvironmentResponse = self.conn.recv()
+            if response.cmd == EnvironmentCommand.ENV_EXITED:
+                env_exception: Exception = response.payload
+                raise env_exception
+            return response
+        except (BrokenPipeError, EOFError):
+            raise UnityCommunicationException("UnityEnvironment worker: recv failed.")
+
+    def request_close(self):
+        try:
+            self.conn.send(EnvironmentRequest(EnvironmentCommand.CLOSE))
+        except (BrokenPipeError, EOFError):
+            logger.debug(
+                f"UnityEnvWorker {self.worker_id} got exception trying to close."
+            )
+            pass
+
+
+def worker(
+    parent_conn: Connection,
+    step_queue: Queue,
+    pickled_env_factory: str,
+    worker_id: int,
+    run_options: RunOptions,
+    log_level: int = logging_util.INFO,
+) -> None:
+    env_factory: Callable[
+        [int, List[SideChannel]], UnityEnvironment
+    ] = cloudpickle.loads(pickled_env_factory)
+    env_parameters = EnvironmentParametersChannel()
+
+    engine_config = EngineConfig(
+        width=run_options.engine_settings.width,
+        height=run_options.engine_settings.height,
+        quality_level=run_options.engine_settings.quality_level,
+        time_scale=run_options.engine_settings.time_scale,
+        target_frame_rate=run_options.engine_settings.target_frame_rate,
+        capture_frame_rate=run_options.engine_settings.capture_frame_rate,
+    )
+    engine_configuration_channel = EngineConfigurationChannel()
+    engine_configuration_channel.set_configuration(engine_config)
+
+    stats_channel = StatsSideChannel()
+    training_analytics_channel: Optional[TrainingAnalyticsSideChannel] = None
+    if worker_id == 0:
+        training_analytics_channel = TrainingAnalyticsSideChannel()
+    env: UnityEnvironment = None
+    # Set log level. On some platforms, the logger isn't common with the
+    # main process, so we need to set it again.
+    logging_util.set_log_level(log_level)
+
+    def _send_response(cmd_name: EnvironmentCommand, payload: Any) -> None:
+        parent_conn.send(EnvironmentResponse(cmd_name, worker_id, payload))
+
+    def _generate_all_results() -> AllStepResult:
+        all_step_result: AllStepResult = {}
+        for brain_name in env.behavior_specs:
+            all_step_result[brain_name] = env.get_steps(brain_name)
+        return all_step_result
+
+    try:
+        side_channels = [env_parameters, engine_configuration_channel, stats_channel]
+        if training_analytics_channel is not None:
+            side_channels.append(training_analytics_channel)
+
+        env = env_factory(worker_id, side_channels)
+        if (
+            not env.academy_capabilities
+            or not env.academy_capabilities.trainingAnalytics
+        ):
+            # Make sure we don't try to send training analytics if the environment doesn't know how to process
+            # them. This wouldn't be catastrophic, but would result in unknown SideChannel UUIDs being used.
+            training_analytics_channel = None
+        if training_analytics_channel:
+            training_analytics_channel.environment_initialized(run_options)
+
+        while True:
+            req: EnvironmentRequest = parent_conn.recv()
+            if req.cmd == EnvironmentCommand.STEP:
+                all_action_info = req.payload
+                for brain_name, action_info in all_action_info.items():
+                    if len(action_info.agent_ids) > 0:
+                        env.set_actions(brain_name, action_info.env_action)
+                env.step()
+                all_step_result = _generate_all_results()
+                # The timers in this process are independent from all the processes and the "main" process
+                # So after we send back the root timer, we can safely clear them.
+                # Note that we could randomly return timers a fraction of the time if we wanted to reduce
+                # the data transferred.
+                # TODO get gauges from the workers and merge them in the main process too.
+                env_stats = stats_channel.get_and_reset_stats()
+                step_response = StepResponse(
+                    all_step_result, get_timer_root(), env_stats
+                )
+                step_queue.put(
+                    EnvironmentResponse(
+                        EnvironmentCommand.STEP, worker_id, step_response
+                    )
+                )
+                reset_timers()
+            elif req.cmd == EnvironmentCommand.BEHAVIOR_SPECS:
+                _send_response(EnvironmentCommand.BEHAVIOR_SPECS, env.behavior_specs)
+            elif req.cmd == EnvironmentCommand.ENVIRONMENT_PARAMETERS:
+                for k, v in req.payload.items():
+                    if isinstance(v, ParameterRandomizationSettings):
+                        v.apply(k, env_parameters)
+            elif req.cmd == EnvironmentCommand.TRAINING_STARTED:
+                behavior_name, trainer_config = req.payload
+                if training_analytics_channel:
+                    training_analytics_channel.training_started(
+                        behavior_name, trainer_config
+                    )
+            elif req.cmd == EnvironmentCommand.RESET:
+                env.reset()
+                all_step_result = _generate_all_results()
+                _send_response(EnvironmentCommand.RESET, all_step_result)
+            elif req.cmd == EnvironmentCommand.CLOSE:
+                break
+    except (
+        KeyboardInterrupt,
+        UnityCommunicationException,
+        UnityTimeOutException,
+        UnityEnvironmentException,
+        UnityCommunicatorStoppedException,
+    ) as ex:
+        logger.debug(f"UnityEnvironment worker {worker_id}: environment stopping.")
+        step_queue.put(
+            EnvironmentResponse(EnvironmentCommand.ENV_EXITED, worker_id, ex)
+        )
+        _send_response(EnvironmentCommand.ENV_EXITED, ex)
+    except Exception as ex:
+        logger.exception(
+            f"UnityEnvironment worker {worker_id}: environment raised an unexpected exception."
+        )
+        step_queue.put(
+            EnvironmentResponse(EnvironmentCommand.ENV_EXITED, worker_id, ex)
+        )
+        _send_response(EnvironmentCommand.ENV_EXITED, ex)
+    finally:
+        logger.debug(f"UnityEnvironment worker {worker_id} closing.")
+        if env is not None:
+            env.close()
+        logger.debug(f"UnityEnvironment worker {worker_id} done.")
+        parent_conn.close()
+        step_queue.put(EnvironmentResponse(EnvironmentCommand.CLOSED, worker_id, None))
+        step_queue.close()
+
+
+class SubprocessEnvManager(EnvManager):
+    def __init__(
+        self,
+        env_factory: Callable[[int, List[SideChannel]], BaseEnv],
+        run_options: RunOptions,
+        n_env: int = 1,
+    ):
+        super().__init__()
+        self.env_workers: List[UnityEnvWorker] = []
+        self.step_queue: Queue = Queue()
+        self.workers_alive = 0
+        self.env_factory = env_factory
+        self.run_options = run_options
+        self.env_parameters: Optional[Dict] = None
+        # Each worker is correlated with a list of times they restarted within the last time period.
+        self.recent_restart_timestamps: List[List[datetime.datetime]] = [
+            [] for _ in range(n_env)
+        ]
+        self.restart_counts: List[int] = [0] * n_env
+        for worker_idx in range(n_env):
+            self.env_workers.append(
+                self.create_worker(
+                    worker_idx, self.step_queue, env_factory, run_options
+                )
+            )
+            self.workers_alive += 1
+
+    @staticmethod
+    def create_worker(
+        worker_id: int,
+        step_queue: Queue,
+        env_factory: Callable[[int, List[SideChannel]], BaseEnv],
+        run_options: RunOptions,
+    ) -> UnityEnvWorker:
+        parent_conn, child_conn = Pipe()
+
+        # Need to use cloudpickle for the env factory function since function objects aren't picklable
+        # on Windows as of Python 3.6.
+        pickled_env_factory = cloudpickle.dumps(env_factory)
+        child_process = Process(
+            target=worker,
+            args=(
+                child_conn,
+                step_queue,
+                pickled_env_factory,
+                worker_id,
+                run_options,
+                logger.level,
+            ),
+        )
+        child_process.start()
+        return UnityEnvWorker(child_process, worker_id, parent_conn)
+
+    def _queue_steps(self) -> None:
+        for env_worker in self.env_workers:
+            if not env_worker.waiting:
+                env_action_info = self._take_step(env_worker.previous_step)
+                env_worker.previous_all_action_info = env_action_info
+                env_worker.send(EnvironmentCommand.STEP, env_action_info)
+                env_worker.waiting = True
+
+    def _restart_failed_workers(self, first_failure: EnvironmentResponse) -> None:
+        if first_failure.cmd != EnvironmentCommand.ENV_EXITED:
+            return
+        # Drain the step queue to make sure all workers are paused and we have found all concurrent errors.
+        # Pausing all training is needed since we need to reset all pending training steps as they could be corrupted.
+        other_failures: Dict[int, Exception] = self._drain_step_queue()
+        # TODO: Once we use python 3.9 switch to using the | operator to combine dicts.
+        failures: Dict[int, Exception] = {
+            **{first_failure.worker_id: first_failure.payload},
+            **other_failures,
+        }
+        for worker_id, ex in failures.items():
+            self._assert_worker_can_restart(worker_id, ex)
+            logger.warning(f"Restarting worker[{worker_id}] after '{ex}'")
+            self.recent_restart_timestamps[worker_id].append(datetime.datetime.now())
+            self.restart_counts[worker_id] += 1
+            self.env_workers[worker_id] = self.create_worker(
+                worker_id, self.step_queue, self.env_factory, self.run_options
+            )
+        # The restarts were successful, clear all the existing training trajectories so we don't use corrupted or
+        # outdated data.
+        self.reset(self.env_parameters)
+
+    def _drain_step_queue(self) -> Dict[int, Exception]:
+        """
+        Drains all steps out of the step queue and returns all exceptions from crashed workers.
+        This will effectively pause all workers so that they won't do anything until _queue_steps is called.
+        """
+        all_failures = {}
+        workers_still_pending = {w.worker_id for w in self.env_workers if w.waiting}
+        deadline = datetime.datetime.now() + datetime.timedelta(minutes=1)
+        while workers_still_pending and deadline > datetime.datetime.now():
+            try:
+                while True:
+                    step: EnvironmentResponse = self.step_queue.get_nowait()
+                    if step.cmd == EnvironmentCommand.ENV_EXITED:
+                        workers_still_pending.add(step.worker_id)
+                        all_failures[step.worker_id] = step.payload
+                    else:
+                        workers_still_pending.remove(step.worker_id)
+                        self.env_workers[step.worker_id].waiting = False
+            except EmptyQueueException:
+                pass
+        if deadline < datetime.datetime.now():
+            still_waiting = {w.worker_id for w in self.env_workers if w.waiting}
+            raise TimeoutError(f"Workers {still_waiting} stuck in waiting state")
+        return all_failures
+
+    def _assert_worker_can_restart(self, worker_id: int, exception: Exception) -> None:
+        """
+        Checks if we can recover from an exception from a worker.
+        If the restart limit is exceeded it will raise a UnityCommunicationException.
+        If the exception is not recoverable it re-raises the exception.
+        """
+        if (
+            isinstance(exception, UnityCommunicationException)
+            or isinstance(exception, UnityTimeOutException)
+            or isinstance(exception, UnityEnvironmentException)
+            or isinstance(exception, UnityCommunicatorStoppedException)
+        ):
+            if self._worker_has_restart_quota(worker_id):
+                return
+            else:
+                logger.error(
+                    f"Worker {worker_id} exceeded the allowed number of restarts."
+                )
+                raise exception
+        raise exception
+
+    def _worker_has_restart_quota(self, worker_id: int) -> bool:
+        self._drop_old_restart_timestamps(worker_id)
+        max_lifetime_restarts = self.run_options.env_settings.max_lifetime_restarts
+        max_limit_check = (
+            max_lifetime_restarts == -1
+            or self.restart_counts[worker_id] < max_lifetime_restarts
+        )
+
+        rate_limit_n = self.run_options.env_settings.restarts_rate_limit_n
+        rate_limit_check = (
+            rate_limit_n == -1
+            or len(self.recent_restart_timestamps[worker_id]) < rate_limit_n
+        )
+
+        return rate_limit_check and max_limit_check
+
+    def _drop_old_restart_timestamps(self, worker_id: int) -> None:
+        """
+        Drops environment restart timestamps that are outside of the current window.
+        """
+
+        def _filter(t: datetime.datetime) -> bool:
+            return t > datetime.datetime.now() - datetime.timedelta(
+                seconds=self.run_options.env_settings.restarts_rate_limit_period_s
+            )
+
+        self.recent_restart_timestamps[worker_id] = list(
+            filter(_filter, self.recent_restart_timestamps[worker_id])
+        )
+
+    def _step(self) -> List[EnvironmentStep]:
+        # Queue steps for any workers which aren't in the "waiting" state.
+        self._queue_steps()
+
+        worker_steps: List[EnvironmentResponse] = []
+        step_workers: Set[int] = set()
+        # Poll the step queue for completed steps from environment workers until we retrieve
+        # 1 or more, which we will then return as StepInfos
+        while len(worker_steps) < 1:
+            try:
+                while True:
+                    step: EnvironmentResponse = self.step_queue.get_nowait()
+                    if step.cmd == EnvironmentCommand.ENV_EXITED:
+                        # If even one env exits try to restart all envs that failed.
+                        self._restart_failed_workers(step)
+                        # Clear state and restart this function.
+                        worker_steps.clear()
+                        step_workers.clear()
+                        self._queue_steps()
+                    elif step.worker_id not in step_workers:
+                        self.env_workers[step.worker_id].waiting = False
+                        worker_steps.append(step)
+                        step_workers.add(step.worker_id)
+            except EmptyQueueException:
+                pass
+        step_infos = self._postprocess_steps(worker_steps)
+        return step_infos
+
+    def _reset_env(self, config: Optional[Dict] = None) -> List[EnvironmentStep]:
+        while any(ew.waiting for ew in self.env_workers):
+            if not self.step_queue.empty():
+                step = self.step_queue.get_nowait()
+                self.env_workers[step.worker_id].waiting = False
+        # Send config to environment
+        self.set_env_parameters(config)
+        # First enqueue reset commands for all workers so that they reset in parallel
+        for ew in self.env_workers:
+            ew.send(EnvironmentCommand.RESET, config)
+        # Next (synchronously) collect the reset observations from each worker in sequence
+        for ew in self.env_workers:
+            ew.previous_step = EnvironmentStep(ew.recv().payload, ew.worker_id, {}, {})
+        return list(map(lambda ew: ew.previous_step, self.env_workers))
+
+    def set_env_parameters(self, config: Dict = None) -> None:
+        """
+        Sends environment parameter settings to C# via the
+        EnvironmentParametersSidehannel for each worker.
+        :param config: Dict of environment parameter keys and values
+        """
+        self.env_parameters = config
+        for ew in self.env_workers:
+            ew.send(EnvironmentCommand.ENVIRONMENT_PARAMETERS, config)
+
+    def on_training_started(
+        self, behavior_name: str, trainer_settings: TrainerSettings
+    ) -> None:
+        """
+        Handle traing starting for a new behavior type. Generally nothing is necessary here.
+        :param behavior_name:
+        :param trainer_settings:
+        :return:
+        """
+        for ew in self.env_workers:
+            ew.send(
+                EnvironmentCommand.TRAINING_STARTED, (behavior_name, trainer_settings)
+            )
+
+    @property
+    def training_behaviors(self) -> Dict[BehaviorName, BehaviorSpec]:
+        result: Dict[BehaviorName, BehaviorSpec] = {}
+        for worker in self.env_workers:
+            worker.send(EnvironmentCommand.BEHAVIOR_SPECS)
+            result.update(worker.recv().payload)
+        return result
+
+    def close(self) -> None:
+        logger.debug("SubprocessEnvManager closing.")
+        for env_worker in self.env_workers:
+            env_worker.request_close()
+        # Pull messages out of the queue until every worker has CLOSED or we time out.
+        deadline = time.time() + WORKER_SHUTDOWN_TIMEOUT_S
+        while self.workers_alive > 0 and time.time() < deadline:
+            try:
+                step: EnvironmentResponse = self.step_queue.get_nowait()
+                env_worker = self.env_workers[step.worker_id]
+                if step.cmd == EnvironmentCommand.CLOSED and not env_worker.closed:
+                    env_worker.closed = True
+                    self.workers_alive -= 1
+                # Discard all other messages.
+            except EmptyQueueException:
+                pass
+        self.step_queue.close()
+        # Sanity check to kill zombie workers and report an issue if they occur.
+        if self.workers_alive > 0:
+            logger.error("SubprocessEnvManager had workers that didn't signal shutdown")
+            for env_worker in self.env_workers:
+                if not env_worker.closed and env_worker.process.is_alive():
+                    env_worker.process.terminate()
+                    logger.error(
+                        "A SubprocessEnvManager worker did not shut down correctly so it was forcefully terminated."
+                    )
+        self.step_queue.join_thread()
+
+    def _postprocess_steps(
+        self, env_steps: List[EnvironmentResponse]
+    ) -> List[EnvironmentStep]:
+        step_infos = []
+        timer_nodes = []
+        for step in env_steps:
+            payload: StepResponse = step.payload
+            env_worker = self.env_workers[step.worker_id]
+            new_step = EnvironmentStep(
+                payload.all_step_result,
+                step.worker_id,
+                env_worker.previous_all_action_info,
+                payload.environment_stats,
+            )
+            step_infos.append(new_step)
+            env_worker.previous_step = new_step
+
+            if payload.timer_root:
+                timer_nodes.append(payload.timer_root)
+
+        if timer_nodes:
+            with hierarchical_timer("workers") as main_timer_node:
+                for worker_timer_node in timer_nodes:
+                    main_timer_node.merge(
+                        worker_timer_node, root_name="worker_root", is_parallel=True
+                    )
+
+        return step_infos
+
+    @timed
+    def _take_step(self, last_step: EnvironmentStep) -> Dict[BehaviorName, ActionInfo]:
+        all_action_info: Dict[str, ActionInfo] = {}
+        for brain_name, step_tuple in last_step.current_all_step_result.items():
+            if brain_name in self.policies:
+                all_action_info[brain_name] = self.policies[brain_name].get_action(
+                    step_tuple[0], last_step.worker_id
+                )
+        return all_action_info
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0520a9f26c8efcbd85d38c83e8b360756c780db3
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_flattener.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_flattener.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29b97d7cada3f6d989a0b9c4ff926828cdaedfab
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_flattener.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_log_probs.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_log_probs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f93a96ce54a950ac103fd105a0ddf0861e9d010f
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_log_probs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_model.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_model.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..520d19b853c791353a986042a30d7f4c177838ac
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/action_model.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/agent_action.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/agent_action.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ac7a1e078bb65278c2dbcf60c08a67987804b44
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/agent_action.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/attention.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/attention.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bebe18d90430fbd28b8613cea34b13e0b721bd1
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/attention.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/conditioning.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/conditioning.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..543a86397ca5dc7fdea7e7f72d572bfa5a719660
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/conditioning.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/decoders.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/decoders.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de3de327b600f8c06184a6b276b224323e6f1590
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/decoders.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/distributions.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/distributions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..525905043497d42e75a2b408cdc49fb6f6b6a271
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/distributions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/encoders.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/encoders.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e22dc2278e02cc0bce8b53b0a695ceefed95105
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/encoders.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/layers.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/layers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fafb38e71b2fc97601c8d15c5d26aa0aebbb00c
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/layers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/model_serialization.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/model_serialization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8071dd1776625198c18a5095e8ee888318743d8f
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/model_serialization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/networks.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/networks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24fd5d6115a543bc496e58aa0f3b5aa600018840
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/networks.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ed48ded864f83bc0fce6ca308eca335c001ce92
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_flattener.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_flattener.py
new file mode 100644
index 0000000000000000000000000000000000000000..beb529c96361ffc8cc0d679aad9771a3bd1cfed2
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_flattener.py
@@ -0,0 +1,44 @@
+from typing import List
+from mlagents.torch_utils import torch
+
+from mlagents_envs.base_env import ActionSpec
+from mlagents.trainers.torch_entities.agent_action import AgentAction
+from mlagents.trainers.torch_entities.utils import ModelUtils
+
+
+class ActionFlattener:
+    def __init__(self, action_spec: ActionSpec):
+        """
+        A torch module that creates the flattened form of an AgentAction object.
+        The flattened form is the continuous action concatenated with the
+        concatenated one hot encodings of the discrete actions.
+        :param action_spec: An ActionSpec that describes the action space dimensions
+        """
+        self._specs = action_spec
+
+    @property
+    def flattened_size(self) -> int:
+        """
+        The flattened size is the continuous size plus the sum of the branch sizes
+        since discrete actions are encoded as one hots.
+        """
+        return self._specs.continuous_size + sum(self._specs.discrete_branches)
+
+    def forward(self, action: AgentAction) -> torch.Tensor:
+        """
+        Returns a tensor corresponding the flattened action
+        :param action: An AgentAction object
+        """
+        action_list: List[torch.Tensor] = []
+        if self._specs.continuous_size > 0:
+            action_list.append(action.continuous_tensor)
+        if self._specs.discrete_size > 0:
+            flat_discrete = torch.cat(
+                ModelUtils.actions_to_onehot(
+                    torch.as_tensor(action.discrete_tensor, dtype=torch.long),
+                    self._specs.discrete_branches,
+                ),
+                dim=1,
+            )
+            action_list.append(flat_discrete)
+        return torch.cat(action_list, dim=1)
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_log_probs.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_log_probs.py
new file mode 100644
index 0000000000000000000000000000000000000000..b72e7bb22358404f18a5a6c41ac0d93adacfc303
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_log_probs.py
@@ -0,0 +1,118 @@
+from typing import List, Optional, NamedTuple
+from mlagents.torch_utils import torch
+import numpy as np
+
+from mlagents.trainers.torch_entities.utils import ModelUtils
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
+from mlagents_envs.base_env import _ActionTupleBase
+
+
+class LogProbsTuple(_ActionTupleBase):
+    """
+    An object whose fields correspond to the log probs of actions of different types.
+    Continuous and discrete are numpy arrays
+    Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
+    respectively. Note, this also holds when continuous or discrete size is
+    zero.
+    """
+
+    @property
+    def discrete_dtype(self) -> np.dtype:
+        """
+        The dtype of a discrete log probability.
+        """
+        return np.float32
+
+    @staticmethod
+    def empty_log_probs() -> "LogProbsTuple":
+        """
+        Generates a dummy LogProbsTuple
+        """
+        return LogProbsTuple()
+
+
+class ActionLogProbs(NamedTuple):
+    """
+    A NamedTuple containing the tensor for continuous log probs and list of tensors for
+    discrete log probs of individual actions as well as all the log probs for an entire branch.
+    Utility functions provide numpy <=> tensor conversions to be used by the optimizers.
+    :param continuous_tensor: Torch tensor corresponding to log probs of continuous actions
+    :param discrete_list: List of Torch tensors each corresponding to log probs of the discrete actions that were
+    sampled.
+    :param all_discrete_list: List of Torch tensors each corresponding to all log probs of
+    a discrete action branch, even the discrete actions that were not sampled. all_discrete_list is a list of Tensors,
+    each Tensor corresponds to one discrete branch log probabilities.
+    """
+
+    continuous_tensor: torch.Tensor
+    discrete_list: Optional[List[torch.Tensor]]
+    all_discrete_list: Optional[List[torch.Tensor]]
+
+    @property
+    def discrete_tensor(self):
+        """
+        Returns the discrete log probs list as a stacked tensor
+        """
+        return torch.stack(self.discrete_list, dim=-1)
+
+    @property
+    def all_discrete_tensor(self):
+        """
+        Returns the discrete log probs of each branch as a tensor
+        """
+        return torch.cat(self.all_discrete_list, dim=1)
+
+    def to_log_probs_tuple(self) -> LogProbsTuple:
+        """
+        Returns a LogProbsTuple. Only adds if tensor is not None. Otherwise,
+        LogProbsTuple uses a default.
+        """
+        log_probs_tuple = LogProbsTuple()
+        if self.continuous_tensor is not None:
+            continuous = ModelUtils.to_numpy(self.continuous_tensor)
+            log_probs_tuple.add_continuous(continuous)
+        if self.discrete_list is not None:
+            discrete = ModelUtils.to_numpy(self.discrete_tensor)
+            log_probs_tuple.add_discrete(discrete)
+        return log_probs_tuple
+
+    def _to_tensor_list(self) -> List[torch.Tensor]:
+        """
+        Returns the tensors in the ActionLogProbs as a flat List of torch Tensors. This
+        is private and serves as a utility for self.flatten()
+        """
+        tensor_list: List[torch.Tensor] = []
+        if self.continuous_tensor is not None:
+            tensor_list.append(self.continuous_tensor)
+        if self.discrete_list is not None:
+            tensor_list.append(self.discrete_tensor)
+        return tensor_list
+
+    def flatten(self) -> torch.Tensor:
+        """
+        A utility method that returns all log probs in ActionLogProbs as a flattened tensor.
+        This is useful for algorithms like PPO which can treat all log probs in the same way.
+        """
+        return torch.cat(self._to_tensor_list(), dim=1)
+
+    @staticmethod
+    def from_buffer(buff: AgentBuffer) -> "ActionLogProbs":
+        """
+        A static method that accesses continuous and discrete log probs fields in an AgentBuffer
+        and constructs the corresponding ActionLogProbs from the retrieved np arrays.
+        """
+        continuous: torch.Tensor = None
+        discrete: List[torch.Tensor] = None  # type: ignore
+
+        if BufferKey.CONTINUOUS_LOG_PROBS in buff:
+            continuous = ModelUtils.list_to_tensor(buff[BufferKey.CONTINUOUS_LOG_PROBS])
+        if BufferKey.DISCRETE_LOG_PROBS in buff:
+            discrete_tensor = ModelUtils.list_to_tensor(
+                buff[BufferKey.DISCRETE_LOG_PROBS]
+            )
+            # This will keep discrete_list = None which enables flatten()
+            if discrete_tensor.shape[1] > 0:
+                discrete = [
+                    discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
+                ]
+        return ActionLogProbs(continuous, discrete, None)
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_model.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b88c0262d35c02286b3bae894569ce5330640f9
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/action_model.py
@@ -0,0 +1,231 @@
+from typing import List, Tuple, NamedTuple, Optional
+from mlagents.torch_utils import torch, nn
+from mlagents.trainers.torch_entities.distributions import (
+    DistInstance,
+    DiscreteDistInstance,
+    GaussianDistribution,
+    MultiCategoricalDistribution,
+)
+from mlagents.trainers.torch_entities.agent_action import AgentAction
+from mlagents.trainers.torch_entities.action_log_probs import ActionLogProbs
+from mlagents_envs.base_env import ActionSpec
+
+
+EPSILON = 1e-7  # Small value to avoid divide by zero
+
+
+class DistInstances(NamedTuple):
+    """
+    A NamedTuple with fields corresponding the the DistInstance objects
+    output by continuous and discrete distributions, respectively. Discrete distributions
+    output a list of DistInstance objects whereas continuous distributions output a single
+    DistInstance object.
+    """
+
+    continuous: Optional[DistInstance]
+    discrete: Optional[List[DiscreteDistInstance]]
+
+
+class ActionModel(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        action_spec: ActionSpec,
+        conditional_sigma: bool = False,
+        tanh_squash: bool = False,
+        deterministic: bool = False,
+    ):
+        """
+        A torch module that represents the action space of a policy. The ActionModel may contain
+        a continuous distribution, a discrete distribution or both where construction depends on
+        the action_spec.  The ActionModel uses the encoded input of the network body to parameterize
+        these distributions. The forward method of this module outputs the action, log probs,
+        and entropies given the encoding from the network body.
+        :params hidden_size: Size of the input to the ActionModel.
+        :params action_spec: The ActionSpec defining the action space dimensions and distributions.
+        :params conditional_sigma: Whether or not the std of a Gaussian is conditioned on state.
+        :params tanh_squash: Whether to squash the output of a Gaussian with the tanh function.
+        :params deterministic: Whether to select actions deterministically in policy.
+        """
+        super().__init__()
+        self.encoding_size = hidden_size
+        self.action_spec = action_spec
+        self._continuous_distribution = None
+        self._discrete_distribution = None
+
+        if self.action_spec.continuous_size > 0:
+            self._continuous_distribution = GaussianDistribution(
+                self.encoding_size,
+                self.action_spec.continuous_size,
+                conditional_sigma=conditional_sigma,
+                tanh_squash=tanh_squash,
+            )
+
+        if self.action_spec.discrete_size > 0:
+            self._discrete_distribution = MultiCategoricalDistribution(
+                self.encoding_size, self.action_spec.discrete_branches
+            )
+
+        # During training, clipping is done in TorchPolicy, but we need to clip before ONNX
+        # export as well.
+        self.clip_action = not tanh_squash
+        self._deterministic = deterministic
+
+    def _sample_action(self, dists: DistInstances) -> AgentAction:
+        """
+        Samples actions from a DistInstances tuple
+        :params dists: The DistInstances tuple
+        :return: An AgentAction corresponding to the actions sampled from the DistInstances
+        """
+
+        continuous_action: Optional[torch.Tensor] = None
+        discrete_action: Optional[List[torch.Tensor]] = None
+        # This checks None because mypy complains otherwise
+        if dists.continuous is not None:
+            if self._deterministic:
+                continuous_action = dists.continuous.deterministic_sample()
+            else:
+                continuous_action = dists.continuous.sample()
+        if dists.discrete is not None:
+            discrete_action = []
+            if self._deterministic:
+                for discrete_dist in dists.discrete:
+                    discrete_action.append(discrete_dist.deterministic_sample())
+            else:
+                for discrete_dist in dists.discrete:
+                    discrete_action.append(discrete_dist.sample())
+        return AgentAction(continuous_action, discrete_action)
+
+    def _get_dists(self, inputs: torch.Tensor, masks: torch.Tensor) -> DistInstances:
+        """
+        Creates a DistInstances tuple using the continuous and discrete distributions
+        :params inputs: The encoding from the network body
+        :params masks: Action masks for discrete actions
+        :return: A DistInstances tuple
+        """
+        continuous_dist: Optional[DistInstance] = None
+        discrete_dist: Optional[List[DiscreteDistInstance]] = None
+        # This checks None because mypy complains otherwise
+        if self._continuous_distribution is not None:
+            continuous_dist = self._continuous_distribution(inputs)
+        if self._discrete_distribution is not None:
+            discrete_dist = self._discrete_distribution(inputs, masks)
+        return DistInstances(continuous_dist, discrete_dist)
+
+    def _get_probs_and_entropy(
+        self, actions: AgentAction, dists: DistInstances
+    ) -> Tuple[ActionLogProbs, torch.Tensor]:
+        """
+        Computes the log probabilites of the actions given distributions and entropies of
+        the given distributions.
+        :params actions: The AgentAction
+        :params dists: The DistInstances tuple
+        :return: An ActionLogProbs tuple and a torch tensor of the distribution entropies.
+        """
+        entropies_list: List[torch.Tensor] = []
+        continuous_log_prob: Optional[torch.Tensor] = None
+        discrete_log_probs: Optional[List[torch.Tensor]] = None
+        all_discrete_log_probs: Optional[List[torch.Tensor]] = None
+        # This checks None because mypy complains otherwise
+        if dists.continuous is not None:
+            continuous_log_prob = dists.continuous.log_prob(actions.continuous_tensor)
+            entropies_list.append(dists.continuous.entropy())
+        if dists.discrete is not None:
+            discrete_log_probs = []
+            all_discrete_log_probs = []
+            for discrete_action, discrete_dist in zip(
+                actions.discrete_list, dists.discrete  # type: ignore
+            ):
+                discrete_log_prob = discrete_dist.log_prob(discrete_action)
+                entropies_list.append(discrete_dist.entropy())
+                discrete_log_probs.append(discrete_log_prob)
+                all_discrete_log_probs.append(discrete_dist.all_log_prob())
+        action_log_probs = ActionLogProbs(
+            continuous_log_prob, discrete_log_probs, all_discrete_log_probs
+        )
+        entropies = torch.cat(entropies_list, dim=1)
+        return action_log_probs, entropies
+
+    def evaluate(
+        self, inputs: torch.Tensor, masks: torch.Tensor, actions: AgentAction
+    ) -> Tuple[ActionLogProbs, torch.Tensor]:
+        """
+        Given actions and encoding from the network body, gets the distributions and
+        computes the log probabilites and entropies.
+        :params inputs: The encoding from the network body
+        :params masks: Action masks for discrete actions
+        :params actions: The AgentAction
+        :return: An ActionLogProbs tuple and a torch tensor of the distribution entropies.
+        """
+        dists = self._get_dists(inputs, masks)
+        log_probs, entropies = self._get_probs_and_entropy(actions, dists)
+        # Use the sum of entropy across actions, not the mean
+        entropy_sum = torch.sum(entropies, dim=1)
+        return log_probs, entropy_sum
+
+    def get_action_out(self, inputs: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
+        """
+        Gets the tensors corresponding to the output of the policy network to be used for
+        inference. Called by the Actor's forward call.
+        :params inputs: The encoding from the network body
+        :params masks: Action masks for discrete actions
+        :return: A tuple of torch tensors corresponding to the inference output
+        """
+        dists = self._get_dists(inputs, masks)
+        continuous_out, discrete_out, action_out_deprecated = None, None, None
+        deterministic_continuous_out, deterministic_discrete_out = (
+            None,
+            None,
+        )  # deterministic actions
+        if self.action_spec.continuous_size > 0 and dists.continuous is not None:
+            continuous_out = dists.continuous.exported_model_output()
+            action_out_deprecated = continuous_out
+            deterministic_continuous_out = dists.continuous.deterministic_sample()
+            if self.clip_action:
+                continuous_out = torch.clamp(continuous_out, -3, 3) / 3
+                action_out_deprecated = continuous_out
+                deterministic_continuous_out = (
+                    torch.clamp(deterministic_continuous_out, -3, 3) / 3
+                )
+        if self.action_spec.discrete_size > 0 and dists.discrete is not None:
+            discrete_out_list = [
+                discrete_dist.exported_model_output()
+                for discrete_dist in dists.discrete
+            ]
+            discrete_out = torch.cat(discrete_out_list, dim=1)
+            action_out_deprecated = torch.cat(discrete_out_list, dim=1)
+            deterministic_discrete_out_list = [
+                discrete_dist.deterministic_sample() for discrete_dist in dists.discrete
+            ]
+            deterministic_discrete_out = torch.cat(
+                deterministic_discrete_out_list, dim=1
+            )
+
+        # deprecated action field does not support hybrid action
+        if self.action_spec.continuous_size > 0 and self.action_spec.discrete_size > 0:
+            action_out_deprecated = None
+        return (
+            continuous_out,
+            discrete_out,
+            action_out_deprecated,
+            deterministic_continuous_out,
+            deterministic_discrete_out,
+        )
+
+    def forward(
+        self, inputs: torch.Tensor, masks: torch.Tensor
+    ) -> Tuple[AgentAction, ActionLogProbs, torch.Tensor]:
+        """
+        The forward method of this module. Outputs the action, log probs,
+        and entropies given the encoding from the network body.
+        :params inputs: The encoding from the network body
+        :params masks: Action masks for discrete actions
+        :return: Given the input, an AgentAction of the actions generated by the policy and the corresponding
+        ActionLogProbs and entropies.
+        """
+        dists = self._get_dists(inputs, masks)
+        actions = self._sample_action(dists)
+        log_probs, entropies = self._get_probs_and_entropy(actions, dists)
+        # Use the sum of entropy across actions, not the mean
+        entropy_sum = torch.sum(entropies, dim=1)
+        return (actions, log_probs, entropy_sum)
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/agent_action.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/agent_action.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ecc995a55eec6d5237ee358ba264773605fbf55
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/agent_action.py
@@ -0,0 +1,157 @@
+from typing import List, Optional, NamedTuple
+import itertools
+import numpy as np
+from mlagents.torch_utils import torch
+
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
+from mlagents.trainers.torch_entities.utils import ModelUtils
+from mlagents_envs.base_env import ActionTuple
+
+
+class AgentAction(NamedTuple):
+    """
+    A NamedTuple containing the tensor for continuous actions and list of tensors for
+    discrete actions. Utility functions provide numpy <=> tensor conversions to be
+    sent as actions to the environment manager as well as used by the optimizers.
+    :param continuous_tensor: Torch tensor corresponding to continuous actions
+    :param discrete_list: List of Torch tensors each corresponding to discrete actions
+    """
+
+    continuous_tensor: torch.Tensor
+    discrete_list: Optional[List[torch.Tensor]]
+
+    @property
+    def discrete_tensor(self) -> torch.Tensor:
+        """
+        Returns the discrete action list as a stacked tensor
+        """
+        if self.discrete_list is not None and len(self.discrete_list) > 0:
+            return torch.stack(self.discrete_list, dim=-1)
+        else:
+            return torch.empty(0)
+
+    def slice(self, start: int, end: int) -> "AgentAction":
+        """
+        Returns an AgentAction with the continuous and discrete tensors slices
+        from index start to index end.
+        """
+        _cont = None
+        _disc_list = []
+        if self.continuous_tensor is not None:
+            _cont = self.continuous_tensor[start:end]
+        if self.discrete_list is not None and len(self.discrete_list) > 0:
+            for _disc in self.discrete_list:
+                _disc_list.append(_disc[start:end])
+        return AgentAction(_cont, _disc_list)
+
+    def to_action_tuple(self, clip: bool = False) -> ActionTuple:
+        """
+        Returns an ActionTuple
+        """
+        action_tuple = ActionTuple()
+        if self.continuous_tensor is not None:
+            _continuous_tensor = self.continuous_tensor
+            if clip:
+                _continuous_tensor = torch.clamp(_continuous_tensor, -3, 3) / 3
+            continuous = ModelUtils.to_numpy(_continuous_tensor)
+            action_tuple.add_continuous(continuous)
+        if self.discrete_list is not None:
+            discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :])
+            action_tuple.add_discrete(discrete)
+        return action_tuple
+
+    @staticmethod
+    def from_buffer(buff: AgentBuffer) -> "AgentAction":
+        """
+        A static method that accesses continuous and discrete action fields in an AgentBuffer
+        and constructs the corresponding AgentAction from the retrieved np arrays.
+        """
+        continuous: torch.Tensor = None
+        discrete: List[torch.Tensor] = None  # type: ignore
+        if BufferKey.CONTINUOUS_ACTION in buff:
+            continuous = ModelUtils.list_to_tensor(buff[BufferKey.CONTINUOUS_ACTION])
+        if BufferKey.DISCRETE_ACTION in buff:
+            discrete_tensor = ModelUtils.list_to_tensor(
+                buff[BufferKey.DISCRETE_ACTION], dtype=torch.long
+            )
+            discrete = [
+                discrete_tensor[..., i] for i in range(discrete_tensor.shape[-1])
+            ]
+        return AgentAction(continuous, discrete)
+
+    @staticmethod
+    def _group_agent_action_from_buffer(
+        buff: AgentBuffer, cont_action_key: BufferKey, disc_action_key: BufferKey
+    ) -> List["AgentAction"]:
+        """
+        Extracts continuous and discrete groupmate actions, as specified by BufferKey, and
+        returns a List of AgentActions that correspond to the groupmate's actions. List will
+        be of length equal to the maximum number of groupmates in the buffer. Any spots where
+        there are less agents than maximum, the actions will be padded with 0's.
+        """
+        continuous_tensors: List[torch.Tensor] = []
+        discrete_tensors: List[torch.Tensor] = []
+        if cont_action_key in buff:
+            padded_batch = buff[cont_action_key].padded_to_batch()
+            continuous_tensors = [
+                ModelUtils.list_to_tensor(arr) for arr in padded_batch
+            ]
+        if disc_action_key in buff:
+            padded_batch = buff[disc_action_key].padded_to_batch(dtype=np.long)
+            discrete_tensors = [
+                ModelUtils.list_to_tensor(arr, dtype=torch.long) for arr in padded_batch
+            ]
+
+        actions_list = []
+        for _cont, _disc in itertools.zip_longest(
+            continuous_tensors, discrete_tensors, fillvalue=None
+        ):
+            if _disc is not None:
+                _disc = [_disc[..., i] for i in range(_disc.shape[-1])]
+            actions_list.append(AgentAction(_cont, _disc))
+        return actions_list
+
+    @staticmethod
+    def group_from_buffer(buff: AgentBuffer) -> List["AgentAction"]:
+        """
+        A static method that accesses next group continuous and discrete action fields in an AgentBuffer
+        and constructs a padded List of AgentActions that represent the group agent actions.
+        The List is of length equal to max number of groupmate agents in the buffer, and the AgentBuffer iss
+        of the same length as the buffer. Empty spots (e.g. when agents die) are padded with 0.
+        :param buff: AgentBuffer of a batch or trajectory
+        :return: List of groupmate's AgentActions
+        """
+        return AgentAction._group_agent_action_from_buffer(
+            buff, BufferKey.GROUP_CONTINUOUS_ACTION, BufferKey.GROUP_DISCRETE_ACTION
+        )
+
+    @staticmethod
+    def group_from_buffer_next(buff: AgentBuffer) -> List["AgentAction"]:
+        """
+        A static method that accesses next group continuous and discrete action fields in an AgentBuffer
+        and constructs a padded List of AgentActions that represent the next group agent actions.
+        The List is of length equal to max number of groupmate agents in the buffer, and the AgentBuffer iss
+        of the same length as the buffer. Empty spots (e.g. when agents die) are padded with 0.
+        :param buff: AgentBuffer of a batch or trajectory
+        :return: List of groupmate's AgentActions
+        """
+        return AgentAction._group_agent_action_from_buffer(
+            buff, BufferKey.GROUP_NEXT_CONT_ACTION, BufferKey.GROUP_NEXT_DISC_ACTION
+        )
+
+    def to_flat(self, discrete_branches: List[int]) -> torch.Tensor:
+        """
+        Flatten this AgentAction into a single torch Tensor of dimension (batch, num_continuous + num_one_hot_discrete).
+        Discrete actions are converted into one-hot and concatenated with continuous actions.
+        :param discrete_branches: List of sizes for discrete actions.
+        :return: Tensor of flattened actions.
+        """
+        # if there are any discrete actions, create one-hot
+        if self.discrete_list is not None and len(self.discrete_list) > 0:
+            discrete_oh = ModelUtils.actions_to_onehot(
+                self.discrete_tensor, discrete_branches
+            )
+            discrete_oh = torch.cat(discrete_oh, dim=1)
+        else:
+            discrete_oh = torch.empty(0)
+        return torch.cat([self.continuous_tensor, discrete_oh], dim=-1)
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/attention.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba34e01995d5586da8942620b5d7ec4fd8ce5770
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/attention.py
@@ -0,0 +1,293 @@
+from mlagents.torch_utils import torch
+import warnings
+from typing import Tuple, Optional, List
+from mlagents.trainers.torch_entities.layers import (
+    LinearEncoder,
+    Initialization,
+    linear_layer,
+    LayerNorm,
+)
+from mlagents.trainers.torch_entities.model_serialization import exporting_to_onnx
+from mlagents.trainers.exception import UnityTrainerException
+
+
+def get_zero_entities_mask(entities: List[torch.Tensor]) -> List[torch.Tensor]:
+    """
+    Takes a List of Tensors and returns a List of mask Tensor with 1 if the input was
+    all zeros (on dimension 2) and 0 otherwise. This is used in the Attention
+    layer to mask the padding observations.
+    """
+    with torch.no_grad():
+
+        if exporting_to_onnx.is_exporting():
+            with warnings.catch_warnings():
+                # We ignore a TracerWarning from PyTorch that warns that doing
+                # shape[n].item() will cause the trace to be incorrect (the trace might
+                # not generalize to other inputs)
+                # We ignore this warning because we know the model will always be
+                # run with inputs of the same shape
+                warnings.simplefilter("ignore")
+                # When exporting to ONNX, we want to transpose the entities. This is
+                # because ONNX only support input in NCHW (channel first) format.
+                # Barracuda also expect to get data in NCHW.
+                entities = [
+                    torch.transpose(obs, 2, 1).reshape(
+                        -1, obs.shape[1].item(), obs.shape[2].item()
+                    )
+                    for obs in entities
+                ]
+
+        # Generate the masking tensors for each entities tensor (mask only if all zeros)
+        key_masks: List[torch.Tensor] = [
+            (torch.sum(ent**2, axis=2) < 0.01).float() for ent in entities
+        ]
+    return key_masks
+
+
+class MultiHeadAttention(torch.nn.Module):
+
+    NEG_INF = -1e6
+
+    def __init__(self, embedding_size: int, num_heads: int):
+        """
+        Multi Head Attention module. We do not use the regular Torch implementation since
+        Barracuda does not support some operators it uses.
+        Takes as input to the forward method 3 tensors:
+        - query: of dimensions (batch_size, number_of_queries, embedding_size)
+        - key: of dimensions (batch_size, number_of_keys, embedding_size)
+        - value: of dimensions (batch_size, number_of_keys, embedding_size)
+        The forward method will return 2 tensors:
+        - The output: (batch_size, number_of_queries, embedding_size)
+        - The attention matrix: (batch_size, num_heads, number_of_queries, number_of_keys)
+        :param embedding_size: The size of the embeddings that will be generated (should be
+        dividable by the num_heads)
+        :param total_max_elements: The maximum total number of entities that can be passed to
+        the module
+        :param num_heads: The number of heads of the attention module
+        """
+        super().__init__()
+        self.n_heads = num_heads
+        self.head_size: int = embedding_size // self.n_heads
+        self.embedding_size: int = self.head_size * self.n_heads
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        n_q: int,
+        n_k: int,
+        key_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        b = -1  # the batch size
+
+        query = query.reshape(
+            b, n_q, self.n_heads, self.head_size
+        )  # (b, n_q, h, emb / h)
+        key = key.reshape(b, n_k, self.n_heads, self.head_size)  # (b, n_k, h, emb / h)
+        value = value.reshape(
+            b, n_k, self.n_heads, self.head_size
+        )  # (b, n_k, h, emb / h)
+
+        query = query.permute([0, 2, 1, 3])  # (b, h, n_q, emb / h)
+        # The next few lines are equivalent to : key.permute([0, 2, 3, 1])
+        # This is a hack, ONNX will compress two permute operations and
+        # Barracuda will not like seeing `permute([0,2,3,1])`
+        key = key.permute([0, 2, 1, 3])  # (b, h, emb / h, n_k)
+        key -= 1
+        key += 1
+        key = key.permute([0, 1, 3, 2])  # (b, h, emb / h, n_k)
+
+        qk = torch.matmul(query, key)  # (b, h, n_q, n_k)
+
+        if key_mask is None:
+            qk = qk / (self.embedding_size**0.5)
+        else:
+            key_mask = key_mask.reshape(b, 1, 1, n_k)
+            qk = (1 - key_mask) * qk / (
+                self.embedding_size**0.5
+            ) + key_mask * self.NEG_INF
+
+        att = torch.softmax(qk, dim=3)  # (b, h, n_q, n_k)
+
+        value = value.permute([0, 2, 1, 3])  # (b, h, n_k, emb / h)
+        value_attention = torch.matmul(att, value)  # (b, h, n_q, emb / h)
+
+        value_attention = value_attention.permute([0, 2, 1, 3])  # (b, n_q, h, emb / h)
+        value_attention = value_attention.reshape(
+            b, n_q, self.embedding_size
+        )  # (b, n_q, emb)
+
+        return value_attention, att
+
+
+class EntityEmbedding(torch.nn.Module):
+    """
+    A module used to embed entities before passing them to a self-attention block.
+    Used in conjunction with ResidualSelfAttention to encode information about a self
+    and additional entities. Can also concatenate self to entities for ego-centric self-
+    attention. Inspired by architecture used in https://arxiv.org/pdf/1909.07528.pdf.
+    """
+
+    def __init__(
+        self,
+        entity_size: int,
+        entity_num_max_elements: Optional[int],
+        embedding_size: int,
+    ):
+        """
+        Constructs an EntityEmbedding module.
+        :param x_self_size: Size of "self" entity.
+        :param entity_size: Size of other entities.
+        :param entity_num_max_elements: Maximum elements for a given entity, None for unrestricted.
+            Needs to be assigned in order for model to be exportable to ONNX and Barracuda.
+        :param embedding_size: Embedding size for the entity encoder.
+        :param concat_self: Whether to concatenate x_self to entities. Set True for ego-centric
+            self-attention.
+        """
+        super().__init__()
+        self.self_size: int = 0
+        self.entity_size: int = entity_size
+        self.entity_num_max_elements: int = -1
+        if entity_num_max_elements is not None:
+            self.entity_num_max_elements = entity_num_max_elements
+        self.embedding_size = embedding_size
+        # Initialization scheme from http://www.cs.toronto.edu/~mvolkovs/ICML2020_tfixup.pdf
+        self.self_ent_encoder = LinearEncoder(
+            self.entity_size,
+            1,
+            self.embedding_size,
+            kernel_init=Initialization.Normal,
+            kernel_gain=(0.125 / self.embedding_size) ** 0.5,
+        )
+
+    def add_self_embedding(self, size: int) -> None:
+        self.self_size = size
+        self.self_ent_encoder = LinearEncoder(
+            self.self_size + self.entity_size,
+            1,
+            self.embedding_size,
+            kernel_init=Initialization.Normal,
+            kernel_gain=(0.125 / self.embedding_size) ** 0.5,
+        )
+
+    def forward(self, x_self: torch.Tensor, entities: torch.Tensor) -> torch.Tensor:
+        num_entities = self.entity_num_max_elements
+        if num_entities < 0:
+            if exporting_to_onnx.is_exporting():
+                raise UnityTrainerException(
+                    "Trying to export an attention mechanism that doesn't have a set max \
+                    number of elements."
+                )
+            num_entities = entities.shape[1]
+
+        if exporting_to_onnx.is_exporting():
+            # When exporting to ONNX, we want to transpose the entities. This is
+            # because ONNX only support input in NCHW (channel first) format.
+            # Barracuda also expect to get data in NCHW.
+            entities = torch.transpose(entities, 2, 1).reshape(
+                -1, num_entities, self.entity_size
+            )
+
+        if self.self_size > 0:
+            expanded_self = x_self.reshape(-1, 1, self.self_size)
+            expanded_self = torch.cat([expanded_self] * num_entities, dim=1)
+            # Concatenate all observations with self
+            entities = torch.cat([expanded_self, entities], dim=2)
+        # Encode entities
+        encoded_entities = self.self_ent_encoder(entities)
+        return encoded_entities
+
+
+class ResidualSelfAttention(torch.nn.Module):
+    """
+    Residual self attentioninspired from https://arxiv.org/pdf/1909.07528.pdf. Can be used
+    with an EntityEmbedding module, to apply multi head self attention to encode information
+    about a "Self" and a list of relevant "Entities".
+    """
+
+    EPSILON = 1e-7
+
+    def __init__(
+        self,
+        embedding_size: int,
+        entity_num_max_elements: Optional[int] = None,
+        num_heads: int = 4,
+    ):
+        """
+        Constructs a ResidualSelfAttention module.
+        :param embedding_size: Embedding sizee for attention mechanism and
+            Q, K, V encoders.
+        :param entity_num_max_elements: A List of ints representing the maximum number
+            of elements in an entity sequence. Should be of length num_entities. Pass None to
+            not restrict the number of elements; however, this will make the module
+            unexportable to ONNX/Barracuda.
+        :param num_heads: Number of heads for Multi Head Self-Attention
+        """
+        super().__init__()
+        self.max_num_ent: Optional[int] = None
+        if entity_num_max_elements is not None:
+            self.max_num_ent = entity_num_max_elements
+
+        self.attention = MultiHeadAttention(
+            num_heads=num_heads, embedding_size=embedding_size
+        )
+
+        # Initialization scheme from http://www.cs.toronto.edu/~mvolkovs/ICML2020_tfixup.pdf
+        self.fc_q = linear_layer(
+            embedding_size,
+            embedding_size,
+            kernel_init=Initialization.Normal,
+            kernel_gain=(0.125 / embedding_size) ** 0.5,
+        )
+        self.fc_k = linear_layer(
+            embedding_size,
+            embedding_size,
+            kernel_init=Initialization.Normal,
+            kernel_gain=(0.125 / embedding_size) ** 0.5,
+        )
+        self.fc_v = linear_layer(
+            embedding_size,
+            embedding_size,
+            kernel_init=Initialization.Normal,
+            kernel_gain=(0.125 / embedding_size) ** 0.5,
+        )
+        self.fc_out = linear_layer(
+            embedding_size,
+            embedding_size,
+            kernel_init=Initialization.Normal,
+            kernel_gain=(0.125 / embedding_size) ** 0.5,
+        )
+        self.embedding_norm = LayerNorm()
+        self.residual_norm = LayerNorm()
+
+    def forward(self, inp: torch.Tensor, key_masks: List[torch.Tensor]) -> torch.Tensor:
+        # Gather the maximum number of entities information
+        mask = torch.cat(key_masks, dim=1)
+
+        inp = self.embedding_norm(inp)
+        # Feed to self attention
+        query = self.fc_q(inp)  # (b, n_q, emb)
+        key = self.fc_k(inp)  # (b, n_k, emb)
+        value = self.fc_v(inp)  # (b, n_k, emb)
+
+        # Only use max num if provided
+        if self.max_num_ent is not None:
+            num_ent = self.max_num_ent
+        else:
+            num_ent = inp.shape[1]
+            if exporting_to_onnx.is_exporting():
+                raise UnityTrainerException(
+                    "Trying to export an attention mechanism that doesn't have a set max \
+                    number of elements."
+                )
+
+        output, _ = self.attention(query, key, value, num_ent, num_ent, mask)
+        # Residual
+        output = self.fc_out(output) + inp
+        output = self.residual_norm(output)
+        # Average Pooling
+        numerator = torch.sum(output * (1 - mask).reshape(-1, num_ent, 1), dim=1)
+        denominator = torch.sum(1 - mask, dim=1, keepdim=True) + self.EPSILON
+        output = numerator / denominator
+        return output
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b79947b9d024befc99baf8b355ecab9ad6aa937c
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14730ab5cbb1b49a041a47a27c01d8a103a1bc4b
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__pycache__/module.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__pycache__/module.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..caf76d070b46eb2218167cf72421091b8a06ee18
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/__pycache__/module.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/module.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab454409c6111a59f659a1ab7c335524628c3d31
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/bc/module.py
@@ -0,0 +1,186 @@
+from typing import Dict
+import numpy as np
+from mlagents.torch_utils import torch
+
+from mlagents.trainers.policy.torch_policy import TorchPolicy
+from mlagents.trainers.demo_loader import demo_to_buffer
+from mlagents.trainers.settings import BehavioralCloningSettings, ScheduleType
+from mlagents.trainers.torch_entities.agent_action import AgentAction
+from mlagents.trainers.torch_entities.action_log_probs import ActionLogProbs
+from mlagents.trainers.torch_entities.utils import ModelUtils
+from mlagents.trainers.trajectory import ObsUtil
+from mlagents.trainers.buffer import AgentBuffer
+
+
+class BCModule:
+    def __init__(
+        self,
+        policy: TorchPolicy,
+        settings: BehavioralCloningSettings,
+        policy_learning_rate: float,
+        default_batch_size: int,
+        default_num_epoch: int,
+    ):
+        """
+        A BC trainer that can be used inline with RL.
+        :param policy: The policy of the learning model
+        :param settings: The settings for BehavioralCloning including LR strength, batch_size,
+        num_epochs, samples_per_update and LR annealing steps.
+        :param policy_learning_rate: The initial Learning Rate of the policy. Used to set an appropriate learning rate
+            for the pretrainer.
+        """
+        self.policy = policy
+        self._anneal_steps = settings.steps
+        self.current_lr = policy_learning_rate * settings.strength
+
+        learning_rate_schedule: ScheduleType = (
+            ScheduleType.LINEAR if self._anneal_steps > 0 else ScheduleType.CONSTANT
+        )
+        self.decay_learning_rate = ModelUtils.DecayedValue(
+            learning_rate_schedule, self.current_lr, 1e-10, self._anneal_steps
+        )
+        params = self.policy.actor.parameters()
+        self.optimizer = torch.optim.Adam(params, lr=self.current_lr)
+        _, self.demonstration_buffer = demo_to_buffer(
+            settings.demo_path, policy.sequence_length, policy.behavior_spec
+        )
+        self.batch_size = (
+            settings.batch_size if settings.batch_size else default_batch_size
+        )
+        self.num_epoch = settings.num_epoch if settings.num_epoch else default_num_epoch
+        self.n_sequences = max(
+            min(self.batch_size, self.demonstration_buffer.num_experiences)
+            // policy.sequence_length,
+            1,
+        )
+
+        self.has_updated = False
+        self.use_recurrent = self.policy.use_recurrent
+        self.samples_per_update = settings.samples_per_update
+
+    def update(self) -> Dict[str, np.ndarray]:
+        """
+        Updates model using buffer.
+        :param max_batches: The maximum number of batches to use per update.
+        :return: The loss of the update.
+        """
+        # Don't continue training if the learning rate has reached 0, to reduce training time.
+
+        decay_lr = self.decay_learning_rate.get_value(self.policy.get_current_step())
+        if self.current_lr <= 1e-10:  # Unlike in TF, this never actually reaches 0.
+            return {"Losses/Pretraining Loss": 0}
+
+        batch_losses = []
+        possible_demo_batches = (
+            self.demonstration_buffer.num_experiences // self.n_sequences
+        )
+        possible_batches = possible_demo_batches
+
+        max_batches = self.samples_per_update // self.n_sequences
+
+        n_epoch = self.num_epoch
+        for _ in range(n_epoch):
+            self.demonstration_buffer.shuffle(
+                sequence_length=self.policy.sequence_length
+            )
+            if max_batches == 0:
+                num_batches = possible_batches
+            else:
+                num_batches = min(possible_batches, max_batches)
+            for i in range(num_batches // self.policy.sequence_length):
+                demo_update_buffer = self.demonstration_buffer
+                start = i * self.n_sequences * self.policy.sequence_length
+                end = (i + 1) * self.n_sequences * self.policy.sequence_length
+                mini_batch_demo = demo_update_buffer.make_mini_batch(start, end)
+                run_out = self._update_batch(mini_batch_demo, self.n_sequences)
+                loss = run_out["loss"]
+                batch_losses.append(loss)
+
+        ModelUtils.update_learning_rate(self.optimizer, decay_lr)
+        self.current_lr = decay_lr
+
+        self.has_updated = True
+        update_stats = {"Losses/Pretraining Loss": np.mean(batch_losses)}
+        return update_stats
+
+    def _behavioral_cloning_loss(
+        self,
+        selected_actions: AgentAction,
+        log_probs: ActionLogProbs,
+        expert_actions: torch.Tensor,
+    ) -> torch.Tensor:
+        bc_loss = 0
+        if self.policy.behavior_spec.action_spec.continuous_size > 0:
+            bc_loss += torch.nn.functional.mse_loss(
+                selected_actions.continuous_tensor, expert_actions.continuous_tensor
+            )
+        if self.policy.behavior_spec.action_spec.discrete_size > 0:
+            one_hot_expert_actions = ModelUtils.actions_to_onehot(
+                expert_actions.discrete_tensor,
+                self.policy.behavior_spec.action_spec.discrete_branches,
+            )
+            log_prob_branches = ModelUtils.break_into_branches(
+                log_probs.all_discrete_tensor,
+                self.policy.behavior_spec.action_spec.discrete_branches,
+            )
+            bc_loss += torch.mean(
+                torch.stack(
+                    [
+                        torch.sum(
+                            -torch.nn.functional.log_softmax(log_prob_branch, dim=1)
+                            * expert_actions_branch,
+                            dim=1,
+                        )
+                        for log_prob_branch, expert_actions_branch in zip(
+                            log_prob_branches, one_hot_expert_actions
+                        )
+                    ]
+                )
+            )
+        return bc_loss
+
+    def _update_batch(
+        self, mini_batch_demo: AgentBuffer, n_sequences: int
+    ) -> Dict[str, float]:
+        """
+        Helper function for update_batch.
+        """
+        np_obs = ObsUtil.from_buffer(
+            mini_batch_demo, len(self.policy.behavior_spec.observation_specs)
+        )
+        # Convert to tensors
+        tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
+        act_masks = None
+        expert_actions = AgentAction.from_buffer(mini_batch_demo)
+        if self.policy.behavior_spec.action_spec.discrete_size > 0:
+
+            act_masks = ModelUtils.list_to_tensor(
+                np.ones(
+                    (
+                        self.n_sequences * self.policy.sequence_length,
+                        sum(self.policy.behavior_spec.action_spec.discrete_branches),
+                    ),
+                    dtype=np.float32,
+                )
+            )
+
+        memories = []
+        if self.policy.use_recurrent:
+            memories = torch.zeros(1, self.n_sequences, self.policy.m_size)
+
+        selected_actions, run_out, _ = self.policy.actor.get_action_and_stats(
+            tensor_obs,
+            masks=act_masks,
+            memories=memories,
+            sequence_length=self.policy.sequence_length,
+        )
+        log_probs = run_out["log_probs"]
+        bc_loss = self._behavioral_cloning_loss(
+            selected_actions, log_probs, expert_actions
+        )
+        self.optimizer.zero_grad()
+        bc_loss.backward()
+
+        self.optimizer.step()
+        run_out = {"loss": bc_loss.item()}
+        return run_out
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..696d978b8ab3e1f34c26e6fecc57b8670ab7659a
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__init__.py
@@ -0,0 +1,18 @@
+from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import (  # noqa F401
+    BaseRewardProvider,
+)
+from mlagents.trainers.torch_entities.components.reward_providers.extrinsic_reward_provider import (  # noqa F401
+    ExtrinsicRewardProvider,
+)
+from mlagents.trainers.torch_entities.components.reward_providers.curiosity_reward_provider import (  # noqa F401
+    CuriosityRewardProvider,
+)
+from mlagents.trainers.torch_entities.components.reward_providers.gail_reward_provider import (  # noqa F401
+    GAILRewardProvider,
+)
+from mlagents.trainers.torch_entities.components.reward_providers.rnd_reward_provider import (  # noqa F401
+    RNDRewardProvider,
+)
+from mlagents.trainers.torch_entities.components.reward_providers.reward_provider_factory import (  # noqa F401
+    create_reward_provider,
+)
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d77416629f9de718768c04f75825b3b35af1f8b6
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/base_reward_provider.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/base_reward_provider.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..144f8da8f8759c7248dc4488c3a504ea7ef7d8ad
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/base_reward_provider.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/curiosity_reward_provider.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/curiosity_reward_provider.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..109f485c0adaac980c9df2328127c945a7b78218
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/curiosity_reward_provider.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/extrinsic_reward_provider.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/extrinsic_reward_provider.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cddb7572f12675b1df004a4cfc9d189c81e1a9f2
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/extrinsic_reward_provider.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/gail_reward_provider.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/gail_reward_provider.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91e7e830174a1f548e28d175a02fcfb8addb2349
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/gail_reward_provider.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/reward_provider_factory.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/reward_provider_factory.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21abac81911a0dc82038b3357b6568e8e31f4cdb
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/reward_provider_factory.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/rnd_reward_provider.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/rnd_reward_provider.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd0736695a4e127fd9b5bdf86a452fadac542b54
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/__pycache__/rnd_reward_provider.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/base_reward_provider.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/base_reward_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e258657b37a9e2c0b9ad0340b91c98bdb7c7f59
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/base_reward_provider.py
@@ -0,0 +1,81 @@
+import numpy as np
+from mlagents.torch_utils import torch
+from abc import ABC, abstractmethod
+from typing import Dict
+
+from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.settings import RewardSignalSettings
+from mlagents_envs.base_env import BehaviorSpec
+
+
+class BaseRewardProvider(ABC):
+    def __init__(self, specs: BehaviorSpec, settings: RewardSignalSettings) -> None:
+        self._policy_specs = specs
+        self._gamma = settings.gamma
+        self._strength = settings.strength
+        self._ignore_done = False
+
+    @property
+    def gamma(self) -> float:
+        """
+        The discount factor for the reward signal
+        """
+        return self._gamma
+
+    @property
+    def strength(self) -> float:
+        """
+        The strength multiplier of the reward provider
+        """
+        return self._strength
+
+    @property
+    def name(self) -> str:
+        """
+        The name of the reward provider. Is used for reporting and identification
+        """
+        class_name = self.__class__.__name__
+        return class_name.replace("RewardProvider", "")
+
+    @property
+    def ignore_done(self) -> bool:
+        """
+        If true, when the agent is done, the rewards of the next episode must be
+        used to calculate the return of the current episode.
+        Is used to mitigate the positive bias in rewards with no natural end.
+        """
+        return self._ignore_done
+
+    @abstractmethod
+    def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
+        """
+        Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward
+        function drawn straight from a Buffer.
+        :param mini_batch: A Dict of numpy arrays (the format used by our Buffer)
+            when drawing from the update buffer.
+        :return: a np.ndarray of rewards generated by the reward provider
+        """
+        raise NotImplementedError(
+            "The reward provider's evaluate method has not been implemented "
+        )
+
+    @abstractmethod
+    def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
+        """
+        Update the reward for the data present in the Dict mini_batch. Use this when updating a reward
+        function drawn straight from a Buffer.
+        :param mini_batch: A Dict of numpy arrays (the format used by our Buffer)
+            when drawing from the update buffer.
+        :return: A dictionary from string to stats values
+        """
+        raise NotImplementedError(
+            "The reward provider's update method has not been implemented "
+        )
+
+    def get_modules(self) -> Dict[str, torch.nn.Module]:
+        """
+        Returns a dictionary of string identifiers to the torch.nn.Modules used by
+        the reward providers. This method is used for loading and saving the weights
+        of the reward providers.
+        """
+        return {}
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/curiosity_reward_provider.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/curiosity_reward_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4cbf34dd92a847e1051f8e190fab81044a4e660
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/curiosity_reward_provider.py
@@ -0,0 +1,239 @@
+import numpy as np
+from typing import Dict, NamedTuple
+from mlagents.torch_utils import torch, default_device
+
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
+from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import (
+    BaseRewardProvider,
+)
+from mlagents.trainers.settings import CuriositySettings
+
+from mlagents_envs.base_env import BehaviorSpec
+from mlagents_envs import logging_util
+from mlagents.trainers.torch_entities.agent_action import AgentAction
+from mlagents.trainers.torch_entities.action_flattener import ActionFlattener
+from mlagents.trainers.torch_entities.utils import ModelUtils
+from mlagents.trainers.torch_entities.networks import NetworkBody
+from mlagents.trainers.torch_entities.layers import LinearEncoder, linear_layer
+from mlagents.trainers.trajectory import ObsUtil
+
+logger = logging_util.get_logger(__name__)
+
+
+class ActionPredictionTuple(NamedTuple):
+    continuous: torch.Tensor
+    discrete: torch.Tensor
+
+
+class CuriosityRewardProvider(BaseRewardProvider):
+    beta = 0.2  # Forward vs Inverse loss weight
+    loss_multiplier = 10.0  # Loss multiplier
+
+    def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None:
+        super().__init__(specs, settings)
+        self._ignore_done = True
+        self._network = CuriosityNetwork(specs, settings)
+        self._network.to(default_device())
+
+        self.optimizer = torch.optim.Adam(
+            self._network.parameters(), lr=settings.learning_rate
+        )
+        self._has_updated_once = False
+
+    def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
+        with torch.no_grad():
+            rewards = ModelUtils.to_numpy(self._network.compute_reward(mini_batch))
+        rewards = np.minimum(rewards, 1.0 / self.strength)
+        return rewards * self._has_updated_once
+
+    def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
+        self._has_updated_once = True
+        forward_loss = self._network.compute_forward_loss(mini_batch)
+        inverse_loss = self._network.compute_inverse_loss(mini_batch)
+
+        loss = self.loss_multiplier * (
+            self.beta * forward_loss + (1.0 - self.beta) * inverse_loss
+        )
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        return {
+            "Losses/Curiosity Forward Loss": forward_loss.item(),
+            "Losses/Curiosity Inverse Loss": inverse_loss.item(),
+        }
+
+    def get_modules(self):
+        return {f"Module:{self.name}": self._network}
+
+
+class CuriosityNetwork(torch.nn.Module):
+    EPSILON = 1e-10
+
+    def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None:
+        super().__init__()
+        self._action_spec = specs.action_spec
+
+        state_encoder_settings = settings.network_settings
+        if state_encoder_settings.memory is not None:
+            state_encoder_settings.memory = None
+            logger.warning(
+                "memory was specified in network_settings but is not supported by Curiosity. It is being ignored."
+            )
+
+        self._state_encoder = NetworkBody(
+            specs.observation_specs, state_encoder_settings
+        )
+
+        self._action_flattener = ActionFlattener(self._action_spec)
+
+        self.inverse_model_action_encoding = torch.nn.Sequential(
+            LinearEncoder(2 * state_encoder_settings.hidden_units, 1, 256)
+        )
+
+        if self._action_spec.continuous_size > 0:
+            self.continuous_action_prediction = linear_layer(
+                256, self._action_spec.continuous_size
+            )
+        if self._action_spec.discrete_size > 0:
+            self.discrete_action_prediction = linear_layer(
+                256, sum(self._action_spec.discrete_branches)
+            )
+
+        self.forward_model_next_state_prediction = torch.nn.Sequential(
+            LinearEncoder(
+                state_encoder_settings.hidden_units
+                + self._action_flattener.flattened_size,
+                1,
+                256,
+            ),
+            linear_layer(256, state_encoder_settings.hidden_units),
+        )
+
+    def get_current_state(self, mini_batch: AgentBuffer) -> torch.Tensor:
+        """
+        Extracts the current state embedding from a mini_batch.
+        """
+        n_obs = len(self._state_encoder.processors)
+        np_obs = ObsUtil.from_buffer(mini_batch, n_obs)
+        # Convert to tensors
+        tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
+
+        hidden, _ = self._state_encoder.forward(tensor_obs)
+        return hidden
+
+    def get_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor:
+        """
+        Extracts the next state embedding from a mini_batch.
+        """
+        n_obs = len(self._state_encoder.processors)
+        np_obs = ObsUtil.from_buffer_next(mini_batch, n_obs)
+        # Convert to tensors
+        tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
+
+        hidden, _ = self._state_encoder.forward(tensor_obs)
+        return hidden
+
+    def predict_action(self, mini_batch: AgentBuffer) -> ActionPredictionTuple:
+        """
+        In the continuous case, returns the predicted action.
+        In the discrete case, returns the logits.
+        """
+        inverse_model_input = torch.cat(
+            (self.get_current_state(mini_batch), self.get_next_state(mini_batch)), dim=1
+        )
+
+        continuous_pred = None
+        discrete_pred = None
+        hidden = self.inverse_model_action_encoding(inverse_model_input)
+        if self._action_spec.continuous_size > 0:
+            continuous_pred = self.continuous_action_prediction(hidden)
+        if self._action_spec.discrete_size > 0:
+            raw_discrete_pred = self.discrete_action_prediction(hidden)
+            branches = ModelUtils.break_into_branches(
+                raw_discrete_pred, self._action_spec.discrete_branches
+            )
+            branches = [torch.softmax(b, dim=1) for b in branches]
+            discrete_pred = torch.cat(branches, dim=1)
+        return ActionPredictionTuple(continuous_pred, discrete_pred)
+
+    def predict_next_state(self, mini_batch: AgentBuffer) -> torch.Tensor:
+        """
+        Uses the current state embedding and the action of the mini_batch to predict
+        the next state embedding.
+        """
+        actions = AgentAction.from_buffer(mini_batch)
+        flattened_action = self._action_flattener.forward(actions)
+        forward_model_input = torch.cat(
+            (self.get_current_state(mini_batch), flattened_action), dim=1
+        )
+
+        return self.forward_model_next_state_prediction(forward_model_input)
+
+    def compute_inverse_loss(self, mini_batch: AgentBuffer) -> torch.Tensor:
+        """
+        Computes the inverse loss for a mini_batch. Corresponds to the error on the
+        action prediction (given the current and next state).
+        """
+        predicted_action = self.predict_action(mini_batch)
+        actions = AgentAction.from_buffer(mini_batch)
+        _inverse_loss = 0
+        if self._action_spec.continuous_size > 0:
+            sq_difference = (
+                actions.continuous_tensor - predicted_action.continuous
+            ) ** 2
+            sq_difference = torch.sum(sq_difference, dim=1)
+            _inverse_loss += torch.mean(
+                ModelUtils.dynamic_partition(
+                    sq_difference,
+                    ModelUtils.list_to_tensor(
+                        mini_batch[BufferKey.MASKS], dtype=torch.float
+                    ),
+                    2,
+                )[1]
+            )
+        if self._action_spec.discrete_size > 0:
+            true_action = torch.cat(
+                ModelUtils.actions_to_onehot(
+                    actions.discrete_tensor, self._action_spec.discrete_branches
+                ),
+                dim=1,
+            )
+            cross_entropy = torch.sum(
+                -torch.log(predicted_action.discrete + self.EPSILON) * true_action,
+                dim=1,
+            )
+            _inverse_loss += torch.mean(
+                ModelUtils.dynamic_partition(
+                    cross_entropy,
+                    ModelUtils.list_to_tensor(
+                        mini_batch[BufferKey.MASKS], dtype=torch.float
+                    ),  # use masks not action_masks
+                    2,
+                )[1]
+            )
+        return _inverse_loss
+
+    def compute_reward(self, mini_batch: AgentBuffer) -> torch.Tensor:
+        """
+        Calculates the curiosity reward for the mini_batch. Corresponds to the error
+        between the predicted and actual next state.
+        """
+        predicted_next_state = self.predict_next_state(mini_batch)
+        target = self.get_next_state(mini_batch)
+        sq_difference = 0.5 * (target - predicted_next_state) ** 2
+        sq_difference = torch.sum(sq_difference, dim=1)
+        return sq_difference
+
+    def compute_forward_loss(self, mini_batch: AgentBuffer) -> torch.Tensor:
+        """
+        Computes the loss for the next state prediction
+        """
+        return torch.mean(
+            ModelUtils.dynamic_partition(
+                self.compute_reward(mini_batch),
+                ModelUtils.list_to_tensor(
+                    mini_batch[BufferKey.MASKS], dtype=torch.float
+                ),
+                2,
+            )[1]
+        )
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/extrinsic_reward_provider.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/extrinsic_reward_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0b847463c78ef052e6dbb6a93d9af2321a5ae02
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/extrinsic_reward_provider.py
@@ -0,0 +1,43 @@
+import numpy as np
+from typing import Dict
+
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
+from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import (
+    BaseRewardProvider,
+)
+from mlagents_envs.base_env import BehaviorSpec
+from mlagents.trainers.settings import RewardSignalSettings
+
+
+class ExtrinsicRewardProvider(BaseRewardProvider):
+    """
+    Evaluates extrinsic reward. For single-agent, this equals the individual reward
+    given to the agent. For the POCA algorithm, we want not only the individual reward
+    but also the team and the individual rewards of the other agents.
+    """
+
+    def __init__(self, specs: BehaviorSpec, settings: RewardSignalSettings) -> None:
+        super().__init__(specs, settings)
+        self.add_groupmate_rewards = False
+
+    def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
+        indiv_rewards = np.array(
+            mini_batch[BufferKey.ENVIRONMENT_REWARDS], dtype=np.float32
+        )
+        total_rewards = indiv_rewards
+        if BufferKey.GROUPMATE_REWARDS in mini_batch and self.add_groupmate_rewards:
+            groupmate_rewards_list = mini_batch[BufferKey.GROUPMATE_REWARDS]
+            groupmate_rewards_sum = np.array(
+                [sum(_rew) for _rew in groupmate_rewards_list], dtype=np.float32
+            )
+            total_rewards += groupmate_rewards_sum
+        if BufferKey.GROUP_REWARD in mini_batch:
+            group_rewards = np.array(
+                mini_batch[BufferKey.GROUP_REWARD], dtype=np.float32
+            )
+            # Add all the group rewards to the individual rewards
+            total_rewards += group_rewards
+        return total_rewards
+
+    def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
+        return {}
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/gail_reward_provider.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/gail_reward_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ae77ba1434d0666420cde871775297fd62d72e4
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/gail_reward_provider.py
@@ -0,0 +1,260 @@
+from typing import Optional, Dict, List
+import numpy as np
+from mlagents.torch_utils import torch, default_device
+
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
+from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import (
+    BaseRewardProvider,
+)
+from mlagents.trainers.settings import GAILSettings
+from mlagents_envs.base_env import BehaviorSpec
+from mlagents_envs import logging_util
+from mlagents.trainers.torch_entities.utils import ModelUtils
+from mlagents.trainers.torch_entities.agent_action import AgentAction
+from mlagents.trainers.torch_entities.action_flattener import ActionFlattener
+from mlagents.trainers.torch_entities.networks import NetworkBody
+from mlagents.trainers.torch_entities.layers import linear_layer, Initialization
+from mlagents.trainers.demo_loader import demo_to_buffer
+from mlagents.trainers.trajectory import ObsUtil
+
+logger = logging_util.get_logger(__name__)
+
+
+class GAILRewardProvider(BaseRewardProvider):
+    def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None:
+        super().__init__(specs, settings)
+        self._ignore_done = False
+        self._discriminator_network = DiscriminatorNetwork(specs, settings)
+        self._discriminator_network.to(default_device())
+        _, self._demo_buffer = demo_to_buffer(
+            settings.demo_path, 1, specs
+        )  # This is supposed to be the sequence length but we do not have access here
+        params = list(self._discriminator_network.parameters())
+        self.optimizer = torch.optim.Adam(params, lr=settings.learning_rate)
+
+    def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
+        with torch.no_grad():
+            estimates, _ = self._discriminator_network.compute_estimate(
+                mini_batch, use_vail_noise=False
+            )
+            return ModelUtils.to_numpy(
+                -torch.log(
+                    1.0
+                    - estimates.squeeze(dim=1)
+                    * (1.0 - self._discriminator_network.EPSILON)
+                )
+            )
+
+    def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
+
+        expert_batch = self._demo_buffer.sample_mini_batch(
+            mini_batch.num_experiences, 1
+        )
+        self._discriminator_network.encoder.update_normalization(expert_batch)
+
+        loss, stats_dict = self._discriminator_network.compute_loss(
+            mini_batch, expert_batch
+        )
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        return stats_dict
+
+    def get_modules(self):
+        return {f"Module:{self.name}": self._discriminator_network}
+
+
+class DiscriminatorNetwork(torch.nn.Module):
+    gradient_penalty_weight = 10.0
+    z_size = 128
+    alpha = 0.0005
+    mutual_information = 0.5
+    EPSILON = 1e-7
+    initial_beta = 0.0
+
+    def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None:
+        super().__init__()
+        self._use_vail = settings.use_vail
+        self._settings = settings
+
+        encoder_settings = settings.network_settings
+        if encoder_settings.memory is not None:
+            encoder_settings.memory = None
+            logger.warning(
+                "memory was specified in network_settings but is not supported by GAIL. It is being ignored."
+            )
+
+        self._action_flattener = ActionFlattener(specs.action_spec)
+        unencoded_size = (
+            self._action_flattener.flattened_size + 1 if settings.use_actions else 0
+        )  # +1 is for dones
+        self.encoder = NetworkBody(
+            specs.observation_specs, encoder_settings, unencoded_size
+        )
+
+        estimator_input_size = encoder_settings.hidden_units
+        if settings.use_vail:
+            estimator_input_size = self.z_size
+            self._z_sigma = torch.nn.Parameter(
+                torch.ones((self.z_size), dtype=torch.float), requires_grad=True
+            )
+            self._z_mu_layer = linear_layer(
+                encoder_settings.hidden_units,
+                self.z_size,
+                kernel_init=Initialization.KaimingHeNormal,
+                kernel_gain=0.1,
+            )
+            self._beta = torch.nn.Parameter(
+                torch.tensor(self.initial_beta, dtype=torch.float), requires_grad=False
+            )
+
+        self._estimator = torch.nn.Sequential(
+            linear_layer(estimator_input_size, 1, kernel_gain=0.2), torch.nn.Sigmoid()
+        )
+
+    def get_action_input(self, mini_batch: AgentBuffer) -> torch.Tensor:
+        """
+        Creates the action Tensor. In continuous case, corresponds to the action. In
+        the discrete case, corresponds to the concatenation of one hot action Tensors.
+        """
+        return self._action_flattener.forward(AgentAction.from_buffer(mini_batch))
+
+    def get_state_inputs(self, mini_batch: AgentBuffer) -> List[torch.Tensor]:
+        """
+        Creates the observation input.
+        """
+        n_obs = len(self.encoder.processors)
+        np_obs = ObsUtil.from_buffer(mini_batch, n_obs)
+        # Convert to tensors
+        tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
+        return tensor_obs
+
+    def compute_estimate(
+        self, mini_batch: AgentBuffer, use_vail_noise: bool = False
+    ) -> torch.Tensor:
+        """
+        Given a mini_batch, computes the estimate (How much the discriminator believes
+        the data was sampled from the demonstration data).
+        :param mini_batch: The AgentBuffer of data
+        :param use_vail_noise: Only when using VAIL : If true, will sample the code, if
+        false, will return the mean of the code.
+        """
+        inputs = self.get_state_inputs(mini_batch)
+        if self._settings.use_actions:
+            actions = self.get_action_input(mini_batch)
+            dones = torch.as_tensor(
+                mini_batch[BufferKey.DONE], dtype=torch.float
+            ).unsqueeze(1)
+            action_inputs = torch.cat([actions, dones], dim=1)
+            hidden, _ = self.encoder(inputs, action_inputs)
+        else:
+            hidden, _ = self.encoder(inputs)
+        z_mu: Optional[torch.Tensor] = None
+        if self._settings.use_vail:
+            z_mu = self._z_mu_layer(hidden)
+            hidden = z_mu + torch.randn_like(z_mu) * self._z_sigma * use_vail_noise
+        estimate = self._estimator(hidden)
+        return estimate, z_mu
+
+    def compute_loss(
+        self, policy_batch: AgentBuffer, expert_batch: AgentBuffer
+    ) -> torch.Tensor:
+        """
+        Given a policy mini_batch and an expert mini_batch, computes the loss of the discriminator.
+        """
+        total_loss = torch.zeros(1)
+        stats_dict: Dict[str, np.ndarray] = {}
+        policy_estimate, policy_mu = self.compute_estimate(
+            policy_batch, use_vail_noise=True
+        )
+        expert_estimate, expert_mu = self.compute_estimate(
+            expert_batch, use_vail_noise=True
+        )
+        stats_dict["Policy/GAIL Policy Estimate"] = policy_estimate.mean().item()
+        stats_dict["Policy/GAIL Expert Estimate"] = expert_estimate.mean().item()
+        discriminator_loss = -(
+            torch.log(expert_estimate + self.EPSILON)
+            + torch.log(1.0 - policy_estimate + self.EPSILON)
+        ).mean()
+        stats_dict["Losses/GAIL Loss"] = discriminator_loss.item()
+        total_loss += discriminator_loss
+        if self._settings.use_vail:
+            # KL divergence loss (encourage latent representation to be normal)
+            kl_loss = torch.mean(
+                -torch.sum(
+                    1
+                    + (self._z_sigma**2).log()
+                    - 0.5 * expert_mu**2
+                    - 0.5 * policy_mu**2
+                    - (self._z_sigma**2),
+                    dim=1,
+                )
+            )
+            vail_loss = self._beta * (kl_loss - self.mutual_information)
+            with torch.no_grad():
+                self._beta.data = torch.max(
+                    self._beta + self.alpha * (kl_loss - self.mutual_information),
+                    torch.tensor(0.0),
+                )
+            total_loss += vail_loss
+            stats_dict["Policy/GAIL Beta"] = self._beta.item()
+            stats_dict["Losses/GAIL KL Loss"] = kl_loss.item()
+        if self.gradient_penalty_weight > 0.0:
+            gradient_magnitude_loss = (
+                self.gradient_penalty_weight
+                * self.compute_gradient_magnitude(policy_batch, expert_batch)
+            )
+            stats_dict["Policy/GAIL Grad Mag Loss"] = gradient_magnitude_loss.item()
+            total_loss += gradient_magnitude_loss
+        return total_loss, stats_dict
+
+    def compute_gradient_magnitude(
+        self, policy_batch: AgentBuffer, expert_batch: AgentBuffer
+    ) -> torch.Tensor:
+        """
+        Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp.
+        for off-policy. Compute gradients w.r.t randomly interpolated input.
+        """
+        policy_inputs = self.get_state_inputs(policy_batch)
+        expert_inputs = self.get_state_inputs(expert_batch)
+        interp_inputs = []
+        for policy_input, expert_input in zip(policy_inputs, expert_inputs):
+            obs_epsilon = torch.rand(policy_input.shape)
+            interp_input = obs_epsilon * policy_input + (1 - obs_epsilon) * expert_input
+            interp_input.requires_grad = True  # For gradient calculation
+            interp_inputs.append(interp_input)
+        if self._settings.use_actions:
+            policy_action = self.get_action_input(policy_batch)
+            expert_action = self.get_action_input(expert_batch)
+            action_epsilon = torch.rand(policy_action.shape)
+            policy_dones = torch.as_tensor(
+                policy_batch[BufferKey.DONE], dtype=torch.float
+            ).unsqueeze(1)
+            expert_dones = torch.as_tensor(
+                expert_batch[BufferKey.DONE], dtype=torch.float
+            ).unsqueeze(1)
+            dones_epsilon = torch.rand(policy_dones.shape)
+            action_inputs = torch.cat(
+                [
+                    action_epsilon * policy_action
+                    + (1 - action_epsilon) * expert_action,
+                    dones_epsilon * policy_dones + (1 - dones_epsilon) * expert_dones,
+                ],
+                dim=1,
+            )
+            action_inputs.requires_grad = True
+            hidden, _ = self.encoder(interp_inputs, action_inputs)
+            encoder_input = tuple(interp_inputs + [action_inputs])
+        else:
+            hidden, _ = self.encoder(interp_inputs)
+            encoder_input = tuple(interp_inputs)
+        if self._settings.use_vail:
+            use_vail_noise = True
+            z_mu = self._z_mu_layer(hidden)
+            hidden = z_mu + torch.randn_like(z_mu) * self._z_sigma * use_vail_noise
+        estimate = self._estimator(hidden).squeeze(1).sum()
+        gradient = torch.autograd.grad(estimate, encoder_input, create_graph=True)[0]
+        # Norm's gradient could be NaN at 0. Use our own safe_norm
+        safe_norm = (torch.sum(gradient**2, dim=1) + self.EPSILON).sqrt()
+        gradient_mag = torch.mean((safe_norm - 1) ** 2)
+        return gradient_mag
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/reward_provider_factory.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/reward_provider_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..825fc49006e4577cfab9ecaf4886b6b1b8c89cef
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/reward_provider_factory.py
@@ -0,0 +1,47 @@
+from typing import Dict, Type
+from mlagents.trainers.exception import UnityTrainerException
+
+from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
+
+from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import (
+    BaseRewardProvider,
+)
+from mlagents.trainers.torch_entities.components.reward_providers.extrinsic_reward_provider import (
+    ExtrinsicRewardProvider,
+)
+from mlagents.trainers.torch_entities.components.reward_providers.curiosity_reward_provider import (
+    CuriosityRewardProvider,
+)
+from mlagents.trainers.torch_entities.components.reward_providers.gail_reward_provider import (
+    GAILRewardProvider,
+)
+from mlagents.trainers.torch_entities.components.reward_providers.rnd_reward_provider import (
+    RNDRewardProvider,
+)
+
+from mlagents_envs.base_env import BehaviorSpec
+
+NAME_TO_CLASS: Dict[RewardSignalType, Type[BaseRewardProvider]] = {
+    RewardSignalType.EXTRINSIC: ExtrinsicRewardProvider,
+    RewardSignalType.CURIOSITY: CuriosityRewardProvider,
+    RewardSignalType.GAIL: GAILRewardProvider,
+    RewardSignalType.RND: RNDRewardProvider,
+}
+
+
+def create_reward_provider(
+    name: RewardSignalType, specs: BehaviorSpec, settings: RewardSignalSettings
+) -> BaseRewardProvider:
+    """
+    Creates a reward provider class based on the name and config entry provided as a dict.
+    :param name: The name of the reward signal
+    :param specs: The BehaviorSpecs of the policy
+    :param settings: The RewardSignalSettings for that reward signal
+    :return: The reward signal class instantiated
+    """
+    rcls = NAME_TO_CLASS.get(name)
+    if not rcls:
+        raise UnityTrainerException(f"Unknown reward signal type {name}")
+
+    class_inst = rcls(specs, settings)
+    return class_inst
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/rnd_reward_provider.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/rnd_reward_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda1424ab5c07776d10aafd5a6fb33a6c3043553
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/components/reward_providers/rnd_reward_provider.py
@@ -0,0 +1,80 @@
+import numpy as np
+from typing import Dict
+from mlagents.torch_utils import torch
+
+from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import (
+    BaseRewardProvider,
+)
+from mlagents.trainers.settings import RNDSettings
+
+from mlagents_envs.base_env import BehaviorSpec
+from mlagents_envs import logging_util
+from mlagents.trainers.torch_entities.utils import ModelUtils
+from mlagents.trainers.torch_entities.networks import NetworkBody
+from mlagents.trainers.trajectory import ObsUtil
+
+logger = logging_util.get_logger(__name__)
+
+
+class RNDRewardProvider(BaseRewardProvider):
+    """
+    Implementation of Random Network Distillation : https://arxiv.org/pdf/1810.12894.pdf
+    """
+
+    def __init__(self, specs: BehaviorSpec, settings: RNDSettings) -> None:
+        super().__init__(specs, settings)
+        self._ignore_done = True
+        self._random_network = RNDNetwork(specs, settings)
+        self._training_network = RNDNetwork(specs, settings)
+        self.optimizer = torch.optim.Adam(
+            self._training_network.parameters(), lr=settings.learning_rate
+        )
+
+    def evaluate(self, mini_batch: AgentBuffer) -> np.ndarray:
+        with torch.no_grad():
+            target = self._random_network(mini_batch)
+            prediction = self._training_network(mini_batch)
+            rewards = torch.sum((prediction - target) ** 2, dim=1)
+        return rewards.detach().cpu().numpy()
+
+    def update(self, mini_batch: AgentBuffer) -> Dict[str, np.ndarray]:
+        with torch.no_grad():
+            target = self._random_network(mini_batch)
+        prediction = self._training_network(mini_batch)
+        loss = torch.mean(torch.sum((prediction - target) ** 2, dim=1))
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        return {"Losses/RND Loss": loss.detach().cpu().numpy()}
+
+    def get_modules(self):
+        return {
+            f"Module:{self.name}-pred": self._training_network,
+            f"Module:{self.name}-target": self._random_network,
+        }
+
+
+class RNDNetwork(torch.nn.Module):
+    EPSILON = 1e-10
+
+    def __init__(self, specs: BehaviorSpec, settings: RNDSettings) -> None:
+        super().__init__()
+        state_encoder_settings = settings.network_settings
+        if state_encoder_settings.memory is not None:
+            state_encoder_settings.memory = None
+            logger.warning(
+                "memory was specified in network_settings but is not supported by RND. It is being ignored."
+            )
+
+        self._encoder = NetworkBody(specs.observation_specs, state_encoder_settings)
+
+    def forward(self, mini_batch: AgentBuffer) -> torch.Tensor:
+        n_obs = len(self._encoder.processors)
+        np_obs = ObsUtil.from_buffer(mini_batch, n_obs)
+        # Convert to tensors
+        tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]
+
+        hidden, _ = self._encoder.forward(tensor_obs)
+        self._encoder.update_normalization(mini_batch)
+        return hidden
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/conditioning.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/conditioning.py
new file mode 100644
index 0000000000000000000000000000000000000000..65f622eba38846658f4bc44423883a28463777ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/conditioning.py
@@ -0,0 +1,133 @@
+from mlagents.torch_utils import torch
+from typing import List
+import math
+
+from mlagents.trainers.torch_entities.layers import (
+    linear_layer,
+    Swish,
+    Initialization,
+    LayerNorm,
+)
+
+
+class ConditionalEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        goal_size: int,
+        hidden_size: int,
+        num_layers: int,
+        num_conditional_layers: int,
+        kernel_init: Initialization = Initialization.KaimingHeNormal,
+        kernel_gain: float = 1.0,
+    ):
+        """
+        ConditionalEncoder module. A fully connected network of which some of the
+        weights are generated by a goal conditioning. Uses the HyperNetwork module to
+        generate the weights of the network. Only the weights of the last
+        "num_conditional_layers" layers will be generated by HyperNetworks, the others
+        will use regular parameters.
+        :param input_size: The size of the input of the encoder
+        :param goal_size: The size of the goal tensor that will condition the encoder
+        :param hidden_size: The number of hidden units in the encoder
+        :param num_layers: The total number of layers of the encoder (both regular and
+        generated by HyperNetwork)
+        :param num_conditional_layers: The number of layers generated with hypernetworks
+        :param kernel_init: The Initialization to use for the weights of the layer
+        :param kernel_gain: The multiplier for the weights of the kernel.
+        """
+        super().__init__()
+        layers: List[torch.nn.Module] = []
+        prev_size = input_size
+        for i in range(num_layers):
+            if num_layers - i <= num_conditional_layers:
+                # This means layer i is a conditional layer since the conditional
+                # leyers are the last num_conditional_layers
+                layers.append(
+                    HyperNetwork(prev_size, hidden_size, goal_size, hidden_size, 2)
+                )
+            else:
+                layers.append(
+                    linear_layer(
+                        prev_size,
+                        hidden_size,
+                        kernel_init=kernel_init,
+                        kernel_gain=kernel_gain,
+                    )
+                )
+            layers.append(Swish())
+            prev_size = hidden_size
+        self.layers = torch.nn.ModuleList(layers)
+
+    def forward(
+        self, input_tensor: torch.Tensor, goal_tensor: torch.Tensor
+    ) -> torch.Tensor:  # type: ignore
+        activation = input_tensor
+        for layer in self.layers:
+            if isinstance(layer, HyperNetwork):
+                activation = layer(activation, goal_tensor)
+            else:
+                activation = layer(activation)
+        return activation
+
+
+class HyperNetwork(torch.nn.Module):
+    def __init__(
+        self, input_size, output_size, hyper_input_size, layer_size, num_layers
+    ):
+        """
+        Hyper Network module. This module will use the hyper_input tensor to generate
+        the weights of the main network. The main network is a single fully connected
+        layer.
+        :param input_size: The size of the input of the main network
+        :param output_size: The size of the output of the main network
+        :param hyper_input_size: The size of the input of the hypernetwork that will
+        generate the main network.
+        :param layer_size: The number of hidden units in the layers of the hypernetwork
+        :param num_layers: The number of layers of the hypernetwork
+        """
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+
+        layer_in_size = hyper_input_size
+        layers = []
+        for _ in range(num_layers):
+            layers.append(
+                linear_layer(
+                    layer_in_size,
+                    layer_size,
+                    kernel_init=Initialization.KaimingHeNormal,
+                    kernel_gain=1.0,
+                    bias_init=Initialization.Zero,
+                )
+            )
+            layers.append(Swish())
+            layer_in_size = layer_size
+        flat_output = linear_layer(
+            layer_size,
+            input_size * output_size,
+            kernel_init=Initialization.KaimingHeNormal,
+            kernel_gain=0.1,
+            bias_init=Initialization.Zero,
+        )
+
+        # Re-initializing the weights of the last layer of the hypernetwork
+        bound = math.sqrt(1 / (layer_size * self.input_size))
+        flat_output.weight.data.uniform_(-bound, bound)
+
+        self.hypernet = torch.nn.Sequential(*layers, LayerNorm(), flat_output)
+
+        # The hypernetwork will not generate the bias of the main network layer
+        self.bias = torch.nn.Parameter(torch.zeros(output_size))
+
+    def forward(self, input_activation, hyper_input):
+        output_weights = self.hypernet(hyper_input)
+
+        output_weights = output_weights.view(-1, self.input_size, self.output_size)
+
+        result = (
+            torch.bmm(input_activation.unsqueeze(1), output_weights).squeeze(1)
+            + self.bias
+        )
+        return result
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/decoders.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/decoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..30f196a455614754b627888ba7e9f380d53aff0d
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/decoders.py
@@ -0,0 +1,22 @@
+from typing import List, Dict
+
+from mlagents.torch_utils import torch, nn
+from mlagents.trainers.torch_entities.layers import linear_layer
+
+
+class ValueHeads(nn.Module):
+    def __init__(self, stream_names: List[str], input_size: int, output_size: int = 1):
+        super().__init__()
+        self.stream_names = stream_names
+        _value_heads = {}
+
+        for name in stream_names:
+            value = linear_layer(input_size, output_size)
+            _value_heads[name] = value
+        self.value_heads = nn.ModuleDict(_value_heads)
+
+    def forward(self, hidden: torch.Tensor) -> Dict[str, torch.Tensor]:
+        value_outputs = {}
+        for stream_name, head in self.value_heads.items():
+            value_outputs[stream_name] = head(hidden).squeeze(-1)
+        return value_outputs
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/distributions.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..47fd0d0847e2ca0b2a1b45030e2c07dab5861dc2
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/distributions.py
@@ -0,0 +1,248 @@
+import abc
+from typing import List
+from mlagents.torch_utils import torch, nn
+import numpy as np
+import math
+from mlagents.trainers.torch_entities.layers import linear_layer, Initialization
+
+EPSILON = 1e-7  # Small value to avoid divide by zero
+
+
+class DistInstance(nn.Module, abc.ABC):
+    @abc.abstractmethod
+    def sample(self) -> torch.Tensor:
+        """
+        Return a sample from this distribution.
+        """
+        pass
+
+    @abc.abstractmethod
+    def deterministic_sample(self) -> torch.Tensor:
+        """
+        Return the most probable sample from this distribution.
+        """
+        pass
+
+    @abc.abstractmethod
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
+        """
+        Returns the log probabilities of a particular value.
+        :param value: A value sampled from the distribution.
+        :returns: Log probabilities of the given value.
+        """
+        pass
+
+    @abc.abstractmethod
+    def entropy(self) -> torch.Tensor:
+        """
+        Returns the entropy of this distribution.
+        """
+        pass
+
+    @abc.abstractmethod
+    def exported_model_output(self) -> torch.Tensor:
+        """
+        Returns the tensor to be exported to ONNX for the distribution
+        """
+        pass
+
+
+class DiscreteDistInstance(DistInstance):
+    @abc.abstractmethod
+    def all_log_prob(self) -> torch.Tensor:
+        """
+        Returns the log probabilities of all actions represented by this distribution.
+        """
+        pass
+
+
+class GaussianDistInstance(DistInstance):
+    def __init__(self, mean, std):
+        super().__init__()
+        self.mean = mean
+        self.std = std
+
+    def sample(self):
+        sample = self.mean + torch.randn_like(self.mean) * self.std
+        return sample
+
+    def deterministic_sample(self):
+        return self.mean
+
+    def log_prob(self, value):
+        var = self.std**2
+        log_scale = torch.log(self.std + EPSILON)
+        return (
+            -((value - self.mean) ** 2) / (2 * var + EPSILON)
+            - log_scale
+            - math.log(math.sqrt(2 * math.pi))
+        )
+
+    def pdf(self, value):
+        log_prob = self.log_prob(value)
+        return torch.exp(log_prob)
+
+    def entropy(self):
+        return torch.mean(
+            0.5 * torch.log(2 * math.pi * math.e * self.std**2 + EPSILON),
+            dim=1,
+            keepdim=True,
+        )  # Use equivalent behavior to TF
+
+    def exported_model_output(self):
+        return self.sample()
+
+
+class TanhGaussianDistInstance(GaussianDistInstance):
+    def __init__(self, mean, std):
+        super().__init__(mean, std)
+        self.transform = torch.distributions.transforms.TanhTransform(cache_size=1)
+
+    def sample(self):
+        unsquashed_sample = super().sample()
+        squashed = self.transform(unsquashed_sample)
+        return squashed
+
+    def _inverse_tanh(self, value):
+        capped_value = torch.clamp(value, -1 + EPSILON, 1 - EPSILON)
+        return 0.5 * torch.log((1 + capped_value) / (1 - capped_value) + EPSILON)
+
+    def log_prob(self, value):
+        unsquashed = self.transform.inv(value)
+        return super().log_prob(unsquashed) - self.transform.log_abs_det_jacobian(
+            unsquashed, value
+        )
+
+
+class CategoricalDistInstance(DiscreteDistInstance):
+    def __init__(self, logits):
+        super().__init__()
+        self.logits = logits
+        self.probs = torch.softmax(self.logits, dim=-1)
+
+    def sample(self):
+        return torch.multinomial(self.probs, 1)
+
+    def deterministic_sample(self):
+        return torch.argmax(self.probs, dim=1, keepdim=True)
+
+    def pdf(self, value):
+        # This function is equivalent to torch.diag(self.probs.T[value.flatten().long()]),
+        # but torch.diag is not supported by ONNX export.
+        idx = torch.arange(start=0, end=len(value)).unsqueeze(-1)
+        return torch.gather(
+            self.probs.permute(1, 0)[value.flatten().long()], -1, idx
+        ).squeeze(-1)
+
+    def log_prob(self, value):
+        return torch.log(self.pdf(value) + EPSILON)
+
+    def all_log_prob(self):
+        return torch.log(self.probs + EPSILON)
+
+    def entropy(self):
+        return -torch.sum(
+            self.probs * torch.log(self.probs + EPSILON), dim=-1
+        ).unsqueeze(-1)
+
+    def exported_model_output(self):
+        return self.sample()
+
+
+class GaussianDistribution(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_outputs: int,
+        conditional_sigma: bool = False,
+        tanh_squash: bool = False,
+    ):
+        super().__init__()
+        self.conditional_sigma = conditional_sigma
+        self.mu = linear_layer(
+            hidden_size,
+            num_outputs,
+            kernel_init=Initialization.KaimingHeNormal,
+            kernel_gain=0.2,
+            bias_init=Initialization.Zero,
+        )
+        self.tanh_squash = tanh_squash
+        if conditional_sigma:
+            self.log_sigma = linear_layer(
+                hidden_size,
+                num_outputs,
+                kernel_init=Initialization.KaimingHeNormal,
+                kernel_gain=0.2,
+                bias_init=Initialization.Zero,
+            )
+        else:
+            self.log_sigma = nn.Parameter(
+                torch.zeros(1, num_outputs, requires_grad=True)
+            )
+
+    def forward(self, inputs: torch.Tensor) -> List[DistInstance]:
+        mu = self.mu(inputs)
+        if self.conditional_sigma:
+            log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2)
+        else:
+            # Expand so that entropy matches batch size. Note that we're using
+            # mu*0 here to get the batch size implicitly since Barracuda 1.2.1
+            # throws error on runtime broadcasting due to unknown reason. We
+            # use this to replace torch.expand() becuase it is not supported in
+            # the verified version of Barracuda (1.0.X).
+            log_sigma = mu * 0 + self.log_sigma
+        if self.tanh_squash:
+            return TanhGaussianDistInstance(mu, torch.exp(log_sigma))
+        else:
+            return GaussianDistInstance(mu, torch.exp(log_sigma))
+
+
+class MultiCategoricalDistribution(nn.Module):
+    def __init__(self, hidden_size: int, act_sizes: List[int]):
+        super().__init__()
+        self.act_sizes = act_sizes
+        self.branches = self._create_policy_branches(hidden_size)
+
+    def _create_policy_branches(self, hidden_size: int) -> nn.ModuleList:
+        branches = []
+        for size in self.act_sizes:
+            branch_output_layer = linear_layer(
+                hidden_size,
+                size,
+                kernel_init=Initialization.KaimingHeNormal,
+                kernel_gain=0.1,
+                bias_init=Initialization.Zero,
+            )
+            branches.append(branch_output_layer)
+        return nn.ModuleList(branches)
+
+    def _mask_branch(
+        self, logits: torch.Tensor, allow_mask: torch.Tensor
+    ) -> torch.Tensor:
+        # Zero out masked logits, then subtract a large value. Technique mentionend here:
+        # https://arxiv.org/abs/2006.14171. Our implementation is ONNX and Barracuda-friendly.
+        block_mask = -1.0 * allow_mask + 1.0
+        # We do -1 * tensor + constant instead of constant - tensor because it seems
+        # Barracuda might swap the inputs of a "Sub" operation
+        logits = logits * allow_mask - 1e8 * block_mask
+
+        return logits
+
+    def _split_masks(self, masks: torch.Tensor) -> List[torch.Tensor]:
+        split_masks = []
+        for idx, _ in enumerate(self.act_sizes):
+            start = int(np.sum(self.act_sizes[:idx]))
+            end = int(np.sum(self.act_sizes[: idx + 1]))
+            split_masks.append(masks[:, start:end])
+        return split_masks
+
+    def forward(self, inputs: torch.Tensor, masks: torch.Tensor) -> List[DistInstance]:
+        # Todo - Support multiple branches in mask code
+        branch_distributions = []
+        masks = self._split_masks(masks)
+        for idx, branch in enumerate(self.branches):
+            logits = branch(inputs)
+            norm_logits = self._mask_branch(logits, masks[idx])
+            distribution = CategoricalDistInstance(norm_logits)
+            branch_distributions.append(distribution)
+        return branch_distributions
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/encoders.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/encoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..32b944ddfc7e47c9203eaa70f3fd607855f1ec00
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/encoders.py
@@ -0,0 +1,298 @@
+from typing import Tuple, Optional, Union
+
+from mlagents.trainers.torch_entities.layers import linear_layer, Initialization, Swish
+
+from mlagents.torch_utils import torch, nn
+from mlagents.trainers.torch_entities.model_serialization import exporting_to_onnx
+
+
+class Normalizer(nn.Module):
+    def __init__(self, vec_obs_size: int):
+        super().__init__()
+        self.register_buffer("normalization_steps", torch.tensor(1))
+        self.register_buffer("running_mean", torch.zeros(vec_obs_size))
+        self.register_buffer("running_variance", torch.ones(vec_obs_size))
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        normalized_state = torch.clamp(
+            (inputs - self.running_mean)
+            / torch.sqrt(self.running_variance / self.normalization_steps),
+            -5,
+            5,
+        )
+        return normalized_state
+
+    def update(self, vector_input: torch.Tensor) -> None:
+        with torch.no_grad():
+            steps_increment = vector_input.size()[0]
+            total_new_steps = self.normalization_steps + steps_increment
+
+            input_to_old_mean = vector_input - self.running_mean
+            new_mean: torch.Tensor = self.running_mean + (
+                input_to_old_mean / total_new_steps
+            ).sum(0)
+
+            input_to_new_mean = vector_input - new_mean
+            new_variance = self.running_variance + (
+                input_to_new_mean * input_to_old_mean
+            ).sum(0)
+            # Update references. This is much faster than in-place data update.
+            self.running_mean: torch.Tensor = new_mean
+            self.running_variance: torch.Tensor = new_variance
+            self.normalization_steps: torch.Tensor = total_new_steps
+
+    def copy_from(self, other_normalizer: "Normalizer") -> None:
+        self.normalization_steps.data.copy_(other_normalizer.normalization_steps.data)
+        self.running_mean.data.copy_(other_normalizer.running_mean.data)
+        self.running_variance.copy_(other_normalizer.running_variance.data)
+
+
+def conv_output_shape(
+    h_w: Tuple[int, int],
+    kernel_size: Union[int, Tuple[int, int]] = 1,
+    stride: int = 1,
+    padding: int = 0,
+    dilation: int = 1,
+) -> Tuple[int, int]:
+    """
+    Calculates the output shape (height and width) of the output of a convolution layer.
+    kernel_size, stride, padding and dilation correspond to the inputs of the
+    torch.nn.Conv2d layer (https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html)
+    :param h_w: The height and width of the input.
+    :param kernel_size: The size of the kernel of the convolution (can be an int or a
+    tuple [width, height])
+    :param stride: The stride of the convolution
+    :param padding: The padding of the convolution
+    :param dilation: The dilation of the convolution
+    """
+    from math import floor
+
+    if not isinstance(kernel_size, tuple):
+        kernel_size = (int(kernel_size), int(kernel_size))
+    h = floor(
+        ((h_w[0] + (2 * padding) - (dilation * (kernel_size[0] - 1)) - 1) / stride) + 1
+    )
+    w = floor(
+        ((h_w[1] + (2 * padding) - (dilation * (kernel_size[1] - 1)) - 1) / stride) + 1
+    )
+    return h, w
+
+
+def pool_out_shape(h_w: Tuple[int, int], kernel_size: int) -> Tuple[int, int]:
+    """
+    Calculates the output shape (height and width) of the output of a max pooling layer.
+    kernel_size corresponds to the inputs of the
+    torch.nn.MaxPool2d layer (https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html)
+    :param kernel_size: The size of the kernel of the convolution
+    """
+    height = (h_w[0] - kernel_size) // 2 + 1
+    width = (h_w[1] - kernel_size) // 2 + 1
+    return height, width
+
+
+class VectorInput(nn.Module):
+    def __init__(self, input_size: int, normalize: bool = False):
+        super().__init__()
+        self.normalizer: Optional[Normalizer] = None
+        if normalize:
+            self.normalizer = Normalizer(input_size)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        if self.normalizer is not None:
+            inputs = self.normalizer(inputs)
+        return inputs
+
+    def copy_normalization(self, other_input: "VectorInput") -> None:
+        if self.normalizer is not None and other_input.normalizer is not None:
+            self.normalizer.copy_from(other_input.normalizer)
+
+    def update_normalization(self, inputs: torch.Tensor) -> None:
+        if self.normalizer is not None:
+            self.normalizer.update(inputs)
+
+
+class FullyConnectedVisualEncoder(nn.Module):
+    def __init__(
+        self, height: int, width: int, initial_channels: int, output_size: int
+    ):
+        super().__init__()
+        self.output_size = output_size
+        self.input_size = height * width * initial_channels
+        self.dense = nn.Sequential(
+            linear_layer(
+                self.input_size,
+                self.output_size,
+                kernel_init=Initialization.KaimingHeNormal,
+                kernel_gain=1.41,  # Use ReLU gain
+            ),
+            nn.LeakyReLU(),
+        )
+
+    def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
+        if not exporting_to_onnx.is_exporting():
+            visual_obs = visual_obs.permute([0, 3, 1, 2])
+        hidden = visual_obs.reshape(-1, self.input_size)
+        return self.dense(hidden)
+
+
+class SmallVisualEncoder(nn.Module):
+    """
+    CNN architecture used by King in their Candy Crush predictor
+    https://www.researchgate.net/publication/328307928_Human-Like_Playtesting_with_Deep_Learning
+    """
+
+    def __init__(
+        self, height: int, width: int, initial_channels: int, output_size: int
+    ):
+        super().__init__()
+        self.h_size = output_size
+        conv_1_hw = conv_output_shape((height, width), 3, 1)
+        conv_2_hw = conv_output_shape(conv_1_hw, 3, 1)
+        self.final_flat = conv_2_hw[0] * conv_2_hw[1] * 144
+
+        self.conv_layers = nn.Sequential(
+            nn.Conv2d(initial_channels, 35, [3, 3], [1, 1]),
+            nn.LeakyReLU(),
+            nn.Conv2d(35, 144, [3, 3], [1, 1]),
+            nn.LeakyReLU(),
+        )
+        self.dense = nn.Sequential(
+            linear_layer(
+                self.final_flat,
+                self.h_size,
+                kernel_init=Initialization.KaimingHeNormal,
+                kernel_gain=1.41,  # Use ReLU gain
+            ),
+            nn.LeakyReLU(),
+        )
+
+    def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
+        if not exporting_to_onnx.is_exporting():
+            visual_obs = visual_obs.permute([0, 3, 1, 2])
+        hidden = self.conv_layers(visual_obs)
+        hidden = hidden.reshape(-1, self.final_flat)
+        return self.dense(hidden)
+
+
+class SimpleVisualEncoder(nn.Module):
+    def __init__(
+        self, height: int, width: int, initial_channels: int, output_size: int
+    ):
+        super().__init__()
+        self.h_size = output_size
+        conv_1_hw = conv_output_shape((height, width), 8, 4)
+        conv_2_hw = conv_output_shape(conv_1_hw, 4, 2)
+        self.final_flat = conv_2_hw[0] * conv_2_hw[1] * 32
+
+        self.conv_layers = nn.Sequential(
+            nn.Conv2d(initial_channels, 16, [8, 8], [4, 4]),
+            nn.LeakyReLU(),
+            nn.Conv2d(16, 32, [4, 4], [2, 2]),
+            nn.LeakyReLU(),
+        )
+        self.dense = nn.Sequential(
+            linear_layer(
+                self.final_flat,
+                self.h_size,
+                kernel_init=Initialization.KaimingHeNormal,
+                kernel_gain=1.41,  # Use ReLU gain
+            ),
+            nn.LeakyReLU(),
+        )
+
+    def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
+        if not exporting_to_onnx.is_exporting():
+            visual_obs = visual_obs.permute([0, 3, 1, 2])
+        hidden = self.conv_layers(visual_obs)
+        hidden = hidden.reshape(-1, self.final_flat)
+        return self.dense(hidden)
+
+
+class NatureVisualEncoder(nn.Module):
+    def __init__(
+        self, height: int, width: int, initial_channels: int, output_size: int
+    ):
+        super().__init__()
+        self.h_size = output_size
+        conv_1_hw = conv_output_shape((height, width), 8, 4)
+        conv_2_hw = conv_output_shape(conv_1_hw, 4, 2)
+        conv_3_hw = conv_output_shape(conv_2_hw, 3, 1)
+        self.final_flat = conv_3_hw[0] * conv_3_hw[1] * 64
+
+        self.conv_layers = nn.Sequential(
+            nn.Conv2d(initial_channels, 32, [8, 8], [4, 4]),
+            nn.LeakyReLU(),
+            nn.Conv2d(32, 64, [4, 4], [2, 2]),
+            nn.LeakyReLU(),
+            nn.Conv2d(64, 64, [3, 3], [1, 1]),
+            nn.LeakyReLU(),
+        )
+        self.dense = nn.Sequential(
+            linear_layer(
+                self.final_flat,
+                self.h_size,
+                kernel_init=Initialization.KaimingHeNormal,
+                kernel_gain=1.41,  # Use ReLU gain
+            ),
+            nn.LeakyReLU(),
+        )
+
+    def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
+        if not exporting_to_onnx.is_exporting():
+            visual_obs = visual_obs.permute([0, 3, 1, 2])
+        hidden = self.conv_layers(visual_obs)
+        hidden = hidden.reshape([-1, self.final_flat])
+        return self.dense(hidden)
+
+
+class ResNetBlock(nn.Module):
+    def __init__(self, channel: int):
+        """
+        Creates a ResNet Block.
+        :param channel: The number of channels in the input (and output) tensors of the
+        convolutions
+        """
+        super().__init__()
+        self.layers = nn.Sequential(
+            Swish(),
+            nn.Conv2d(channel, channel, [3, 3], [1, 1], padding=1),
+            Swish(),
+            nn.Conv2d(channel, channel, [3, 3], [1, 1], padding=1),
+        )
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        return input_tensor + self.layers(input_tensor)
+
+
+class ResNetVisualEncoder(nn.Module):
+    def __init__(
+        self, height: int, width: int, initial_channels: int, output_size: int
+    ):
+        super().__init__()
+        n_channels = [16, 32, 32]  # channel for each stack
+        n_blocks = 2  # number of residual blocks
+        layers = []
+        last_channel = initial_channels
+        for _, channel in enumerate(n_channels):
+            layers.append(nn.Conv2d(last_channel, channel, [3, 3], [1, 1], padding=1))
+            layers.append(nn.MaxPool2d([3, 3], [2, 2]))
+            height, width = pool_out_shape((height, width), 3)
+            for _ in range(n_blocks):
+                layers.append(ResNetBlock(channel))
+            last_channel = channel
+        layers.append(Swish())
+        self.final_flat_size = n_channels[-1] * height * width
+        self.dense = linear_layer(
+            self.final_flat_size,
+            output_size,
+            kernel_init=Initialization.KaimingHeNormal,
+            kernel_gain=1.41,  # Use ReLU gain
+        )
+        self.sequential = nn.Sequential(*layers)
+
+    def forward(self, visual_obs: torch.Tensor) -> torch.Tensor:
+        if not exporting_to_onnx.is_exporting():
+            visual_obs = visual_obs.permute([0, 3, 1, 2])
+        hidden = self.sequential(visual_obs)
+        before_out = hidden.reshape(-1, self.final_flat_size)
+        return torch.relu(self.dense(before_out))
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/layers.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5a598edf64baa072941acb6a917d9dccef179a5
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/layers.py
@@ -0,0 +1,225 @@
+from mlagents.torch_utils import torch
+import abc
+from typing import Tuple
+from enum import Enum
+from mlagents.trainers.torch_entities.model_serialization import exporting_to_onnx
+
+
+class Swish(torch.nn.Module):
+    def forward(self, data: torch.Tensor) -> torch.Tensor:
+        return torch.mul(data, torch.sigmoid(data))
+
+
+class Initialization(Enum):
+    Zero = 0
+    XavierGlorotNormal = 1
+    XavierGlorotUniform = 2
+    KaimingHeNormal = 3  # also known as Variance scaling
+    KaimingHeUniform = 4
+    Normal = 5
+
+
+_init_methods = {
+    Initialization.Zero: torch.zero_,
+    Initialization.XavierGlorotNormal: torch.nn.init.xavier_normal_,
+    Initialization.XavierGlorotUniform: torch.nn.init.xavier_uniform_,
+    Initialization.KaimingHeNormal: torch.nn.init.kaiming_normal_,
+    Initialization.KaimingHeUniform: torch.nn.init.kaiming_uniform_,
+    Initialization.Normal: torch.nn.init.normal_,
+}
+
+
+def linear_layer(
+    input_size: int,
+    output_size: int,
+    kernel_init: Initialization = Initialization.XavierGlorotUniform,
+    kernel_gain: float = 1.0,
+    bias_init: Initialization = Initialization.Zero,
+) -> torch.nn.Module:
+    """
+    Creates a torch.nn.Linear module and initializes its weights.
+    :param input_size: The size of the input tensor
+    :param output_size: The size of the output tensor
+    :param kernel_init: The Initialization to use for the weights of the layer
+    :param kernel_gain: The multiplier for the weights of the kernel. Note that in
+    TensorFlow, the gain is square-rooted. Therefore calling  with scale 0.01 is equivalent to calling
+        KaimingHeNormal with kernel_gain of 0.1
+    :param bias_init: The Initialization to use for the weights of the bias layer
+    """
+    layer = torch.nn.Linear(input_size, output_size)
+    if (
+        kernel_init == Initialization.KaimingHeNormal
+        or kernel_init == Initialization.KaimingHeUniform
+    ):
+        _init_methods[kernel_init](layer.weight.data, nonlinearity="linear")
+    else:
+        _init_methods[kernel_init](layer.weight.data)
+    layer.weight.data *= kernel_gain
+    _init_methods[bias_init](layer.bias.data)
+    return layer
+
+
+def lstm_layer(
+    input_size: int,
+    hidden_size: int,
+    num_layers: int = 1,
+    batch_first: bool = True,
+    forget_bias: float = 1.0,
+    kernel_init: Initialization = Initialization.XavierGlorotUniform,
+    bias_init: Initialization = Initialization.Zero,
+) -> torch.nn.Module:
+    """
+    Creates a torch.nn.LSTM and initializes its weights and biases. Provides a
+    forget_bias offset like is done in TensorFlow.
+    """
+    lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=batch_first)
+    # Add forget_bias to forget gate bias
+    for name, param in lstm.named_parameters():
+        # Each weight and bias is a concatenation of 4 matrices
+        if "weight" in name:
+            for idx in range(4):
+                block_size = param.shape[0] // 4
+                _init_methods[kernel_init](
+                    param.data[idx * block_size : (idx + 1) * block_size]
+                )
+        if "bias" in name:
+            for idx in range(4):
+                block_size = param.shape[0] // 4
+                _init_methods[bias_init](
+                    param.data[idx * block_size : (idx + 1) * block_size]
+                )
+                if idx == 1:
+                    param.data[idx * block_size : (idx + 1) * block_size].add_(
+                        forget_bias
+                    )
+    return lstm
+
+
+class MemoryModule(torch.nn.Module):
+    @abc.abstractproperty
+    def memory_size(self) -> int:
+        """
+        Size of memory that is required at the start of a sequence.
+        """
+        pass
+
+    @abc.abstractmethod
+    def forward(
+        self, input_tensor: torch.Tensor, memories: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Pass a sequence to the memory module.
+        :input_tensor: Tensor of shape (batch_size, seq_length, size) that represents the input.
+        :memories: Tensor of initial memories.
+        :return: Tuple of output, final memories.
+        """
+        pass
+
+
+class LayerNorm(torch.nn.Module):
+    """
+    A vanilla implementation of layer normalization  https://arxiv.org/pdf/1607.06450.pdf
+    norm_x = (x - mean) / sqrt((x - mean) ^ 2)
+    This does not include the trainable parameters gamma and beta for performance speed.
+    Typically, this is norm_x * gamma + beta
+    """
+
+    def forward(self, layer_activations: torch.Tensor) -> torch.Tensor:
+        mean = torch.mean(layer_activations, dim=-1, keepdim=True)
+        var = torch.mean((layer_activations - mean) ** 2, dim=-1, keepdim=True)
+        return (layer_activations - mean) / (torch.sqrt(var + 1e-5))
+
+
+class LinearEncoder(torch.nn.Module):
+    """
+    Linear layers.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        num_layers: int,
+        hidden_size: int,
+        kernel_init: Initialization = Initialization.KaimingHeNormal,
+        kernel_gain: float = 1.0,
+    ):
+        super().__init__()
+        self.layers = [
+            linear_layer(
+                input_size,
+                hidden_size,
+                kernel_init=kernel_init,
+                kernel_gain=kernel_gain,
+            )
+        ]
+        self.layers.append(Swish())
+        for _ in range(num_layers - 1):
+            self.layers.append(
+                linear_layer(
+                    hidden_size,
+                    hidden_size,
+                    kernel_init=kernel_init,
+                    kernel_gain=kernel_gain,
+                )
+            )
+            self.layers.append(Swish())
+        self.seq_layers = torch.nn.Sequential(*self.layers)
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        return self.seq_layers(input_tensor)
+
+
+class LSTM(MemoryModule):
+    """
+    Memory module that implements LSTM.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        memory_size: int,
+        num_layers: int = 1,
+        forget_bias: float = 1.0,
+        kernel_init: Initialization = Initialization.XavierGlorotUniform,
+        bias_init: Initialization = Initialization.Zero,
+    ):
+        super().__init__()
+        # We set hidden size to half of memory_size since the initial memory
+        # will be divided between the hidden state and initial cell state.
+        self.hidden_size = memory_size // 2
+        self.lstm = lstm_layer(
+            input_size,
+            self.hidden_size,
+            num_layers,
+            True,
+            forget_bias,
+            kernel_init,
+            bias_init,
+        )
+
+    @property
+    def memory_size(self) -> int:
+        return 2 * self.hidden_size
+
+    def forward(
+        self, input_tensor: torch.Tensor, memories: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        if exporting_to_onnx.is_exporting():
+            # This transpose is needed both at input and output of the LSTM when
+            # exporting because ONNX will expect (sequence_len, batch, memory_size)
+            # instead of (batch, sequence_len, memory_size)
+            memories = torch.transpose(memories, 0, 1)
+
+        # We don't use torch.split here since it is not supported by Barracuda
+        h0 = memories[:, :, : self.hidden_size].contiguous()
+        c0 = memories[:, :, self.hidden_size :].contiguous()
+
+        hidden = (h0, c0)
+        lstm_out, hidden_out = self.lstm(input_tensor, hidden)
+        output_mem = torch.cat(hidden_out, dim=-1)
+
+        if exporting_to_onnx.is_exporting():
+            output_mem = torch.transpose(output_mem, 0, 1)
+
+        return lstm_out, output_mem
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/model_serialization.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/model_serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..f204b52445e5247bc8b921a4eb4207218a9d2747
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/model_serialization.py
@@ -0,0 +1,173 @@
+from typing import Tuple
+import threading
+from mlagents.torch_utils import torch
+
+from mlagents_envs.logging_util import get_logger
+from mlagents.trainers.settings import SerializationSettings
+
+
+logger = get_logger(__name__)
+
+
+class exporting_to_onnx:
+    """
+    Set this context by calling
+    ```
+    with exporting_to_onnx():
+    ```
+    Within this context, the variable exporting_to_onnx.is_exporting() will be true.
+    This implementation is thread safe.
+    """
+
+    # local is_exporting flag for each thread
+    _local_data = threading.local()
+    _local_data._is_exporting = False
+
+    # global lock shared among all threads, to make sure only one thread is exporting at a time
+    _lock = threading.Lock()
+
+    def __enter__(self):
+        self._lock.acquire()
+        self._local_data._is_exporting = True
+
+    def __exit__(self, *args):
+        self._local_data._is_exporting = False
+        self._lock.release()
+
+    @staticmethod
+    def is_exporting():
+        if not hasattr(exporting_to_onnx._local_data, "_is_exporting"):
+            return False
+        return exporting_to_onnx._local_data._is_exporting
+
+
+class TensorNames:
+    batch_size_placeholder = "batch_size"
+    sequence_length_placeholder = "sequence_length"
+    vector_observation_placeholder = "vector_observation"
+    recurrent_in_placeholder = "recurrent_in"
+    visual_observation_placeholder_prefix = "visual_observation_"
+    observation_placeholder_prefix = "obs_"
+    previous_action_placeholder = "prev_action"
+    action_mask_placeholder = "action_masks"
+    random_normal_epsilon_placeholder = "epsilon"
+
+    value_estimate_output = "value_estimate"
+    recurrent_output = "recurrent_out"
+    memory_size = "memory_size"
+    version_number = "version_number"
+
+    continuous_action_output_shape = "continuous_action_output_shape"
+    discrete_action_output_shape = "discrete_action_output_shape"
+    continuous_action_output = "continuous_actions"
+    discrete_action_output = "discrete_actions"
+    deterministic_continuous_action_output = "deterministic_continuous_actions"
+    deterministic_discrete_action_output = "deterministic_discrete_actions"
+
+    # Deprecated TensorNames entries for backward compatibility
+    is_continuous_control_deprecated = "is_continuous_control"
+    action_output_deprecated = "action"
+    action_output_shape_deprecated = "action_output_shape"
+
+    @staticmethod
+    def get_visual_observation_name(index: int) -> str:
+        """
+        Returns the name of the visual observation with a given index
+        """
+        return TensorNames.visual_observation_placeholder_prefix + str(index)
+
+    @staticmethod
+    def get_observation_name(index: int) -> str:
+        """
+        Returns the name of the observation with a given index
+        """
+        return TensorNames.observation_placeholder_prefix + str(index)
+
+
+class ModelSerializer:
+    def __init__(self, policy):
+        # ONNX only support input in NCHW (channel first) format.
+        # Barracuda also expect to get data in NCHW.
+        # Any multi-dimentional input should follow that otherwise will
+        # cause problem to barracuda import.
+        self.policy = policy
+        observation_specs = self.policy.behavior_spec.observation_specs
+        batch_dim = [1]
+        seq_len_dim = [1]
+        num_obs = len(observation_specs)
+
+        dummy_obs = [
+            torch.zeros(
+                batch_dim + list(ModelSerializer._get_onnx_shape(obs_spec.shape))
+            )
+            for obs_spec in observation_specs
+        ]
+
+        dummy_masks = torch.ones(
+            batch_dim + [sum(self.policy.behavior_spec.action_spec.discrete_branches)]
+        )
+        dummy_memories = torch.zeros(
+            batch_dim + seq_len_dim + [self.policy.export_memory_size]
+        )
+
+        self.dummy_input = (dummy_obs, dummy_masks, dummy_memories)
+
+        self.input_names = [TensorNames.get_observation_name(i) for i in range(num_obs)]
+        self.input_names += [
+            TensorNames.action_mask_placeholder,
+            TensorNames.recurrent_in_placeholder,
+        ]
+
+        self.dynamic_axes = {name: {0: "batch"} for name in self.input_names}
+
+        self.output_names = [TensorNames.version_number, TensorNames.memory_size]
+        if self.policy.behavior_spec.action_spec.continuous_size > 0:
+            self.output_names += [
+                TensorNames.continuous_action_output,
+                TensorNames.continuous_action_output_shape,
+                TensorNames.deterministic_continuous_action_output,
+            ]
+            self.dynamic_axes.update(
+                {TensorNames.continuous_action_output: {0: "batch"}}
+            )
+        if self.policy.behavior_spec.action_spec.discrete_size > 0:
+            self.output_names += [
+                TensorNames.discrete_action_output,
+                TensorNames.discrete_action_output_shape,
+                TensorNames.deterministic_discrete_action_output,
+            ]
+            self.dynamic_axes.update({TensorNames.discrete_action_output: {0: "batch"}})
+
+        if self.policy.export_memory_size > 0:
+            self.output_names += [TensorNames.recurrent_output]
+
+    @staticmethod
+    def _get_onnx_shape(shape: Tuple[int, ...]) -> Tuple[int, ...]:
+        """
+        Converts the shape of an observation to be compatible with the NCHW format
+        of ONNX
+        """
+        if len(shape) == 3:
+            return shape[2], shape[0], shape[1]
+        return shape
+
+    def export_policy_model(self, output_filepath: str) -> None:
+        """
+        Exports a Torch model for a Policy to .onnx format for Unity embedding.
+
+        :param output_filepath: file path to output the model (without file suffix)
+        """
+        onnx_output_path = f"{output_filepath}.onnx"
+        logger.debug(f"Converting to {onnx_output_path}")
+
+        with exporting_to_onnx():
+            torch.onnx.export(
+                self.policy.actor,
+                self.dummy_input,
+                onnx_output_path,
+                opset_version=SerializationSettings.onnx_opset,
+                input_names=self.input_names,
+                output_names=self.output_names,
+                dynamic_axes=self.dynamic_axes,
+            )
+        logger.info(f"Exported {onnx_output_path}")
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/networks.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/networks.py
new file mode 100644
index 0000000000000000000000000000000000000000..555268075c90d6e111f5511f16290fd632be88b3
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/networks.py
@@ -0,0 +1,767 @@
+from typing import Callable, List, Dict, Tuple, Optional, Union, Any
+import abc
+
+from mlagents.torch_utils import torch, nn
+
+from mlagents_envs.base_env import ActionSpec, ObservationSpec, ObservationType
+from mlagents.trainers.torch_entities.action_model import ActionModel
+from mlagents.trainers.torch_entities.agent_action import AgentAction
+from mlagents.trainers.settings import NetworkSettings, EncoderType, ConditioningType
+from mlagents.trainers.torch_entities.utils import ModelUtils
+from mlagents.trainers.torch_entities.decoders import ValueHeads
+from mlagents.trainers.torch_entities.layers import LSTM, LinearEncoder
+from mlagents.trainers.torch_entities.encoders import VectorInput
+from mlagents.trainers.buffer import AgentBuffer
+from mlagents.trainers.trajectory import ObsUtil
+from mlagents.trainers.torch_entities.conditioning import ConditionalEncoder
+from mlagents.trainers.torch_entities.attention import (
+    EntityEmbedding,
+    ResidualSelfAttention,
+    get_zero_entities_mask,
+)
+from mlagents.trainers.exception import UnityTrainerException
+
+
+ActivationFunction = Callable[[torch.Tensor], torch.Tensor]
+EncoderFunction = Callable[
+    [torch.Tensor, int, ActivationFunction, int, str, bool], torch.Tensor
+]
+
+EPSILON = 1e-7
+
+
+class ObservationEncoder(nn.Module):
+    ATTENTION_EMBEDDING_SIZE = 128  # The embedding size of attention is fixed
+
+    def __init__(
+        self,
+        observation_specs: List[ObservationSpec],
+        h_size: int,
+        vis_encode_type: EncoderType,
+        normalize: bool = False,
+    ):
+        """
+        Returns an ObservationEncoder that can process and encode a set of observations.
+        Will use an RSA if needed for variable length observations.
+        """
+        super().__init__()
+        self.processors, self.embedding_sizes = ModelUtils.create_input_processors(
+            observation_specs,
+            h_size,
+            vis_encode_type,
+            self.ATTENTION_EMBEDDING_SIZE,
+            normalize=normalize,
+        )
+        self.rsa, self.x_self_encoder = ModelUtils.create_residual_self_attention(
+            self.processors, self.embedding_sizes, self.ATTENTION_EMBEDDING_SIZE
+        )
+        if self.rsa is not None:
+            total_enc_size = sum(self.embedding_sizes) + self.ATTENTION_EMBEDDING_SIZE
+        else:
+            total_enc_size = sum(self.embedding_sizes)
+        self.normalize = normalize
+        self._total_enc_size = total_enc_size
+
+        self._total_goal_enc_size = 0
+        self._goal_processor_indices: List[int] = []
+        for i in range(len(observation_specs)):
+            if observation_specs[i].observation_type == ObservationType.GOAL_SIGNAL:
+                self._total_goal_enc_size += self.embedding_sizes[i]
+                self._goal_processor_indices.append(i)
+
+    @property
+    def total_enc_size(self) -> int:
+        """
+        Returns the total encoding size for this ObservationEncoder.
+        """
+        return self._total_enc_size
+
+    @property
+    def total_goal_enc_size(self) -> int:
+        """
+        Returns the total goal encoding size for this ObservationEncoder.
+        """
+        return self._total_goal_enc_size
+
+    def update_normalization(self, buffer: AgentBuffer) -> None:
+        obs = ObsUtil.from_buffer(buffer, len(self.processors))
+        for vec_input, enc in zip(obs, self.processors):
+            if isinstance(enc, VectorInput):
+                enc.update_normalization(torch.as_tensor(vec_input.to_ndarray()))
+
+    def copy_normalization(self, other_encoder: "ObservationEncoder") -> None:
+        if self.normalize:
+            for n1, n2 in zip(self.processors, other_encoder.processors):
+                if isinstance(n1, VectorInput) and isinstance(n2, VectorInput):
+                    n1.copy_normalization(n2)
+
+    def forward(self, inputs: List[torch.Tensor]) -> torch.Tensor:
+        """
+        Encode observations using a list of processors and an RSA.
+        :param inputs: List of Tensors corresponding to a set of obs.
+        """
+        encodes = []
+        var_len_processor_inputs: List[Tuple[nn.Module, torch.Tensor]] = []
+
+        for idx, processor in enumerate(self.processors):
+            if not isinstance(processor, EntityEmbedding):
+                # The input can be encoded without having to process other inputs
+                obs_input = inputs[idx]
+                processed_obs = processor(obs_input)
+                encodes.append(processed_obs)
+            else:
+                var_len_processor_inputs.append((processor, inputs[idx]))
+        if len(encodes) != 0:
+            encoded_self = torch.cat(encodes, dim=1)
+            input_exist = True
+        else:
+            input_exist = False
+        if len(var_len_processor_inputs) > 0 and self.rsa is not None:
+            # Some inputs need to be processed with a variable length encoder
+            masks = get_zero_entities_mask([p_i[1] for p_i in var_len_processor_inputs])
+            embeddings: List[torch.Tensor] = []
+            processed_self = (
+                self.x_self_encoder(encoded_self)
+                if input_exist and self.x_self_encoder is not None
+                else None
+            )
+            for processor, var_len_input in var_len_processor_inputs:
+                embeddings.append(processor(processed_self, var_len_input))
+            qkv = torch.cat(embeddings, dim=1)
+            attention_embedding = self.rsa(qkv, masks)
+            if not input_exist:
+                encoded_self = torch.cat([attention_embedding], dim=1)
+                input_exist = True
+            else:
+                encoded_self = torch.cat([encoded_self, attention_embedding], dim=1)
+
+        if not input_exist:
+            raise UnityTrainerException(
+                "The trainer was unable to process any of the provided inputs. "
+                "Make sure the trained agents has at least one sensor attached to them."
+            )
+
+        return encoded_self
+
+    def get_goal_encoding(self, inputs: List[torch.Tensor]) -> torch.Tensor:
+        """
+        Encode observations corresponding to goals using a list of processors.
+        :param inputs: List of Tensors corresponding to a set of obs.
+        """
+        encodes = []
+        for idx in self._goal_processor_indices:
+            processor = self.processors[idx]
+            if not isinstance(processor, EntityEmbedding):
+                # The input can be encoded without having to process other inputs
+                obs_input = inputs[idx]
+                processed_obs = processor(obs_input)
+                encodes.append(processed_obs)
+            else:
+                raise UnityTrainerException(
+                    "The one of the goals uses variable length observations. This use "
+                    "case is not supported."
+                )
+        if len(encodes) != 0:
+            encoded = torch.cat(encodes, dim=1)
+        else:
+            raise UnityTrainerException(
+                "Trainer was unable to process any of the goals provided as input."
+            )
+        return encoded
+
+
+class NetworkBody(nn.Module):
+    def __init__(
+        self,
+        observation_specs: List[ObservationSpec],
+        network_settings: NetworkSettings,
+        encoded_act_size: int = 0,
+    ):
+        super().__init__()
+        self.normalize = network_settings.normalize
+        self.use_lstm = network_settings.memory is not None
+        self.h_size = network_settings.hidden_units
+        self.m_size = (
+            network_settings.memory.memory_size
+            if network_settings.memory is not None
+            else 0
+        )
+        self.observation_encoder = ObservationEncoder(
+            observation_specs,
+            self.h_size,
+            network_settings.vis_encode_type,
+            self.normalize,
+        )
+        self.processors = self.observation_encoder.processors
+        total_enc_size = self.observation_encoder.total_enc_size
+        total_enc_size += encoded_act_size
+
+        if (
+            self.observation_encoder.total_goal_enc_size > 0
+            and network_settings.goal_conditioning_type == ConditioningType.HYPER
+        ):
+            self._body_endoder = ConditionalEncoder(
+                total_enc_size,
+                self.observation_encoder.total_goal_enc_size,
+                self.h_size,
+                network_settings.num_layers,
+                1,
+            )
+        else:
+            self._body_endoder = LinearEncoder(
+                total_enc_size, network_settings.num_layers, self.h_size
+            )
+
+        if self.use_lstm:
+            self.lstm = LSTM(self.h_size, self.m_size)
+        else:
+            self.lstm = None  # type: ignore
+
+    def update_normalization(self, buffer: AgentBuffer) -> None:
+        self.observation_encoder.update_normalization(buffer)
+
+    def copy_normalization(self, other_network: "NetworkBody") -> None:
+        self.observation_encoder.copy_normalization(other_network.observation_encoder)
+
+    @property
+    def memory_size(self) -> int:
+        return self.lstm.memory_size if self.use_lstm else 0
+
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        actions: Optional[torch.Tensor] = None,
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        encoded_self = self.observation_encoder(inputs)
+        if actions is not None:
+            encoded_self = torch.cat([encoded_self, actions], dim=1)
+        if isinstance(self._body_endoder, ConditionalEncoder):
+            goal = self.observation_encoder.get_goal_encoding(inputs)
+            encoding = self._body_endoder(encoded_self, goal)
+        else:
+            encoding = self._body_endoder(encoded_self)
+
+        if self.use_lstm:
+            # Resize to (batch, sequence length, encoding size)
+            encoding = encoding.reshape([-1, sequence_length, self.h_size])
+            encoding, memories = self.lstm(encoding, memories)
+            encoding = encoding.reshape([-1, self.m_size // 2])
+        return encoding, memories
+
+
+class MultiAgentNetworkBody(torch.nn.Module):
+    """
+    A network body that uses a self attention layer to handle state
+    and action input from a potentially variable number of agents that
+    share the same observation and action space.
+    """
+
+    def __init__(
+        self,
+        observation_specs: List[ObservationSpec],
+        network_settings: NetworkSettings,
+        action_spec: ActionSpec,
+    ):
+        super().__init__()
+        self.normalize = network_settings.normalize
+        self.use_lstm = network_settings.memory is not None
+        self.h_size = network_settings.hidden_units
+        self.m_size = (
+            network_settings.memory.memory_size
+            if network_settings.memory is not None
+            else 0
+        )
+        self.action_spec = action_spec
+        self.observation_encoder = ObservationEncoder(
+            observation_specs,
+            self.h_size,
+            network_settings.vis_encode_type,
+            self.normalize,
+        )
+        self.processors = self.observation_encoder.processors
+
+        # Modules for multi-agent self-attention
+        obs_only_ent_size = self.observation_encoder.total_enc_size
+        q_ent_size = (
+            obs_only_ent_size
+            + sum(self.action_spec.discrete_branches)
+            + self.action_spec.continuous_size
+        )
+
+        attention_embeding_size = self.h_size
+        self.obs_encoder = EntityEmbedding(
+            obs_only_ent_size, None, attention_embeding_size
+        )
+        self.obs_action_encoder = EntityEmbedding(
+            q_ent_size, None, attention_embeding_size
+        )
+
+        self.self_attn = ResidualSelfAttention(attention_embeding_size)
+
+        self.linear_encoder = LinearEncoder(
+            attention_embeding_size,
+            network_settings.num_layers,
+            self.h_size,
+            kernel_gain=(0.125 / self.h_size) ** 0.5,
+        )
+
+        if self.use_lstm:
+            self.lstm = LSTM(self.h_size, self.m_size)
+        else:
+            self.lstm = None  # type: ignore
+        self._current_max_agents = torch.nn.Parameter(
+            torch.as_tensor(1), requires_grad=False
+        )
+
+    @property
+    def memory_size(self) -> int:
+        return self.lstm.memory_size if self.use_lstm else 0
+
+    def update_normalization(self, buffer: AgentBuffer) -> None:
+        self.observation_encoder.update_normalization(buffer)
+
+    def copy_normalization(self, other_network: "MultiAgentNetworkBody") -> None:
+        self.observation_encoder.copy_normalization(other_network.observation_encoder)
+
+    def _get_masks_from_nans(self, obs_tensors: List[torch.Tensor]) -> torch.Tensor:
+        """
+        Get attention masks by grabbing an arbitrary obs across all the agents
+        Since these are raw obs, the padded values are still NaN
+        """
+        only_first_obs = [_all_obs[0] for _all_obs in obs_tensors]
+        # Just get the first element in each obs regardless of its dimension. This will speed up
+        # searching for NaNs.
+        only_first_obs_flat = torch.stack(
+            [_obs.flatten(start_dim=1)[:, 0] for _obs in only_first_obs], dim=1
+        )
+        # Get the mask from NaNs
+        attn_mask = only_first_obs_flat.isnan().float()
+        return attn_mask
+
+    def _copy_and_remove_nans_from_obs(
+        self, all_obs: List[List[torch.Tensor]], attention_mask: torch.Tensor
+    ) -> List[List[torch.Tensor]]:
+        """
+        Helper function to remove NaNs from observations using an attention mask.
+        """
+        obs_with_no_nans = []
+        for i_agent, single_agent_obs in enumerate(all_obs):
+            no_nan_obs = []
+            for obs in single_agent_obs:
+                new_obs = obs.clone()
+                new_obs[attention_mask.bool()[:, i_agent], ::] = 0.0  # Remove NaNs fast
+                no_nan_obs.append(new_obs)
+            obs_with_no_nans.append(no_nan_obs)
+        return obs_with_no_nans
+
+    def forward(
+        self,
+        obs_only: List[List[torch.Tensor]],
+        obs: List[List[torch.Tensor]],
+        actions: List[AgentAction],
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns sampled actions.
+        If memory is enabled, return the memories as well.
+        :param obs_only: Observations to be processed that do not have corresponding actions.
+            These are encoded with the obs_encoder.
+        :param obs: Observations to be processed that do have corresponding actions.
+            After concatenation with actions, these are processed with obs_action_encoder.
+        :param actions: After concatenation with obs, these are processed with obs_action_encoder.
+        :param memories: If using memory, a Tensor of initial memories.
+        :param sequence_length: If using memory, the sequence length.
+        """
+        self_attn_masks = []
+        self_attn_inputs = []
+        concat_f_inp = []
+        if obs:
+            obs_attn_mask = self._get_masks_from_nans(obs)
+            obs = self._copy_and_remove_nans_from_obs(obs, obs_attn_mask)
+            for inputs, action in zip(obs, actions):
+                encoded = self.observation_encoder(inputs)
+                cat_encodes = [
+                    encoded,
+                    action.to_flat(self.action_spec.discrete_branches),
+                ]
+                concat_f_inp.append(torch.cat(cat_encodes, dim=1))
+            f_inp = torch.stack(concat_f_inp, dim=1)
+            self_attn_masks.append(obs_attn_mask)
+            self_attn_inputs.append(self.obs_action_encoder(None, f_inp))
+
+        concat_encoded_obs = []
+        if obs_only:
+            obs_only_attn_mask = self._get_masks_from_nans(obs_only)
+            obs_only = self._copy_and_remove_nans_from_obs(obs_only, obs_only_attn_mask)
+            for inputs in obs_only:
+                encoded = self.observation_encoder(inputs)
+                concat_encoded_obs.append(encoded)
+            g_inp = torch.stack(concat_encoded_obs, dim=1)
+            self_attn_masks.append(obs_only_attn_mask)
+            self_attn_inputs.append(self.obs_encoder(None, g_inp))
+
+        encoded_entity = torch.cat(self_attn_inputs, dim=1)
+        encoded_state = self.self_attn(encoded_entity, self_attn_masks)
+
+        flipped_masks = 1 - torch.cat(self_attn_masks, dim=1)
+        num_agents = torch.sum(flipped_masks, dim=1, keepdim=True)
+        if torch.max(num_agents).item() > self._current_max_agents:
+            self._current_max_agents = torch.nn.Parameter(
+                torch.as_tensor(torch.max(num_agents).item()), requires_grad=False
+            )
+
+        # num_agents will be -1 for a single agent and +1 when the current maximum is reached
+        num_agents = num_agents * 2.0 / self._current_max_agents - 1
+
+        encoding = self.linear_encoder(encoded_state)
+        if self.use_lstm:
+            # Resize to (batch, sequence length, encoding size)
+            encoding = encoding.reshape([-1, sequence_length, self.h_size])
+            encoding, memories = self.lstm(encoding, memories)
+            encoding = encoding.reshape([-1, self.m_size // 2])
+        encoding = torch.cat([encoding, num_agents], dim=1)
+        return encoding, memories
+
+
+class Critic(abc.ABC):
+    @abc.abstractmethod
+    def update_normalization(self, buffer: AgentBuffer) -> None:
+        """
+        Updates normalization of Actor based on the provided List of vector obs.
+        :param vector_obs: A List of vector obs as tensors.
+        """
+        pass
+
+    def critic_pass(
+        self,
+        inputs: List[torch.Tensor],
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
+        """
+        Get value outputs for the given obs.
+        :param inputs: List of inputs as tensors.
+        :param memories: Tensor of memories, if using memory. Otherwise, None.
+        :returns: Dict of reward stream to output tensor for values.
+        """
+        pass
+
+
+class ValueNetwork(nn.Module, Critic):
+    def __init__(
+        self,
+        stream_names: List[str],
+        observation_specs: List[ObservationSpec],
+        network_settings: NetworkSettings,
+        encoded_act_size: int = 0,
+        outputs_per_stream: int = 1,
+    ):
+
+        # This is not a typo, we want to call __init__ of nn.Module
+        nn.Module.__init__(self)
+        self.network_body = NetworkBody(
+            observation_specs, network_settings, encoded_act_size=encoded_act_size
+        )
+        if network_settings.memory is not None:
+            encoding_size = network_settings.memory.memory_size // 2
+        else:
+            encoding_size = network_settings.hidden_units
+        self.value_heads = ValueHeads(stream_names, encoding_size, outputs_per_stream)
+
+    def update_normalization(self, buffer: AgentBuffer) -> None:
+        self.network_body.update_normalization(buffer)
+
+    @property
+    def memory_size(self) -> int:
+        return self.network_body.memory_size
+
+    def critic_pass(
+        self,
+        inputs: List[torch.Tensor],
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
+        value_outputs, critic_mem_out = self.forward(
+            inputs, memories=memories, sequence_length=sequence_length
+        )
+        return value_outputs, critic_mem_out
+
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        actions: Optional[torch.Tensor] = None,
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
+        encoding, memories = self.network_body(
+            inputs, actions, memories, sequence_length
+        )
+        output = self.value_heads(encoding)
+        return output, memories
+
+
+class Actor(abc.ABC):
+    @abc.abstractmethod
+    def update_normalization(self, buffer: AgentBuffer) -> None:
+        """
+        Updates normalization of Actor based on the provided List of vector obs.
+        :param vector_obs: A List of vector obs as tensors.
+        """
+        pass
+
+    def get_action_and_stats(
+        self,
+        inputs: List[torch.Tensor],
+        masks: Optional[torch.Tensor] = None,
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Tuple[AgentAction, Dict[str, Any], torch.Tensor]:
+        """
+        Returns sampled actions.
+        If memory is enabled, return the memories as well.
+        :param inputs: A List of inputs as tensors.
+        :param masks: If using discrete actions, a Tensor of action masks.
+        :param memories: If using memory, a Tensor of initial memories.
+        :param sequence_length: If using memory, the sequence length.
+        :return: A Tuple of AgentAction, ActionLogProbs, entropies, and memories.
+            Memories will be None if not using memory.
+        """
+        pass
+
+    def get_stats(
+        self,
+        inputs: List[torch.Tensor],
+        actions: AgentAction,
+        masks: Optional[torch.Tensor] = None,
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Dict[str, Any]:
+        """
+        Returns log_probs for actions and entropies.
+        If memory is enabled, return the memories as well.
+        :param inputs: A List of inputs as tensors.
+        :param actions: AgentAction of actions.
+        :param masks: If using discrete actions, a Tensor of action masks.
+        :param memories: If using memory, a Tensor of initial memories.
+        :param sequence_length: If using memory, the sequence length.
+        :return: A Tuple of AgentAction, ActionLogProbs, entropies, and memories.
+            Memories will be None if not using memory.
+        """
+
+        pass
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        masks: Optional[torch.Tensor] = None,
+        memories: Optional[torch.Tensor] = None,
+    ) -> Tuple[Union[int, torch.Tensor], ...]:
+        """
+        Forward pass of the Actor for inference. This is required for export to ONNX, and
+        the inputs and outputs of this method should not be changed without a respective change
+        in the ONNX export code.
+        """
+        pass
+
+
+class SimpleActor(nn.Module, Actor):
+    MODEL_EXPORT_VERSION = 3  # Corresponds to ModelApiVersion.MLAgents2_0
+
+    def __init__(
+        self,
+        observation_specs: List[ObservationSpec],
+        network_settings: NetworkSettings,
+        action_spec: ActionSpec,
+        conditional_sigma: bool = False,
+        tanh_squash: bool = False,
+    ):
+        super().__init__()
+        self.action_spec = action_spec
+        self.version_number = torch.nn.Parameter(
+            torch.Tensor([self.MODEL_EXPORT_VERSION]), requires_grad=False
+        )
+        self.is_continuous_int_deprecated = torch.nn.Parameter(
+            torch.Tensor([int(self.action_spec.is_continuous())]), requires_grad=False
+        )
+        self.continuous_act_size_vector = torch.nn.Parameter(
+            torch.Tensor([int(self.action_spec.continuous_size)]), requires_grad=False
+        )
+        self.discrete_act_size_vector = torch.nn.Parameter(
+            torch.Tensor([self.action_spec.discrete_branches]), requires_grad=False
+        )
+        self.act_size_vector_deprecated = torch.nn.Parameter(
+            torch.Tensor(
+                [
+                    self.action_spec.continuous_size
+                    + sum(self.action_spec.discrete_branches)
+                ]
+            ),
+            requires_grad=False,
+        )
+        self.network_body = NetworkBody(observation_specs, network_settings)
+        if network_settings.memory is not None:
+            self.encoding_size = network_settings.memory.memory_size // 2
+        else:
+            self.encoding_size = network_settings.hidden_units
+        self.memory_size_vector = torch.nn.Parameter(
+            torch.Tensor([int(self.network_body.memory_size)]), requires_grad=False
+        )
+
+        self.action_model = ActionModel(
+            self.encoding_size,
+            action_spec,
+            conditional_sigma=conditional_sigma,
+            tanh_squash=tanh_squash,
+            deterministic=network_settings.deterministic,
+        )
+
+    @property
+    def memory_size(self) -> int:
+        return self.network_body.memory_size
+
+    def update_normalization(self, buffer: AgentBuffer) -> None:
+        self.network_body.update_normalization(buffer)
+
+    def get_action_and_stats(
+        self,
+        inputs: List[torch.Tensor],
+        masks: Optional[torch.Tensor] = None,
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Tuple[AgentAction, Dict[str, Any], torch.Tensor]:
+
+        encoding, memories = self.network_body(
+            inputs, memories=memories, sequence_length=sequence_length
+        )
+        action, log_probs, entropies = self.action_model(encoding, masks)
+        run_out = {}
+        # This is the clipped action which is not saved to the buffer
+        # but is exclusively sent to the environment.
+        run_out["env_action"] = action.to_action_tuple(
+            clip=self.action_model.clip_action
+        )
+        run_out["log_probs"] = log_probs
+        run_out["entropy"] = entropies
+
+        return action, run_out, memories
+
+    def get_stats(
+        self,
+        inputs: List[torch.Tensor],
+        actions: AgentAction,
+        masks: Optional[torch.Tensor] = None,
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Dict[str, Any]:
+        encoding, actor_mem_outs = self.network_body(
+            inputs, memories=memories, sequence_length=sequence_length
+        )
+
+        log_probs, entropies = self.action_model.evaluate(encoding, masks, actions)
+        run_out = {}
+        run_out["log_probs"] = log_probs
+        run_out["entropy"] = entropies
+        return run_out
+
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        masks: Optional[torch.Tensor] = None,
+        memories: Optional[torch.Tensor] = None,
+    ) -> Tuple[Union[int, torch.Tensor], ...]:
+        """
+        Note: This forward() method is required for exporting to ONNX. Don't modify the inputs and outputs.
+
+        At this moment, torch.onnx.export() doesn't accept None as tensor to be exported,
+        so the size of return tuple varies with action spec.
+        """
+        encoding, memories_out = self.network_body(
+            inputs, memories=memories, sequence_length=1
+        )
+
+        (
+            cont_action_out,
+            disc_action_out,
+            action_out_deprecated,
+            deterministic_cont_action_out,
+            deterministic_disc_action_out,
+        ) = self.action_model.get_action_out(encoding, masks)
+        export_out = [self.version_number, self.memory_size_vector]
+        if self.action_spec.continuous_size > 0:
+            export_out += [
+                cont_action_out,
+                self.continuous_act_size_vector,
+                deterministic_cont_action_out,
+            ]
+        if self.action_spec.discrete_size > 0:
+            export_out += [
+                disc_action_out,
+                self.discrete_act_size_vector,
+                deterministic_disc_action_out,
+            ]
+        if self.network_body.memory_size > 0:
+            export_out += [memories_out]
+        return tuple(export_out)
+
+
+class SharedActorCritic(SimpleActor, Critic):
+    def __init__(
+        self,
+        observation_specs: List[ObservationSpec],
+        network_settings: NetworkSettings,
+        action_spec: ActionSpec,
+        stream_names: List[str],
+        conditional_sigma: bool = False,
+        tanh_squash: bool = False,
+    ):
+        self.use_lstm = network_settings.memory is not None
+        super().__init__(
+            observation_specs,
+            network_settings,
+            action_spec,
+            conditional_sigma,
+            tanh_squash,
+        )
+        self.stream_names = stream_names
+        self.value_heads = ValueHeads(stream_names, self.encoding_size)
+
+    def critic_pass(
+        self,
+        inputs: List[torch.Tensor],
+        memories: Optional[torch.Tensor] = None,
+        sequence_length: int = 1,
+    ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
+        encoding, memories_out = self.network_body(
+            inputs, memories=memories, sequence_length=sequence_length
+        )
+        return self.value_heads(encoding), memories_out
+
+
+class GlobalSteps(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.__global_step = nn.Parameter(
+            torch.Tensor([0]).to(torch.int64), requires_grad=False
+        )
+
+    @property
+    def current_step(self):
+        return int(self.__global_step.item())
+
+    @current_step.setter
+    def current_step(self, value):
+        self.__global_step[:] = value
+
+    def increment(self, value):
+        self.__global_step += value
+
+
+class LearningRate(nn.Module):
+    def __init__(self, lr):
+        # Todo: add learning rate decay
+        super().__init__()
+        self.learning_rate = torch.Tensor([lr])
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/utils.py b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..048ce8b59174c1b9ac5a2520b4b36f858754f9ac
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/torch_entities/utils.py
@@ -0,0 +1,452 @@
+from typing import List, Optional, Tuple, Dict
+from mlagents.torch_utils import torch, nn
+from mlagents.trainers.torch_entities.layers import LinearEncoder, Initialization
+import numpy as np
+
+from mlagents.trainers.torch_entities.encoders import (
+    SimpleVisualEncoder,
+    ResNetVisualEncoder,
+    NatureVisualEncoder,
+    SmallVisualEncoder,
+    FullyConnectedVisualEncoder,
+    VectorInput,
+)
+from mlagents.trainers.settings import EncoderType, ScheduleType
+from mlagents.trainers.torch_entities.attention import (
+    EntityEmbedding,
+    ResidualSelfAttention,
+)
+from mlagents.trainers.exception import UnityTrainerException
+from mlagents_envs.base_env import ObservationSpec, DimensionProperty
+
+
+class ModelUtils:
+    # Minimum supported side for each encoder type. If refactoring an encoder, please
+    # adjust these also.
+    MIN_RESOLUTION_FOR_ENCODER = {
+        EncoderType.FULLY_CONNECTED: 1,
+        EncoderType.MATCH3: 5,
+        EncoderType.SIMPLE: 20,
+        EncoderType.NATURE_CNN: 36,
+        EncoderType.RESNET: 15,
+    }
+
+    VALID_VISUAL_PROP = frozenset(
+        [
+            (
+                DimensionProperty.TRANSLATIONAL_EQUIVARIANCE,
+                DimensionProperty.TRANSLATIONAL_EQUIVARIANCE,
+                DimensionProperty.NONE,
+            ),
+            (DimensionProperty.UNSPECIFIED,) * 3,
+        ]
+    )
+
+    VALID_VECTOR_PROP = frozenset(
+        [(DimensionProperty.NONE,), (DimensionProperty.UNSPECIFIED,)]
+    )
+
+    VALID_VAR_LEN_PROP = frozenset(
+        [(DimensionProperty.VARIABLE_SIZE, DimensionProperty.NONE)]
+    )
+
+    @staticmethod
+    def update_learning_rate(optim: torch.optim.Optimizer, lr: float) -> None:
+        """
+        Apply a learning rate to a torch optimizer.
+        :param optim: Optimizer
+        :param lr: Learning rate
+        """
+        for param_group in optim.param_groups:
+            param_group["lr"] = lr
+
+    class DecayedValue:
+        def __init__(
+            self,
+            schedule: ScheduleType,
+            initial_value: float,
+            min_value: float,
+            max_step: int,
+        ):
+            """
+            Object that represnets value of a parameter that should be decayed, assuming it is a function of
+            global_step.
+            :param schedule: Type of learning rate schedule.
+            :param initial_value: Initial value before decay.
+            :param min_value: Decay value to this value by max_step.
+            :param max_step: The final step count where the return value should equal min_value.
+            :param global_step: The current step count.
+            :return: The value.
+            """
+            self.schedule = schedule
+            self.initial_value = initial_value
+            self.min_value = min_value
+            self.max_step = max_step
+
+        def get_value(self, global_step: int) -> float:
+            """
+            Get the value at a given global step.
+            :param global_step: Step count.
+            :returns: Decayed value at this global step.
+            """
+            if self.schedule == ScheduleType.CONSTANT:
+                return self.initial_value
+            elif self.schedule == ScheduleType.LINEAR:
+                return ModelUtils.polynomial_decay(
+                    self.initial_value, self.min_value, self.max_step, global_step
+                )
+            else:
+                raise UnityTrainerException(f"The schedule {self.schedule} is invalid.")
+
+    @staticmethod
+    def polynomial_decay(
+        initial_value: float,
+        min_value: float,
+        max_step: int,
+        global_step: int,
+        power: float = 1.0,
+    ) -> float:
+        """
+        Get a decayed value based on a polynomial schedule, with respect to the current global step.
+        :param initial_value: Initial value before decay.
+        :param min_value: Decay value to this value by max_step.
+        :param max_step: The final step count where the return value should equal min_value.
+        :param global_step: The current step count.
+        :param power: Power of polynomial decay. 1.0 (default) is a linear decay.
+        :return: The current decayed value.
+        """
+        global_step = min(global_step, max_step)
+        decayed_value = (initial_value - min_value) * (
+            1 - float(global_step) / max_step
+        ) ** (power) + min_value
+        return decayed_value
+
+    @staticmethod
+    def get_encoder_for_type(encoder_type: EncoderType) -> nn.Module:
+        ENCODER_FUNCTION_BY_TYPE = {
+            EncoderType.SIMPLE: SimpleVisualEncoder,
+            EncoderType.NATURE_CNN: NatureVisualEncoder,
+            EncoderType.RESNET: ResNetVisualEncoder,
+            EncoderType.MATCH3: SmallVisualEncoder,
+            EncoderType.FULLY_CONNECTED: FullyConnectedVisualEncoder,
+        }
+        return ENCODER_FUNCTION_BY_TYPE.get(encoder_type)
+
+    @staticmethod
+    def _check_resolution_for_encoder(
+        height: int, width: int, vis_encoder_type: EncoderType
+    ) -> None:
+        min_res = ModelUtils.MIN_RESOLUTION_FOR_ENCODER[vis_encoder_type]
+        if height < min_res or width < min_res:
+            raise UnityTrainerException(
+                f"Visual observation resolution ({width}x{height}) is too small for"
+                f"the provided EncoderType ({vis_encoder_type.value}). The min dimension is {min_res}"
+            )
+
+    @staticmethod
+    def get_encoder_for_obs(
+        obs_spec: ObservationSpec,
+        normalize: bool,
+        h_size: int,
+        attention_embedding_size: int,
+        vis_encode_type: EncoderType,
+    ) -> Tuple[nn.Module, int]:
+        """
+        Returns the encoder and the size of the appropriate encoder.
+        :param shape: Tuples that represent the observation dimension.
+        :param normalize: Normalize all vector inputs.
+        :param h_size: Number of hidden units per layer excluding attention layers.
+        :param attention_embedding_size: Number of hidden units per attention layer.
+        :param vis_encode_type: Type of visual encoder to use.
+        """
+        shape = obs_spec.shape
+        dim_prop = obs_spec.dimension_property
+
+        # VISUAL
+        if dim_prop in ModelUtils.VALID_VISUAL_PROP:
+            visual_encoder_class = ModelUtils.get_encoder_for_type(vis_encode_type)
+            ModelUtils._check_resolution_for_encoder(
+                shape[0], shape[1], vis_encode_type
+            )
+            return (visual_encoder_class(shape[0], shape[1], shape[2], h_size), h_size)
+        # VECTOR
+        if dim_prop in ModelUtils.VALID_VECTOR_PROP:
+            return (VectorInput(shape[0], normalize), shape[0])
+        # VARIABLE LENGTH
+        if dim_prop in ModelUtils.VALID_VAR_LEN_PROP:
+            return (
+                EntityEmbedding(
+                    entity_size=shape[1],
+                    entity_num_max_elements=shape[0],
+                    embedding_size=attention_embedding_size,
+                ),
+                0,
+            )
+        # OTHER
+        raise UnityTrainerException(f"Unsupported Sensor with specs {obs_spec}")
+
+    @staticmethod
+    def create_input_processors(
+        observation_specs: List[ObservationSpec],
+        h_size: int,
+        vis_encode_type: EncoderType,
+        attention_embedding_size: int,
+        normalize: bool = False,
+    ) -> Tuple[nn.ModuleList, List[int]]:
+        """
+        Creates visual and vector encoders, along with their normalizers.
+        :param observation_specs: List of ObservationSpec that represent the observation dimensions.
+        :param action_size: Number of additional un-normalized inputs to each vector encoder. Used for
+            conditioning network on other values (e.g. actions for a Q function)
+        :param h_size: Number of hidden units per layer excluding attention layers.
+        :param attention_embedding_size: Number of hidden units per attention layer.
+        :param vis_encode_type: Type of visual encoder to use.
+        :param unnormalized_inputs: Vector inputs that should not be normalized, and added to the vector
+            obs.
+        :param normalize: Normalize all vector inputs.
+        :return: Tuple of :
+         - ModuleList of the encoders
+         - A list of embedding sizes (0 if the input requires to be processed with a variable length
+         observation encoder)
+        """
+        encoders: List[nn.Module] = []
+        embedding_sizes: List[int] = []
+        for obs_spec in observation_specs:
+            encoder, embedding_size = ModelUtils.get_encoder_for_obs(
+                obs_spec, normalize, h_size, attention_embedding_size, vis_encode_type
+            )
+            encoders.append(encoder)
+            embedding_sizes.append(embedding_size)
+
+        x_self_size = sum(embedding_sizes)  # The size of the "self" embedding
+        if x_self_size > 0:
+            for enc in encoders:
+                if isinstance(enc, EntityEmbedding):
+                    enc.add_self_embedding(attention_embedding_size)
+        return (nn.ModuleList(encoders), embedding_sizes)
+
+    @staticmethod
+    def list_to_tensor(
+        ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = torch.float32
+    ) -> torch.Tensor:
+        """
+        Converts a list of numpy arrays into a tensor. MUCH faster than
+        calling as_tensor on the list directly.
+        """
+        return torch.as_tensor(np.asanyarray(ndarray_list), dtype=dtype)
+
+    @staticmethod
+    def list_to_tensor_list(
+        ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = torch.float32
+    ) -> torch.Tensor:
+        """
+        Converts a list of numpy arrays into a list of tensors. MUCH faster than
+        calling as_tensor on the list directly.
+        """
+        return [
+            torch.as_tensor(np.asanyarray(_arr), dtype=dtype) for _arr in ndarray_list
+        ]
+
+    @staticmethod
+    def to_numpy(tensor: torch.Tensor) -> np.ndarray:
+        """
+        Converts a Torch Tensor to a numpy array. If the Tensor is on the GPU, it will
+        be brought to the CPU.
+        """
+        return tensor.detach().cpu().numpy()
+
+    @staticmethod
+    def break_into_branches(
+        concatenated_logits: torch.Tensor, action_size: List[int]
+    ) -> List[torch.Tensor]:
+        """
+        Takes a concatenated set of logits that represent multiple discrete action branches
+        and breaks it up into one Tensor per branch.
+        :param concatenated_logits: Tensor that represents the concatenated action branches
+        :param action_size: List of ints containing the number of possible actions for each branch.
+        :return: A List of Tensors containing one tensor per branch.
+        """
+        action_idx = [0] + list(np.cumsum(action_size))
+        branched_logits = [
+            concatenated_logits[:, action_idx[i] : action_idx[i + 1]]
+            for i in range(len(action_size))
+        ]
+        return branched_logits
+
+    @staticmethod
+    def actions_to_onehot(
+        discrete_actions: torch.Tensor, action_size: List[int]
+    ) -> List[torch.Tensor]:
+        """
+        Takes a tensor of discrete actions and turns it into a List of onehot encoding for each
+        action.
+        :param discrete_actions: Actions in integer form.
+        :param action_size: List of branch sizes. Should be of same size as discrete_actions'
+        last dimension.
+        :return: List of one-hot tensors, one representing each branch.
+        """
+        onehot_branches = [
+            torch.nn.functional.one_hot(_act.T, action_size[i]).float()
+            for i, _act in enumerate(discrete_actions.long().T)
+        ]
+        return onehot_branches
+
+    @staticmethod
+    def dynamic_partition(
+        data: torch.Tensor, partitions: torch.Tensor, num_partitions: int
+    ) -> List[torch.Tensor]:
+        """
+        Torch implementation of dynamic_partition :
+        https://www.tensorflow.org/api_docs/python/tf/dynamic_partition
+        Splits the data Tensor input into num_partitions Tensors according to the indices in
+        partitions.
+        :param data: The Tensor data that will be split into partitions.
+        :param partitions: An indices tensor that determines in which partition each element
+        of data will be in.
+        :param num_partitions: The number of partitions to output. Corresponds to the
+        maximum possible index in the partitions argument.
+        :return: A list of Tensor partitions (Their indices correspond to their partition index).
+        """
+        res: List[torch.Tensor] = []
+        for i in range(num_partitions):
+            res += [data[(partitions == i).nonzero().squeeze(1)]]
+        return res
+
+    @staticmethod
+    def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
+        """
+        Returns the mean of the tensor but ignoring the values specified by masks.
+        Used for masking out loss functions.
+        :param tensor: Tensor which needs mean computation.
+        :param masks: Boolean tensor of masks with same dimension as tensor.
+        """
+        if tensor.ndim == 0:
+            return (tensor * masks).sum() / torch.clamp(
+                (torch.ones_like(tensor) * masks).float().sum(), min=1.0
+            )
+        else:
+            return (
+                tensor.permute(*torch.arange(tensor.ndim - 1, -1, -1)) * masks
+            ).sum() / torch.clamp(
+                (
+                    torch.ones_like(
+                        tensor.permute(*torch.arange(tensor.ndim - 1, -1, -1))
+                    )
+                    * masks
+                )
+                .float()
+                .sum(),
+                min=1.0,
+            )
+
+    @staticmethod
+    def soft_update(source: nn.Module, target: nn.Module, tau: float) -> None:
+        """
+        Performs an in-place polyak update of the target module based on the source,
+        by a ratio of tau. Note that source and target modules must have the same
+        parameters, where:
+            target = tau * source + (1-tau) * target
+        :param source: Source module whose parameters will be used.
+        :param target: Target module whose parameters will be updated.
+        :param tau: Percentage of source parameters to use in average. Setting tau to
+            1 will copy the source parameters to the target.
+        """
+        with torch.no_grad():
+            for source_param, target_param in zip(
+                source.parameters(), target.parameters()
+            ):
+                target_param.data.mul_(1.0 - tau)
+                torch.add(
+                    target_param.data,
+                    source_param.data,
+                    alpha=tau,
+                    out=target_param.data,
+                )
+
+    @staticmethod
+    def create_residual_self_attention(
+        input_processors: nn.ModuleList, embedding_sizes: List[int], hidden_size: int
+    ) -> Tuple[Optional[ResidualSelfAttention], Optional[LinearEncoder]]:
+        """
+        Creates an RSA if there are variable length observations found in the input processors.
+        :param input_processors: A ModuleList of input processors as returned by the function
+            create_input_processors().
+        :param embedding sizes: A List of embedding sizes as returned by create_input_processors().
+        :param hidden_size: The hidden size to use for the RSA.
+        :returns: A Tuple of the RSA itself, a self encoder, and the embedding size after the RSA.
+            Returns None for the RSA and encoder if no var len inputs are detected.
+        """
+        rsa, x_self_encoder = None, None
+        entity_num_max: int = 0
+        var_processors = [p for p in input_processors if isinstance(p, EntityEmbedding)]
+        for processor in var_processors:
+            entity_max: int = processor.entity_num_max_elements
+            # Only adds entity max if it was known at construction
+            if entity_max > 0:
+                entity_num_max += entity_max
+        if len(var_processors) > 0:
+            if sum(embedding_sizes):
+                x_self_encoder = LinearEncoder(
+                    sum(embedding_sizes),
+                    1,
+                    hidden_size,
+                    kernel_init=Initialization.Normal,
+                    kernel_gain=(0.125 / hidden_size) ** 0.5,
+                )
+            rsa = ResidualSelfAttention(hidden_size, entity_num_max)
+        return rsa, x_self_encoder
+
+    @staticmethod
+    def trust_region_value_loss(
+        values: Dict[str, torch.Tensor],
+        old_values: Dict[str, torch.Tensor],
+        returns: Dict[str, torch.Tensor],
+        epsilon: float,
+        loss_masks: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Evaluates value loss, clipping to stay within a trust region of old value estimates.
+        Used for PPO and POCA.
+        :param values: Value output of the current network.
+        :param old_values: Value stored with experiences in buffer.
+        :param returns: Computed returns.
+        :param epsilon: Clipping value for value estimate.
+        :param loss_mask: Mask for losses. Used with LSTM to ignore 0'ed out experiences.
+        """
+        value_losses = []
+        for name, head in values.items():
+            old_val_tensor = old_values[name]
+            returns_tensor = returns[name]
+            clipped_value_estimate = old_val_tensor + torch.clamp(
+                head - old_val_tensor, -1 * epsilon, epsilon
+            )
+            v_opt_a = (returns_tensor - head) ** 2
+            v_opt_b = (returns_tensor - clipped_value_estimate) ** 2
+            value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks)
+            value_losses.append(value_loss)
+        value_loss = torch.mean(torch.stack(value_losses))
+        return value_loss
+
+    @staticmethod
+    def trust_region_policy_loss(
+        advantages: torch.Tensor,
+        log_probs: torch.Tensor,
+        old_log_probs: torch.Tensor,
+        loss_masks: torch.Tensor,
+        epsilon: float,
+    ) -> torch.Tensor:
+        """
+        Evaluate policy loss clipped to stay within a trust region. Used for PPO and POCA.
+        :param advantages: Computed advantages.
+        :param log_probs: Current policy probabilities
+        :param old_log_probs: Past policy probabilities
+        :param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences.
+        """
+        advantage = advantages.unsqueeze(-1)
+        r_theta = torch.exp(log_probs - old_log_probs)
+        p_opt_a = r_theta * advantage
+        p_opt_b = torch.clamp(r_theta, 1.0 - epsilon, 1.0 + epsilon) * advantage
+        policy_loss = -1 * ModelUtils.masked_mean(
+            torch.min(p_opt_a, p_opt_b), loss_masks
+        )
+        return policy_loss
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__init__.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..620008bdb17ab1dc338f57ec88cd1432c8ef09bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__init__.py
@@ -0,0 +1,2 @@
+from mlagents.trainers.trainer.trainer import Trainer  # noqa
+from mlagents.trainers.trainer.trainer_factory import TrainerFactory  # noqa
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f79ce0260d5d4eabf3e6b08c5510884f74fe6a4
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/off_policy_trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/off_policy_trainer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c801ab4ce9e1d7f549c81257c8bcc7aec39606fe
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/off_policy_trainer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/on_policy_trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/on_policy_trainer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7064bbff51c55fdfee95b61780aefe2af55ffeb9
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/on_policy_trainer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/rl_trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/rl_trainer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ca252fd18aad88c4c7dee1ced18f8d696826432
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/rl_trainer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5239e5cfad8df43f8a28775669086284998a8da3
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer_factory.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer_factory.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04f0743965819ded3d1695491f3118e541e67d65
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer_factory.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53c37c5080176463f2c939aaddc4fedf7fe67470
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents/trainers/trainer/__pycache__/trainer_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/off_policy_trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/off_policy_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ae4966928ab45b780815e94e64668fca7e1eb9
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/off_policy_trainer.py
@@ -0,0 +1,263 @@
+# ## ML-Agent Learning (SAC)
+# Contains an implementation of SAC as described in https://arxiv.org/abs/1801.01290
+# and implemented in https://github.com/hill-a/stable-baselines
+
+from collections import defaultdict
+from typing import Dict, cast
+import os
+
+import numpy as np
+from mlagents.trainers.policy.checkpoint_manager import ModelCheckpoint
+
+from mlagents_envs.logging_util import get_logger
+from mlagents_envs.timers import timed
+from mlagents.trainers.buffer import RewardSignalUtil
+from mlagents.trainers.policy import Policy
+from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
+from mlagents.trainers.trainer.rl_trainer import RLTrainer
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.settings import TrainerSettings, OffPolicyHyperparamSettings
+
+logger = get_logger(__name__)
+
+BUFFER_TRUNCATE_PERCENT = 0.8
+
+
+class OffPolicyTrainer(RLTrainer):
+    """
+    The SACTrainer is an implementation of the SAC algorithm, with support
+    for discrete actions and recurrent networks.
+    """
+
+    def __init__(
+        self,
+        behavior_name: str,
+        reward_buff_cap: int,
+        trainer_settings: TrainerSettings,
+        training: bool,
+        load: bool,
+        seed: int,
+        artifact_path: str,
+    ):
+        """
+        Responsible for collecting experiences and training an off-policy model.
+        :param behavior_name: The name of the behavior associated with trainer config
+        :param reward_buff_cap: Max reward history to track in the reward buffer
+        :param trainer_settings: The parameters for the trainer.
+        :param training: Whether the trainer is set for training.
+        :param load: Whether the model should be loaded.
+        :param seed: The seed the model will be initialized with
+        :param artifact_path: The directory within which to store artifacts from this trainer.
+        """
+        super().__init__(
+            behavior_name,
+            trainer_settings,
+            training,
+            load,
+            artifact_path,
+            reward_buff_cap,
+        )
+
+        self.seed = seed
+        self.policy: Policy = None  # type: ignore
+        self.optimizer: TorchOptimizer = None  # type: ignore
+        self.hyperparameters: OffPolicyHyperparamSettings = cast(
+            OffPolicyHyperparamSettings, trainer_settings.hyperparameters
+        )
+
+        self._step = 0
+
+        # Don't divide by zero
+        self.update_steps = 1
+        self.reward_signal_update_steps = 1
+
+        self.steps_per_update = self.hyperparameters.steps_per_update
+        self.reward_signal_steps_per_update = (
+            self.hyperparameters.reward_signal_steps_per_update
+        )
+
+        self.checkpoint_replay_buffer = self.hyperparameters.save_replay_buffer
+
+    def _checkpoint(self) -> ModelCheckpoint:
+        """
+        Writes a checkpoint model to memory
+        Overrides the default to save the replay buffer.
+        """
+        ckpt = super()._checkpoint()
+        if self.checkpoint_replay_buffer:
+            self.save_replay_buffer()
+        return ckpt
+
+    def save_model(self) -> None:
+        """
+        Saves the final training model to memory
+        Overrides the default to save the replay buffer.
+        """
+        super().save_model()
+        if self.checkpoint_replay_buffer:
+            self.save_replay_buffer()
+
+    def save_replay_buffer(self) -> None:
+        """
+        Save the training buffer's update buffer to a pickle file.
+        """
+        filename = os.path.join(self.artifact_path, "last_replay_buffer.hdf5")
+        logger.info(f"Saving Experience Replay Buffer to {filename}...")
+        with open(filename, "wb") as file_object:
+            self.update_buffer.save_to_file(file_object)
+            logger.info(
+                f"Saved Experience Replay Buffer ({os.path.getsize(filename)} bytes)."
+            )
+
+    def load_replay_buffer(self) -> None:
+        """
+        Loads the last saved replay buffer from a file.
+        """
+        filename = os.path.join(self.artifact_path, "last_replay_buffer.hdf5")
+        logger.info(f"Loading Experience Replay Buffer from {filename}...")
+        with open(filename, "rb+") as file_object:
+            self.update_buffer.load_from_file(file_object)
+        logger.debug(
+            "Experience replay buffer has {} experiences.".format(
+                self.update_buffer.num_experiences
+            )
+        )
+
+    def _is_ready_update(self) -> bool:
+        """
+        Returns whether or not the trainer has enough elements to run update model
+        :return: A boolean corresponding to whether or not _update_policy() can be run
+        """
+        return (
+            self.update_buffer.num_experiences >= self.hyperparameters.batch_size
+            and self._step >= self.hyperparameters.buffer_init_steps
+        )
+
+    def maybe_load_replay_buffer(self):
+        # Load the replay buffer if load
+        if self.load and self.checkpoint_replay_buffer:
+            try:
+                self.load_replay_buffer()
+            except (AttributeError, FileNotFoundError):
+                logger.warning(
+                    "Replay buffer was unable to load, starting from scratch."
+                )
+            logger.debug(
+                "Loaded update buffer with {} sequences".format(
+                    self.update_buffer.num_experiences
+                )
+            )
+
+    def add_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
+    ) -> None:
+        """
+        Adds policy to trainer.
+        """
+        if self.policy:
+            logger.warning(
+                "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \
+                    train adversarial games.".format(
+                    self.__class__.__name__
+                )
+            )
+        self.policy = policy
+        self.policies[parsed_behavior_id.behavior_id] = policy
+        self.optimizer = self.create_optimizer()
+        for _reward_signal in self.optimizer.reward_signals.keys():
+            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
+
+        self.model_saver.register(self.policy)
+        self.model_saver.register(self.optimizer)
+        self.model_saver.initialize_or_load()
+
+        # Needed to resume loads properly
+        self._step = policy.get_current_step()
+        # Assume steps were updated at the correct ratio before
+        self.update_steps = int(max(1, self._step / self.steps_per_update))
+        self.reward_signal_update_steps = int(
+            max(1, self._step / self.reward_signal_steps_per_update)
+        )
+
+    @timed
+    def _update_policy(self) -> bool:
+        """
+        Uses update_buffer to update the policy. We sample the update_buffer and update
+        until the steps_per_update ratio is met.
+        """
+        has_updated = False
+        self.cumulative_returns_since_policy_update.clear()
+        n_sequences = max(
+            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
+        )
+
+        batch_update_stats: Dict[str, list] = defaultdict(list)
+        while (
+            self._step - self.hyperparameters.buffer_init_steps
+        ) / self.update_steps > self.steps_per_update:
+            logger.debug(f"Updating SAC policy at step {self._step}")
+            buffer = self.update_buffer
+            if self.update_buffer.num_experiences >= self.hyperparameters.batch_size:
+                sampled_minibatch = buffer.sample_mini_batch(
+                    self.hyperparameters.batch_size,
+                    sequence_length=self.policy.sequence_length,
+                )
+                # Get rewards for each reward
+                for name, signal in self.optimizer.reward_signals.items():
+                    sampled_minibatch[RewardSignalUtil.rewards_key(name)] = (
+                        signal.evaluate(sampled_minibatch) * signal.strength
+                    )
+
+                update_stats = self.optimizer.update(sampled_minibatch, n_sequences)
+                for stat_name, value in update_stats.items():
+                    batch_update_stats[stat_name].append(value)
+
+                self.update_steps += 1
+
+                for stat, stat_list in batch_update_stats.items():
+                    self._stats_reporter.add_stat(stat, np.mean(stat_list))
+                has_updated = True
+
+            if self.optimizer.bc_module:
+                update_stats = self.optimizer.bc_module.update()
+                for stat, val in update_stats.items():
+                    self._stats_reporter.add_stat(stat, val)
+
+        # Truncate update buffer if neccessary. Truncate more than we need to to avoid truncating
+        # a large buffer at each update.
+        if self.update_buffer.num_experiences > self.hyperparameters.buffer_size:
+            self.update_buffer.truncate(
+                int(self.hyperparameters.buffer_size * BUFFER_TRUNCATE_PERCENT)
+            )
+        # TODO: revisit this update
+        self._update_reward_signals()
+        return has_updated
+
+    def _update_reward_signals(self) -> None:
+        """
+        Iterate through the reward signals and update them. Unlike in PPO,
+        do it separate from the policy so that it can be done at a different
+        interval.
+        This function should only be used to simulate
+        http://arxiv.org/abs/1809.02925 and similar papers, where the policy is updated
+        N times, then the reward signals are updated N times. Normally, the reward signal
+        and policy are updated in parallel.
+        """
+        buffer = self.update_buffer
+        batch_update_stats: Dict[str, list] = defaultdict(list)
+        while (
+            self._step - self.hyperparameters.buffer_init_steps
+        ) / self.reward_signal_update_steps > self.reward_signal_steps_per_update:
+            # Get minibatches for reward signal update if needed
+            minibatch = buffer.sample_mini_batch(
+                self.hyperparameters.batch_size,
+                sequence_length=self.policy.sequence_length,
+            )
+            update_stats = self.optimizer.update_reward_signals(minibatch)
+
+            for stat_name, value in update_stats.items():
+                batch_update_stats[stat_name].append(value)
+            self.reward_signal_update_steps += 1
+
+            for stat, stat_list in batch_update_stats.items():
+                self._stats_reporter.add_stat(stat, np.mean(stat_list))
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/on_policy_trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/on_policy_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..879640a0e5f95f2594d9f30ab8aed2cec9eb722d
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/on_policy_trainer.py
@@ -0,0 +1,144 @@
+# # Unity ML-Agents Toolkit
+# ## ML-Agent Learning (PPO)
+# Contains an implementation of PPO as described in: https://arxiv.org/abs/1707.06347
+
+from collections import defaultdict
+from typing import cast
+
+import numpy as np
+
+from mlagents_envs.logging_util import get_logger
+from mlagents.trainers.buffer import BufferKey
+from mlagents.trainers.trainer.rl_trainer import RLTrainer
+from mlagents.trainers.policy import Policy
+from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.settings import TrainerSettings, OnPolicyHyperparamSettings
+
+logger = get_logger(__name__)
+
+
+class OnPolicyTrainer(RLTrainer):
+    """The PPOTrainer is an implementation of the PPO algorithm."""
+
+    def __init__(
+        self,
+        behavior_name: str,
+        reward_buff_cap: int,
+        trainer_settings: TrainerSettings,
+        training: bool,
+        load: bool,
+        seed: int,
+        artifact_path: str,
+    ):
+        """
+        Responsible for collecting experiences and training an on-policy model.
+        :param behavior_name: The name of the behavior associated with trainer config
+        :param reward_buff_cap: Max reward history to track in the reward buffer
+        :param trainer_settings: The parameters for the trainer.
+        :param training: Whether the trainer is set for training.
+        :param load: Whether the model should be loaded.
+        :param seed: The seed the model will be initialized with
+        :param artifact_path: The directory within which to store artifacts from this trainer.
+        """
+        super().__init__(
+            behavior_name,
+            trainer_settings,
+            training,
+            load,
+            artifact_path,
+            reward_buff_cap,
+        )
+        self.hyperparameters = cast(
+            OnPolicyHyperparamSettings, self.trainer_settings.hyperparameters
+        )
+        self.seed = seed
+        self.policy: Policy = None  # type: ignore
+        self.optimizer: TorchOptimizer = None  # type: ignore
+
+    def _is_ready_update(self):
+        """
+        Returns whether or not the trainer has enough elements to run update model
+        :return: A boolean corresponding to whether or not update_model() can be run
+        """
+        size_of_buffer = self.update_buffer.num_experiences
+        return size_of_buffer > self.hyperparameters.buffer_size
+
+    def _update_policy(self):
+        """
+        Uses demonstration_buffer to update the policy.
+        The reward signal generators must be updated in this method at their own pace.
+        """
+        buffer_length = self.update_buffer.num_experiences
+        self.cumulative_returns_since_policy_update.clear()
+
+        # Make sure batch_size is a multiple of sequence length. During training, we
+        # will need to reshape the data into a batch_size x sequence_length tensor.
+        batch_size = (
+            self.hyperparameters.batch_size
+            - self.hyperparameters.batch_size % self.policy.sequence_length
+        )
+        # Make sure there is at least one sequence
+        batch_size = max(batch_size, self.policy.sequence_length)
+
+        n_sequences = max(
+            int(self.hyperparameters.batch_size / self.policy.sequence_length), 1
+        )
+
+        advantages = np.array(
+            self.update_buffer[BufferKey.ADVANTAGES].get_batch(), dtype=np.float32
+        )
+        self.update_buffer[BufferKey.ADVANTAGES].set(
+            (advantages - advantages.mean()) / (advantages.std() + 1e-10)
+        )
+        num_epoch = self.hyperparameters.num_epoch
+        batch_update_stats = defaultdict(list)
+        for _ in range(num_epoch):
+            self.update_buffer.shuffle(sequence_length=self.policy.sequence_length)
+            buffer = self.update_buffer
+            max_num_batch = buffer_length // batch_size
+            for i in range(0, max_num_batch * batch_size, batch_size):
+                minibatch = buffer.make_mini_batch(i, i + batch_size)
+                update_stats = self.optimizer.update(minibatch, n_sequences)
+                update_stats.update(self.optimizer.update_reward_signals(minibatch))
+                for stat_name, value in update_stats.items():
+                    batch_update_stats[stat_name].append(value)
+
+        for stat, stat_list in batch_update_stats.items():
+            self._stats_reporter.add_stat(stat, np.mean(stat_list))
+
+        if self.optimizer.bc_module:
+            update_stats = self.optimizer.bc_module.update()
+            for stat, val in update_stats.items():
+                self._stats_reporter.add_stat(stat, val)
+        self._clear_update_buffer()
+        return True
+
+    def add_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
+    ) -> None:
+        """
+        Adds policy to trainer.
+        :param parsed_behavior_id: Behavior identifiers that the policy should belong to.
+        :param policy: Policy to associate with name_behavior_id.
+        """
+        if self.policy:
+            logger.warning(
+                "Your environment contains multiple teams, but {} doesn't support adversarial games. Enable self-play to \
+                    train adversarial games.".format(
+                    self.__class__.__name__
+                )
+            )
+        self.policy = policy
+        self.policies[parsed_behavior_id.behavior_id] = policy
+
+        self.optimizer = self.create_optimizer()
+        for _reward_signal in self.optimizer.reward_signals.keys():
+            self.collected_rewards[_reward_signal] = defaultdict(lambda: 0)
+
+        self.model_saver.register(self.policy)
+        self.model_saver.register(self.optimizer)
+        self.model_saver.initialize_or_load()
+
+        # Needed to resume loads properly
+        self._step = policy.get_current_step()
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/rl_trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/rl_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..57454900a0b61a4af4c728d01373eb63147f485b
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/rl_trainer.py
@@ -0,0 +1,305 @@
+# # Unity ML-Agents Toolkit
+from typing import Dict, List, Optional
+from collections import defaultdict
+import abc
+import time
+import attr
+import numpy as np
+from mlagents_envs.side_channel.stats_side_channel import StatsAggregationMethod
+
+from mlagents.trainers.policy.checkpoint_manager import (
+    ModelCheckpoint,
+    ModelCheckpointManager,
+)
+from mlagents_envs.logging_util import get_logger
+from mlagents_envs.timers import timed
+from mlagents.trainers.optimizer import Optimizer
+from mlagents.trainers.optimizer.torch_optimizer import TorchOptimizer
+from mlagents.trainers.buffer import AgentBuffer, BufferKey
+from mlagents.trainers.trainer import Trainer
+from mlagents.trainers.torch_entities.components.reward_providers.base_reward_provider import (
+    BaseRewardProvider,
+)
+from mlagents_envs.timers import hierarchical_timer
+from mlagents.trainers.model_saver.torch_model_saver import TorchModelSaver
+from mlagents.trainers.agent_processor import AgentManagerQueue
+from mlagents.trainers.trajectory import Trajectory
+from mlagents.trainers.settings import TrainerSettings
+from mlagents.trainers.stats import StatsPropertyType
+from mlagents.trainers.model_saver.model_saver import BaseModelSaver
+
+
+logger = get_logger(__name__)
+
+
+class RLTrainer(Trainer):
+    """
+    This class is the base class for trainers that use Reward Signals.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
+        # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
+        # of what reward signals are actually present.
+        self.cumulative_returns_since_policy_update: List[float] = []
+        self.collected_rewards: Dict[str, Dict[str, int]] = {
+            "environment": defaultdict(lambda: 0)
+        }
+        self.update_buffer: AgentBuffer = AgentBuffer()
+        self._stats_reporter.add_property(
+            StatsPropertyType.HYPERPARAMETERS, self.trainer_settings.as_dict()
+        )
+
+        self._next_save_step = 0
+        self._next_summary_step = 0
+        self.model_saver = self.create_model_saver(
+            self.trainer_settings, self.artifact_path, self.load
+        )
+        self._has_warned_group_rewards = False
+
+    def end_episode(self) -> None:
+        """
+        A signal that the Episode has ended. The buffer must be reset.
+        Get only called when the academy resets.
+        """
+        for rewards in self.collected_rewards.values():
+            for agent_id in rewards:
+                rewards[agent_id] = 0
+
+    def _update_end_episode_stats(self, agent_id: str, optimizer: Optimizer) -> None:
+        for name, rewards in self.collected_rewards.items():
+            if name == "environment":
+                self.stats_reporter.add_stat(
+                    "Environment/Cumulative Reward",
+                    rewards.get(agent_id, 0),
+                    aggregation=StatsAggregationMethod.HISTOGRAM,
+                )
+                self.cumulative_returns_since_policy_update.append(
+                    rewards.get(agent_id, 0)
+                )
+                self.reward_buffer.appendleft(rewards.get(agent_id, 0))
+                rewards[agent_id] = 0
+            else:
+                if isinstance(optimizer.reward_signals[name], BaseRewardProvider):
+                    self.stats_reporter.add_stat(
+                        f"Policy/{optimizer.reward_signals[name].name.capitalize()} Reward",
+                        rewards.get(agent_id, 0),
+                    )
+                else:
+                    self.stats_reporter.add_stat(
+                        optimizer.reward_signals[name].stat_name,
+                        rewards.get(agent_id, 0),
+                    )
+                rewards[agent_id] = 0
+
+    def _clear_update_buffer(self) -> None:
+        """
+        Clear the buffers that have been built up during inference.
+        """
+        self.update_buffer.reset_agent()
+
+    @abc.abstractmethod
+    def _is_ready_update(self):
+        """
+        Returns whether or not the trainer has enough elements to run update model
+        :return: A boolean corresponding to wether or not update_model() can be run
+        """
+        return False
+
+    @abc.abstractmethod
+    def create_optimizer(self) -> TorchOptimizer:
+        """
+        Creates an Optimizer object
+        """
+        pass
+
+    @staticmethod
+    def create_model_saver(
+        trainer_settings: TrainerSettings, model_path: str, load: bool
+    ) -> BaseModelSaver:
+        model_saver = TorchModelSaver(  # type: ignore
+            trainer_settings, model_path, load
+        )
+        return model_saver
+
+    def _policy_mean_reward(self) -> Optional[float]:
+        """Returns the mean episode reward for the current policy."""
+        rewards = self.cumulative_returns_since_policy_update
+        if len(rewards) == 0:
+            return None
+        else:
+            return sum(rewards) / len(rewards)
+
+    @timed
+    def _checkpoint(self) -> ModelCheckpoint:
+        """
+        Checkpoints the policy associated with this trainer.
+        """
+        n_policies = len(self.policies.keys())
+        if n_policies > 1:
+            logger.warning(
+                "Trainer has multiple policies, but default behavior only saves the first."
+            )
+        export_path, auxillary_paths = self.model_saver.save_checkpoint(
+            self.brain_name, self._step
+        )
+        new_checkpoint = ModelCheckpoint(
+            int(self._step),
+            export_path,
+            self._policy_mean_reward(),
+            time.time(),
+            auxillary_file_paths=auxillary_paths,
+        )
+        ModelCheckpointManager.add_checkpoint(
+            self.brain_name, new_checkpoint, self.trainer_settings.keep_checkpoints
+        )
+        return new_checkpoint
+
+    def save_model(self) -> None:
+        """
+        Saves the policy associated with this trainer.
+        """
+        n_policies = len(self.policies.keys())
+        if n_policies > 1:
+            logger.warning(
+                "Trainer has multiple policies, but default behavior only saves the first."
+            )
+        elif n_policies == 0:
+            logger.warning("Trainer has no policies, not saving anything.")
+            return
+
+        model_checkpoint = self._checkpoint()
+        self.model_saver.copy_final_model(model_checkpoint.file_path)
+        export_ext = "onnx"
+        final_checkpoint = attr.evolve(
+            model_checkpoint, file_path=f"{self.model_saver.model_path}.{export_ext}"
+        )
+        ModelCheckpointManager.track_final_checkpoint(self.brain_name, final_checkpoint)
+
+    @abc.abstractmethod
+    def _update_policy(self) -> bool:
+        """
+        Uses demonstration_buffer to update model.
+        :return: Whether or not the policy was updated.
+        """
+        pass
+
+    def _increment_step(self, n_steps: int, name_behavior_id: str) -> None:
+        """
+        Increment the step count of the trainer
+        :param n_steps: number of steps to increment the step count by
+        """
+        self._step += n_steps
+        self._next_summary_step = self._get_next_interval_step(self.summary_freq)
+        self._next_save_step = self._get_next_interval_step(
+            self.trainer_settings.checkpoint_interval
+        )
+        p = self.get_policy(name_behavior_id)
+        if p:
+            p.increment_step(n_steps)
+        self.stats_reporter.set_stat("Step", float(self.get_step))
+
+    def _get_next_interval_step(self, interval: int) -> int:
+        """
+        Get the next step count that should result in an action.
+        :param interval: The interval between actions.
+        """
+        return self._step + (interval - self._step % interval)
+
+    def _write_summary(self, step: int) -> None:
+        """
+        Saves training statistics to Tensorboard.
+        """
+        self.stats_reporter.add_stat("Is Training", float(self.should_still_train))
+        self.stats_reporter.write_stats(int(step))
+
+    @abc.abstractmethod
+    def _process_trajectory(self, trajectory: Trajectory) -> None:
+        """
+        Takes a trajectory and processes it, putting it into the update buffer.
+        :param trajectory: The Trajectory tuple containing the steps to be processed.
+        """
+        self._maybe_write_summary(self.get_step + len(trajectory.steps))
+        self._maybe_save_model(self.get_step + len(trajectory.steps))
+        self._increment_step(len(trajectory.steps), trajectory.behavior_id)
+
+    def _maybe_write_summary(self, step_after_process: int) -> None:
+        """
+        If processing the trajectory will make the step exceed the next summary write,
+        write the summary. This logic ensures summaries are written on the update step and not in between.
+        :param step_after_process: the step count after processing the next trajectory.
+        """
+        if self._next_summary_step == 0:  # Don't write out the first one
+            self._next_summary_step = self._get_next_interval_step(self.summary_freq)
+        if step_after_process >= self._next_summary_step and self.get_step != 0:
+            self._write_summary(self._next_summary_step)
+
+    def _append_to_update_buffer(self, agentbuffer_trajectory: AgentBuffer) -> None:
+        """
+        Append an AgentBuffer to the update buffer. If the trainer isn't training,
+        don't update to avoid a memory leak.
+        """
+        if self.should_still_train:
+            seq_len = (
+                self.trainer_settings.network_settings.memory.sequence_length
+                if self.trainer_settings.network_settings.memory is not None
+                else 1
+            )
+            agentbuffer_trajectory.resequence_and_append(
+                self.update_buffer, training_length=seq_len
+            )
+
+    def _maybe_save_model(self, step_after_process: int) -> None:
+        """
+        If processing the trajectory will make the step exceed the next model write,
+        save the model. This logic ensures models are written on the update step and not in between.
+        :param step_after_process: the step count after processing the next trajectory.
+        """
+        if self._next_save_step == 0:  # Don't save the first one
+            self._next_save_step = self._get_next_interval_step(
+                self.trainer_settings.checkpoint_interval
+            )
+        if step_after_process >= self._next_save_step and self.get_step != 0:
+            self._checkpoint()
+
+    def _warn_if_group_reward(self, buffer: AgentBuffer) -> None:
+        """
+        Warn if the trainer receives a Group Reward but isn't a multiagent trainer (e.g. POCA).
+        """
+        if not self._has_warned_group_rewards:
+            if np.any(buffer[BufferKey.GROUP_REWARD]):
+                logger.warning(
+                    "An agent recieved a Group Reward, but you are not using a multi-agent trainer. "
+                    "Please use the POCA trainer for best results."
+                )
+                self._has_warned_group_rewards = True
+
+    def advance(self) -> None:
+        """
+        Steps the trainer, taking in trajectories and updates if ready.
+        Will block and wait briefly if there are no trajectories.
+        """
+        with hierarchical_timer("process_trajectory"):
+            for traj_queue in self.trajectory_queues:
+                # We grab at most the maximum length of the queue.
+                # This ensures that even if the queue is being filled faster than it is
+                # being emptied, the trajectories in the queue are on-policy.
+                _queried = False
+                for _ in range(traj_queue.qsize()):
+                    _queried = True
+                    try:
+                        t = traj_queue.get_nowait()
+                        self._process_trajectory(t)
+                    except AgentManagerQueue.Empty:
+                        break
+                if self.threaded and not _queried:
+                    # Yield thread to avoid busy-waiting
+                    time.sleep(0.0001)
+        if self.should_still_train:
+            if self._is_ready_update():
+                with hierarchical_timer("_update_policy"):
+                    if self._update_policy():
+                        for q in self.policy_queues:
+                            # Get policies that correspond to the policy queue in question
+                            q.put(self.get_policy(q.behavior_id))
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..58a339efd2b4e5089d8564bbe51d55d005b2b27b
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer.py
@@ -0,0 +1,183 @@
+# # Unity ML-Agents Toolkit
+from typing import List, Deque, Dict
+import abc
+from collections import deque
+
+from mlagents_envs.logging_util import get_logger
+from mlagents_envs.base_env import BehaviorSpec
+from mlagents.trainers.stats import StatsReporter
+from mlagents.trainers.trajectory import Trajectory
+from mlagents.trainers.agent_processor import AgentManagerQueue
+from mlagents.trainers.policy import Policy
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.settings import TrainerSettings
+
+
+logger = get_logger(__name__)
+
+
+class Trainer(abc.ABC):
+    """This class is the base class for the mlagents_envs.trainers"""
+
+    def __init__(
+        self,
+        brain_name: str,
+        trainer_settings: TrainerSettings,
+        training: bool,
+        load: bool,
+        artifact_path: str,
+        reward_buff_cap: int = 1,
+    ):
+        """
+        Responsible for collecting experiences and training a neural network model.
+        :param brain_name: Brain name of brain to be trained.
+        :param trainer_settings: The parameters for the trainer (dictionary).
+        :param training: Whether the trainer is set for training.
+        :param artifact_path: The directory within which to store artifacts from this trainer
+        :param reward_buff_cap:
+        """
+        self.brain_name = brain_name
+        self.trainer_settings = trainer_settings
+        self._threaded = trainer_settings.threaded
+        self._stats_reporter = StatsReporter(brain_name)
+        self.is_training = training
+        self.load = load
+        self._reward_buffer: Deque[float] = deque(maxlen=reward_buff_cap)
+        self.policy_queues: List[AgentManagerQueue[Policy]] = []
+        self.trajectory_queues: List[AgentManagerQueue[Trajectory]] = []
+        self._step: int = 0
+        self.artifact_path = artifact_path
+        self.summary_freq = self.trainer_settings.summary_freq
+        self.policies: Dict[str, Policy] = {}
+
+    @property
+    def stats_reporter(self):
+        """
+        Returns the stats reporter associated with this Trainer.
+        """
+        return self._stats_reporter
+
+    @property
+    def parameters(self) -> TrainerSettings:
+        """
+        Returns the trainer parameters of the trainer.
+        """
+        return self.trainer_settings
+
+    @property
+    def get_max_steps(self) -> int:
+        """
+        Returns the maximum number of steps. Is used to know when the trainer should be stopped.
+        :return: The maximum number of steps of the trainer
+        """
+        return self.trainer_settings.max_steps
+
+    @property
+    def get_step(self) -> int:
+        """
+        Returns the number of steps the trainer has performed
+        :return: the step count of the trainer
+        """
+        return self._step
+
+    @property
+    def threaded(self) -> bool:
+        """
+        Whether or not to run the trainer in a thread. True allows the trainer to
+        update the policy while the environment is taking steps. Set to False to
+        enforce strict on-policy updates (i.e. don't update the policy when taking steps.)
+        """
+        return self._threaded
+
+    @property
+    def should_still_train(self) -> bool:
+        """
+        Returns whether or not the trainer should train. A Trainer could
+        stop training if it wasn't training to begin with, or if max_steps
+        is reached.
+        """
+        return self.is_training and self.get_step <= self.get_max_steps
+
+    @property
+    def reward_buffer(self) -> Deque[float]:
+        """
+        Returns the reward buffer. The reward buffer contains the cumulative
+        rewards of the most recent episodes completed by agents using this
+        trainer.
+        :return: the reward buffer.
+        """
+        return self._reward_buffer
+
+    @abc.abstractmethod
+    def save_model(self) -> None:
+        """
+        Saves model file(s) for the policy or policies associated with this trainer.
+        """
+        pass
+
+    @abc.abstractmethod
+    def end_episode(self):
+        """
+        A signal that the Episode has ended. The buffer must be reset.
+        Get only called when the academy resets.
+        """
+        pass
+
+    @abc.abstractmethod
+    def create_policy(
+        self,
+        parsed_behavior_id: BehaviorIdentifiers,
+        behavior_spec: BehaviorSpec,
+    ) -> Policy:
+        """
+        Creates a Policy object
+        """
+        pass
+
+    @abc.abstractmethod
+    def add_policy(
+        self, parsed_behavior_id: BehaviorIdentifiers, policy: Policy
+    ) -> None:
+        """
+        Adds policy to trainer.
+        """
+        pass
+
+    def get_policy(self, name_behavior_id: str) -> Policy:
+        """
+        Gets policy associated with name_behavior_id
+        :param name_behavior_id: Fully qualified behavior name
+        :return: Policy associated with name_behavior_id
+        """
+        return self.policies[name_behavior_id]
+
+    @abc.abstractmethod
+    def advance(self) -> None:
+        """
+        Advances the trainer. Typically, this means grabbing trajectories
+        from all subscribed trajectory queues (self.trajectory_queues), and updating
+        a policy using the steps in them, and if needed pushing a new policy onto the right
+        policy queues (self.policy_queues).
+        """
+        pass
+
+    def publish_policy_queue(self, policy_queue: AgentManagerQueue[Policy]) -> None:
+        """
+        Adds a policy queue to the list of queues to publish to when this Trainer
+        makes a policy update
+        :param policy_queue: Policy queue to publish to.
+        """
+        self.policy_queues.append(policy_queue)
+
+    def subscribe_trajectory_queue(
+        self, trajectory_queue: AgentManagerQueue[Trajectory]
+    ) -> None:
+        """
+        Adds a trajectory queue to the list of queues for the trainer to ingest Trajectories from.
+        :param trajectory_queue: Trajectory queue to read from.
+        """
+        self.trajectory_queues.append(trajectory_queue)
+
+    @staticmethod
+    def get_trainer_name() -> str:
+        raise NotImplementedError
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer_factory.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffb774151336a5a3e790e25843c62895562ad57d
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer_factory.py
@@ -0,0 +1,131 @@
+import os
+from typing import Dict
+
+from mlagents_envs.logging_util import get_logger
+from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
+from mlagents.trainers.exception import TrainerConfigError
+from mlagents.trainers.trainer import Trainer
+from mlagents.trainers.ghost.trainer import GhostTrainer
+from mlagents.trainers.ghost.controller import GhostController
+from mlagents.trainers.settings import TrainerSettings
+from mlagents.plugins import all_trainer_types
+
+
+logger = get_logger(__name__)
+
+
+class TrainerFactory:
+    def __init__(
+        self,
+        trainer_config: Dict[str, TrainerSettings],
+        output_path: str,
+        train_model: bool,
+        load_model: bool,
+        seed: int,
+        param_manager: EnvironmentParameterManager,
+        init_path: str = None,
+        multi_gpu: bool = False,
+    ):
+        """
+        The TrainerFactory generates the Trainers based on the configuration passed as
+        input.
+        :param trainer_config: A dictionary from behavior name to TrainerSettings
+        :param output_path: The path to the directory where the artifacts generated by
+        the trainer will be saved.
+        :param train_model: If True, the Trainers will train the model and if False,
+        only perform inference.
+        :param load_model: If True, the Trainer will load neural networks weights from
+        the previous run.
+        :param seed: The seed of the Trainers. Dictates how the neural networks will be
+        initialized.
+        :param param_manager: The EnvironmentParameterManager that will dictate when/if
+        the EnvironmentParameters must change.
+        :param init_path: Path from which to load model.
+        :param multi_gpu: If True, multi-gpu will be used. (currently not available)
+        """
+        self.trainer_config = trainer_config
+        self.output_path = output_path
+        self.init_path = init_path
+        self.train_model = train_model
+        self.load_model = load_model
+        self.seed = seed
+        self.param_manager = param_manager
+        self.multi_gpu = multi_gpu
+        self.ghost_controller = GhostController()
+
+    def generate(self, behavior_name: str) -> Trainer:
+        trainer_settings = self.trainer_config[behavior_name]
+        return TrainerFactory._initialize_trainer(
+            trainer_settings,
+            behavior_name,
+            self.output_path,
+            self.train_model,
+            self.load_model,
+            self.ghost_controller,
+            self.seed,
+            self.param_manager,
+            self.multi_gpu,
+        )
+
+    @staticmethod
+    def _initialize_trainer(
+        trainer_settings: TrainerSettings,
+        brain_name: str,
+        output_path: str,
+        train_model: bool,
+        load_model: bool,
+        ghost_controller: GhostController,
+        seed: int,
+        param_manager: EnvironmentParameterManager,
+        multi_gpu: bool = False,
+    ) -> Trainer:
+        """
+        Initializes a trainer given a provided trainer configuration and brain parameters, as well as
+        some general training session options.
+
+        :param trainer_settings: Original trainer configuration loaded from YAML
+        :param brain_name: Name of the brain to be associated with trainer
+        :param output_path: Path to save the model and summary statistics
+        :param keep_checkpoints: How many model checkpoints to keep
+        :param train_model: Whether to train the model (vs. run inference)
+        :param load_model: Whether to load the model or randomly initialize
+        :param ghost_controller: The object that coordinates ghost trainers
+        :param seed: The random seed to use
+        :param param_manager: EnvironmentParameterManager, used to determine a reward buffer length for PPOTrainer
+        :return:
+        """
+        trainer_artifact_path = os.path.join(output_path, brain_name)
+
+        min_lesson_length = param_manager.get_minimum_reward_buffer_size(brain_name)
+
+        trainer: Trainer = None  # type: ignore  # will be set to one of these, or raise
+
+        try:
+            trainer_type = all_trainer_types[trainer_settings.trainer_type]
+            trainer = trainer_type(
+                brain_name,
+                min_lesson_length,
+                trainer_settings,
+                train_model,
+                load_model,
+                seed,
+                trainer_artifact_path,
+            )
+
+        except KeyError:
+            raise TrainerConfigError(
+                f"The trainer config contains an unknown trainer type "
+                f"{trainer_settings.trainer_type} for brain {brain_name}"
+            )
+
+        if trainer_settings.self_play is not None:
+            trainer = GhostTrainer(
+                trainer,
+                brain_name,
+                ghost_controller,
+                min_lesson_length,
+                trainer_settings,
+                train_model,
+                trainer_artifact_path,
+            )
+        return trainer
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer_utils.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad94bd35f066fe37780555d2b3402662f62d4ca5
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer/trainer_utils.py
@@ -0,0 +1,45 @@
+import numpy as np
+
+
+def discount_rewards(r, gamma=0.99, value_next=0.0):
+    """
+    Computes discounted sum of future rewards for use in updating value estimate.
+    :param r: List of rewards.
+    :param gamma: Discount factor.
+    :param value_next: T+1 value estimate for returns calculation.
+    :return: discounted sum of future rewards as list.
+    """
+    discounted_r = np.zeros_like(r)
+    running_add = value_next
+    for t in reversed(range(0, r.size)):
+        running_add = running_add * gamma + r[t]
+        discounted_r[t] = running_add
+    return discounted_r
+
+
+def get_gae(rewards, value_estimates, value_next=0.0, gamma=0.99, lambd=0.95):
+    """
+    Computes generalized advantage estimate for use in updating policy.
+    :param rewards: list of rewards for time-steps t to T.
+    :param value_next: Value estimate for time-step T+1.
+    :param value_estimates: list of value estimates for time-steps t to T.
+    :param gamma: Discount factor.
+    :param lambd: GAE weighing factor.
+    :return: list of advantage estimates for time-steps t to T.
+    """
+    value_estimates = np.append(value_estimates, value_next)
+    delta_t = rewards + gamma * value_estimates[1:] - value_estimates[:-1]
+    advantage = discount_rewards(r=delta_t, gamma=gamma * lambd)
+    return advantage
+
+
+def lambda_return(r, value_estimates, gamma=0.99, lambd=0.8, value_next=0.0):
+    returns = np.zeros_like(r)
+    returns[-1] = r[-1] + gamma * value_next
+    for t in reversed(range(0, r.size - 1)):
+        returns[t] = (
+            gamma * lambd * returns[t + 1]
+            + r[t]
+            + (1 - lambd) * gamma * value_estimates[t + 1]
+        )
+    return returns
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trainer_controller.py b/MLPY/Lib/site-packages/mlagents/trainers/trainer_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..69da1e569496d043e1d4c167a2cce963e9fd69d9
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/trainer_controller.py
@@ -0,0 +1,297 @@
+# # Unity ML-Agents Toolkit
+# ## ML-Agent Learning
+"""Launches trainers for each External Brains in a Unity Environment."""
+
+import os
+import threading
+from typing import Dict, Set, List
+from collections import defaultdict
+
+import numpy as np
+
+from mlagents_envs.logging_util import get_logger
+from mlagents.trainers.env_manager import EnvManager, EnvironmentStep
+from mlagents_envs.exception import (
+    UnityEnvironmentException,
+    UnityCommunicationException,
+    UnityCommunicatorStoppedException,
+)
+from mlagents_envs.timers import (
+    hierarchical_timer,
+    timed,
+    get_timer_stack_for_thread,
+    merge_gauges,
+)
+from mlagents.trainers.trainer import Trainer
+from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
+from mlagents.trainers.trainer import TrainerFactory
+from mlagents.trainers.behavior_id_utils import BehaviorIdentifiers
+from mlagents.trainers.agent_processor import AgentManager
+from mlagents import torch_utils
+from mlagents.torch_utils.globals import get_rank
+
+
+class TrainerController:
+    def __init__(
+        self,
+        trainer_factory: TrainerFactory,
+        output_path: str,
+        run_id: str,
+        param_manager: EnvironmentParameterManager,
+        train: bool,
+        training_seed: int,
+    ):
+        """
+        :param output_path: Path to save the model.
+        :param summaries_dir: Folder to save training summaries.
+        :param run_id: The sub-directory name for model and summary statistics
+        :param param_manager: EnvironmentParameterManager object which stores information about all
+        environment parameters.
+        :param train: Whether to train model, or only run inference.
+        :param training_seed: Seed to use for Numpy and Torch random number generation.
+        :param threaded: Whether or not to run trainers in a separate thread. Disable for testing/debugging.
+        """
+        self.trainers: Dict[str, Trainer] = {}
+        self.brain_name_to_identifier: Dict[str, Set] = defaultdict(set)
+        self.trainer_factory = trainer_factory
+        self.output_path = output_path
+        self.logger = get_logger(__name__)
+        self.run_id = run_id
+        self.train_model = train
+        self.param_manager = param_manager
+        self.ghost_controller = self.trainer_factory.ghost_controller
+        self.registered_behavior_ids: Set[str] = set()
+
+        self.trainer_threads: List[threading.Thread] = []
+        self.kill_trainers = False
+        np.random.seed(training_seed)
+        torch_utils.torch.manual_seed(training_seed)
+        self.rank = get_rank()
+
+    @timed
+    def _save_models(self):
+        """
+        Saves current model to checkpoint folder.
+        """
+        if self.rank is not None and self.rank != 0:
+            return
+
+        for brain_name in self.trainers.keys():
+            self.trainers[brain_name].save_model()
+        self.logger.debug("Saved Model")
+
+    @staticmethod
+    def _create_output_path(output_path):
+        try:
+            if not os.path.exists(output_path):
+                os.makedirs(output_path)
+        except Exception:
+            raise UnityEnvironmentException(
+                f"The folder {output_path} containing the "
+                "generated model could not be "
+                "accessed. Please make sure the "
+                "permissions are set correctly."
+            )
+
+    @timed
+    def _reset_env(self, env_manager: EnvManager) -> None:
+        """Resets the environment.
+
+        Returns:
+            A Data structure corresponding to the initial reset state of the
+            environment.
+        """
+        new_config = self.param_manager.get_current_samplers()
+        env_manager.reset(config=new_config)
+        # Register any new behavior ids that were generated on the reset.
+        self._register_new_behaviors(env_manager, env_manager.first_step_infos)
+
+    def _not_done_training(self) -> bool:
+        return (
+            any(t.should_still_train for t in self.trainers.values())
+            or not self.train_model
+        ) or len(self.trainers) == 0
+
+    def _create_trainer_and_manager(
+        self, env_manager: EnvManager, name_behavior_id: str
+    ) -> None:
+
+        parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(name_behavior_id)
+        brain_name = parsed_behavior_id.brain_name
+        trainerthread = None
+        if brain_name in self.trainers:
+            trainer = self.trainers[brain_name]
+        else:
+            trainer = self.trainer_factory.generate(brain_name)
+            self.trainers[brain_name] = trainer
+            if trainer.threaded:
+                # Only create trainer thread for new trainers
+                trainerthread = threading.Thread(
+                    target=self.trainer_update_func, args=(trainer,), daemon=True
+                )
+                self.trainer_threads.append(trainerthread)
+            env_manager.on_training_started(
+                brain_name, self.trainer_factory.trainer_config[brain_name]
+            )
+
+        policy = trainer.create_policy(
+            parsed_behavior_id,
+            env_manager.training_behaviors[name_behavior_id],
+        )
+        trainer.add_policy(parsed_behavior_id, policy)
+
+        agent_manager = AgentManager(
+            policy,
+            name_behavior_id,
+            trainer.stats_reporter,
+            trainer.parameters.time_horizon,
+            threaded=trainer.threaded,
+        )
+        env_manager.set_agent_manager(name_behavior_id, agent_manager)
+        env_manager.set_policy(name_behavior_id, policy)
+        self.brain_name_to_identifier[brain_name].add(name_behavior_id)
+
+        trainer.publish_policy_queue(agent_manager.policy_queue)
+        trainer.subscribe_trajectory_queue(agent_manager.trajectory_queue)
+
+        # Only start new trainers
+        if trainerthread is not None:
+            trainerthread.start()
+
+    def _create_trainers_and_managers(
+        self, env_manager: EnvManager, behavior_ids: Set[str]
+    ) -> None:
+        for behavior_id in behavior_ids:
+            self._create_trainer_and_manager(env_manager, behavior_id)
+
+    @timed
+    def start_learning(self, env_manager: EnvManager) -> None:
+        self._create_output_path(self.output_path)
+        try:
+            # Initial reset
+            self._reset_env(env_manager)
+            self.param_manager.log_current_lesson()
+            while self._not_done_training():
+                n_steps = self.advance(env_manager)
+                for _ in range(n_steps):
+                    self.reset_env_if_ready(env_manager)
+            # Stop advancing trainers
+            self.join_threads()
+        except (
+            KeyboardInterrupt,
+            UnityCommunicationException,
+            UnityEnvironmentException,
+            UnityCommunicatorStoppedException,
+        ) as ex:
+            self.join_threads()
+            self.logger.info(
+                "Learning was interrupted. Please wait while the graph is generated."
+            )
+            if isinstance(ex, KeyboardInterrupt) or isinstance(
+                ex, UnityCommunicatorStoppedException
+            ):
+                pass
+            else:
+                # If the environment failed, we want to make sure to raise
+                # the exception so we exit the process with an return code of 1.
+                raise ex
+        finally:
+            if self.train_model:
+                self._save_models()
+
+    def end_trainer_episodes(self) -> None:
+        # Reward buffers reset takes place only for curriculum learning
+        # else no reset.
+        for trainer in self.trainers.values():
+            trainer.end_episode()
+
+    def reset_env_if_ready(self, env: EnvManager) -> None:
+        # Get the sizes of the reward buffers.
+        reward_buff = {k: list(t.reward_buffer) for (k, t) in self.trainers.items()}
+        curr_step = {k: int(t.get_step) for (k, t) in self.trainers.items()}
+        max_step = {k: int(t.get_max_steps) for (k, t) in self.trainers.items()}
+        # Attempt to increment the lessons of the brains who
+        # were ready.
+        updated, param_must_reset = self.param_manager.update_lessons(
+            curr_step, max_step, reward_buff
+        )
+        if updated:
+            for trainer in self.trainers.values():
+                trainer.reward_buffer.clear()
+        # If ghost trainer swapped teams
+        ghost_controller_reset = self.ghost_controller.should_reset()
+        if param_must_reset or ghost_controller_reset:
+            self._reset_env(env)  # This reset also sends the new config to env
+            self.end_trainer_episodes()
+        elif updated:
+            env.set_env_parameters(self.param_manager.get_current_samplers())
+
+    @timed
+    def advance(self, env_manager: EnvManager) -> int:
+        # Get steps
+        with hierarchical_timer("env_step"):
+            new_step_infos = env_manager.get_steps()
+            self._register_new_behaviors(env_manager, new_step_infos)
+            num_steps = env_manager.process_steps(new_step_infos)
+
+        # Report current lesson for each environment parameter
+        for (
+            param_name,
+            lesson_number,
+        ) in self.param_manager.get_current_lesson_number().items():
+            for trainer in self.trainers.values():
+                trainer.stats_reporter.set_stat(
+                    f"Environment/Lesson Number/{param_name}", lesson_number
+                )
+
+        for trainer in self.trainers.values():
+            if not trainer.threaded:
+                with hierarchical_timer("trainer_advance"):
+                    trainer.advance()
+
+        return num_steps
+
+    def _register_new_behaviors(
+        self, env_manager: EnvManager, step_infos: List[EnvironmentStep]
+    ) -> None:
+        """
+        Handle registration (adding trainers and managers) of new behaviors ids.
+        :param env_manager:
+        :param step_infos:
+        :return:
+        """
+        step_behavior_ids: Set[str] = set()
+        for s in step_infos:
+            step_behavior_ids |= set(s.name_behavior_ids)
+        new_behavior_ids = step_behavior_ids - self.registered_behavior_ids
+        self._create_trainers_and_managers(env_manager, new_behavior_ids)
+        self.registered_behavior_ids |= step_behavior_ids
+
+    def join_threads(self, timeout_seconds: float = 1.0) -> None:
+        """
+        Wait for threads to finish, and merge their timer information into the main thread.
+        :param timeout_seconds:
+        :return:
+        """
+        self.kill_trainers = True
+        for t in self.trainer_threads:
+            try:
+                t.join(timeout_seconds)
+            except Exception:
+                pass
+
+        with hierarchical_timer("trainer_threads") as main_timer_node:
+            for trainer_thread in self.trainer_threads:
+                thread_timer_stack = get_timer_stack_for_thread(trainer_thread)
+                if thread_timer_stack:
+                    main_timer_node.merge(
+                        thread_timer_stack.root,
+                        root_name="thread_root",
+                        is_parallel=True,
+                    )
+                    merge_gauges(thread_timer_stack.gauges)
+
+    def trainer_update_func(self, trainer: Trainer) -> None:
+        while not self.kill_trainers:
+            with hierarchical_timer("trainer_advance"):
+                trainer.advance()
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/training_analytics_side_channel.py b/MLPY/Lib/site-packages/mlagents/trainers/training_analytics_side_channel.py
new file mode 100644
index 0000000000000000000000000000000000000000..84edad717534342d943feffe6a3c44de270a1f61
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/training_analytics_side_channel.py
@@ -0,0 +1,188 @@
+import copy
+import json
+import hmac
+import hashlib
+import sys
+from typing import Optional, Dict
+import mlagents_envs
+import mlagents.trainers
+from mlagents import torch_utils
+from mlagents.trainers.settings import RewardSignalType
+from mlagents_envs.exception import UnityCommunicationException
+from mlagents_envs.side_channel import (
+    IncomingMessage,
+    OutgoingMessage,
+    DefaultTrainingAnalyticsSideChannel,
+)
+from mlagents_envs.communicator_objects.training_analytics_pb2 import (
+    TrainingEnvironmentInitialized,
+    TrainingBehaviorInitialized,
+)
+from google.protobuf.any_pb2 import Any
+
+from mlagents.trainers.settings import TrainerSettings, RunOptions
+
+
+class TrainingAnalyticsSideChannel(DefaultTrainingAnalyticsSideChannel):
+    """
+    Side channel that sends information about the training to the Unity environment so it can be logged.
+    """
+
+    __vendorKey: str = "unity.ml-agents"
+
+    def __init__(self) -> None:
+        # >>> uuid.uuid5(uuid.NAMESPACE_URL, "com.unity.ml-agents/TrainingAnalyticsSideChannel")
+        # UUID('b664a4a9-d86f-5a5f-95cb-e8353a7e8356')
+        # Use the same uuid as the parent side channel
+        super().__init__()
+        self.run_options: Optional[RunOptions] = None
+
+    @classmethod
+    def _hash(cls, data: str) -> str:
+        res = hmac.new(
+            cls.__vendorKey.encode("utf-8"), data.encode("utf-8"), hashlib.sha256
+        ).hexdigest()
+        return res
+
+    def on_message_received(self, msg: IncomingMessage) -> None:
+        raise UnityCommunicationException(
+            "The TrainingAnalyticsSideChannel received a message from Unity, "
+            "this should not have happened."
+        )
+
+    @classmethod
+    def _sanitize_run_options(cls, config: RunOptions) -> Dict[str, Any]:
+        res = copy.deepcopy(config.as_dict())
+
+        # Filter potentially PII behavior names
+        if "behaviors" in res and res["behaviors"]:
+            res["behaviors"] = {cls._hash(k): v for (k, v) in res["behaviors"].items()}
+            for (k, v) in res["behaviors"].items():
+                if "init_path" in v and v["init_path"] is not None:
+                    hashed_path = cls._hash(v["init_path"])
+                    res["behaviors"][k]["init_path"] = hashed_path
+                if "demo_path" in v and v["demo_path"] is not None:
+                    hashed_path = cls._hash(v["demo_path"])
+                    res["behaviors"][k]["demo_path"] = hashed_path
+
+        # Filter potentially PII curriculum and behavior names from Checkpoint Settings
+        if "environment_parameters" in res and res["environment_parameters"]:
+            res["environment_parameters"] = {
+                cls._hash(k): v for (k, v) in res["environment_parameters"].items()
+            }
+            for (curriculumName, curriculum) in res["environment_parameters"].items():
+                updated_lessons = []
+                for lesson in curriculum["curriculum"]:
+                    new_lesson = copy.deepcopy(lesson)
+                    if "name" in lesson:
+                        new_lesson["name"] = cls._hash(lesson["name"])
+                    if (
+                        "completion_criteria" in lesson
+                        and lesson["completion_criteria"] is not None
+                    ):
+                        new_lesson["completion_criteria"]["behavior"] = cls._hash(
+                            new_lesson["completion_criteria"]["behavior"]
+                        )
+                    updated_lessons.append(new_lesson)
+                res["environment_parameters"][curriculumName][
+                    "curriculum"
+                ] = updated_lessons
+
+        # Filter potentially PII filenames from Checkpoint Settings
+        if "checkpoint_settings" in res and res["checkpoint_settings"] is not None:
+            if (
+                "initialize_from" in res["checkpoint_settings"]
+                and res["checkpoint_settings"]["initialize_from"] is not None
+            ):
+                res["checkpoint_settings"]["initialize_from"] = cls._hash(
+                    res["checkpoint_settings"]["initialize_from"]
+                )
+            if (
+                "results_dir" in res["checkpoint_settings"]
+                and res["checkpoint_settings"]["results_dir"] is not None
+            ):
+                res["checkpoint_settings"]["results_dir"] = hash(
+                    res["checkpoint_settings"]["results_dir"]
+                )
+
+        return res
+
+    def environment_initialized(self, run_options: RunOptions) -> None:
+        self.run_options = run_options
+        # Tuple of (major, minor, patch)
+        vi = sys.version_info
+        env_params = run_options.environment_parameters
+        sanitized_run_options = self._sanitize_run_options(run_options)
+
+        msg = TrainingEnvironmentInitialized(
+            python_version=f"{vi[0]}.{vi[1]}.{vi[2]}",
+            mlagents_version=mlagents.trainers.__version__,
+            mlagents_envs_version=mlagents_envs.__version__,
+            torch_version=torch_utils.torch.__version__,
+            torch_device_type=torch_utils.default_device().type,
+            num_envs=run_options.env_settings.num_envs,
+            num_environment_parameters=len(env_params) if env_params else 0,
+            run_options=json.dumps(sanitized_run_options),
+        )
+
+        any_message = Any()
+        any_message.Pack(msg)
+
+        env_init_msg = OutgoingMessage()
+        env_init_msg.set_raw_bytes(any_message.SerializeToString())
+        super().queue_message_to_send(env_init_msg)
+
+    @classmethod
+    def _sanitize_trainer_settings(cls, config: TrainerSettings) -> Dict[str, Any]:
+        config_dict = copy.deepcopy(config.as_dict())
+        if "init_path" in config_dict and config_dict["init_path"] is not None:
+            hashed_path = cls._hash(config_dict["init_path"])
+            config_dict["init_path"] = hashed_path
+        if "demo_path" in config_dict and config_dict["demo_path"] is not None:
+            hashed_path = cls._hash(config_dict["demo_path"])
+            config_dict["demo_path"] = hashed_path
+        return config_dict
+
+    def training_started(self, behavior_name: str, config: TrainerSettings) -> None:
+        raw_config = self._sanitize_trainer_settings(config)
+        msg = TrainingBehaviorInitialized(
+            behavior_name=self._hash(behavior_name),
+            trainer_type=config.trainer_type,
+            extrinsic_reward_enabled=(
+                RewardSignalType.EXTRINSIC in config.reward_signals
+            ),
+            gail_reward_enabled=(RewardSignalType.GAIL in config.reward_signals),
+            curiosity_reward_enabled=(
+                RewardSignalType.CURIOSITY in config.reward_signals
+            ),
+            rnd_reward_enabled=(RewardSignalType.RND in config.reward_signals),
+            behavioral_cloning_enabled=config.behavioral_cloning is not None,
+            recurrent_enabled=config.network_settings.memory is not None,
+            visual_encoder=config.network_settings.vis_encode_type.value,
+            num_network_layers=config.network_settings.num_layers,
+            num_network_hidden_units=config.network_settings.hidden_units,
+            trainer_threaded=config.threaded,
+            self_play_enabled=config.self_play is not None,
+            curriculum_enabled=self._behavior_uses_curriculum(behavior_name),
+            config=json.dumps(raw_config),
+        )
+
+        any_message = Any()
+        any_message.Pack(msg)
+
+        training_start_msg = OutgoingMessage()
+        training_start_msg.set_raw_bytes(any_message.SerializeToString())
+
+        super().queue_message_to_send(training_start_msg)
+
+    def _behavior_uses_curriculum(self, behavior_name: str) -> bool:
+        if not self.run_options or not self.run_options.environment_parameters:
+            return False
+
+        for param_settings in self.run_options.environment_parameters.values():
+            for lesson in param_settings.curriculum:
+                cc = lesson.completion_criteria
+                if cc and cc.behavior == behavior_name:
+                    return True
+
+        return False
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/training_status.py b/MLPY/Lib/site-packages/mlagents/trainers/training_status.py
new file mode 100644
index 0000000000000000000000000000000000000000..06bd73cd23af6e838675f815d859cc6e184a8f2b
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/training_status.py
@@ -0,0 +1,118 @@
+from typing import Dict, Any
+from enum import Enum
+from collections import defaultdict
+import json
+import attr
+import cattr
+
+from mlagents.torch_utils import torch
+from mlagents_envs.logging_util import get_logger
+from mlagents.trainers import __version__
+from mlagents.trainers.exception import TrainerError
+
+logger = get_logger(__name__)
+
+STATUS_FORMAT_VERSION = "0.3.0"
+
+
+class StatusType(Enum):
+    LESSON_NUM = "lesson_num"
+    STATS_METADATA = "metadata"
+    CHECKPOINTS = "checkpoints"
+    FINAL_CHECKPOINT = "final_checkpoint"
+    ELO = "elo"
+
+
+@attr.s(auto_attribs=True)
+class StatusMetaData:
+    stats_format_version: str = STATUS_FORMAT_VERSION
+    mlagents_version: str = __version__
+    torch_version: str = torch.__version__
+
+    def to_dict(self) -> Dict[str, str]:
+        return cattr.unstructure(self)
+
+    @staticmethod
+    def from_dict(import_dict: Dict[str, str]) -> "StatusMetaData":
+        return cattr.structure(import_dict, StatusMetaData)
+
+    def check_compatibility(self, other: "StatusMetaData") -> None:
+        """
+        Check compatibility with a loaded StatsMetaData and warn the user
+        if versions mismatch. This is used for resuming from old checkpoints.
+        """
+        # This should cover all stats version mismatches as well.
+        if self.mlagents_version != other.mlagents_version:
+            logger.warning(
+                "Checkpoint was loaded from a different version of ML-Agents. Some things may not resume properly."
+            )
+        if self.torch_version != other.torch_version:
+            logger.warning(
+                "PyTorch checkpoint was saved with a different version of PyTorch. Model may not resume properly."
+            )
+
+
+class GlobalTrainingStatus:
+    """
+    GlobalTrainingStatus class that contains static methods to save global training status and
+    load it on a resume. These are values that might be needed for the training resume that
+    cannot/should not be captured in a model checkpoint, such as curriclum lesson.
+    """
+
+    saved_state: Dict[str, Dict[str, Any]] = defaultdict(lambda: {})
+
+    @staticmethod
+    def load_state(path: str) -> None:
+        """
+        Load a JSON file that contains saved state.
+        :param path: Path to the JSON file containing the state.
+        """
+        try:
+            with open(path) as f:
+                loaded_dict = json.load(f)
+            # Compare the metadata
+            _metadata = loaded_dict[StatusType.STATS_METADATA.value]
+            StatusMetaData.from_dict(_metadata).check_compatibility(StatusMetaData())
+            # Update saved state.
+            GlobalTrainingStatus.saved_state.update(loaded_dict)
+        except FileNotFoundError:
+            logger.warning(
+                "Training status file not found. Not all functions will resume properly."
+            )
+        except KeyError:
+            raise TrainerError(
+                "Metadata not found, resuming from an incompatible version of ML-Agents."
+            )
+
+    @staticmethod
+    def save_state(path: str) -> None:
+        """
+        Save a JSON file that contains saved state.
+        :param path: Path to the JSON file containing the state.
+        """
+        GlobalTrainingStatus.saved_state[
+            StatusType.STATS_METADATA.value
+        ] = StatusMetaData().to_dict()
+        with open(path, "w") as f:
+            json.dump(GlobalTrainingStatus.saved_state, f, indent=4)
+
+    @staticmethod
+    def set_parameter_state(category: str, key: StatusType, value: Any) -> None:
+        """
+        Stores an arbitrary-named parameter in the global saved state.
+        :param category: The category (usually behavior name) of the parameter.
+        :param key: The parameter, e.g. lesson number.
+        :param value: The value.
+        """
+        GlobalTrainingStatus.saved_state[category][key.value] = value
+
+    @staticmethod
+    def get_parameter_state(category: str, key: StatusType) -> Any:
+        """
+        Loads an arbitrary-named parameter from training_status.json.
+        If not found, returns None.
+        :param category: The category (usually behavior name) of the parameter.
+        :param key: The statistic, e.g. lesson number.
+        :param value: The value.
+        """
+        return GlobalTrainingStatus.saved_state[category].get(key.value, None)
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/trajectory.py b/MLPY/Lib/site-packages/mlagents/trainers/trajectory.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a08bc24b497585357b4273b7d048a4218161ac0
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/trajectory.py
@@ -0,0 +1,313 @@
+from typing import List, NamedTuple
+import numpy as np
+
+from mlagents.trainers.buffer import (
+    AgentBuffer,
+    ObservationKeyPrefix,
+    AgentBufferKey,
+    BufferKey,
+)
+from mlagents_envs.base_env import ActionTuple
+from mlagents.trainers.torch_entities.action_log_probs import LogProbsTuple
+
+
+class AgentStatus(NamedTuple):
+    """
+    Stores observation, action, and reward for an agent. Does not have additional
+    fields that are present in AgentExperience.
+    """
+
+    obs: List[np.ndarray]
+    reward: float
+    action: ActionTuple
+    done: bool
+
+
+class AgentExperience(NamedTuple):
+    """
+    Stores the full amount of data for an agent in one timestep. Includes
+    the status' of group mates and the group reward, as well as the probabilities
+    outputted by the policy.
+    """
+
+    obs: List[np.ndarray]
+    reward: float
+    done: bool
+    action: ActionTuple
+    action_probs: LogProbsTuple
+    action_mask: np.ndarray
+    prev_action: np.ndarray
+    interrupted: bool
+    memory: np.ndarray
+    group_status: List[AgentStatus]
+    group_reward: float
+
+
+class ObsUtil:
+    @staticmethod
+    def get_name_at(index: int) -> AgentBufferKey:
+        """
+        returns the name of the observation given the index of the observation
+        """
+        return ObservationKeyPrefix.OBSERVATION, index
+
+    @staticmethod
+    def get_name_at_next(index: int) -> AgentBufferKey:
+        """
+        returns the name of the next observation given the index of the observation
+        """
+        return ObservationKeyPrefix.NEXT_OBSERVATION, index
+
+    @staticmethod
+    def from_buffer(batch: AgentBuffer, num_obs: int) -> List[np.array]:
+        """
+        Creates the list of observations from an AgentBuffer
+        """
+        result: List[np.array] = []
+        for i in range(num_obs):
+            result.append(batch[ObsUtil.get_name_at(i)])
+        return result
+
+    @staticmethod
+    def from_buffer_next(batch: AgentBuffer, num_obs: int) -> List[np.array]:
+        """
+        Creates the list of next observations from an AgentBuffer
+        """
+        result = []
+        for i in range(num_obs):
+            result.append(batch[ObsUtil.get_name_at_next(i)])
+        return result
+
+
+class GroupObsUtil:
+    @staticmethod
+    def get_name_at(index: int) -> AgentBufferKey:
+        """
+        returns the name of the observation given the index of the observation
+        """
+        return ObservationKeyPrefix.GROUP_OBSERVATION, index
+
+    @staticmethod
+    def get_name_at_next(index: int) -> AgentBufferKey:
+        """
+        returns the name of the next team observation given the index of the observation
+        """
+        return ObservationKeyPrefix.NEXT_GROUP_OBSERVATION, index
+
+    @staticmethod
+    def _transpose_list_of_lists(
+        list_list: List[List[np.ndarray]],
+    ) -> List[List[np.ndarray]]:
+        return list(map(list, zip(*list_list)))
+
+    @staticmethod
+    def from_buffer(batch: AgentBuffer, num_obs: int) -> List[np.array]:
+        """
+        Creates the list of observations from an AgentBuffer
+        """
+        separated_obs: List[np.array] = []
+        for i in range(num_obs):
+            separated_obs.append(
+                batch[GroupObsUtil.get_name_at(i)].padded_to_batch(pad_value=np.nan)
+            )
+        # separated_obs contains a List(num_obs) of Lists(num_agents), we want to flip
+        # that and get a List(num_agents) of Lists(num_obs)
+        result = GroupObsUtil._transpose_list_of_lists(separated_obs)
+        return result
+
+    @staticmethod
+    def from_buffer_next(batch: AgentBuffer, num_obs: int) -> List[np.array]:
+        """
+        Creates the list of observations from an AgentBuffer
+        """
+        separated_obs: List[np.array] = []
+        for i in range(num_obs):
+            separated_obs.append(
+                batch[GroupObsUtil.get_name_at_next(i)].padded_to_batch(
+                    pad_value=np.nan
+                )
+            )
+        # separated_obs contains a List(num_obs) of Lists(num_agents), we want to flip
+        # that and get a List(num_agents) of Lists(num_obs)
+        result = GroupObsUtil._transpose_list_of_lists(separated_obs)
+        return result
+
+
+class Trajectory(NamedTuple):
+    steps: List[AgentExperience]
+    next_obs: List[
+        np.ndarray
+    ]  # Observation following the trajectory, for bootstrapping
+    next_group_obs: List[List[np.ndarray]]
+    agent_id: str
+    behavior_id: str
+
+    def to_agentbuffer(self) -> AgentBuffer:
+        """
+        Converts a Trajectory to an AgentBuffer
+        :param trajectory: A Trajectory
+        :returns: AgentBuffer. Note that the length of the AgentBuffer will be one
+        less than the trajectory, as the next observation need to be populated from the last
+        step of the trajectory.
+        """
+        agent_buffer_trajectory = AgentBuffer()
+        obs = self.steps[0].obs
+        for step, exp in enumerate(self.steps):
+            is_last_step = step == len(self.steps) - 1
+            if not is_last_step:
+                next_obs = self.steps[step + 1].obs
+            else:
+                next_obs = self.next_obs
+
+            num_obs = len(obs)
+            for i in range(num_obs):
+                agent_buffer_trajectory[ObsUtil.get_name_at(i)].append(obs[i])
+                agent_buffer_trajectory[ObsUtil.get_name_at_next(i)].append(next_obs[i])
+
+            # Take care of teammate obs and actions
+            teammate_continuous_actions, teammate_discrete_actions, teammate_rewards = (
+                [],
+                [],
+                [],
+            )
+            for group_status in exp.group_status:
+                teammate_rewards.append(group_status.reward)
+                teammate_continuous_actions.append(group_status.action.continuous)
+                teammate_discrete_actions.append(group_status.action.discrete)
+
+            # Team actions
+            agent_buffer_trajectory[BufferKey.GROUP_CONTINUOUS_ACTION].append(
+                teammate_continuous_actions
+            )
+            agent_buffer_trajectory[BufferKey.GROUP_DISCRETE_ACTION].append(
+                teammate_discrete_actions
+            )
+            agent_buffer_trajectory[BufferKey.GROUPMATE_REWARDS].append(
+                teammate_rewards
+            )
+            agent_buffer_trajectory[BufferKey.GROUP_REWARD].append(exp.group_reward)
+
+            # Next actions
+            teammate_cont_next_actions = []
+            teammate_disc_next_actions = []
+            if not is_last_step:
+                next_exp = self.steps[step + 1]
+                for group_status in next_exp.group_status:
+                    teammate_cont_next_actions.append(group_status.action.continuous)
+                    teammate_disc_next_actions.append(group_status.action.discrete)
+            else:
+                for group_status in exp.group_status:
+                    teammate_cont_next_actions.append(group_status.action.continuous)
+                    teammate_disc_next_actions.append(group_status.action.discrete)
+
+            agent_buffer_trajectory[BufferKey.GROUP_NEXT_CONT_ACTION].append(
+                teammate_cont_next_actions
+            )
+            agent_buffer_trajectory[BufferKey.GROUP_NEXT_DISC_ACTION].append(
+                teammate_disc_next_actions
+            )
+
+            for i in range(num_obs):
+                ith_group_obs = []
+                for _group_status in exp.group_status:
+                    # Assume teammates have same obs space
+                    ith_group_obs.append(_group_status.obs[i])
+                agent_buffer_trajectory[GroupObsUtil.get_name_at(i)].append(
+                    ith_group_obs
+                )
+
+                ith_group_obs_next = []
+                if is_last_step:
+                    for _obs in self.next_group_obs:
+                        ith_group_obs_next.append(_obs[i])
+                else:
+                    next_group_status = self.steps[step + 1].group_status
+                    for _group_status in next_group_status:
+                        # Assume teammates have same obs space
+                        ith_group_obs_next.append(_group_status.obs[i])
+                agent_buffer_trajectory[GroupObsUtil.get_name_at_next(i)].append(
+                    ith_group_obs_next
+                )
+
+            if exp.memory is not None:
+                agent_buffer_trajectory[BufferKey.MEMORY].append(exp.memory)
+
+            agent_buffer_trajectory[BufferKey.MASKS].append(1.0)
+            agent_buffer_trajectory[BufferKey.DONE].append(exp.done)
+            agent_buffer_trajectory[BufferKey.GROUP_DONES].append(
+                [_status.done for _status in exp.group_status]
+            )
+
+            # Adds the log prob and action of continuous/discrete separately
+            agent_buffer_trajectory[BufferKey.CONTINUOUS_ACTION].append(
+                exp.action.continuous
+            )
+            agent_buffer_trajectory[BufferKey.DISCRETE_ACTION].append(
+                exp.action.discrete
+            )
+
+            if not is_last_step:
+                next_action = self.steps[step + 1].action
+                cont_next_actions = next_action.continuous
+                disc_next_actions = next_action.discrete
+            else:
+                cont_next_actions = np.zeros_like(exp.action.continuous)
+                disc_next_actions = np.zeros_like(exp.action.discrete)
+
+            agent_buffer_trajectory[BufferKey.NEXT_CONT_ACTION].append(
+                cont_next_actions
+            )
+            agent_buffer_trajectory[BufferKey.NEXT_DISC_ACTION].append(
+                disc_next_actions
+            )
+
+            agent_buffer_trajectory[BufferKey.CONTINUOUS_LOG_PROBS].append(
+                exp.action_probs.continuous
+            )
+            agent_buffer_trajectory[BufferKey.DISCRETE_LOG_PROBS].append(
+                exp.action_probs.discrete
+            )
+
+            # Store action masks if necessary. Note that 1 means active, while
+            # in AgentExperience False means active.
+            if exp.action_mask is not None:
+                mask = 1 - np.concatenate(exp.action_mask)
+                agent_buffer_trajectory[BufferKey.ACTION_MASK].append(
+                    mask, padding_value=1
+                )
+            else:
+                # This should never be needed unless the environment somehow doesn't supply the
+                # action mask in a discrete space.
+
+                action_shape = exp.action.discrete.shape
+                agent_buffer_trajectory[BufferKey.ACTION_MASK].append(
+                    np.ones(action_shape, dtype=np.float32), padding_value=1
+                )
+            agent_buffer_trajectory[BufferKey.PREV_ACTION].append(exp.prev_action)
+            agent_buffer_trajectory[BufferKey.ENVIRONMENT_REWARDS].append(exp.reward)
+
+            # Store the next visual obs as the current
+            obs = next_obs
+        return agent_buffer_trajectory
+
+    @property
+    def done_reached(self) -> bool:
+        """
+        Returns true if trajectory is terminated with a Done.
+        """
+        return self.steps[-1].done
+
+    @property
+    def all_group_dones_reached(self) -> bool:
+        """
+        Returns true if all other agents in this trajectory are done at the end of the trajectory.
+        Combine with done_reached to check if the whole team is done.
+        """
+        return all(_status.done for _status in self.steps[-1].group_status)
+
+    @property
+    def interrupted(self) -> bool:
+        """
+        Returns true if trajectory was terminated because max steps was reached.
+        """
+        return self.steps[-1].interrupted
diff --git a/MLPY/Lib/site-packages/mlagents/trainers/upgrade_config.py b/MLPY/Lib/site-packages/mlagents/trainers/upgrade_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d07ce0016d11076a8dad6d8ea206248ff1bacc4b
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents/trainers/upgrade_config.py
@@ -0,0 +1,251 @@
+# NOTE: This upgrade script is a temporary measure for the transition between the old-format
+# configuration file and the new format. It will be marked for deprecation once the
+# Python CLI and configuration files are finalized, and removed the following release.
+
+import attr
+import cattr
+import yaml
+from typing import Dict, Any, Optional
+import argparse
+from mlagents.trainers.settings import TrainerSettings, NetworkSettings
+from mlagents.trainers.cli_utils import load_config
+from mlagents.trainers.exception import TrainerConfigError
+from mlagents.plugins import all_trainer_settings
+
+
+# Take an existing trainer config (e.g. trainer_config.yaml) and turn it into the new format.
+def convert_behaviors(old_trainer_config: Dict[str, Any]) -> Dict[str, Any]:
+    all_behavior_config_dict = {}
+    default_config = old_trainer_config.get("default", {})
+    for behavior_name, config in old_trainer_config.items():
+        if behavior_name != "default":
+            config = default_config.copy()
+            config.update(old_trainer_config[behavior_name])
+
+            # Convert to split TrainerSettings, Hyperparameters, NetworkSettings
+            # Set trainer_type and get appropriate hyperparameter settings
+            try:
+                trainer_type = config["trainer"]
+            except KeyError:
+                raise TrainerConfigError(
+                    "Config doesn't specify a trainer type. "
+                    "Please specify trainer: in your config."
+                )
+            new_config = {}
+            new_config["trainer_type"] = trainer_type
+            hyperparam_cls = all_trainer_settings[trainer_type]
+            # Try to absorb as much as possible into the hyperparam_cls
+            new_config["hyperparameters"] = cattr.structure(config, hyperparam_cls)
+
+            # Try to absorb as much as possible into the network settings
+            new_config["network_settings"] = cattr.structure(config, NetworkSettings)
+            # Deal with recurrent
+            try:
+                if config["use_recurrent"]:
+                    new_config[
+                        "network_settings"
+                    ].memory = NetworkSettings.MemorySettings(
+                        sequence_length=config["sequence_length"],
+                        memory_size=config["memory_size"],
+                    )
+            except KeyError:
+                raise TrainerConfigError(
+                    "Config doesn't specify use_recurrent. "
+                    "Please specify true or false for use_recurrent in your config."
+                )
+            # Absorb the rest into the base TrainerSettings
+            for key, val in config.items():
+                if key in attr.fields_dict(TrainerSettings):
+                    new_config[key] = val
+
+            # Structure the whole thing
+            all_behavior_config_dict[behavior_name] = cattr.structure(
+                new_config, TrainerSettings
+            )
+    return all_behavior_config_dict
+
+
+def write_to_yaml_file(unstructed_config: Dict[str, Any], output_config: str) -> None:
+    with open(output_config, "w") as f:
+        try:
+            yaml.dump(unstructed_config, f, sort_keys=False)
+        except TypeError:  # Older versions of pyyaml don't support sort_keys
+            yaml.dump(unstructed_config, f)
+
+
+def remove_nones(config: Dict[Any, Any]) -> Dict[str, Any]:
+    new_config = {}
+    for key, val in config.items():
+        if isinstance(val, dict):
+            new_config[key] = remove_nones(val)
+        elif val is not None:
+            new_config[key] = val
+    return new_config
+
+
+# Take a sampler from the old format and convert to new sampler structure
+def convert_samplers(old_sampler_config: Dict[str, Any]) -> Dict[str, Any]:
+    new_sampler_config: Dict[str, Any] = {}
+    for parameter, parameter_config in old_sampler_config.items():
+        if parameter == "resampling-interval":
+            print(
+                "resampling-interval is no longer necessary for parameter randomization and is being ignored."
+            )
+            continue
+        new_sampler_config[parameter] = {}
+        new_sampler_config[parameter]["sampler_type"] = parameter_config["sampler-type"]
+        new_samp_parameters = dict(parameter_config)  # Copy dict
+        new_samp_parameters.pop("sampler-type")
+        new_sampler_config[parameter]["sampler_parameters"] = new_samp_parameters
+    return new_sampler_config
+
+
+def convert_samplers_and_curriculum(
+    parameter_dict: Dict[str, Any], curriculum: Dict[str, Any]
+) -> Dict[str, Any]:
+    for key, sampler in parameter_dict.items():
+        if "sampler_parameters" not in sampler:
+            parameter_dict[key]["sampler_parameters"] = {}
+        for argument in [
+            "seed",
+            "min_value",
+            "max_value",
+            "mean",
+            "st_dev",
+            "intervals",
+        ]:
+            if argument in sampler:
+                parameter_dict[key]["sampler_parameters"][argument] = sampler[argument]
+                parameter_dict[key].pop(argument)
+    param_set = set(parameter_dict.keys())
+    for behavior_name, behavior_dict in curriculum.items():
+        measure = behavior_dict["measure"]
+        min_lesson_length = behavior_dict.get("min_lesson_length", 1)
+        signal_smoothing = behavior_dict.get("signal_smoothing", False)
+        thresholds = behavior_dict["thresholds"]
+        num_lessons = len(thresholds) + 1
+        parameters = behavior_dict["parameters"]
+        for param_name in parameters.keys():
+            if param_name in param_set:
+                print(
+                    f"The parameter {param_name} has both a sampler and a curriculum. Will ignore curriculum"
+                )
+            else:
+                param_set.add(param_name)
+                parameter_dict[param_name] = {"curriculum": []}
+                for lesson_index in range(num_lessons - 1):
+                    parameter_dict[param_name]["curriculum"].append(
+                        {
+                            f"Lesson{lesson_index}": {
+                                "completion_criteria": {
+                                    "measure": measure,
+                                    "behavior": behavior_name,
+                                    "signal_smoothing": signal_smoothing,
+                                    "min_lesson_length": min_lesson_length,
+                                    "threshold": thresholds[lesson_index],
+                                },
+                                "value": parameters[param_name][lesson_index],
+                            }
+                        }
+                    )
+                lesson_index += 1  # This is the last lesson
+                parameter_dict[param_name]["curriculum"].append(
+                    {
+                        f"Lesson{lesson_index}": {
+                            "value": parameters[param_name][lesson_index]
+                        }
+                    }
+                )
+    return parameter_dict
+
+
+def parse_args():
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    argparser.add_argument(
+        "trainer_config_path",
+        help="Path to old format (<=0.18.X) trainer configuration YAML.",
+    )
+    argparser.add_argument(
+        "--curriculum",
+        help="Path to old format (<=0.16.X) curriculum configuration YAML.",
+        default=None,
+    )
+    argparser.add_argument(
+        "--sampler",
+        help="Path to old format (<=0.16.X) parameter randomization configuration YAML.",
+        default=None,
+    )
+    argparser.add_argument(
+        "output_config_path", help="Path to write converted YAML file."
+    )
+    args = argparser.parse_args()
+    return args
+
+
+def convert(
+    config: Dict[str, Any],
+    old_curriculum: Optional[Dict[str, Any]],
+    old_param_random: Optional[Dict[str, Any]],
+) -> Dict[str, Any]:
+    if "behaviors" not in config:
+        print("Config file format version :  version <= 0.16.X")
+        behavior_config_dict = convert_behaviors(config)
+        full_config = {"behaviors": behavior_config_dict}
+
+        # Convert curriculum and sampler. note that we don't validate these; if it was correct
+        # before it should be correct now.
+        if old_curriculum is not None:
+            full_config["curriculum"] = old_curriculum
+
+        if old_param_random is not None:
+            sampler_config_dict = convert_samplers(old_param_random)
+            full_config["parameter_randomization"] = sampler_config_dict
+
+        # Convert config to dict
+        config = cattr.unstructure(full_config)
+    if "curriculum" in config or "parameter_randomization" in config:
+        print("Config file format version :  0.16.X < version <= 0.18.X")
+        full_config = {"behaviors": config["behaviors"]}
+
+        param_randomization = config.get("parameter_randomization", {})
+        if "resampling-interval" in param_randomization:
+            param_randomization.pop("resampling-interval")
+        if len(param_randomization) > 0:
+            # check if we use the old format sampler-type vs sampler_type
+            if (
+                "sampler-type"
+                in param_randomization[list(param_randomization.keys())[0]]
+            ):
+                param_randomization = convert_samplers(param_randomization)
+
+        full_config["environment_parameters"] = convert_samplers_and_curriculum(
+            param_randomization, config.get("curriculum", {})
+        )
+
+        # Convert config to dict
+        config = cattr.unstructure(full_config)
+    return config
+
+
+def main() -> None:
+    args = parse_args()
+    print(
+        f"Converting {args.trainer_config_path} and saving to {args.output_config_path}."
+    )
+
+    old_config = load_config(args.trainer_config_path)
+    curriculum_config_dict = None
+    old_sampler_config_dict = None
+    if args.curriculum is not None:
+        curriculum_config_dict = load_config(args.curriculum)
+    if args.sampler is not None:
+        old_sampler_config_dict = load_config(args.sampler)
+    new_config = convert(old_config, curriculum_config_dict, old_sampler_config_dict)
+    unstructed_config = remove_nones(new_config)
+    write_to_yaml_file(unstructed_config, args.output_config_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/mlagents_envs/__init__.py b/MLPY/Lib/site-packages/mlagents_envs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8476df42fa19998794ea2bcbe8c61682f45f5df5
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/__init__.py
@@ -0,0 +1,5 @@
+# Version of the library that will be used to upload to pypi
+__version__ = "0.30.0"
+
+# Git tag that will be checked to determine whether to trigger upload to pypi
+__release_tag__ = "release_20"
diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d414c60900f0bf5969f19f90bfafa2d42d6a25fd
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/base_env.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/base_env.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a661c51e6eafcad8c6a9f627d6dabf73e4d65010
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/base_env.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/communicator.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/communicator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58704ce2657ef36a367919582913a9c3497cd756
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/communicator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/env_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/env_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6fe8671cc9f90efa7b736f754e17e17cd8e5b34
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/env_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/environment.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/environment.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47a4ef70a910b67fdb2896104d3fe6b8ed10de45
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/environment.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/exception.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/exception.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2635a0f569a3be6c49005d9a936279b674163ba
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/exception.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/logging_util.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/logging_util.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d5c448e5ff9a6aee53675e0da116ee27df94ec5
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/logging_util.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/mock_communicator.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/mock_communicator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c735ea63d0e503ea575e30b347a019060f23d471
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/mock_communicator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/rpc_communicator.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/rpc_communicator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..837666d9f77e9434408e60c9137aeb344cef8fbf
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/rpc_communicator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/rpc_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/rpc_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2822e620b37ea65d8ff251f34ec866facdece9c2
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/rpc_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/__pycache__/timers.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/timers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92d5cebde9fae7c7add491d217b42ccc57b47b16
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/__pycache__/timers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/base_env.py b/MLPY/Lib/site-packages/mlagents_envs/base_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..a993d8a7c509c34d1a2621195c840e4c9e14a099
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/base_env.py
@@ -0,0 +1,616 @@
+"""
+Python Environment API for the ML-Agents Toolkit
+The aim of this API is to expose Agents evolving in a simulation
+to perform reinforcement learning on.
+This API supports multi-agent scenarios and groups similar Agents (same
+observations, actions spaces and behavior) together. These groups of Agents are
+identified by their BehaviorName.
+For performance reasons, the data of each group of agents is processed in a
+batched manner. Agents are identified by a unique AgentId identifier that
+allows tracking of Agents across simulation steps. Note that there is no
+guarantee that the number or order of the Agents in the state will be
+consistent across simulation steps.
+A simulation steps corresponds to moving the simulation forward until at least
+one agent in the simulation sends its observations to Python again. Since
+Agents can request decisions at different frequencies, a simulation step does
+not necessarily correspond to a fixed simulation time increment.
+"""
+
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from typing import (
+    List,
+    NamedTuple,
+    Tuple,
+    Optional,
+    Dict,
+    Iterator,
+    Any,
+    Mapping as MappingType,
+)
+from enum import IntFlag, Enum
+import numpy as np
+
+from mlagents_envs.exception import UnityActionException
+
+AgentId = int
+GroupId = int
+BehaviorName = str
+
+
+class DecisionStep(NamedTuple):
+    """
+    Contains the data a single Agent collected since the last
+    simulation step.
+     - obs is a list of numpy arrays observations collected by the agent.
+     - reward is a float. Corresponds to the rewards collected by the agent
+     since the last simulation step.
+     - agent_id is an int and an unique identifier for the corresponding Agent.
+     - action_mask is an optional list of one dimensional array of booleans.
+     Only available when using multi-discrete actions.
+     Each array corresponds to an action branch. Each array contains a mask
+     for each action of the branch. If true, the action is not available for
+     the agent during this simulation step.
+    """
+
+    obs: List[np.ndarray]
+    reward: float
+    agent_id: AgentId
+    action_mask: Optional[List[np.ndarray]]
+    group_id: int
+    group_reward: float
+
+
+class DecisionSteps(Mapping):
+    """
+    Contains the data a batch of similar Agents collected since the last
+    simulation step. Note that all Agents do not necessarily have new
+    information to send at each simulation step. Therefore, the ordering of
+    agents and the batch size of the DecisionSteps are not fixed across
+    simulation steps.
+     - obs is a list of numpy arrays observations collected by the batch of
+     agent. Each obs has one extra dimension compared to DecisionStep: the
+     first dimension of the array corresponds to the batch size of the batch.
+     - reward is a float vector of length batch size. Corresponds to the
+     rewards collected by each agent since the last simulation step.
+     - agent_id is an int vector of length batch size containing unique
+     identifier for the corresponding Agent. This is used to track Agents
+     across simulation steps.
+     - action_mask is an optional list of two dimensional array of booleans.
+     Only available when using multi-discrete actions.
+     Each array corresponds to an action branch. The first dimension of each
+     array is the batch size and the second contains a mask for each action of
+     the branch. If true, the action is not available for the agent during
+     this simulation step.
+    """
+
+    def __init__(self, obs, reward, agent_id, action_mask, group_id, group_reward):
+        self.obs: List[np.ndarray] = obs
+        self.reward: np.ndarray = reward
+        self.agent_id: np.ndarray = agent_id
+        self.action_mask: Optional[List[np.ndarray]] = action_mask
+        self.group_id: np.ndarray = group_id
+        self.group_reward: np.ndarray = group_reward
+        self._agent_id_to_index: Optional[Dict[AgentId, int]] = None
+
+    @property
+    def agent_id_to_index(self) -> Dict[AgentId, int]:
+        """
+        :returns: A Dict that maps agent_id to the index of those agents in
+        this DecisionSteps.
+        """
+        if self._agent_id_to_index is None:
+            self._agent_id_to_index = {}
+            for a_idx, a_id in enumerate(self.agent_id):
+                self._agent_id_to_index[a_id] = a_idx
+        return self._agent_id_to_index
+
+    def __len__(self) -> int:
+        return len(self.agent_id)
+
+    def __getitem__(self, agent_id: AgentId) -> DecisionStep:
+        """
+        returns the DecisionStep for a specific agent.
+        :param agent_id: The id of the agent
+        :returns: The DecisionStep
+        """
+        if agent_id not in self.agent_id_to_index:
+            raise KeyError(f"agent_id {agent_id} is not present in the DecisionSteps")
+        agent_index = self._agent_id_to_index[agent_id]  # type: ignore
+        agent_obs = []
+        for batched_obs in self.obs:
+            agent_obs.append(batched_obs[agent_index])
+        agent_mask = None
+        if self.action_mask is not None:
+            agent_mask = []
+            for mask in self.action_mask:
+                agent_mask.append(mask[agent_index])
+        group_id = self.group_id[agent_index]
+        return DecisionStep(
+            obs=agent_obs,
+            reward=self.reward[agent_index],
+            agent_id=agent_id,
+            action_mask=agent_mask,
+            group_id=group_id,
+            group_reward=self.group_reward[agent_index],
+        )
+
+    def __iter__(self) -> Iterator[Any]:
+        yield from self.agent_id
+
+    @staticmethod
+    def empty(spec: "BehaviorSpec") -> "DecisionSteps":
+        """
+        Returns an empty DecisionSteps.
+        :param spec: The BehaviorSpec for the DecisionSteps
+        """
+        obs: List[np.ndarray] = []
+        for sen_spec in spec.observation_specs:
+            obs += [np.zeros((0,) + sen_spec.shape, dtype=np.float32)]
+        return DecisionSteps(
+            obs=obs,
+            reward=np.zeros(0, dtype=np.float32),
+            agent_id=np.zeros(0, dtype=np.int32),
+            action_mask=None,
+            group_id=np.zeros(0, dtype=np.int32),
+            group_reward=np.zeros(0, dtype=np.float32),
+        )
+
+
+class TerminalStep(NamedTuple):
+    """
+    Contains the data a single Agent collected when its episode ended.
+     - obs is a list of numpy arrays observations collected by the agent.
+     - reward is a float. Corresponds to the rewards collected by the agent
+     since the last simulation step.
+     - interrupted is a bool. Is true if the Agent was interrupted since the last
+     decision step. For example, if the Agent reached the maximum number of steps for
+     the episode.
+     - agent_id is an int and an unique identifier for the corresponding Agent.
+    """
+
+    obs: List[np.ndarray]
+    reward: float
+    interrupted: bool
+    agent_id: AgentId
+    group_id: GroupId
+    group_reward: float
+
+
+class TerminalSteps(Mapping):
+    """
+    Contains the data a batch of Agents collected when their episode
+    terminated. All Agents present in the TerminalSteps have ended their
+    episode.
+     - obs is a list of numpy arrays observations collected by the batch of
+     agent. Each obs has one extra dimension compared to DecisionStep: the
+     first dimension of the array corresponds to the batch size of the batch.
+     - reward is a float vector of length batch size. Corresponds to the
+     rewards collected by each agent since the last simulation step.
+     - interrupted is an array of booleans of length batch size. Is true if the
+     associated Agent was interrupted since the last decision step. For example, if the
+     Agent reached the maximum number of steps for the episode.
+     - agent_id is an int vector of length batch size containing unique
+     identifier for the corresponding Agent. This is used to track Agents
+     across simulation steps.
+    """
+
+    def __init__(self, obs, reward, interrupted, agent_id, group_id, group_reward):
+        self.obs: List[np.ndarray] = obs
+        self.reward: np.ndarray = reward
+        self.interrupted: np.ndarray = interrupted
+        self.agent_id: np.ndarray = agent_id
+        self.group_id: np.ndarray = group_id
+        self.group_reward: np.ndarray = group_reward
+        self._agent_id_to_index: Optional[Dict[AgentId, int]] = None
+
+    @property
+    def agent_id_to_index(self) -> Dict[AgentId, int]:
+        """
+        :returns: A Dict that maps agent_id to the index of those agents in
+        this TerminalSteps.
+        """
+        if self._agent_id_to_index is None:
+            self._agent_id_to_index = {}
+            for a_idx, a_id in enumerate(self.agent_id):
+                self._agent_id_to_index[a_id] = a_idx
+        return self._agent_id_to_index
+
+    def __len__(self) -> int:
+        return len(self.agent_id)
+
+    def __getitem__(self, agent_id: AgentId) -> TerminalStep:
+        """
+        returns the TerminalStep for a specific agent.
+        :param agent_id: The id of the agent
+        :returns: obs, reward, done, agent_id and optional action mask for a
+        specific agent
+        """
+        if agent_id not in self.agent_id_to_index:
+            raise KeyError(f"agent_id {agent_id} is not present in the TerminalSteps")
+        agent_index = self._agent_id_to_index[agent_id]  # type: ignore
+        agent_obs = []
+        for batched_obs in self.obs:
+            agent_obs.append(batched_obs[agent_index])
+        group_id = self.group_id[agent_index]
+        return TerminalStep(
+            obs=agent_obs,
+            reward=self.reward[agent_index],
+            interrupted=self.interrupted[agent_index],
+            agent_id=agent_id,
+            group_id=group_id,
+            group_reward=self.group_reward[agent_index],
+        )
+
+    def __iter__(self) -> Iterator[Any]:
+        yield from self.agent_id
+
+    @staticmethod
+    def empty(spec: "BehaviorSpec") -> "TerminalSteps":
+        """
+        Returns an empty TerminalSteps.
+        :param spec: The BehaviorSpec for the TerminalSteps
+        """
+        obs: List[np.ndarray] = []
+        for sen_spec in spec.observation_specs:
+            obs += [np.zeros((0,) + sen_spec.shape, dtype=np.float32)]
+        return TerminalSteps(
+            obs=obs,
+            reward=np.zeros(0, dtype=np.float32),
+            interrupted=np.zeros(0, dtype=bool),
+            agent_id=np.zeros(0, dtype=np.int32),
+            group_id=np.zeros(0, dtype=np.int32),
+            group_reward=np.zeros(0, dtype=np.float32),
+        )
+
+
+class _ActionTupleBase(ABC):
+    """
+    An object whose fields correspond to action data of continuous and discrete
+    spaces. Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
+    respectively. Note, this also holds when continuous or discrete size is
+    zero.
+    """
+
+    def __init__(
+        self,
+        continuous: Optional[np.ndarray] = None,
+        discrete: Optional[np.ndarray] = None,
+    ):
+        self._continuous: Optional[np.ndarray] = None
+        self._discrete: Optional[np.ndarray] = None
+        if continuous is not None:
+            self.add_continuous(continuous)
+        if discrete is not None:
+            self.add_discrete(discrete)
+
+    @property
+    def continuous(self) -> np.ndarray:
+        return self._continuous
+
+    @property
+    def discrete(self) -> np.ndarray:
+        return self._discrete
+
+    def add_continuous(self, continuous: np.ndarray) -> None:
+        if continuous.dtype != np.float32:
+            continuous = continuous.astype(np.float32, copy=False)
+        if self._discrete is None:
+            self._discrete = np.zeros(
+                (continuous.shape[0], 0), dtype=self.discrete_dtype
+            )
+        self._continuous = continuous
+
+    def add_discrete(self, discrete: np.ndarray) -> None:
+        if discrete.dtype != self.discrete_dtype:
+            discrete = discrete.astype(self.discrete_dtype, copy=False)
+        if self._continuous is None:
+            self._continuous = np.zeros((discrete.shape[0], 0), dtype=np.float32)
+        self._discrete = discrete
+
+    @property
+    @abstractmethod
+    def discrete_dtype(self) -> np.dtype:
+        pass
+
+
+class ActionTuple(_ActionTupleBase):
+    """
+    An object whose fields correspond to actions of different types.
+    Continuous and discrete actions are numpy arrays of type float32 and
+    int32, respectively and are type checked on construction.
+    Dimensions are of (n_agents, continuous_size) and (n_agents, discrete_size),
+    respectively. Note, this also holds when continuous or discrete size is
+    zero.
+    """
+
+    @property
+    def discrete_dtype(self) -> np.dtype:
+        """
+        The dtype of a discrete action.
+        """
+        return np.int32
+
+
+class ActionSpec(NamedTuple):
+    """
+    A NamedTuple containing utility functions and information about the action spaces
+    for a group of Agents under the same behavior.
+    - num_continuous_actions is an int corresponding to the number of floats which
+    constitute the action.
+    - discrete_branch_sizes is a Tuple of int where each int corresponds to
+    the number of discrete actions available to the agent on an independent action branch.
+    """
+
+    continuous_size: int
+    discrete_branches: Tuple[int, ...]
+
+    def __eq__(self, other):
+        return (
+            self.continuous_size == other.continuous_size
+            and self.discrete_branches == other.discrete_branches
+        )
+
+    def __str__(self):
+        return f"Continuous: {self.continuous_size}, Discrete: {self.discrete_branches}"
+
+    # For backwards compatibility
+    def is_discrete(self) -> bool:
+        """
+        Returns true if this Behavior uses discrete actions
+        """
+        return self.discrete_size > 0 and self.continuous_size == 0
+
+    # For backwards compatibility
+    def is_continuous(self) -> bool:
+        """
+        Returns true if this Behavior uses continuous actions
+        """
+        return self.discrete_size == 0 and self.continuous_size > 0
+
+    @property
+    def discrete_size(self) -> int:
+        """
+        Returns a an int corresponding to the number of discrete branches.
+        """
+        return len(self.discrete_branches)
+
+    def empty_action(self, n_agents: int) -> ActionTuple:
+        """
+        Generates ActionTuple corresponding to an empty action (all zeros)
+        for a number of agents.
+        :param n_agents: The number of agents that will have actions generated
+        """
+        _continuous = np.zeros((n_agents, self.continuous_size), dtype=np.float32)
+        _discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
+        return ActionTuple(continuous=_continuous, discrete=_discrete)
+
+    def random_action(self, n_agents: int) -> ActionTuple:
+        """
+        Generates ActionTuple corresponding to a random action (either discrete
+        or continuous) for a number of agents.
+        :param n_agents: The number of agents that will have actions generated
+        """
+        _continuous = np.random.uniform(
+            low=-1.0, high=1.0, size=(n_agents, self.continuous_size)
+        )
+        _discrete = np.zeros((n_agents, self.discrete_size), dtype=np.int32)
+        if self.discrete_size > 0:
+            _discrete = np.column_stack(
+                [
+                    np.random.randint(
+                        0,
+                        self.discrete_branches[i],  # type: ignore
+                        size=(n_agents),
+                        dtype=np.int32,
+                    )
+                    for i in range(self.discrete_size)
+                ]
+            )
+        return ActionTuple(continuous=_continuous, discrete=_discrete)
+
+    def _validate_action(
+        self, actions: ActionTuple, n_agents: int, name: str
+    ) -> ActionTuple:
+        """
+        Validates that action has the correct action dim
+        for the correct number of agents and ensures the type.
+        """
+        _expected_shape = (n_agents, self.continuous_size)
+        if actions.continuous.shape != _expected_shape:
+            raise UnityActionException(
+                f"The behavior {name} needs a continuous input of dimension "
+                f"{_expected_shape} for (<number of agents>, <action size>) but "
+                f"received input of dimension {actions.continuous.shape}"
+            )
+        _expected_shape = (n_agents, self.discrete_size)
+        if actions.discrete.shape != _expected_shape:
+            raise UnityActionException(
+                f"The behavior {name} needs a discrete input of dimension "
+                f"{_expected_shape} for (<number of agents>, <action size>) but "
+                f"received input of dimension {actions.discrete.shape}"
+            )
+        return actions
+
+    @staticmethod
+    def create_continuous(continuous_size: int) -> "ActionSpec":
+        """
+        Creates an ActionSpec that is homogenously continuous
+        """
+        return ActionSpec(continuous_size, ())
+
+    @staticmethod
+    def create_discrete(discrete_branches: Tuple[int]) -> "ActionSpec":
+        """
+        Creates an ActionSpec that is homogenously discrete
+        """
+        return ActionSpec(0, discrete_branches)
+
+
+class DimensionProperty(IntFlag):
+    """
+    The dimension property of a dimension of an observation.
+    """
+
+    UNSPECIFIED = 0
+    """
+    No properties specified.
+    """
+
+    NONE = 1
+    """
+    No Property of the observation in that dimension. Observation can be processed with
+    Fully connected networks.
+    """
+
+    TRANSLATIONAL_EQUIVARIANCE = 2
+    """
+    Means it is suitable to do a convolution in this dimension.
+    """
+
+    VARIABLE_SIZE = 4
+    """
+    Means that there can be a variable number of observations in this dimension.
+    The observations are unordered.
+    """
+
+
+class ObservationType(Enum):
+    """
+    An Enum which defines the type of information carried in the observation
+    of the agent.
+    """
+
+    DEFAULT = 0
+    """
+    Observation information is generic.
+    """
+
+    GOAL_SIGNAL = 1
+    """
+    Observation contains goal information for current task.
+    """
+
+
+class ObservationSpec(NamedTuple):
+    """
+    A NamedTuple containing information about the observation of Agents.
+    - shape is a Tuple of int : It corresponds to the shape of
+    an observation's dimensions.
+    - dimension_property is a Tuple of DimensionProperties flag, one flag for each
+    dimension.
+    - observation_type is an enum of ObservationType.
+    """
+
+    shape: Tuple[int, ...]
+    dimension_property: Tuple[DimensionProperty, ...]
+    observation_type: ObservationType
+
+    # Optional name. For observations coming from com.unity.ml-agents, this
+    # will be the ISensor name.
+    name: str
+
+
+class BehaviorSpec(NamedTuple):
+    """
+    A NamedTuple containing information about the observation and action
+    spaces for a group of Agents under the same behavior.
+    - observation_specs is a List of ObservationSpec NamedTuple containing
+    information about the information of the Agent's observations such as their shapes.
+    The order of the ObservationSpec is the same as the order of the observations of an
+    agent.
+    - action_spec is an ActionSpec NamedTuple.
+    """
+
+    observation_specs: List[ObservationSpec]
+    action_spec: ActionSpec
+
+
+class BehaviorMapping(Mapping):
+    def __init__(self, specs: Dict[BehaviorName, BehaviorSpec]):
+        self._dict = specs
+
+    def __len__(self) -> int:
+        return len(self._dict)
+
+    def __getitem__(self, behavior: BehaviorName) -> BehaviorSpec:
+        return self._dict[behavior]
+
+    def __iter__(self) -> Iterator[Any]:
+        yield from self._dict
+
+
+class BaseEnv(ABC):
+    @abstractmethod
+    def step(self) -> None:
+        """
+        Signals the environment that it must move the simulation forward
+        by one step.
+        """
+
+    @abstractmethod
+    def reset(self) -> None:
+        """
+        Signals the environment that it must reset the simulation.
+        """
+
+    @abstractmethod
+    def close(self) -> None:
+        """
+        Signals the environment that it must close.
+        """
+
+    @property
+    @abstractmethod
+    def behavior_specs(self) -> MappingType[str, BehaviorSpec]:
+        """
+        Returns a Mapping from behavior names to behavior specs.
+        Agents grouped under the same behavior name have the same action and
+        observation specs, and are expected to behave similarly in the
+        environment.
+        Note that new keys can be added to this mapping as new policies are instantiated.
+        """
+
+    @abstractmethod
+    def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None:
+        """
+        Sets the action for all of the agents in the simulation for the next
+        step. The Actions must be in the same order as the order received in
+        the DecisionSteps.
+        :param behavior_name: The name of the behavior the agents are part of
+        :param action: ActionTuple tuple of continuous and/or discrete action.
+        Actions are np.arrays with dimensions  (n_agents, continuous_size) and
+        (n_agents, discrete_size), respectively.
+        """
+
+    @abstractmethod
+    def set_action_for_agent(
+        self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple
+    ) -> None:
+        """
+        Sets the action for one of the agents in the simulation for the next
+        step.
+        :param behavior_name: The name of the behavior the agent is part of
+        :param agent_id: The id of the agent the action is set for
+        :param action: ActionTuple tuple of continuous and/or discrete action
+        Actions are np.arrays with dimensions  (1, continuous_size) and
+        (1, discrete_size), respectively. Note, this initial dimensions of 1 is because
+        this action is meant for a single agent.
+        """
+
+    @abstractmethod
+    def get_steps(
+        self, behavior_name: BehaviorName
+    ) -> Tuple[DecisionSteps, TerminalSteps]:
+        """
+        Retrieves the steps of the agents that requested a step in the
+        simulation.
+        :param behavior_name: The name of the behavior the agents are part of
+        :return: A tuple containing :
+         - A DecisionSteps NamedTuple containing the observations,
+         the rewards, the agent ids and the action masks for the Agents
+         of the specified behavior. These Agents need an action this step.
+         - A TerminalSteps NamedTuple containing the observations,
+         rewards, agent ids and interrupted flags of the agents that had their
+         episode terminated last step.
+        """
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator.py b/MLPY/Lib/site-packages/mlagents_envs/communicator.py
new file mode 100644
index 0000000000000000000000000000000000000000..2223f34d3a4b4177b69867125b3e449be76f920d
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator.py
@@ -0,0 +1,43 @@
+from typing import Callable, Optional
+from mlagents_envs.communicator_objects.unity_output_pb2 import UnityOutputProto
+from mlagents_envs.communicator_objects.unity_input_pb2 import UnityInputProto
+
+
+# Function to call while waiting for a connection timeout.
+# This should raise an exception if it needs to break from waiting for the timeout.
+PollCallback = Callable[[], None]
+
+
+class Communicator:
+    def __init__(self, worker_id=0, base_port=5005):
+        """
+        Python side of the communication. Must be used in pair with the right Unity Communicator equivalent.
+
+        :int worker_id: Offset from base_port. Used for training multiple environments simultaneously.
+        :int base_port: Baseline port number to connect to Unity environment over. worker_id increments over this.
+        """
+
+    def initialize(
+        self, inputs: UnityInputProto, poll_callback: Optional[PollCallback] = None
+    ) -> UnityOutputProto:
+        """
+        Used to exchange initialization parameters between Python and the Environment
+        :param inputs: The initialization input that will be sent to the environment.
+        :param poll_callback: Optional callback to be used while polling the connection.
+        :return: UnityOutput: The initialization output sent by Unity
+        """
+
+    def exchange(
+        self, inputs: UnityInputProto, poll_callback: Optional[PollCallback] = None
+    ) -> Optional[UnityOutputProto]:
+        """
+        Used to send an input and receive an output from the Environment
+        :param inputs: The UnityInput that needs to be sent the Environment
+        :param poll_callback: Optional callback to be used while polling the connection.
+        :return: The UnityOutputs generated by the Environment
+        """
+
+    def close(self):
+        """
+        Sends a shutdown signal to the unity environment, and closes the connection.
+        """
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__init__.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7038764d34f752991ce2d03ee964f55c9e90680
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_action_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_action_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24add860faa1a29adf6e63c67a5264983f74bec2
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_action_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_info_action_pair_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_info_action_pair_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17af007ee4fcb18e37c22dd50e7d467c9d8d74ff
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_info_action_pair_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_info_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_info_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4040c697d8c86f47a3fdeb35577de18d297bf591
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/agent_info_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/brain_parameters_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/brain_parameters_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..311ca4cd58ecbf93064cc036bb379f3443a24a16
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/brain_parameters_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/capabilities_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/capabilities_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed1b8882830eeae229036ec3cffc9637f7ea9ec7
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/capabilities_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/command_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/command_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e95b23c2bf79e6c9bd8120348d2d28617832f94
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/command_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/custom_reset_parameters_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/custom_reset_parameters_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f22821ca4a4014f62b0c3cca3baa4146f46bf61
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/custom_reset_parameters_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/demonstration_meta_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/demonstration_meta_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83fd188b2d713866d5e8249bea43050b1065d3b0
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/demonstration_meta_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/engine_configuration_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/engine_configuration_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b512a56d7d63e3e6fb8332354c021fce6e4c7b7f
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/engine_configuration_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/header_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/header_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb8038479bca969620525ed4da3e1ca6bd786af8
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/header_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/observation_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/observation_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb68da15d1d4c4c4b90c3acbf4017e3624dc0ffc
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/observation_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/space_type_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/space_type_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7aa02e6acb319065909c57da144e84e0c0b02479
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/space_type_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/training_analytics_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/training_analytics_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9a4879449b759e061e6545b8441939a6159fb85
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/training_analytics_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_input_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_input_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5409679a8464be7f18eb514c900e5cca489df03a
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_input_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_message_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_message_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b381db243df8c117b6cd4f8d7600aa66f0e7ec9
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_message_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_output_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_output_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88a2cb04996da7eeaa0c1288ab63fb1c70b2dfb2
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_output_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_initialization_input_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_initialization_input_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41ce1337547d8db9c636e5b5516059a3510d9953
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_initialization_input_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_initialization_output_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_initialization_output_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb69cd615142586a4c979a38c9ecf7a1c494cad4
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_initialization_output_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_input_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_input_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97a4657773a88ffeef3db1a13024eef35640d3cf
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_input_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_output_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_output_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3271602d1ee8044734d32768adf268a545d506af
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_rl_output_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_to_external_pb2.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_to_external_pb2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a4b6ad090eecc3f169a795d36b7b9cacd113120
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_to_external_pb2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_to_external_pb2_grpc.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_to_external_pb2_grpc.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17843953c51c87445026ca700680e44408aa584b
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/__pycache__/unity_to_external_pb2_grpc.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_action_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_action_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c57d8b4cb179db52449f32c638f841f4a2146a2
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_action_pb2.py
@@ -0,0 +1,92 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/agent_action.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/agent_action.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n5mlagents_envs/communicator_objects/agent_action.proto\x12\x14\x63ommunicator_objects\"\x8c\x01\n\x10\x41gentActionProto\x12!\n\x19vector_actions_deprecated\x18\x01 \x03(\x02\x12\r\n\x05value\x18\x04 \x01(\x02\x12\x1a\n\x12\x63ontinuous_actions\x18\x06 \x03(\x02\x12\x18\n\x10\x64iscrete_actions\x18\x07 \x03(\x05J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x05\x10\x06\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+)
+
+
+
+
+_AGENTACTIONPROTO = _descriptor.Descriptor(
+  name='AgentActionProto',
+  full_name='communicator_objects.AgentActionProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='vector_actions_deprecated', full_name='communicator_objects.AgentActionProto.vector_actions_deprecated', index=0,
+      number=1, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='communicator_objects.AgentActionProto.value', index=1,
+      number=4, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='continuous_actions', full_name='communicator_objects.AgentActionProto.continuous_actions', index=2,
+      number=6, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='discrete_actions', full_name='communicator_objects.AgentActionProto.discrete_actions', index=3,
+      number=7, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=80,
+  serialized_end=220,
+)
+
+DESCRIPTOR.message_types_by_name['AgentActionProto'] = _AGENTACTIONPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+AgentActionProto = _reflection.GeneratedProtocolMessageType('AgentActionProto', (_message.Message,), dict(
+  DESCRIPTOR = _AGENTACTIONPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.agent_action_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.AgentActionProto)
+  ))
+_sym_db.RegisterMessage(AgentActionProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_info_action_pair_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_info_action_pair_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..667b91a13dcc95d76f754920fa79c2e69a65a791
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_info_action_pair_pb2.py
@@ -0,0 +1,83 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/agent_info_action_pair.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from mlagents_envs.communicator_objects import agent_info_pb2 as mlagents__envs_dot_communicator__objects_dot_agent__info__pb2
+from mlagents_envs.communicator_objects import agent_action_pb2 as mlagents__envs_dot_communicator__objects_dot_agent__action__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/agent_info_action_pair.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n?mlagents_envs/communicator_objects/agent_info_action_pair.proto\x12\x14\x63ommunicator_objects\x1a\x33mlagents_envs/communicator_objects/agent_info.proto\x1a\x35mlagents_envs/communicator_objects/agent_action.proto\"\x91\x01\n\x18\x41gentInfoActionPairProto\x12\x38\n\nagent_info\x18\x01 \x01(\x0b\x32$.communicator_objects.AgentInfoProto\x12;\n\x0b\x61\x63tion_info\x18\x02 \x01(\x0b\x32&.communicator_objects.AgentActionProtoB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  ,
+  dependencies=[mlagents__envs_dot_communicator__objects_dot_agent__info__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_agent__action__pb2.DESCRIPTOR,])
+
+
+
+
+_AGENTINFOACTIONPAIRPROTO = _descriptor.Descriptor(
+  name='AgentInfoActionPairProto',
+  full_name='communicator_objects.AgentInfoActionPairProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='agent_info', full_name='communicator_objects.AgentInfoActionPairProto.agent_info', index=0,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='action_info', full_name='communicator_objects.AgentInfoActionPairProto.action_info', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=198,
+  serialized_end=343,
+)
+
+_AGENTINFOACTIONPAIRPROTO.fields_by_name['agent_info'].message_type = mlagents__envs_dot_communicator__objects_dot_agent__info__pb2._AGENTINFOPROTO
+_AGENTINFOACTIONPAIRPROTO.fields_by_name['action_info'].message_type = mlagents__envs_dot_communicator__objects_dot_agent__action__pb2._AGENTACTIONPROTO
+DESCRIPTOR.message_types_by_name['AgentInfoActionPairProto'] = _AGENTINFOACTIONPAIRPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+AgentInfoActionPairProto = _reflection.GeneratedProtocolMessageType('AgentInfoActionPairProto', (_message.Message,), dict(
+  DESCRIPTOR = _AGENTINFOACTIONPAIRPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.agent_info_action_pair_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.AgentInfoActionPairProto)
+  ))
+_sym_db.RegisterMessage(AgentInfoActionPairProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_info_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_info_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..57bb77aa575e0bfc5187b14cc3aec946046678e3
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/agent_info_pb2.py
@@ -0,0 +1,123 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/agent_info.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from mlagents_envs.communicator_objects import observation_pb2 as mlagents__envs_dot_communicator__objects_dot_observation__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/agent_info.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n3mlagents_envs/communicator_objects/agent_info.proto\x12\x14\x63ommunicator_objects\x1a\x34mlagents_envs/communicator_objects/observation.proto\"\xf9\x01\n\x0e\x41gentInfoProto\x12\x0e\n\x06reward\x18\x07 \x01(\x02\x12\x0c\n\x04\x64one\x18\x08 \x01(\x08\x12\x18\n\x10max_step_reached\x18\t \x01(\x08\x12\n\n\x02id\x18\n \x01(\x05\x12\x13\n\x0b\x61\x63tion_mask\x18\x0b \x03(\x08\x12<\n\x0cobservations\x18\r \x03(\x0b\x32&.communicator_objects.ObservationProto\x12\x10\n\x08group_id\x18\x0e \x01(\x05\x12\x14\n\x0cgroup_reward\x18\x0f \x01(\x02J\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04J\x04\x08\x04\x10\x05J\x04\x08\x05\x10\x06J\x04\x08\x06\x10\x07J\x04\x08\x0c\x10\rB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  ,
+  dependencies=[mlagents__envs_dot_communicator__objects_dot_observation__pb2.DESCRIPTOR,])
+
+
+
+
+_AGENTINFOPROTO = _descriptor.Descriptor(
+  name='AgentInfoProto',
+  full_name='communicator_objects.AgentInfoProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='reward', full_name='communicator_objects.AgentInfoProto.reward', index=0,
+      number=7, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='done', full_name='communicator_objects.AgentInfoProto.done', index=1,
+      number=8, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='max_step_reached', full_name='communicator_objects.AgentInfoProto.max_step_reached', index=2,
+      number=9, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='id', full_name='communicator_objects.AgentInfoProto.id', index=3,
+      number=10, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='action_mask', full_name='communicator_objects.AgentInfoProto.action_mask', index=4,
+      number=11, type=8, cpp_type=7, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='observations', full_name='communicator_objects.AgentInfoProto.observations', index=5,
+      number=13, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='group_id', full_name='communicator_objects.AgentInfoProto.group_id', index=6,
+      number=14, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='group_reward', full_name='communicator_objects.AgentInfoProto.group_reward', index=7,
+      number=15, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=132,
+  serialized_end=381,
+)
+
+_AGENTINFOPROTO.fields_by_name['observations'].message_type = mlagents__envs_dot_communicator__objects_dot_observation__pb2._OBSERVATIONPROTO
+DESCRIPTOR.message_types_by_name['AgentInfoProto'] = _AGENTINFOPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+AgentInfoProto = _reflection.GeneratedProtocolMessageType('AgentInfoProto', (_message.Message,), dict(
+  DESCRIPTOR = _AGENTINFOPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.agent_info_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.AgentInfoProto)
+  ))
+_sym_db.RegisterMessage(AgentInfoProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/brain_parameters_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/brain_parameters_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccd9140d3cc7b7f8ef9a4092480572fbd04d911e
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/brain_parameters_pb2.py
@@ -0,0 +1,170 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/brain_parameters.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from mlagents_envs.communicator_objects import space_type_pb2 as mlagents__envs_dot_communicator__objects_dot_space__type__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/brain_parameters.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n9mlagents_envs/communicator_objects/brain_parameters.proto\x12\x14\x63ommunicator_objects\x1a\x33mlagents_envs/communicator_objects/space_type.proto\"\x8b\x01\n\x0f\x41\x63tionSpecProto\x12\x1e\n\x16num_continuous_actions\x18\x01 \x01(\x05\x12\x1c\n\x14num_discrete_actions\x18\x02 \x01(\x05\x12\x1d\n\x15\x64iscrete_branch_sizes\x18\x03 \x03(\x05\x12\x1b\n\x13\x61\x63tion_descriptions\x18\x04 \x03(\t\"\xb6\x02\n\x14\x42rainParametersProto\x12%\n\x1dvector_action_size_deprecated\x18\x03 \x03(\x05\x12-\n%vector_action_descriptions_deprecated\x18\x05 \x03(\t\x12Q\n#vector_action_space_type_deprecated\x18\x06 \x01(\x0e\x32$.communicator_objects.SpaceTypeProto\x12\x12\n\nbrain_name\x18\x07 \x01(\t\x12\x13\n\x0bis_training\x18\x08 \x01(\x08\x12:\n\x0b\x61\x63tion_spec\x18\t \x01(\x0b\x32%.communicator_objects.ActionSpecProtoJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03J\x04\x08\x04\x10\x05\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  ,
+  dependencies=[mlagents__envs_dot_communicator__objects_dot_space__type__pb2.DESCRIPTOR,])
+
+
+
+
+_ACTIONSPECPROTO = _descriptor.Descriptor(
+  name='ActionSpecProto',
+  full_name='communicator_objects.ActionSpecProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='num_continuous_actions', full_name='communicator_objects.ActionSpecProto.num_continuous_actions', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='num_discrete_actions', full_name='communicator_objects.ActionSpecProto.num_discrete_actions', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='discrete_branch_sizes', full_name='communicator_objects.ActionSpecProto.discrete_branch_sizes', index=2,
+      number=3, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='action_descriptions', full_name='communicator_objects.ActionSpecProto.action_descriptions', index=3,
+      number=4, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=137,
+  serialized_end=276,
+)
+
+
+_BRAINPARAMETERSPROTO = _descriptor.Descriptor(
+  name='BrainParametersProto',
+  full_name='communicator_objects.BrainParametersProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='vector_action_size_deprecated', full_name='communicator_objects.BrainParametersProto.vector_action_size_deprecated', index=0,
+      number=3, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='vector_action_descriptions_deprecated', full_name='communicator_objects.BrainParametersProto.vector_action_descriptions_deprecated', index=1,
+      number=5, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='vector_action_space_type_deprecated', full_name='communicator_objects.BrainParametersProto.vector_action_space_type_deprecated', index=2,
+      number=6, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='brain_name', full_name='communicator_objects.BrainParametersProto.brain_name', index=3,
+      number=7, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='is_training', full_name='communicator_objects.BrainParametersProto.is_training', index=4,
+      number=8, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='action_spec', full_name='communicator_objects.BrainParametersProto.action_spec', index=5,
+      number=9, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=279,
+  serialized_end=589,
+)
+
+_BRAINPARAMETERSPROTO.fields_by_name['vector_action_space_type_deprecated'].enum_type = mlagents__envs_dot_communicator__objects_dot_space__type__pb2._SPACETYPEPROTO
+_BRAINPARAMETERSPROTO.fields_by_name['action_spec'].message_type = _ACTIONSPECPROTO
+DESCRIPTOR.message_types_by_name['ActionSpecProto'] = _ACTIONSPECPROTO
+DESCRIPTOR.message_types_by_name['BrainParametersProto'] = _BRAINPARAMETERSPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+ActionSpecProto = _reflection.GeneratedProtocolMessageType('ActionSpecProto', (_message.Message,), dict(
+  DESCRIPTOR = _ACTIONSPECPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.brain_parameters_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.ActionSpecProto)
+  ))
+_sym_db.RegisterMessage(ActionSpecProto)
+
+BrainParametersProto = _reflection.GeneratedProtocolMessageType('BrainParametersProto', (_message.Message,), dict(
+  DESCRIPTOR = _BRAINPARAMETERSPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.brain_parameters_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.BrainParametersProto)
+  ))
+_sym_db.RegisterMessage(BrainParametersProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/capabilities_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/capabilities_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..35b8fbdef15752e0c75bdcd329afbc391d68e6f4
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/capabilities_pb2.py
@@ -0,0 +1,113 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/capabilities.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/capabilities.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n5mlagents_envs/communicator_objects/capabilities.proto\x12\x14\x63ommunicator_objects\"\xec\x01\n\x18UnityRLCapabilitiesProto\x12\x1a\n\x12\x62\x61seRLCapabilities\x18\x01 \x01(\x08\x12#\n\x1b\x63oncatenatedPngObservations\x18\x02 \x01(\x08\x12 \n\x18\x63ompressedChannelMapping\x18\x03 \x01(\x08\x12\x15\n\rhybridActions\x18\x04 \x01(\x08\x12\x19\n\x11trainingAnalytics\x18\x05 \x01(\x08\x12!\n\x19variableLengthObservation\x18\x06 \x01(\x08\x12\x18\n\x10multiAgentGroups\x18\x07 \x01(\x08\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+)
+
+
+
+
+_UNITYRLCAPABILITIESPROTO = _descriptor.Descriptor(
+  name='UnityRLCapabilitiesProto',
+  full_name='communicator_objects.UnityRLCapabilitiesProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='baseRLCapabilities', full_name='communicator_objects.UnityRLCapabilitiesProto.baseRLCapabilities', index=0,
+      number=1, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='concatenatedPngObservations', full_name='communicator_objects.UnityRLCapabilitiesProto.concatenatedPngObservations', index=1,
+      number=2, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='compressedChannelMapping', full_name='communicator_objects.UnityRLCapabilitiesProto.compressedChannelMapping', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='hybridActions', full_name='communicator_objects.UnityRLCapabilitiesProto.hybridActions', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='trainingAnalytics', full_name='communicator_objects.UnityRLCapabilitiesProto.trainingAnalytics', index=4,
+      number=5, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='variableLengthObservation', full_name='communicator_objects.UnityRLCapabilitiesProto.variableLengthObservation', index=5,
+      number=6, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='multiAgentGroups', full_name='communicator_objects.UnityRLCapabilitiesProto.multiAgentGroups', index=6,
+      number=7, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=80,
+  serialized_end=316,
+)
+
+DESCRIPTOR.message_types_by_name['UnityRLCapabilitiesProto'] = _UNITYRLCAPABILITIESPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+UnityRLCapabilitiesProto = _reflection.GeneratedProtocolMessageType('UnityRLCapabilitiesProto', (_message.Message,), dict(
+  DESCRIPTOR = _UNITYRLCAPABILITIESPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.capabilities_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLCapabilitiesProto)
+  ))
+_sym_db.RegisterMessage(UnityRLCapabilitiesProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/command_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/command_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..397f31098025a193a2365d150057ad1c3fc90c36
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/command_pb2.py
@@ -0,0 +1,64 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/command.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf.internal import enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/command.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n0mlagents_envs/communicator_objects/command.proto\x12\x14\x63ommunicator_objects*-\n\x0c\x43ommandProto\x12\x08\n\x04STEP\x10\x00\x12\t\n\x05RESET\x10\x01\x12\x08\n\x04QUIT\x10\x02\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+)
+
+_COMMANDPROTO = _descriptor.EnumDescriptor(
+  name='CommandProto',
+  full_name='communicator_objects.CommandProto',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='STEP', index=0, number=0,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='RESET', index=1, number=1,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='QUIT', index=2, number=2,
+      options=None,
+      type=None),
+  ],
+  containing_type=None,
+  options=None,
+  serialized_start=74,
+  serialized_end=119,
+)
+_sym_db.RegisterEnumDescriptor(_COMMANDPROTO)
+
+CommandProto = enum_type_wrapper.EnumTypeWrapper(_COMMANDPROTO)
+STEP = 0
+RESET = 1
+QUIT = 2
+
+
+DESCRIPTOR.enum_types_by_name['CommandProto'] = _COMMANDPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/custom_reset_parameters_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/custom_reset_parameters_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4300d1016aa7e7dc6dcdea0f27fb21d641abb39
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/custom_reset_parameters_pb2.py
@@ -0,0 +1,64 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/custom_reset_parameters.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/custom_reset_parameters.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n@mlagents_envs/communicator_objects/custom_reset_parameters.proto\x12\x14\x63ommunicator_objects\"\x1c\n\x1a\x43ustomResetParametersProtoB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+)
+
+
+
+
+_CUSTOMRESETPARAMETERSPROTO = _descriptor.Descriptor(
+  name='CustomResetParametersProto',
+  full_name='communicator_objects.CustomResetParametersProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=90,
+  serialized_end=118,
+)
+
+DESCRIPTOR.message_types_by_name['CustomResetParametersProto'] = _CUSTOMRESETPARAMETERSPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+CustomResetParametersProto = _reflection.GeneratedProtocolMessageType('CustomResetParametersProto', (_message.Message,), dict(
+  DESCRIPTOR = _CUSTOMRESETPARAMETERSPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.custom_reset_parameters_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.CustomResetParametersProto)
+  ))
+_sym_db.RegisterMessage(CustomResetParametersProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/demonstration_meta_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/demonstration_meta_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b86fee455033d420e151518070a1e5174c1cd0a
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/demonstration_meta_pb2.py
@@ -0,0 +1,99 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/demonstration_meta.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/demonstration_meta.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n;mlagents_envs/communicator_objects/demonstration_meta.proto\x12\x14\x63ommunicator_objects\"\x8d\x01\n\x16\x44\x65monstrationMetaProto\x12\x13\n\x0b\x61pi_version\x18\x01 \x01(\x05\x12\x1a\n\x12\x64\x65monstration_name\x18\x02 \x01(\t\x12\x14\n\x0cnumber_steps\x18\x03 \x01(\x05\x12\x17\n\x0fnumber_episodes\x18\x04 \x01(\x05\x12\x13\n\x0bmean_reward\x18\x05 \x01(\x02\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+)
+
+
+
+
+_DEMONSTRATIONMETAPROTO = _descriptor.Descriptor(
+  name='DemonstrationMetaProto',
+  full_name='communicator_objects.DemonstrationMetaProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='api_version', full_name='communicator_objects.DemonstrationMetaProto.api_version', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='demonstration_name', full_name='communicator_objects.DemonstrationMetaProto.demonstration_name', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='number_steps', full_name='communicator_objects.DemonstrationMetaProto.number_steps', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='number_episodes', full_name='communicator_objects.DemonstrationMetaProto.number_episodes', index=3,
+      number=4, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='mean_reward', full_name='communicator_objects.DemonstrationMetaProto.mean_reward', index=4,
+      number=5, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=86,
+  serialized_end=227,
+)
+
+DESCRIPTOR.message_types_by_name['DemonstrationMetaProto'] = _DEMONSTRATIONMETAPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+DemonstrationMetaProto = _reflection.GeneratedProtocolMessageType('DemonstrationMetaProto', (_message.Message,), dict(
+  DESCRIPTOR = _DEMONSTRATIONMETAPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.demonstration_meta_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.DemonstrationMetaProto)
+  ))
+_sym_db.RegisterMessage(DemonstrationMetaProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/engine_configuration_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/engine_configuration_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f4656ca8cb6048231b50892373ec94968b12622
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/engine_configuration_pb2.py
@@ -0,0 +1,106 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/engine_configuration.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/engine_configuration.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n=mlagents_envs/communicator_objects/engine_configuration.proto\x12\x14\x63ommunicator_objects\"\x95\x01\n\x18\x45ngineConfigurationProto\x12\r\n\x05width\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\x15\n\rquality_level\x18\x03 \x01(\x05\x12\x12\n\ntime_scale\x18\x04 \x01(\x02\x12\x19\n\x11target_frame_rate\x18\x05 \x01(\x05\x12\x14\n\x0cshow_monitor\x18\x06 \x01(\x08\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+)
+
+
+
+
+_ENGINECONFIGURATIONPROTO = _descriptor.Descriptor(
+  name='EngineConfigurationProto',
+  full_name='communicator_objects.EngineConfigurationProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='width', full_name='communicator_objects.EngineConfigurationProto.width', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='height', full_name='communicator_objects.EngineConfigurationProto.height', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='quality_level', full_name='communicator_objects.EngineConfigurationProto.quality_level', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='time_scale', full_name='communicator_objects.EngineConfigurationProto.time_scale', index=3,
+      number=4, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='target_frame_rate', full_name='communicator_objects.EngineConfigurationProto.target_frame_rate', index=4,
+      number=5, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='show_monitor', full_name='communicator_objects.EngineConfigurationProto.show_monitor', index=5,
+      number=6, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=88,
+  serialized_end=237,
+)
+
+DESCRIPTOR.message_types_by_name['EngineConfigurationProto'] = _ENGINECONFIGURATIONPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+EngineConfigurationProto = _reflection.GeneratedProtocolMessageType('EngineConfigurationProto', (_message.Message,), dict(
+  DESCRIPTOR = _ENGINECONFIGURATIONPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.engine_configuration_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.EngineConfigurationProto)
+  ))
+_sym_db.RegisterMessage(EngineConfigurationProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/header_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/header_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aa8eb89545b4c97e4eea1f3fd4c92e5cad2b463
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/header_pb2.py
@@ -0,0 +1,78 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/header.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/header.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n/mlagents_envs/communicator_objects/header.proto\x12\x14\x63ommunicator_objects\".\n\x0bHeaderProto\x12\x0e\n\x06status\x18\x01 \x01(\x05\x12\x0f\n\x07message\x18\x02 \x01(\tB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+)
+
+
+
+
+_HEADERPROTO = _descriptor.Descriptor(
+  name='HeaderProto',
+  full_name='communicator_objects.HeaderProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='status', full_name='communicator_objects.HeaderProto.status', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='message', full_name='communicator_objects.HeaderProto.message', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=73,
+  serialized_end=119,
+)
+
+DESCRIPTOR.message_types_by_name['HeaderProto'] = _HEADERPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+HeaderProto = _reflection.GeneratedProtocolMessageType('HeaderProto', (_message.Message,), dict(
+  DESCRIPTOR = _HEADERPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.header_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.HeaderProto)
+  ))
+_sym_db.RegisterMessage(HeaderProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/observation_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/observation_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..838ca1d87dabe02d7355fc627fe5b131196ab8bf
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/observation_pb2.py
@@ -0,0 +1,224 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/observation.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf.internal import enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/observation.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n4mlagents_envs/communicator_objects/observation.proto\x12\x14\x63ommunicator_objects\"\x8f\x03\n\x10ObservationProto\x12\r\n\x05shape\x18\x01 \x03(\x05\x12\x44\n\x10\x63ompression_type\x18\x02 \x01(\x0e\x32*.communicator_objects.CompressionTypeProto\x12\x19\n\x0f\x63ompressed_data\x18\x03 \x01(\x0cH\x00\x12\x46\n\nfloat_data\x18\x04 \x01(\x0b\x32\x30.communicator_objects.ObservationProto.FloatDataH\x00\x12\"\n\x1a\x63ompressed_channel_mapping\x18\x05 \x03(\x05\x12\x1c\n\x14\x64imension_properties\x18\x06 \x03(\x05\x12\x44\n\x10observation_type\x18\x07 \x01(\x0e\x32*.communicator_objects.ObservationTypeProto\x12\x0c\n\x04name\x18\x08 \x01(\t\x1a\x19\n\tFloatData\x12\x0c\n\x04\x64\x61ta\x18\x01 \x03(\x02\x42\x12\n\x10observation_data*)\n\x14\x43ompressionTypeProto\x12\x08\n\x04NONE\x10\x00\x12\x07\n\x03PNG\x10\x01*@\n\x14ObservationTypeProto\x12\x0b\n\x07\x44\x45\x46\x41ULT\x10\x00\x12\x0f\n\x0bGOAL_SIGNAL\x10\x01\"\x04\x08\x02\x10\x02\"\x04\x08\x03\x10\x03\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+)
+
+_COMPRESSIONTYPEPROTO = _descriptor.EnumDescriptor(
+  name='CompressionTypeProto',
+  full_name='communicator_objects.CompressionTypeProto',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='NONE', index=0, number=0,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='PNG', index=1, number=1,
+      options=None,
+      type=None),
+  ],
+  containing_type=None,
+  options=None,
+  serialized_start=480,
+  serialized_end=521,
+)
+_sym_db.RegisterEnumDescriptor(_COMPRESSIONTYPEPROTO)
+
+CompressionTypeProto = enum_type_wrapper.EnumTypeWrapper(_COMPRESSIONTYPEPROTO)
+_OBSERVATIONTYPEPROTO = _descriptor.EnumDescriptor(
+  name='ObservationTypeProto',
+  full_name='communicator_objects.ObservationTypeProto',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='DEFAULT', index=0, number=0,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='GOAL_SIGNAL', index=1, number=1,
+      options=None,
+      type=None),
+  ],
+  containing_type=None,
+  options=None,
+  serialized_start=523,
+  serialized_end=587,
+)
+_sym_db.RegisterEnumDescriptor(_OBSERVATIONTYPEPROTO)
+
+ObservationTypeProto = enum_type_wrapper.EnumTypeWrapper(_OBSERVATIONTYPEPROTO)
+NONE = 0
+PNG = 1
+DEFAULT = 0
+GOAL_SIGNAL = 1
+
+
+
+_OBSERVATIONPROTO_FLOATDATA = _descriptor.Descriptor(
+  name='FloatData',
+  full_name='communicator_objects.ObservationProto.FloatData',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='data', full_name='communicator_objects.ObservationProto.FloatData.data', index=0,
+      number=1, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=433,
+  serialized_end=458,
+)
+
+_OBSERVATIONPROTO = _descriptor.Descriptor(
+  name='ObservationProto',
+  full_name='communicator_objects.ObservationProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='shape', full_name='communicator_objects.ObservationProto.shape', index=0,
+      number=1, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='compression_type', full_name='communicator_objects.ObservationProto.compression_type', index=1,
+      number=2, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='compressed_data', full_name='communicator_objects.ObservationProto.compressed_data', index=2,
+      number=3, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b(""),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='float_data', full_name='communicator_objects.ObservationProto.float_data', index=3,
+      number=4, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='compressed_channel_mapping', full_name='communicator_objects.ObservationProto.compressed_channel_mapping', index=4,
+      number=5, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='dimension_properties', full_name='communicator_objects.ObservationProto.dimension_properties', index=5,
+      number=6, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='observation_type', full_name='communicator_objects.ObservationProto.observation_type', index=6,
+      number=7, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='name', full_name='communicator_objects.ObservationProto.name', index=7,
+      number=8, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_OBSERVATIONPROTO_FLOATDATA, ],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+    _descriptor.OneofDescriptor(
+      name='observation_data', full_name='communicator_objects.ObservationProto.observation_data',
+      index=0, containing_type=None, fields=[]),
+  ],
+  serialized_start=79,
+  serialized_end=478,
+)
+
+_OBSERVATIONPROTO_FLOATDATA.containing_type = _OBSERVATIONPROTO
+_OBSERVATIONPROTO.fields_by_name['compression_type'].enum_type = _COMPRESSIONTYPEPROTO
+_OBSERVATIONPROTO.fields_by_name['float_data'].message_type = _OBSERVATIONPROTO_FLOATDATA
+_OBSERVATIONPROTO.fields_by_name['observation_type'].enum_type = _OBSERVATIONTYPEPROTO
+_OBSERVATIONPROTO.oneofs_by_name['observation_data'].fields.append(
+  _OBSERVATIONPROTO.fields_by_name['compressed_data'])
+_OBSERVATIONPROTO.fields_by_name['compressed_data'].containing_oneof = _OBSERVATIONPROTO.oneofs_by_name['observation_data']
+_OBSERVATIONPROTO.oneofs_by_name['observation_data'].fields.append(
+  _OBSERVATIONPROTO.fields_by_name['float_data'])
+_OBSERVATIONPROTO.fields_by_name['float_data'].containing_oneof = _OBSERVATIONPROTO.oneofs_by_name['observation_data']
+DESCRIPTOR.message_types_by_name['ObservationProto'] = _OBSERVATIONPROTO
+DESCRIPTOR.enum_types_by_name['CompressionTypeProto'] = _COMPRESSIONTYPEPROTO
+DESCRIPTOR.enum_types_by_name['ObservationTypeProto'] = _OBSERVATIONTYPEPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+ObservationProto = _reflection.GeneratedProtocolMessageType('ObservationProto', (_message.Message,), dict(
+
+  FloatData = _reflection.GeneratedProtocolMessageType('FloatData', (_message.Message,), dict(
+    DESCRIPTOR = _OBSERVATIONPROTO_FLOATDATA,
+    __module__ = 'mlagents_envs.communicator_objects.observation_pb2'
+    # @@protoc_insertion_point(class_scope:communicator_objects.ObservationProto.FloatData)
+    ))
+  ,
+  DESCRIPTOR = _OBSERVATIONPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.observation_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.ObservationProto)
+  ))
+_sym_db.RegisterMessage(ObservationProto)
+_sym_db.RegisterMessage(ObservationProto.FloatData)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/space_type_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/space_type_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..df62c23319c2be5890aa5d021aad9f8590820dc3
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/space_type_pb2.py
@@ -0,0 +1,59 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/space_type.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf.internal import enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/space_type.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n3mlagents_envs/communicator_objects/space_type.proto\x12\x14\x63ommunicator_objects*.\n\x0eSpaceTypeProto\x12\x0c\n\x08\x64iscrete\x10\x00\x12\x0e\n\ncontinuous\x10\x01\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+)
+
+_SPACETYPEPROTO = _descriptor.EnumDescriptor(
+  name='SpaceTypeProto',
+  full_name='communicator_objects.SpaceTypeProto',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='discrete', index=0, number=0,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='continuous', index=1, number=1,
+      options=None,
+      type=None),
+  ],
+  containing_type=None,
+  options=None,
+  serialized_start=77,
+  serialized_end=123,
+)
+_sym_db.RegisterEnumDescriptor(_SPACETYPEPROTO)
+
+SpaceTypeProto = enum_type_wrapper.EnumTypeWrapper(_SPACETYPEPROTO)
+discrete = 0
+continuous = 1
+
+
+DESCRIPTOR.enum_types_by_name['SpaceTypeProto'] = _SPACETYPEPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/training_analytics_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/training_analytics_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2701dac8586bf004fd61430e2c0aa55a9f412691
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/training_analytics_pb2.py
@@ -0,0 +1,257 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/training_analytics.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/training_analytics.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n;mlagents_envs/communicator_objects/training_analytics.proto\x12\x14\x63ommunicator_objects\"\xee\x01\n\x1eTrainingEnvironmentInitialized\x12\x18\n\x10mlagents_version\x18\x01 \x01(\t\x12\x1d\n\x15mlagents_envs_version\x18\x02 \x01(\t\x12\x16\n\x0epython_version\x18\x03 \x01(\t\x12\x15\n\rtorch_version\x18\x04 \x01(\t\x12\x19\n\x11torch_device_type\x18\x05 \x01(\t\x12\x10\n\x08num_envs\x18\x06 \x01(\x05\x12\"\n\x1anum_environment_parameters\x18\x07 \x01(\x05\x12\x13\n\x0brun_options\x18\x08 \x01(\t\"\xbd\x03\n\x1bTrainingBehaviorInitialized\x12\x15\n\rbehavior_name\x18\x01 \x01(\t\x12\x14\n\x0ctrainer_type\x18\x02 \x01(\t\x12 \n\x18\x65xtrinsic_reward_enabled\x18\x03 \x01(\x08\x12\x1b\n\x13gail_reward_enabled\x18\x04 \x01(\x08\x12 \n\x18\x63uriosity_reward_enabled\x18\x05 \x01(\x08\x12\x1a\n\x12rnd_reward_enabled\x18\x06 \x01(\x08\x12\"\n\x1a\x62\x65havioral_cloning_enabled\x18\x07 \x01(\x08\x12\x19\n\x11recurrent_enabled\x18\x08 \x01(\x08\x12\x16\n\x0evisual_encoder\x18\t \x01(\t\x12\x1a\n\x12num_network_layers\x18\n \x01(\x05\x12 \n\x18num_network_hidden_units\x18\x0b \x01(\x05\x12\x18\n\x10trainer_threaded\x18\x0c \x01(\x08\x12\x19\n\x11self_play_enabled\x18\r \x01(\x08\x12\x1a\n\x12\x63urriculum_enabled\x18\x0e \x01(\x08\x12\x0e\n\x06\x63onfig\x18\x0f \x01(\tB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+)
+
+
+
+
+_TRAININGENVIRONMENTINITIALIZED = _descriptor.Descriptor(
+  name='TrainingEnvironmentInitialized',
+  full_name='communicator_objects.TrainingEnvironmentInitialized',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='mlagents_version', full_name='communicator_objects.TrainingEnvironmentInitialized.mlagents_version', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='mlagents_envs_version', full_name='communicator_objects.TrainingEnvironmentInitialized.mlagents_envs_version', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='python_version', full_name='communicator_objects.TrainingEnvironmentInitialized.python_version', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='torch_version', full_name='communicator_objects.TrainingEnvironmentInitialized.torch_version', index=3,
+      number=4, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='torch_device_type', full_name='communicator_objects.TrainingEnvironmentInitialized.torch_device_type', index=4,
+      number=5, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='num_envs', full_name='communicator_objects.TrainingEnvironmentInitialized.num_envs', index=5,
+      number=6, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='num_environment_parameters', full_name='communicator_objects.TrainingEnvironmentInitialized.num_environment_parameters', index=6,
+      number=7, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='run_options', full_name='communicator_objects.TrainingEnvironmentInitialized.run_options', index=7,
+      number=8, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=86,
+  serialized_end=324,
+)
+
+
+_TRAININGBEHAVIORINITIALIZED = _descriptor.Descriptor(
+  name='TrainingBehaviorInitialized',
+  full_name='communicator_objects.TrainingBehaviorInitialized',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='behavior_name', full_name='communicator_objects.TrainingBehaviorInitialized.behavior_name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='trainer_type', full_name='communicator_objects.TrainingBehaviorInitialized.trainer_type', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='extrinsic_reward_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.extrinsic_reward_enabled', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='gail_reward_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.gail_reward_enabled', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='curiosity_reward_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.curiosity_reward_enabled', index=4,
+      number=5, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='rnd_reward_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.rnd_reward_enabled', index=5,
+      number=6, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='behavioral_cloning_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.behavioral_cloning_enabled', index=6,
+      number=7, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='recurrent_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.recurrent_enabled', index=7,
+      number=8, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='visual_encoder', full_name='communicator_objects.TrainingBehaviorInitialized.visual_encoder', index=8,
+      number=9, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='num_network_layers', full_name='communicator_objects.TrainingBehaviorInitialized.num_network_layers', index=9,
+      number=10, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='num_network_hidden_units', full_name='communicator_objects.TrainingBehaviorInitialized.num_network_hidden_units', index=10,
+      number=11, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='trainer_threaded', full_name='communicator_objects.TrainingBehaviorInitialized.trainer_threaded', index=11,
+      number=12, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='self_play_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.self_play_enabled', index=12,
+      number=13, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='curriculum_enabled', full_name='communicator_objects.TrainingBehaviorInitialized.curriculum_enabled', index=13,
+      number=14, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='config', full_name='communicator_objects.TrainingBehaviorInitialized.config', index=14,
+      number=15, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=327,
+  serialized_end=772,
+)
+
+DESCRIPTOR.message_types_by_name['TrainingEnvironmentInitialized'] = _TRAININGENVIRONMENTINITIALIZED
+DESCRIPTOR.message_types_by_name['TrainingBehaviorInitialized'] = _TRAININGBEHAVIORINITIALIZED
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+TrainingEnvironmentInitialized = _reflection.GeneratedProtocolMessageType('TrainingEnvironmentInitialized', (_message.Message,), dict(
+  DESCRIPTOR = _TRAININGENVIRONMENTINITIALIZED,
+  __module__ = 'mlagents_envs.communicator_objects.training_analytics_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.TrainingEnvironmentInitialized)
+  ))
+_sym_db.RegisterMessage(TrainingEnvironmentInitialized)
+
+TrainingBehaviorInitialized = _reflection.GeneratedProtocolMessageType('TrainingBehaviorInitialized', (_message.Message,), dict(
+  DESCRIPTOR = _TRAININGBEHAVIORINITIALIZED,
+  __module__ = 'mlagents_envs.communicator_objects.training_analytics_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.TrainingBehaviorInitialized)
+  ))
+_sym_db.RegisterMessage(TrainingBehaviorInitialized)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_input_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_input_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c4a24f3922f209ed928119b97c0f67ea38e7b15
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_input_pb2.py
@@ -0,0 +1,83 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/unity_input.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from mlagents_envs.communicator_objects import unity_rl_input_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__rl__input__pb2
+from mlagents_envs.communicator_objects import unity_rl_initialization_input_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__rl__initialization__input__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/unity_input.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n4mlagents_envs/communicator_objects/unity_input.proto\x12\x14\x63ommunicator_objects\x1a\x37mlagents_envs/communicator_objects/unity_rl_input.proto\x1a\x46mlagents_envs/communicator_objects/unity_rl_initialization_input.proto\"\xa4\x01\n\x0fUnityInputProto\x12\x39\n\x08rl_input\x18\x01 \x01(\x0b\x32\'.communicator_objects.UnityRLInputProto\x12V\n\x17rl_initialization_input\x18\x02 \x01(\x0b\x32\x35.communicator_objects.UnityRLInitializationInputProtoB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  ,
+  dependencies=[mlagents__envs_dot_communicator__objects_dot_unity__rl__input__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_unity__rl__initialization__input__pb2.DESCRIPTOR,])
+
+
+
+
+_UNITYINPUTPROTO = _descriptor.Descriptor(
+  name='UnityInputProto',
+  full_name='communicator_objects.UnityInputProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='rl_input', full_name='communicator_objects.UnityInputProto.rl_input', index=0,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='rl_initialization_input', full_name='communicator_objects.UnityInputProto.rl_initialization_input', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=208,
+  serialized_end=372,
+)
+
+_UNITYINPUTPROTO.fields_by_name['rl_input'].message_type = mlagents__envs_dot_communicator__objects_dot_unity__rl__input__pb2._UNITYRLINPUTPROTO
+_UNITYINPUTPROTO.fields_by_name['rl_initialization_input'].message_type = mlagents__envs_dot_communicator__objects_dot_unity__rl__initialization__input__pb2._UNITYRLINITIALIZATIONINPUTPROTO
+DESCRIPTOR.message_types_by_name['UnityInputProto'] = _UNITYINPUTPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+UnityInputProto = _reflection.GeneratedProtocolMessageType('UnityInputProto', (_message.Message,), dict(
+  DESCRIPTOR = _UNITYINPUTPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.unity_input_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.UnityInputProto)
+  ))
+_sym_db.RegisterMessage(UnityInputProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_message_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_message_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e01b52298f8b05f477802c53ea5ab5a9082f9b0e
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_message_pb2.py
@@ -0,0 +1,92 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/unity_message.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from mlagents_envs.communicator_objects import unity_output_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__output__pb2
+from mlagents_envs.communicator_objects import unity_input_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__input__pb2
+from mlagents_envs.communicator_objects import header_pb2 as mlagents__envs_dot_communicator__objects_dot_header__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/unity_message.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n6mlagents_envs/communicator_objects/unity_message.proto\x12\x14\x63ommunicator_objects\x1a\x35mlagents_envs/communicator_objects/unity_output.proto\x1a\x34mlagents_envs/communicator_objects/unity_input.proto\x1a/mlagents_envs/communicator_objects/header.proto\"\xc0\x01\n\x11UnityMessageProto\x12\x31\n\x06header\x18\x01 \x01(\x0b\x32!.communicator_objects.HeaderProto\x12<\n\x0cunity_output\x18\x02 \x01(\x0b\x32&.communicator_objects.UnityOutputProto\x12:\n\x0bunity_input\x18\x03 \x01(\x0b\x32%.communicator_objects.UnityInputProtoB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  ,
+  dependencies=[mlagents__envs_dot_communicator__objects_dot_unity__output__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_unity__input__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_header__pb2.DESCRIPTOR,])
+
+
+
+
+_UNITYMESSAGEPROTO = _descriptor.Descriptor(
+  name='UnityMessageProto',
+  full_name='communicator_objects.UnityMessageProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='header', full_name='communicator_objects.UnityMessageProto.header', index=0,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='unity_output', full_name='communicator_objects.UnityMessageProto.unity_output', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='unity_input', full_name='communicator_objects.UnityMessageProto.unity_input', index=2,
+      number=3, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=239,
+  serialized_end=431,
+)
+
+_UNITYMESSAGEPROTO.fields_by_name['header'].message_type = mlagents__envs_dot_communicator__objects_dot_header__pb2._HEADERPROTO
+_UNITYMESSAGEPROTO.fields_by_name['unity_output'].message_type = mlagents__envs_dot_communicator__objects_dot_unity__output__pb2._UNITYOUTPUTPROTO
+_UNITYMESSAGEPROTO.fields_by_name['unity_input'].message_type = mlagents__envs_dot_communicator__objects_dot_unity__input__pb2._UNITYINPUTPROTO
+DESCRIPTOR.message_types_by_name['UnityMessageProto'] = _UNITYMESSAGEPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+UnityMessageProto = _reflection.GeneratedProtocolMessageType('UnityMessageProto', (_message.Message,), dict(
+  DESCRIPTOR = _UNITYMESSAGEPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.unity_message_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.UnityMessageProto)
+  ))
+_sym_db.RegisterMessage(UnityMessageProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_output_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_output_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..400fdb00f736a6b6a496b270b20bb65bdcc304f6
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_output_pb2.py
@@ -0,0 +1,83 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/unity_output.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from mlagents_envs.communicator_objects import unity_rl_output_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__rl__output__pb2
+from mlagents_envs.communicator_objects import unity_rl_initialization_output_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__rl__initialization__output__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/unity_output.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n5mlagents_envs/communicator_objects/unity_output.proto\x12\x14\x63ommunicator_objects\x1a\x38mlagents_envs/communicator_objects/unity_rl_output.proto\x1aGmlagents_envs/communicator_objects/unity_rl_initialization_output.proto\"\xa9\x01\n\x10UnityOutputProto\x12;\n\trl_output\x18\x01 \x01(\x0b\x32(.communicator_objects.UnityRLOutputProto\x12X\n\x18rl_initialization_output\x18\x02 \x01(\x0b\x32\x36.communicator_objects.UnityRLInitializationOutputProtoB%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  ,
+  dependencies=[mlagents__envs_dot_communicator__objects_dot_unity__rl__output__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_unity__rl__initialization__output__pb2.DESCRIPTOR,])
+
+
+
+
+_UNITYOUTPUTPROTO = _descriptor.Descriptor(
+  name='UnityOutputProto',
+  full_name='communicator_objects.UnityOutputProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='rl_output', full_name='communicator_objects.UnityOutputProto.rl_output', index=0,
+      number=1, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='rl_initialization_output', full_name='communicator_objects.UnityOutputProto.rl_initialization_output', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=211,
+  serialized_end=380,
+)
+
+_UNITYOUTPUTPROTO.fields_by_name['rl_output'].message_type = mlagents__envs_dot_communicator__objects_dot_unity__rl__output__pb2._UNITYRLOUTPUTPROTO
+_UNITYOUTPUTPROTO.fields_by_name['rl_initialization_output'].message_type = mlagents__envs_dot_communicator__objects_dot_unity__rl__initialization__output__pb2._UNITYRLINITIALIZATIONOUTPUTPROTO
+DESCRIPTOR.message_types_by_name['UnityOutputProto'] = _UNITYOUTPUTPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+UnityOutputProto = _reflection.GeneratedProtocolMessageType('UnityOutputProto', (_message.Message,), dict(
+  DESCRIPTOR = _UNITYOUTPUTPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.unity_output_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.UnityOutputProto)
+  ))
+_sym_db.RegisterMessage(UnityOutputProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_initialization_input_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_initialization_input_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d111397adaa25c1d73a00291092fb98358ef0ff7
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_initialization_input_pb2.py
@@ -0,0 +1,102 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/unity_rl_initialization_input.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from mlagents_envs.communicator_objects import capabilities_pb2 as mlagents__envs_dot_communicator__objects_dot_capabilities__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/unity_rl_initialization_input.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\nFmlagents_envs/communicator_objects/unity_rl_initialization_input.proto\x12\x14\x63ommunicator_objects\x1a\x35mlagents_envs/communicator_objects/capabilities.proto\"\xc0\x01\n\x1fUnityRLInitializationInputProto\x12\x0c\n\x04seed\x18\x01 \x01(\x05\x12\x1d\n\x15\x63ommunication_version\x18\x02 \x01(\t\x12\x17\n\x0fpackage_version\x18\x03 \x01(\t\x12\x44\n\x0c\x63\x61pabilities\x18\x04 \x01(\x0b\x32..communicator_objects.UnityRLCapabilitiesProto\x12\x11\n\tnum_areas\x18\x05 \x01(\x05\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  ,
+  dependencies=[mlagents__envs_dot_communicator__objects_dot_capabilities__pb2.DESCRIPTOR,])
+
+
+
+
+_UNITYRLINITIALIZATIONINPUTPROTO = _descriptor.Descriptor(
+  name='UnityRLInitializationInputProto',
+  full_name='communicator_objects.UnityRLInitializationInputProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='seed', full_name='communicator_objects.UnityRLInitializationInputProto.seed', index=0,
+      number=1, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='communication_version', full_name='communicator_objects.UnityRLInitializationInputProto.communication_version', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='package_version', full_name='communicator_objects.UnityRLInitializationInputProto.package_version', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='capabilities', full_name='communicator_objects.UnityRLInitializationInputProto.capabilities', index=3,
+      number=4, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='num_areas', full_name='communicator_objects.UnityRLInitializationInputProto.num_areas', index=4,
+      number=5, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=152,
+  serialized_end=344,
+)
+
+_UNITYRLINITIALIZATIONINPUTPROTO.fields_by_name['capabilities'].message_type = mlagents__envs_dot_communicator__objects_dot_capabilities__pb2._UNITYRLCAPABILITIESPROTO
+DESCRIPTOR.message_types_by_name['UnityRLInitializationInputProto'] = _UNITYRLINITIALIZATIONINPUTPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+UnityRLInitializationInputProto = _reflection.GeneratedProtocolMessageType('UnityRLInitializationInputProto', (_message.Message,), dict(
+  DESCRIPTOR = _UNITYRLINITIALIZATIONINPUTPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.unity_rl_initialization_input_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLInitializationInputProto)
+  ))
+_sym_db.RegisterMessage(UnityRLInitializationInputProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_initialization_output_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_initialization_output_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..155af96d825b5ee5ae1f1df3adaa0abf35c65db2
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_initialization_output_pb2.py
@@ -0,0 +1,111 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/unity_rl_initialization_output.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from mlagents_envs.communicator_objects import capabilities_pb2 as mlagents__envs_dot_communicator__objects_dot_capabilities__pb2
+from mlagents_envs.communicator_objects import brain_parameters_pb2 as mlagents__envs_dot_communicator__objects_dot_brain__parameters__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/unity_rl_initialization_output.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\nGmlagents_envs/communicator_objects/unity_rl_initialization_output.proto\x12\x14\x63ommunicator_objects\x1a\x35mlagents_envs/communicator_objects/capabilities.proto\x1a\x39mlagents_envs/communicator_objects/brain_parameters.proto\"\x8c\x02\n UnityRLInitializationOutputProto\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1d\n\x15\x63ommunication_version\x18\x02 \x01(\t\x12\x10\n\x08log_path\x18\x03 \x01(\t\x12\x44\n\x10\x62rain_parameters\x18\x05 \x03(\x0b\x32*.communicator_objects.BrainParametersProto\x12\x17\n\x0fpackage_version\x18\x07 \x01(\t\x12\x44\n\x0c\x63\x61pabilities\x18\x08 \x01(\x0b\x32..communicator_objects.UnityRLCapabilitiesProtoJ\x04\x08\x06\x10\x07\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  ,
+  dependencies=[mlagents__envs_dot_communicator__objects_dot_capabilities__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_brain__parameters__pb2.DESCRIPTOR,])
+
+
+
+
+_UNITYRLINITIALIZATIONOUTPUTPROTO = _descriptor.Descriptor(
+  name='UnityRLInitializationOutputProto',
+  full_name='communicator_objects.UnityRLInitializationOutputProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='communicator_objects.UnityRLInitializationOutputProto.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='communication_version', full_name='communicator_objects.UnityRLInitializationOutputProto.communication_version', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='log_path', full_name='communicator_objects.UnityRLInitializationOutputProto.log_path', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='brain_parameters', full_name='communicator_objects.UnityRLInitializationOutputProto.brain_parameters', index=3,
+      number=5, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='package_version', full_name='communicator_objects.UnityRLInitializationOutputProto.package_version', index=4,
+      number=7, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='capabilities', full_name='communicator_objects.UnityRLInitializationOutputProto.capabilities', index=5,
+      number=8, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=212,
+  serialized_end=480,
+)
+
+_UNITYRLINITIALIZATIONOUTPUTPROTO.fields_by_name['brain_parameters'].message_type = mlagents__envs_dot_communicator__objects_dot_brain__parameters__pb2._BRAINPARAMETERSPROTO
+_UNITYRLINITIALIZATIONOUTPUTPROTO.fields_by_name['capabilities'].message_type = mlagents__envs_dot_communicator__objects_dot_capabilities__pb2._UNITYRLCAPABILITIESPROTO
+DESCRIPTOR.message_types_by_name['UnityRLInitializationOutputProto'] = _UNITYRLINITIALIZATIONOUTPUTPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+UnityRLInitializationOutputProto = _reflection.GeneratedProtocolMessageType('UnityRLInitializationOutputProto', (_message.Message,), dict(
+  DESCRIPTOR = _UNITYRLINITIALIZATIONOUTPUTPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.unity_rl_initialization_output_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLInitializationOutputProto)
+  ))
+_sym_db.RegisterMessage(UnityRLInitializationOutputProto)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_input_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_input_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..69e14f3fd291f3cedd03446926fadfd2f4e589b8
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_input_pb2.py
@@ -0,0 +1,179 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/unity_rl_input.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from mlagents_envs.communicator_objects import agent_action_pb2 as mlagents__envs_dot_communicator__objects_dot_agent__action__pb2
+from mlagents_envs.communicator_objects import command_pb2 as mlagents__envs_dot_communicator__objects_dot_command__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/unity_rl_input.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n7mlagents_envs/communicator_objects/unity_rl_input.proto\x12\x14\x63ommunicator_objects\x1a\x35mlagents_envs/communicator_objects/agent_action.proto\x1a\x30mlagents_envs/communicator_objects/command.proto\"\xfe\x02\n\x11UnityRLInputProto\x12P\n\ragent_actions\x18\x01 \x03(\x0b\x32\x39.communicator_objects.UnityRLInputProto.AgentActionsEntry\x12\x33\n\x07\x63ommand\x18\x04 \x01(\x0e\x32\".communicator_objects.CommandProto\x12\x14\n\x0cside_channel\x18\x05 \x01(\x0c\x1aM\n\x14ListAgentActionProto\x12\x35\n\x05value\x18\x01 \x03(\x0b\x32&.communicator_objects.AgentActionProto\x1aq\n\x11\x41gentActionsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12K\n\x05value\x18\x02 \x01(\x0b\x32<.communicator_objects.UnityRLInputProto.ListAgentActionProto:\x02\x38\x01J\x04\x08\x02\x10\x03J\x04\x08\x03\x10\x04\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  ,
+  dependencies=[mlagents__envs_dot_communicator__objects_dot_agent__action__pb2.DESCRIPTOR,mlagents__envs_dot_communicator__objects_dot_command__pb2.DESCRIPTOR,])
+
+
+
+
+_UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO = _descriptor.Descriptor(
+  name='ListAgentActionProto',
+  full_name='communicator_objects.UnityRLInputProto.ListAgentActionProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='value', full_name='communicator_objects.UnityRLInputProto.ListAgentActionProto.value', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=365,
+  serialized_end=442,
+)
+
+_UNITYRLINPUTPROTO_AGENTACTIONSENTRY = _descriptor.Descriptor(
+  name='AgentActionsEntry',
+  full_name='communicator_objects.UnityRLInputProto.AgentActionsEntry',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='key', full_name='communicator_objects.UnityRLInputProto.AgentActionsEntry.key', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='communicator_objects.UnityRLInputProto.AgentActionsEntry.value', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=_descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')),
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=444,
+  serialized_end=557,
+)
+
+_UNITYRLINPUTPROTO = _descriptor.Descriptor(
+  name='UnityRLInputProto',
+  full_name='communicator_objects.UnityRLInputProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='agent_actions', full_name='communicator_objects.UnityRLInputProto.agent_actions', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='command', full_name='communicator_objects.UnityRLInputProto.command', index=1,
+      number=4, type=14, cpp_type=8, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='side_channel', full_name='communicator_objects.UnityRLInputProto.side_channel', index=2,
+      number=5, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b(""),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO, _UNITYRLINPUTPROTO_AGENTACTIONSENTRY, ],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=187,
+  serialized_end=569,
+)
+
+_UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO.fields_by_name['value'].message_type = mlagents__envs_dot_communicator__objects_dot_agent__action__pb2._AGENTACTIONPROTO
+_UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO.containing_type = _UNITYRLINPUTPROTO
+_UNITYRLINPUTPROTO_AGENTACTIONSENTRY.fields_by_name['value'].message_type = _UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO
+_UNITYRLINPUTPROTO_AGENTACTIONSENTRY.containing_type = _UNITYRLINPUTPROTO
+_UNITYRLINPUTPROTO.fields_by_name['agent_actions'].message_type = _UNITYRLINPUTPROTO_AGENTACTIONSENTRY
+_UNITYRLINPUTPROTO.fields_by_name['command'].enum_type = mlagents__envs_dot_communicator__objects_dot_command__pb2._COMMANDPROTO
+DESCRIPTOR.message_types_by_name['UnityRLInputProto'] = _UNITYRLINPUTPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+UnityRLInputProto = _reflection.GeneratedProtocolMessageType('UnityRLInputProto', (_message.Message,), dict(
+
+  ListAgentActionProto = _reflection.GeneratedProtocolMessageType('ListAgentActionProto', (_message.Message,), dict(
+    DESCRIPTOR = _UNITYRLINPUTPROTO_LISTAGENTACTIONPROTO,
+    __module__ = 'mlagents_envs.communicator_objects.unity_rl_input_pb2'
+    # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLInputProto.ListAgentActionProto)
+    ))
+  ,
+
+  AgentActionsEntry = _reflection.GeneratedProtocolMessageType('AgentActionsEntry', (_message.Message,), dict(
+    DESCRIPTOR = _UNITYRLINPUTPROTO_AGENTACTIONSENTRY,
+    __module__ = 'mlagents_envs.communicator_objects.unity_rl_input_pb2'
+    # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLInputProto.AgentActionsEntry)
+    ))
+  ,
+  DESCRIPTOR = _UNITYRLINPUTPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.unity_rl_input_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLInputProto)
+  ))
+_sym_db.RegisterMessage(UnityRLInputProto)
+_sym_db.RegisterMessage(UnityRLInputProto.ListAgentActionProto)
+_sym_db.RegisterMessage(UnityRLInputProto.AgentActionsEntry)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+_UNITYRLINPUTPROTO_AGENTACTIONSENTRY.has_options = True
+_UNITYRLINPUTPROTO_AGENTACTIONSENTRY._options = _descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_output_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_output_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..099d0b0040e98418950442adc34410e908a52a08
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_rl_output_pb2.py
@@ -0,0 +1,170 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/unity_rl_output.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from mlagents_envs.communicator_objects import agent_info_pb2 as mlagents__envs_dot_communicator__objects_dot_agent__info__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/unity_rl_output.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n8mlagents_envs/communicator_objects/unity_rl_output.proto\x12\x14\x63ommunicator_objects\x1a\x33mlagents_envs/communicator_objects/agent_info.proto\"\xb9\x02\n\x12UnityRLOutputProto\x12L\n\nagentInfos\x18\x02 \x03(\x0b\x32\x38.communicator_objects.UnityRLOutputProto.AgentInfosEntry\x12\x14\n\x0cside_channel\x18\x03 \x01(\x0c\x1aI\n\x12ListAgentInfoProto\x12\x33\n\x05value\x18\x01 \x03(\x0b\x32$.communicator_objects.AgentInfoProto\x1an\n\x0f\x41gentInfosEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12J\n\x05value\x18\x02 \x01(\x0b\x32;.communicator_objects.UnityRLOutputProto.ListAgentInfoProto:\x02\x38\x01J\x04\x08\x01\x10\x02\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  ,
+  dependencies=[mlagents__envs_dot_communicator__objects_dot_agent__info__pb2.DESCRIPTOR,])
+
+
+
+
+_UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO = _descriptor.Descriptor(
+  name='ListAgentInfoProto',
+  full_name='communicator_objects.UnityRLOutputProto.ListAgentInfoProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='value', full_name='communicator_objects.UnityRLOutputProto.ListAgentInfoProto.value', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=258,
+  serialized_end=331,
+)
+
+_UNITYRLOUTPUTPROTO_AGENTINFOSENTRY = _descriptor.Descriptor(
+  name='AgentInfosEntry',
+  full_name='communicator_objects.UnityRLOutputProto.AgentInfosEntry',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='key', full_name='communicator_objects.UnityRLOutputProto.AgentInfosEntry.key', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='value', full_name='communicator_objects.UnityRLOutputProto.AgentInfosEntry.value', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=_descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001')),
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=333,
+  serialized_end=443,
+)
+
+_UNITYRLOUTPUTPROTO = _descriptor.Descriptor(
+  name='UnityRLOutputProto',
+  full_name='communicator_objects.UnityRLOutputProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='agentInfos', full_name='communicator_objects.UnityRLOutputProto.agentInfos', index=0,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='side_channel', full_name='communicator_objects.UnityRLOutputProto.side_channel', index=1,
+      number=3, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b(""),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO, _UNITYRLOUTPUTPROTO_AGENTINFOSENTRY, ],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=136,
+  serialized_end=449,
+)
+
+_UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO.fields_by_name['value'].message_type = mlagents__envs_dot_communicator__objects_dot_agent__info__pb2._AGENTINFOPROTO
+_UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO.containing_type = _UNITYRLOUTPUTPROTO
+_UNITYRLOUTPUTPROTO_AGENTINFOSENTRY.fields_by_name['value'].message_type = _UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO
+_UNITYRLOUTPUTPROTO_AGENTINFOSENTRY.containing_type = _UNITYRLOUTPUTPROTO
+_UNITYRLOUTPUTPROTO.fields_by_name['agentInfos'].message_type = _UNITYRLOUTPUTPROTO_AGENTINFOSENTRY
+DESCRIPTOR.message_types_by_name['UnityRLOutputProto'] = _UNITYRLOUTPUTPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+UnityRLOutputProto = _reflection.GeneratedProtocolMessageType('UnityRLOutputProto', (_message.Message,), dict(
+
+  ListAgentInfoProto = _reflection.GeneratedProtocolMessageType('ListAgentInfoProto', (_message.Message,), dict(
+    DESCRIPTOR = _UNITYRLOUTPUTPROTO_LISTAGENTINFOPROTO,
+    __module__ = 'mlagents_envs.communicator_objects.unity_rl_output_pb2'
+    # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLOutputProto.ListAgentInfoProto)
+    ))
+  ,
+
+  AgentInfosEntry = _reflection.GeneratedProtocolMessageType('AgentInfosEntry', (_message.Message,), dict(
+    DESCRIPTOR = _UNITYRLOUTPUTPROTO_AGENTINFOSENTRY,
+    __module__ = 'mlagents_envs.communicator_objects.unity_rl_output_pb2'
+    # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLOutputProto.AgentInfosEntry)
+    ))
+  ,
+  DESCRIPTOR = _UNITYRLOUTPUTPROTO,
+  __module__ = 'mlagents_envs.communicator_objects.unity_rl_output_pb2'
+  # @@protoc_insertion_point(class_scope:communicator_objects.UnityRLOutputProto)
+  ))
+_sym_db.RegisterMessage(UnityRLOutputProto)
+_sym_db.RegisterMessage(UnityRLOutputProto.ListAgentInfoProto)
+_sym_db.RegisterMessage(UnityRLOutputProto.AgentInfosEntry)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+_UNITYRLOUTPUTPROTO_AGENTINFOSENTRY.has_options = True
+_UNITYRLOUTPUTPROTO_AGENTINFOSENTRY._options = _descriptor._ParseOptions(descriptor_pb2.MessageOptions(), _b('8\001'))
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_to_external_pb2.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_to_external_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6053ccc46a4b550649b8f20238b15117f8748e88
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_to_external_pb2.py
@@ -0,0 +1,58 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: mlagents_envs/communicator_objects/unity_to_external.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+from mlagents_envs.communicator_objects import unity_message_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__message__pb2
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='mlagents_envs/communicator_objects/unity_to_external.proto',
+  package='communicator_objects',
+  syntax='proto3',
+  serialized_pb=_b('\n:mlagents_envs/communicator_objects/unity_to_external.proto\x12\x14\x63ommunicator_objects\x1a\x36mlagents_envs/communicator_objects/unity_message.proto2v\n\x14UnityToExternalProto\x12^\n\x08\x45xchange\x12\'.communicator_objects.UnityMessageProto\x1a\'.communicator_objects.UnityMessageProto\"\x00\x42%\xaa\x02\"Unity.MLAgents.CommunicatorObjectsb\x06proto3')
+  ,
+  dependencies=[mlagents__envs_dot_communicator__objects_dot_unity__message__pb2.DESCRIPTOR,])
+
+
+
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\252\002\"Unity.MLAgents.CommunicatorObjects'))
+
+_UNITYTOEXTERNALPROTO = _descriptor.ServiceDescriptor(
+  name='UnityToExternalProto',
+  full_name='communicator_objects.UnityToExternalProto',
+  file=DESCRIPTOR,
+  index=0,
+  options=None,
+  serialized_start=140,
+  serialized_end=258,
+  methods=[
+  _descriptor.MethodDescriptor(
+    name='Exchange',
+    full_name='communicator_objects.UnityToExternalProto.Exchange',
+    index=0,
+    containing_service=None,
+    input_type=mlagents__envs_dot_communicator__objects_dot_unity__message__pb2._UNITYMESSAGEPROTO,
+    output_type=mlagents__envs_dot_communicator__objects_dot_unity__message__pb2._UNITYMESSAGEPROTO,
+    options=None,
+  ),
+])
+_sym_db.RegisterServiceDescriptor(_UNITYTOEXTERNALPROTO)
+
+DESCRIPTOR.services_by_name['UnityToExternalProto'] = _UNITYTOEXTERNALPROTO
+
+# @@protoc_insertion_point(module_scope)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_to_external_pb2_grpc.py b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_to_external_pb2_grpc.py
new file mode 100644
index 0000000000000000000000000000000000000000..bedeb100594c7ba9728c2118b526a18de5f2421d
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/communicator_objects/unity_to_external_pb2_grpc.py
@@ -0,0 +1,46 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+import grpc
+
+from mlagents_envs.communicator_objects import unity_message_pb2 as mlagents__envs_dot_communicator__objects_dot_unity__message__pb2
+
+
+class UnityToExternalProtoStub(object):
+  # missing associated documentation comment in .proto file
+  pass
+
+  def __init__(self, channel):
+    """Constructor.
+
+    Args:
+      channel: A grpc.Channel.
+    """
+    self.Exchange = channel.unary_unary(
+        '/communicator_objects.UnityToExternalProto/Exchange',
+        request_serializer=mlagents__envs_dot_communicator__objects_dot_unity__message__pb2.UnityMessageProto.SerializeToString,
+        response_deserializer=mlagents__envs_dot_communicator__objects_dot_unity__message__pb2.UnityMessageProto.FromString,
+        )
+
+
+class UnityToExternalProtoServicer(object):
+  # missing associated documentation comment in .proto file
+  pass
+
+  def Exchange(self, request, context):
+    """Sends the academy parameters
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+
+def add_UnityToExternalProtoServicer_to_server(servicer, server):
+  rpc_method_handlers = {
+      'Exchange': grpc.unary_unary_rpc_method_handler(
+          servicer.Exchange,
+          request_deserializer=mlagents__envs_dot_communicator__objects_dot_unity__message__pb2.UnityMessageProto.FromString,
+          response_serializer=mlagents__envs_dot_communicator__objects_dot_unity__message__pb2.UnityMessageProto.SerializeToString,
+      ),
+  }
+  generic_handler = grpc.method_handlers_generic_handler(
+      'communicator_objects.UnityToExternalProto', rpc_method_handlers)
+  server.add_generic_rpc_handlers((generic_handler,))
diff --git a/MLPY/Lib/site-packages/mlagents_envs/env_utils.py b/MLPY/Lib/site-packages/mlagents_envs/env_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..73393d945617a2e3ac0c3489939ae9575d5d8d80
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/env_utils.py
@@ -0,0 +1,130 @@
+import glob
+import os
+import subprocess
+from sys import platform
+from typing import Optional, List
+from mlagents_envs.logging_util import get_logger, DEBUG
+from mlagents_envs.exception import UnityEnvironmentException
+
+
+logger = get_logger(__name__)
+
+
+def get_platform():
+    """
+    returns the platform of the operating system : linux, darwin or win32
+    """
+    return platform
+
+
+def validate_environment_path(env_path: str) -> Optional[str]:
+    """
+    Strip out executable extensions of the env_path
+    :param env_path: The path to the executable
+    """
+    env_path = (
+        env_path.strip()
+        .replace(".app", "")
+        .replace(".exe", "")
+        .replace(".x86_64", "")
+        .replace(".x86", "")
+    )
+    true_filename = os.path.basename(os.path.normpath(env_path))
+    logger.debug(f"The true file name is {true_filename}")
+
+    if not (glob.glob(env_path) or glob.glob(env_path + ".*")):
+        return None
+
+    cwd = os.getcwd()
+    launch_string = None
+    true_filename = os.path.basename(os.path.normpath(env_path))
+    if get_platform() == "linux" or get_platform() == "linux2":
+        candidates = glob.glob(os.path.join(cwd, env_path) + ".x86_64")
+        if len(candidates) == 0:
+            candidates = glob.glob(os.path.join(cwd, env_path) + ".x86")
+        if len(candidates) == 0:
+            candidates = glob.glob(env_path + ".x86_64")
+        if len(candidates) == 0:
+            candidates = glob.glob(env_path + ".x86")
+        if len(candidates) == 0:
+            if os.path.isfile(env_path):
+                candidates = [env_path]
+        if len(candidates) > 0:
+            launch_string = candidates[0]
+
+    elif get_platform() == "darwin":
+        candidates = glob.glob(
+            os.path.join(cwd, env_path + ".app", "Contents", "MacOS", true_filename)
+        )
+        if len(candidates) == 0:
+            candidates = glob.glob(
+                os.path.join(env_path + ".app", "Contents", "MacOS", true_filename)
+            )
+        if len(candidates) == 0:
+            candidates = glob.glob(
+                os.path.join(cwd, env_path + ".app", "Contents", "MacOS", "*")
+            )
+        if len(candidates) == 0:
+            candidates = glob.glob(
+                os.path.join(env_path + ".app", "Contents", "MacOS", "*")
+            )
+        if len(candidates) > 0:
+            launch_string = candidates[0]
+    elif get_platform() == "win32":
+        candidates = glob.glob(os.path.join(cwd, env_path + ".exe"))
+        if len(candidates) == 0:
+            candidates = glob.glob(env_path + ".exe")
+        if len(candidates) == 0:
+            # Look for e.g. 3DBall\UnityEnvironment.exe
+            crash_handlers = set(
+                glob.glob(os.path.join(cwd, env_path, "UnityCrashHandler*.exe"))
+            )
+            candidates = [
+                c
+                for c in glob.glob(os.path.join(cwd, env_path, "*.exe"))
+                if c not in crash_handlers
+            ]
+        if len(candidates) > 0:
+            launch_string = candidates[0]
+    return launch_string
+
+
+def launch_executable(file_name: str, args: List[str]) -> subprocess.Popen:
+    """
+    Launches a Unity executable and returns the process handle for it.
+    :param file_name: the name of the executable
+    :param args: List of string that will be passed as command line arguments
+    when launching the executable.
+    """
+    launch_string = validate_environment_path(file_name)
+    if launch_string is None:
+        raise UnityEnvironmentException(
+            f"Couldn't launch the {file_name} environment. Provided filename does not match any environments."
+        )
+    else:
+        logger.debug(f"The launch string is {launch_string}")
+        logger.debug(f"Running with args {args}")
+        # Launch Unity environment
+        subprocess_args = [launch_string] + args
+        # std_out_option = DEVNULL means the outputs will not be displayed on terminal.
+        # std_out_option = None is default behavior: the outputs are displayed on terminal.
+        std_out_option = subprocess.DEVNULL if logger.level > DEBUG else None
+        try:
+            return subprocess.Popen(
+                subprocess_args,
+                # start_new_session=True means that signals to the parent python process
+                # (e.g. SIGINT from keyboard interrupt) will not be sent to the new process on POSIX platforms.
+                # This is generally good since we want the environment to have a chance to shutdown,
+                # but may be undesirable in come cases; if so, we'll add a command-line toggle.
+                # Note that on Windows, the CTRL_C signal will still be sent.
+                start_new_session=True,
+                stdout=std_out_option,
+                stderr=std_out_option,
+            )
+        except PermissionError as perm:
+            # This is likely due to missing read or execute permissions on file.
+            raise UnityEnvironmentException(
+                f"Error when trying to launch environment - make sure "
+                f"permissions are set correctly. For example "
+                f'"chmod -R 755 {launch_string}"'
+            ) from perm
diff --git a/MLPY/Lib/site-packages/mlagents_envs/environment.py b/MLPY/Lib/site-packages/mlagents_envs/environment.py
new file mode 100644
index 0000000000000000000000000000000000000000..18731a20bb198c4be456d5fe8772fb2dae3e4574
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/environment.py
@@ -0,0 +1,518 @@
+import atexit
+from distutils.version import StrictVersion
+
+import numpy as np
+import os
+import subprocess
+from typing import Dict, List, Optional, Tuple, Mapping as MappingType
+
+import mlagents_envs
+
+from mlagents_envs.logging_util import get_logger
+from mlagents_envs.side_channel.side_channel import SideChannel
+from mlagents_envs.side_channel import DefaultTrainingAnalyticsSideChannel
+from mlagents_envs.side_channel.side_channel_manager import SideChannelManager
+from mlagents_envs import env_utils
+
+from mlagents_envs.base_env import (
+    BaseEnv,
+    DecisionSteps,
+    TerminalSteps,
+    BehaviorSpec,
+    ActionTuple,
+    BehaviorName,
+    AgentId,
+    BehaviorMapping,
+)
+from mlagents_envs.timers import timed, hierarchical_timer
+from mlagents_envs.exception import (
+    UnityEnvironmentException,
+    UnityActionException,
+    UnityTimeOutException,
+    UnityCommunicatorStoppedException,
+)
+
+from mlagents_envs.communicator_objects.command_pb2 import STEP, RESET
+from mlagents_envs.rpc_utils import behavior_spec_from_proto, steps_from_proto
+
+from mlagents_envs.communicator_objects.unity_rl_input_pb2 import UnityRLInputProto
+from mlagents_envs.communicator_objects.unity_rl_output_pb2 import UnityRLOutputProto
+from mlagents_envs.communicator_objects.agent_action_pb2 import AgentActionProto
+from mlagents_envs.communicator_objects.unity_output_pb2 import UnityOutputProto
+from mlagents_envs.communicator_objects.capabilities_pb2 import UnityRLCapabilitiesProto
+from mlagents_envs.communicator_objects.unity_rl_initialization_input_pb2 import (
+    UnityRLInitializationInputProto,
+)
+
+from mlagents_envs.communicator_objects.unity_input_pb2 import UnityInputProto
+
+from .rpc_communicator import RpcCommunicator
+import signal
+
+logger = get_logger(__name__)
+
+
+class UnityEnvironment(BaseEnv):
+    # Communication protocol version.
+    # When connecting to C#, this must be compatible with Academy.k_ApiVersion.
+    # We follow semantic versioning on the communication version, so existing
+    # functionality will work as long the major versions match.
+    # This should be changed whenever a change is made to the communication protocol.
+    # Revision history:
+    #  * 1.0.0 - initial version
+    #  * 1.1.0 - support concatenated PNGs for compressed observations.
+    #  * 1.2.0 - support compression mapping for stacked compressed observations.
+    #  * 1.3.0 - support action spaces with both continuous and discrete actions.
+    #  * 1.4.0 - support training analytics sent from python trainer to the editor.
+    #  * 1.5.0 - support variable length observation training and multi-agent groups.
+    API_VERSION = "1.5.0"
+
+    # Default port that the editor listens on. If an environment executable
+    # isn't specified, this port will be used.
+    DEFAULT_EDITOR_PORT = 5004
+
+    # Default base port for environments. Each environment will be offset from this
+    # by it's worker_id.
+    BASE_ENVIRONMENT_PORT = 5005
+
+    # Command line argument used to pass the port to the executable environment.
+    _PORT_COMMAND_LINE_ARG = "--mlagents-port"
+
+    @staticmethod
+    def _raise_version_exception(unity_com_ver: str) -> None:
+        raise UnityEnvironmentException(
+            f"The communication API version is not compatible between Unity and python. "
+            f"Python API: {UnityEnvironment.API_VERSION}, Unity API: {unity_com_ver}.\n "
+            f"Please find the versions that work best together from our release page.\n"
+            "https://github.com/Unity-Technologies/ml-agents/releases"
+        )
+
+    @staticmethod
+    def _check_communication_compatibility(
+        unity_com_ver: str, python_api_version: str, unity_package_version: str
+    ) -> bool:
+        unity_communicator_version = StrictVersion(unity_com_ver)
+        api_version = StrictVersion(python_api_version)
+        if unity_communicator_version.version[0] == 0:
+            if (
+                unity_communicator_version.version[0] != api_version.version[0]
+                or unity_communicator_version.version[1] != api_version.version[1]
+            ):
+                # Minor beta versions differ.
+                return False
+        elif unity_communicator_version.version[0] != api_version.version[0]:
+            # Major versions mismatch.
+            return False
+        else:
+            # Major versions match, so either:
+            # 1) The versions are identical, in which case there's no compatibility issues
+            # 2) The Unity version is newer, in which case we'll warn or fail on the Unity side if trying to use
+            #    unsupported features
+            # 3) The trainer version is newer, in which case new trainer features might be available but unused by C#
+            # In any of the cases, there's no reason to warn about mismatch here.
+            logger.info(
+                f"Connected to Unity environment with package version {unity_package_version} "
+                f"and communication version {unity_com_ver}"
+            )
+        return True
+
+    @staticmethod
+    def _get_capabilities_proto() -> UnityRLCapabilitiesProto:
+        capabilities = UnityRLCapabilitiesProto()
+        capabilities.baseRLCapabilities = True
+        capabilities.concatenatedPngObservations = True
+        capabilities.compressedChannelMapping = True
+        capabilities.hybridActions = True
+        capabilities.trainingAnalytics = True
+        capabilities.variableLengthObservation = True
+        capabilities.multiAgentGroups = True
+        return capabilities
+
+    @staticmethod
+    def _warn_csharp_base_capabilities(
+        caps: UnityRLCapabilitiesProto, unity_package_ver: str, python_package_ver: str
+    ) -> None:
+        if not caps.baseRLCapabilities:
+            logger.warning(
+                "WARNING: The Unity process is not running with the expected base Reinforcement Learning"
+                " capabilities. Please be sure upgrade the Unity Package to a version that is compatible with this "
+                "python package.\n"
+                f"Python package version: {python_package_ver}, C# package version: {unity_package_ver}"
+                f"Please find the versions that work best together from our release page.\n"
+                "https://github.com/Unity-Technologies/ml-agents/releases"
+            )
+
+    def __init__(
+        self,
+        file_name: Optional[str] = None,
+        worker_id: int = 0,
+        base_port: Optional[int] = None,
+        seed: int = 0,
+        no_graphics: bool = False,
+        timeout_wait: int = 60,
+        additional_args: Optional[List[str]] = None,
+        side_channels: Optional[List[SideChannel]] = None,
+        log_folder: Optional[str] = None,
+        num_areas: int = 1,
+    ):
+        """
+        Starts a new unity environment and establishes a connection with the environment.
+        Notice: Currently communication between Unity and Python takes place over an open socket without authentication.
+        Ensure that the network where training takes place is secure.
+
+        :string file_name: Name of Unity environment binary.
+        :int base_port: Baseline port number to connect to Unity environment over. worker_id increments over this.
+        If no environment is specified (i.e. file_name is None), the DEFAULT_EDITOR_PORT will be used.
+        :int worker_id: Offset from base_port. Used for training multiple environments simultaneously.
+        :bool no_graphics: Whether to run the Unity simulator in no-graphics mode
+        :int timeout_wait: Time (in seconds) to wait for connection from environment.
+        :list args: Addition Unity command line arguments
+        :list side_channels: Additional side channel for no-rl communication with Unity
+        :str log_folder: Optional folder to write the Unity Player log file into.  Requires absolute path.
+        """
+        atexit.register(self._close)
+        self._additional_args = additional_args or []
+        self._no_graphics = no_graphics
+        # If base port is not specified, use BASE_ENVIRONMENT_PORT if we have
+        # an environment, otherwise DEFAULT_EDITOR_PORT
+        if base_port is None:
+            base_port = (
+                self.BASE_ENVIRONMENT_PORT if file_name else self.DEFAULT_EDITOR_PORT
+            )
+        self._port = base_port + worker_id
+        self._buffer_size = 12000
+        # If true, this means the environment was successfully loaded
+        self._loaded = False
+        # The process that is started. If None, no process was started
+        self._process: Optional[subprocess.Popen] = None
+        self._timeout_wait: int = timeout_wait
+        self._communicator = self._get_communicator(worker_id, base_port, timeout_wait)
+        self._worker_id = worker_id
+        if side_channels is None:
+            side_channels = []
+        default_training_side_channel: Optional[
+            DefaultTrainingAnalyticsSideChannel
+        ] = None
+        if DefaultTrainingAnalyticsSideChannel.CHANNEL_ID not in [
+            _.channel_id for _ in side_channels
+        ]:
+            default_training_side_channel = DefaultTrainingAnalyticsSideChannel()
+            side_channels.append(default_training_side_channel)
+        self._side_channel_manager = SideChannelManager(side_channels)
+        self._log_folder = log_folder
+        self.academy_capabilities: UnityRLCapabilitiesProto = None  # type: ignore
+
+        # If the environment name is None, a new environment will not be launched
+        # and the communicator will directly try to connect to an existing unity environment.
+        # If the worker-id is not 0 and the environment name is None, an error is thrown
+        if file_name is None and worker_id != 0:
+            raise UnityEnvironmentException(
+                "If the environment name is None, "
+                "the worker-id must be 0 in order to connect with the Editor."
+            )
+        if file_name is not None:
+            try:
+                self._process = env_utils.launch_executable(
+                    file_name, self._executable_args()
+                )
+            except UnityEnvironmentException:
+                self._close(0)
+                raise
+        else:
+            logger.info(
+                f"Listening on port {self._port}. "
+                f"Start training by pressing the Play button in the Unity Editor."
+            )
+        self._loaded = True
+
+        rl_init_parameters_in = UnityRLInitializationInputProto(
+            seed=seed,
+            communication_version=self.API_VERSION,
+            package_version=mlagents_envs.__version__,
+            capabilities=UnityEnvironment._get_capabilities_proto(),
+            num_areas=num_areas,
+        )
+        try:
+            aca_output = self._send_academy_parameters(rl_init_parameters_in)
+            aca_params = aca_output.rl_initialization_output
+        except UnityTimeOutException:
+            self._close(0)
+            raise
+
+        if not UnityEnvironment._check_communication_compatibility(
+            aca_params.communication_version,
+            UnityEnvironment.API_VERSION,
+            aca_params.package_version,
+        ):
+            self._close(0)
+            UnityEnvironment._raise_version_exception(aca_params.communication_version)
+
+        UnityEnvironment._warn_csharp_base_capabilities(
+            aca_params.capabilities,
+            aca_params.package_version,
+            UnityEnvironment.API_VERSION,
+        )
+
+        self._env_state: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
+        self._env_specs: Dict[str, BehaviorSpec] = {}
+        self._env_actions: Dict[str, ActionTuple] = {}
+        self._is_first_message = True
+        self._update_behavior_specs(aca_output)
+        self.academy_capabilities = aca_params.capabilities
+        if default_training_side_channel is not None:
+            default_training_side_channel.environment_initialized()
+
+    @staticmethod
+    def _get_communicator(worker_id, base_port, timeout_wait):
+        return RpcCommunicator(worker_id, base_port, timeout_wait)
+
+    def _executable_args(self) -> List[str]:
+        args: List[str] = []
+        if self._no_graphics:
+            args += ["-nographics", "-batchmode"]
+        args += [UnityEnvironment._PORT_COMMAND_LINE_ARG, str(self._port)]
+
+        # If the logfile arg isn't already set in the env args,
+        # try to set it to an output directory
+        logfile_set = "-logfile" in (arg.lower() for arg in self._additional_args)
+        if self._log_folder and not logfile_set:
+            log_file_path = os.path.join(
+                self._log_folder, f"Player-{self._worker_id}.log"
+            )
+            args += ["-logFile", log_file_path]
+        # Add in arguments passed explicitly by the user.
+        args += self._additional_args
+        return args
+
+    def _update_behavior_specs(self, output: UnityOutputProto) -> None:
+        init_output = output.rl_initialization_output
+        for brain_param in init_output.brain_parameters:
+            # Each BrainParameter in the rl_initialization_output should have at least one AgentInfo
+            # Get that agent, because we need some of its observations.
+            agent_infos = output.rl_output.agentInfos[brain_param.brain_name]
+            if agent_infos.value:
+                agent = agent_infos.value[0]
+                new_spec = behavior_spec_from_proto(brain_param, agent)
+                self._env_specs[brain_param.brain_name] = new_spec
+                logger.info(f"Connected new brain: {brain_param.brain_name}")
+
+    def _update_state(self, output: UnityRLOutputProto) -> None:
+        """
+        Collects experience information from all external brains in environment at current step.
+        """
+        for brain_name in self._env_specs.keys():
+            if brain_name in output.agentInfos:
+                agent_info_list = output.agentInfos[brain_name].value
+                self._env_state[brain_name] = steps_from_proto(
+                    agent_info_list, self._env_specs[brain_name]
+                )
+            else:
+                self._env_state[brain_name] = (
+                    DecisionSteps.empty(self._env_specs[brain_name]),
+                    TerminalSteps.empty(self._env_specs[brain_name]),
+                )
+        self._side_channel_manager.process_side_channel_message(output.side_channel)
+
+    def reset(self) -> None:
+        if self._loaded:
+            outputs = self._communicator.exchange(
+                self._generate_reset_input(), self._poll_process
+            )
+            if outputs is None:
+                raise UnityCommunicatorStoppedException("Communicator has exited.")
+            self._update_behavior_specs(outputs)
+            rl_output = outputs.rl_output
+            self._update_state(rl_output)
+            self._is_first_message = False
+            self._env_actions.clear()
+        else:
+            raise UnityEnvironmentException("No Unity environment is loaded.")
+
+    @timed
+    def step(self) -> None:
+        if self._is_first_message:
+            return self.reset()
+        if not self._loaded:
+            raise UnityEnvironmentException("No Unity environment is loaded.")
+        # fill the blanks for missing actions
+        for group_name in self._env_specs:
+            if group_name not in self._env_actions:
+                n_agents = 0
+                if group_name in self._env_state:
+                    n_agents = len(self._env_state[group_name][0])
+                self._env_actions[group_name] = self._env_specs[
+                    group_name
+                ].action_spec.empty_action(n_agents)
+        step_input = self._generate_step_input(self._env_actions)
+        with hierarchical_timer("communicator.exchange"):
+            outputs = self._communicator.exchange(step_input, self._poll_process)
+        if outputs is None:
+            raise UnityCommunicatorStoppedException("Communicator has exited.")
+        self._update_behavior_specs(outputs)
+        rl_output = outputs.rl_output
+        self._update_state(rl_output)
+        self._env_actions.clear()
+
+    @property
+    def behavior_specs(self) -> MappingType[str, BehaviorSpec]:
+        return BehaviorMapping(self._env_specs)
+
+    def _assert_behavior_exists(self, behavior_name: str) -> None:
+        if behavior_name not in self._env_specs:
+            raise UnityActionException(
+                f"The group {behavior_name} does not correspond to an existing "
+                f"agent group in the environment"
+            )
+
+    def set_actions(self, behavior_name: BehaviorName, action: ActionTuple) -> None:
+        self._assert_behavior_exists(behavior_name)
+        if behavior_name not in self._env_state:
+            return
+        action_spec = self._env_specs[behavior_name].action_spec
+        num_agents = len(self._env_state[behavior_name][0])
+        action = action_spec._validate_action(action, num_agents, behavior_name)
+        self._env_actions[behavior_name] = action
+
+    def set_action_for_agent(
+        self, behavior_name: BehaviorName, agent_id: AgentId, action: ActionTuple
+    ) -> None:
+        self._assert_behavior_exists(behavior_name)
+        if behavior_name not in self._env_state:
+            return
+        action_spec = self._env_specs[behavior_name].action_spec
+        action = action_spec._validate_action(action, 1, behavior_name)
+        if behavior_name not in self._env_actions:
+            num_agents = len(self._env_state[behavior_name][0])
+            self._env_actions[behavior_name] = action_spec.empty_action(num_agents)
+        try:
+            index = np.where(self._env_state[behavior_name][0].agent_id == agent_id)[0][
+                0
+            ]
+        except IndexError as ie:
+            raise IndexError(
+                "agent_id {} is did not request a decision at the previous step".format(
+                    agent_id
+                )
+            ) from ie
+        if action_spec.continuous_size > 0:
+            self._env_actions[behavior_name].continuous[index] = action.continuous[0, :]
+        if action_spec.discrete_size > 0:
+            self._env_actions[behavior_name].discrete[index] = action.discrete[0, :]
+
+    def get_steps(
+        self, behavior_name: BehaviorName
+    ) -> Tuple[DecisionSteps, TerminalSteps]:
+        self._assert_behavior_exists(behavior_name)
+        return self._env_state[behavior_name]
+
+    def _poll_process(self) -> None:
+        """
+        Check the status of the subprocess. If it has exited, raise a UnityEnvironmentException
+        :return: None
+        """
+        if not self._process:
+            return
+        poll_res = self._process.poll()
+        if poll_res is not None:
+            exc_msg = self._returncode_to_env_message(self._process.returncode)
+            raise UnityEnvironmentException(exc_msg)
+
+    def close(self):
+        """
+        Sends a shutdown signal to the unity environment, and closes the socket connection.
+        """
+        if self._loaded:
+            self._close()
+        else:
+            raise UnityEnvironmentException("No Unity environment is loaded.")
+
+    def _close(self, timeout: Optional[int] = None) -> None:
+        """
+        Close the communicator and environment subprocess (if necessary).
+
+        :int timeout: [Optional] Number of seconds to wait for the environment to shut down before
+            force-killing it.  Defaults to `self.timeout_wait`.
+        """
+        if timeout is None:
+            timeout = self._timeout_wait
+        self._loaded = False
+        self._communicator.close()
+        if self._process is not None:
+            # Wait a bit for the process to shutdown, but kill it if it takes too long
+            try:
+                self._process.wait(timeout=timeout)
+                logger.debug(self._returncode_to_env_message(self._process.returncode))
+            except subprocess.TimeoutExpired:
+                logger.warning("Environment timed out shutting down. Killing...")
+                self._process.kill()
+            # Set to None so we don't try to close multiple times.
+            self._process = None
+
+    @timed
+    def _generate_step_input(
+        self, vector_action: Dict[str, ActionTuple]
+    ) -> UnityInputProto:
+        rl_in = UnityRLInputProto()
+        for b in vector_action:
+            n_agents = len(self._env_state[b][0])
+            if n_agents == 0:
+                continue
+            for i in range(n_agents):
+                action = AgentActionProto()
+                if vector_action[b].continuous is not None:
+                    action.vector_actions_deprecated.extend(
+                        vector_action[b].continuous[i]
+                    )
+                    action.continuous_actions.extend(vector_action[b].continuous[i])
+                if vector_action[b].discrete is not None:
+                    action.vector_actions_deprecated.extend(
+                        vector_action[b].discrete[i]
+                    )
+                    action.discrete_actions.extend(vector_action[b].discrete[i])
+                rl_in.agent_actions[b].value.extend([action])
+                rl_in.command = STEP
+        rl_in.side_channel = bytes(
+            self._side_channel_manager.generate_side_channel_messages()
+        )
+        return self._wrap_unity_input(rl_in)
+
+    def _generate_reset_input(self) -> UnityInputProto:
+        rl_in = UnityRLInputProto()
+        rl_in.command = RESET
+        rl_in.side_channel = bytes(
+            self._side_channel_manager.generate_side_channel_messages()
+        )
+        return self._wrap_unity_input(rl_in)
+
+    def _send_academy_parameters(
+        self, init_parameters: UnityRLInitializationInputProto
+    ) -> UnityOutputProto:
+        inputs = UnityInputProto()
+        inputs.rl_initialization_input.CopyFrom(init_parameters)
+        return self._communicator.initialize(inputs, self._poll_process)
+
+    @staticmethod
+    def _wrap_unity_input(rl_input: UnityRLInputProto) -> UnityInputProto:
+        result = UnityInputProto()
+        result.rl_input.CopyFrom(rl_input)
+        return result
+
+    @staticmethod
+    def _returncode_to_signal_name(returncode: int) -> Optional[str]:
+        """
+        Try to convert return codes into their corresponding signal name.
+        E.g. returncode_to_signal_name(-2) -> "SIGINT"
+        """
+        try:
+            # A negative value -N indicates that the child was terminated by signal N (POSIX only).
+            s = signal.Signals(-returncode)
+            return s.name
+        except Exception:
+            # Should generally be a ValueError, but catch everything just in case.
+            return None
+
+    @staticmethod
+    def _returncode_to_env_message(returncode: int) -> str:
+        signal_name = UnityEnvironment._returncode_to_signal_name(returncode)
+        signal_name = f" ({signal_name})" if signal_name else ""
+        return f"Environment shut down with return code {returncode}{signal_name}."
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__init__.py b/MLPY/Lib/site-packages/mlagents_envs/envs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d7a4f89414563d505de11853ace56ff470a83ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/envs/__init__.py
@@ -0,0 +1,15 @@
+from mlagents_envs.registry import default_registry
+from mlagents_envs.envs.pettingzoo_env_factory import logger, PettingZooEnvFactory
+
+# Register each environment in default_registry as a PettingZooEnv
+for key in default_registry:
+    env_name = key
+    if key[0].isdigit():
+        env_name = key.replace("3", "Three")
+    if not env_name.isidentifier():
+        logger.warning(
+            f"Environment id {env_name} can not be registered since it is"
+            f"not a valid identifier name."
+        )
+        continue
+    locals()[env_name] = PettingZooEnvFactory(key)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20ac2010ce326f37cb4212607df016800ebe6198
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/env_helpers.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/env_helpers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb7a0abd9f9da9f26fc4f36c61c53f594e873690
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/env_helpers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/pettingzoo_env_factory.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/pettingzoo_env_factory.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da7a1625e561a98f90998cfe6571f704a7ae93f7
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/pettingzoo_env_factory.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_aec_env.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_aec_env.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aecd0f8aaca4ceb9f1306977e6e33cd9779d538a
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_aec_env.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_gym_env.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_gym_env.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cf77cfe2ded45ad576b640bc727c72304571899
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_gym_env.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_parallel_env.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_parallel_env.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1b490835f74435c12feda4010d7b54da8292a1d
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_parallel_env.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_pettingzoo_base_env.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_pettingzoo_base_env.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c786d28d769100f6caa71a140aa61f4dea6e8855
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/envs/__pycache__/unity_pettingzoo_base_env.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/env_helpers.py b/MLPY/Lib/site-packages/mlagents_envs/envs/env_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..768e6706038b5cb1030d26f8f0efc40cfe695435
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/envs/env_helpers.py
@@ -0,0 +1,76 @@
+from urllib.parse import urlparse, parse_qs
+
+
+def _behavior_to_agent_id(behavior_name: str, unique_id: int) -> str:
+    return f"{behavior_name}?agent_id={unique_id}"
+
+
+def _agent_id_to_behavior(agent_id: str) -> str:
+    return agent_id.split("?agent_id=")[0]
+
+
+def _unwrap_batch_steps(batch_steps, behavior_name):
+    decision_batch, termination_batch = batch_steps
+    decision_id = [
+        _behavior_to_agent_id(behavior_name, i) for i in decision_batch.agent_id
+    ]
+    termination_id = [
+        _behavior_to_agent_id(behavior_name, i) for i in termination_batch.agent_id
+    ]
+    agents = decision_id + termination_id
+    obs = {
+        agent_id: [batch_obs[i] for batch_obs in termination_batch.obs]
+        for i, agent_id in enumerate(termination_id)
+    }
+    if decision_batch.action_mask is not None:
+        obs.update(
+            {
+                agent_id: {
+                    "observation": [batch_obs[i] for batch_obs in decision_batch.obs],
+                    "action_mask": [mask[i] for mask in decision_batch.action_mask],
+                }
+                for i, agent_id in enumerate(decision_id)
+            }
+        )
+    else:
+        obs.update(
+            {
+                agent_id: [batch_obs[i] for batch_obs in decision_batch.obs]
+                for i, agent_id in enumerate(decision_id)
+            }
+        )
+    obs = {k: v if len(v) > 1 else v[0] for k, v in obs.items()}
+    dones = {agent_id: True for agent_id in termination_id}
+    dones.update({agent_id: False for agent_id in decision_id})
+    rewards = {
+        agent_id: termination_batch.reward[i]
+        for i, agent_id in enumerate(termination_id)
+    }
+    rewards.update(
+        {agent_id: decision_batch.reward[i] for i, agent_id in enumerate(decision_id)}
+    )
+    cumulative_rewards = {k: v for k, v in rewards.items()}
+    infos = {}
+    for i, agent_id in enumerate(decision_id):
+        infos[agent_id] = {}
+        infos[agent_id]["behavior_name"] = behavior_name
+        infos[agent_id]["group_id"] = decision_batch.group_id[i]
+        infos[agent_id]["group_reward"] = decision_batch.group_reward[i]
+    for i, agent_id in enumerate(termination_id):
+        infos[agent_id] = {}
+        infos[agent_id]["behavior_name"] = behavior_name
+        infos[agent_id]["group_id"] = termination_batch.group_id[i]
+        infos[agent_id]["group_reward"] = termination_batch.group_reward[i]
+        infos[agent_id]["interrupted"] = termination_batch.interrupted[i]
+    id_map = {agent_id: i for i, agent_id in enumerate(decision_id)}
+    return agents, obs, dones, rewards, cumulative_rewards, infos, id_map
+
+
+def _parse_behavior(full_behavior):
+    parsed = urlparse(full_behavior)
+    name = parsed.path
+    ids = parse_qs(parsed.query)
+    team_id: int = 0
+    if "team" in ids:
+        team_id = int(ids["team"][0])
+    return name, team_id
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/pettingzoo_env_factory.py b/MLPY/Lib/site-packages/mlagents_envs/envs/pettingzoo_env_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..aae82d36e81ef1d4e67913b673b63edeaec2dff9
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/envs/pettingzoo_env_factory.py
@@ -0,0 +1,50 @@
+from typing import Optional, Union, List
+
+from mlagents_envs import logging_util
+from mlagents_envs.exception import UnityWorkerInUseException
+from mlagents_envs.registry import default_registry
+from mlagents_envs.side_channel.engine_configuration_channel import (
+    EngineConfigurationChannel,
+)
+from mlagents_envs.side_channel.environment_parameters_channel import (
+    EnvironmentParametersChannel,
+)
+from mlagents_envs.side_channel.stats_side_channel import StatsSideChannel
+from mlagents_envs.envs.unity_aec_env import UnityAECEnv
+
+logger = logging_util.get_logger(__name__)
+
+
+class PettingZooEnvFactory:
+    def __init__(self, env_id: str) -> None:
+        self.env_id = env_id
+
+    def env(
+        self, seed: Optional[int] = None, **kwargs: Union[List, int, bool, None]
+    ) -> UnityAECEnv:
+        """
+        Creates the environment with env_id from unity's default_registry and wraps it in a UnityToPettingZooWrapper
+        :param seed: The seed for the action spaces of the agents.
+        :param kwargs: Any argument accepted by `UnityEnvironment`class except file_name
+        """
+        # If not side_channels specified, add the followings
+        if "side_channels" not in kwargs:
+            kwargs["side_channels"] = [
+                EngineConfigurationChannel(),
+                EnvironmentParametersChannel(),
+                StatsSideChannel(),
+            ]
+        _env = None
+        # If no base port argument is provided, try ports starting at 6000 until one is free
+        if "base_port" not in kwargs:
+            port = 6000
+            while _env is None:
+                try:
+                    kwargs["base_port"] = port
+                    _env = default_registry[self.env_id].make(**kwargs)
+                except UnityWorkerInUseException:
+                    port += 1
+                    pass
+        else:
+            _env = default_registry[self.env_id].make(**kwargs)
+        return UnityAECEnv(_env, seed)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/unity_aec_env.py b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_aec_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bb6fdf390998755b9443c6e0d687175cb41f7cc
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_aec_env.py
@@ -0,0 +1,72 @@
+from typing import Any, Optional
+from gym import error
+from mlagents_envs.base_env import BaseEnv
+from pettingzoo import AECEnv
+
+from mlagents_envs.envs.unity_pettingzoo_base_env import UnityPettingzooBaseEnv
+
+
+class UnityAECEnv(UnityPettingzooBaseEnv, AECEnv):
+    """
+    Unity AEC (PettingZoo) environment wrapper.
+    """
+
+    def __init__(self, env: BaseEnv, seed: Optional[int] = None):
+        """
+        Initializes a Unity AEC environment wrapper.
+
+        :param env: The UnityEnvironment that is being wrapped.
+        :param seed: The seed for the action spaces of the agents.
+        """
+        super().__init__(env, seed)
+
+    def step(self, action: Any) -> None:
+        """
+        Sets the action of the active agent and get the observation, reward, done
+        and info of the next agent.
+        :param action: The action for the active agent
+        """
+        self._assert_loaded()
+        if len(self._live_agents) <= 0:
+            raise error.Error(
+                "You must reset the environment before you can perform a step"
+            )
+
+        # Process action
+        current_agent = self._agents[self._agent_index]
+        self._process_action(current_agent, action)
+
+        self._agent_index += 1
+        # Reset reward
+        for k in self._rewards.keys():
+            self._rewards[k] = 0
+
+        if self._agent_index >= len(self._agents) and self.num_agents > 0:
+            # The index is too high, time to set the action for the agents we have
+            self._step()
+            self._live_agents.sort()  # unnecessary, only for passing API test
+
+    def observe(self, agent_id):
+        """
+        Returns the observation an agent currently can make. `last()` calls this function.
+        """
+        return (
+            self._observations[agent_id],
+            self._cumm_rewards[agent_id],
+            self._dones[agent_id],
+            self._infos[agent_id],
+        )
+
+    def last(self, observe=True):
+        """
+        returns observation, cumulative reward, done, info for the current agent (specified by self.agent_selection)
+        """
+        obs, reward, done, info = self.observe(self._agents[self._agent_index])
+        return obs if observe else None, reward, done, info
+
+    @property
+    def agent_selection(self):
+        if not self._live_agents:
+            # If we had an agent finish then return that agent even though it isn't alive.
+            return self._agents[0]
+        return self._agents[self._agent_index]
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/unity_gym_env.py b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_gym_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..df29a95c9ab79d01389a2e66f9ebfcabc72c77ec
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_gym_env.py
@@ -0,0 +1,360 @@
+import itertools
+
+import numpy as np
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import gym
+from gym import error, spaces
+
+from mlagents_envs.base_env import ActionTuple, BaseEnv
+from mlagents_envs.base_env import DecisionSteps, TerminalSteps
+from mlagents_envs import logging_util
+
+
+class UnityGymException(error.Error):
+    """
+    Any error related to the gym wrapper of ml-agents.
+    """
+
+    pass
+
+
+logger = logging_util.get_logger(__name__)
+GymStepResult = Tuple[np.ndarray, float, bool, Dict]
+
+
+class UnityToGymWrapper(gym.Env):
+    """
+    Provides Gym wrapper for Unity Learning Environments.
+    """
+
+    def __init__(
+        self,
+        unity_env: BaseEnv,
+        uint8_visual: bool = False,
+        flatten_branched: bool = False,
+        allow_multiple_obs: bool = False,
+        action_space_seed: Optional[int] = None,
+    ):
+        """
+        Environment initialization
+        :param unity_env: The Unity BaseEnv to be wrapped in the gym. Will be closed when the UnityToGymWrapper closes.
+        :param uint8_visual: Return visual observations as uint8 (0-255) matrices instead of float (0.0-1.0).
+        :param flatten_branched: If True, turn branched discrete action spaces into a Discrete space rather than
+            MultiDiscrete.
+        :param allow_multiple_obs: If True, return a list of np.ndarrays as observations with the first elements
+            containing the visual observations and the last element containing the array of vector observations.
+            If False, returns a single np.ndarray containing either only a single visual observation or the array of
+            vector observations.
+        :param action_space_seed: If non-None, will be used to set the random seed on created gym.Space instances.
+        """
+        self._env = unity_env
+
+        # Take a single step so that the brain information will be sent over
+        if not self._env.behavior_specs:
+            self._env.step()
+
+        self.visual_obs = None
+
+        # Save the step result from the last time all Agents requested decisions.
+        self._previous_decision_step: Optional[DecisionSteps] = None
+        self._flattener = None
+        # Hidden flag used by Atari environments to determine if the game is over
+        self.game_over = False
+        self._allow_multiple_obs = allow_multiple_obs
+
+        # Check brain configuration
+        if len(self._env.behavior_specs) != 1:
+            raise UnityGymException(
+                "There can only be one behavior in a UnityEnvironment "
+                "if it is wrapped in a gym."
+            )
+
+        self.name = list(self._env.behavior_specs.keys())[0]
+        self.group_spec = self._env.behavior_specs[self.name]
+
+        if self._get_n_vis_obs() == 0 and self._get_vec_obs_size() == 0:
+            raise UnityGymException(
+                "There are no observations provided by the environment."
+            )
+
+        if not self._get_n_vis_obs() >= 1 and uint8_visual:
+            logger.warning(
+                "uint8_visual was set to true, but visual observations are not in use. "
+                "This setting will not have any effect."
+            )
+        else:
+            self.uint8_visual = uint8_visual
+        if (
+            self._get_n_vis_obs() + self._get_vec_obs_size() >= 2
+            and not self._allow_multiple_obs
+        ):
+            logger.warning(
+                "The environment contains multiple observations. "
+                "You must define allow_multiple_obs=True to receive them all. "
+                "Otherwise, only the first visual observation (or vector observation if"
+                "there are no visual observations) will be provided in the observation."
+            )
+
+        # Check for number of agents in scene.
+        self._env.reset()
+        decision_steps, _ = self._env.get_steps(self.name)
+        self._check_agents(len(decision_steps))
+        self._previous_decision_step = decision_steps
+
+        # Set action spaces
+        if self.group_spec.action_spec.is_discrete():
+            self.action_size = self.group_spec.action_spec.discrete_size
+            branches = self.group_spec.action_spec.discrete_branches
+            if self.group_spec.action_spec.discrete_size == 1:
+                self._action_space = spaces.Discrete(branches[0])
+            else:
+                if flatten_branched:
+                    self._flattener = ActionFlattener(branches)
+                    self._action_space = self._flattener.action_space
+                else:
+                    self._action_space = spaces.MultiDiscrete(branches)
+
+        elif self.group_spec.action_spec.is_continuous():
+            if flatten_branched:
+                logger.warning(
+                    "The environment has a non-discrete action space. It will "
+                    "not be flattened."
+                )
+
+            self.action_size = self.group_spec.action_spec.continuous_size
+            high = np.array([1] * self.group_spec.action_spec.continuous_size)
+            self._action_space = spaces.Box(-high, high, dtype=np.float32)
+        else:
+            raise UnityGymException(
+                "The gym wrapper does not provide explicit support for both discrete "
+                "and continuous actions."
+            )
+
+        if action_space_seed is not None:
+            self._action_space.seed(action_space_seed)
+
+        # Set observations space
+        list_spaces: List[gym.Space] = []
+        shapes = self._get_vis_obs_shape()
+        for shape in shapes:
+            if uint8_visual:
+                list_spaces.append(spaces.Box(0, 255, dtype=np.uint8, shape=shape))
+            else:
+                list_spaces.append(spaces.Box(0, 1, dtype=np.float32, shape=shape))
+        if self._get_vec_obs_size() > 0:
+            # vector observation is last
+            high = np.array([np.inf] * self._get_vec_obs_size())
+            list_spaces.append(spaces.Box(-high, high, dtype=np.float32))
+        if self._allow_multiple_obs:
+            self._observation_space = spaces.Tuple(list_spaces)
+        else:
+            self._observation_space = list_spaces[0]  # only return the first one
+
+    def reset(self) -> Union[List[np.ndarray], np.ndarray]:
+        """Resets the state of the environment and returns an initial observation.
+        Returns: observation (object/list): the initial observation of the
+        space.
+        """
+        self._env.reset()
+        decision_step, _ = self._env.get_steps(self.name)
+        n_agents = len(decision_step)
+        self._check_agents(n_agents)
+        self.game_over = False
+
+        res: GymStepResult = self._single_step(decision_step)
+        return res[0]
+
+    def step(self, action: List[Any]) -> GymStepResult:
+        """Run one timestep of the environment's dynamics. When end of
+        episode is reached, you are responsible for calling `reset()`
+        to reset this environment's state.
+        Accepts an action and returns a tuple (observation, reward, done, info).
+        Args:
+            action (object/list): an action provided by the environment
+        Returns:
+            observation (object/list): agent's observation of the current environment
+            reward (float/list) : amount of reward returned after previous action
+            done (boolean/list): whether the episode has ended.
+            info (dict): contains auxiliary diagnostic information.
+        """
+        if self.game_over:
+            raise UnityGymException(
+                "You are calling 'step()' even though this environment has already "
+                "returned done = True. You must always call 'reset()' once you "
+                "receive 'done = True'."
+            )
+        if self._flattener is not None:
+            # Translate action into list
+            action = self._flattener.lookup_action(action)
+
+        action = np.array(action).reshape((1, self.action_size))
+
+        action_tuple = ActionTuple()
+        if self.group_spec.action_spec.is_continuous():
+            action_tuple.add_continuous(action)
+        else:
+            action_tuple.add_discrete(action)
+        self._env.set_actions(self.name, action_tuple)
+
+        self._env.step()
+        decision_step, terminal_step = self._env.get_steps(self.name)
+        self._check_agents(max(len(decision_step), len(terminal_step)))
+        if len(terminal_step) != 0:
+            # The agent is done
+            self.game_over = True
+            return self._single_step(terminal_step)
+        else:
+            return self._single_step(decision_step)
+
+    def _single_step(self, info: Union[DecisionSteps, TerminalSteps]) -> GymStepResult:
+        if self._allow_multiple_obs:
+            visual_obs = self._get_vis_obs_list(info)
+            visual_obs_list = []
+            for obs in visual_obs:
+                visual_obs_list.append(self._preprocess_single(obs[0]))
+            default_observation = visual_obs_list
+            if self._get_vec_obs_size() >= 1:
+                default_observation.append(self._get_vector_obs(info)[0, :])
+        else:
+            if self._get_n_vis_obs() >= 1:
+                visual_obs = self._get_vis_obs_list(info)
+                default_observation = self._preprocess_single(visual_obs[0][0])
+            else:
+                default_observation = self._get_vector_obs(info)[0, :]
+
+        if self._get_n_vis_obs() >= 1:
+            visual_obs = self._get_vis_obs_list(info)
+            self.visual_obs = self._preprocess_single(visual_obs[0][0])
+
+        done = isinstance(info, TerminalSteps)
+
+        return (default_observation, info.reward[0], done, {"step": info})
+
+    def _preprocess_single(self, single_visual_obs: np.ndarray) -> np.ndarray:
+        if self.uint8_visual:
+            return (255.0 * single_visual_obs).astype(np.uint8)
+        else:
+            return single_visual_obs
+
+    def _get_n_vis_obs(self) -> int:
+        result = 0
+        for obs_spec in self.group_spec.observation_specs:
+            if len(obs_spec.shape) == 3:
+                result += 1
+        return result
+
+    def _get_vis_obs_shape(self) -> List[Tuple]:
+        result: List[Tuple] = []
+        for obs_spec in self.group_spec.observation_specs:
+            if len(obs_spec.shape) == 3:
+                result.append(obs_spec.shape)
+        return result
+
+    def _get_vis_obs_list(
+        self, step_result: Union[DecisionSteps, TerminalSteps]
+    ) -> List[np.ndarray]:
+        result: List[np.ndarray] = []
+        for obs in step_result.obs:
+            if len(obs.shape) == 4:
+                result.append(obs)
+        return result
+
+    def _get_vector_obs(
+        self, step_result: Union[DecisionSteps, TerminalSteps]
+    ) -> np.ndarray:
+        result: List[np.ndarray] = []
+        for obs in step_result.obs:
+            if len(obs.shape) == 2:
+                result.append(obs)
+        return np.concatenate(result, axis=1)
+
+    def _get_vec_obs_size(self) -> int:
+        result = 0
+        for obs_spec in self.group_spec.observation_specs:
+            if len(obs_spec.shape) == 1:
+                result += obs_spec.shape[0]
+        return result
+
+    def render(self, mode="rgb_array"):
+        """
+        Return the latest visual observations.
+        Note that it will not render a new frame of the environment.
+        """
+        return self.visual_obs
+
+    def close(self) -> None:
+        """Override _close in your subclass to perform any necessary cleanup.
+        Environments will automatically close() themselves when
+        garbage collected or when the program exits.
+        """
+        self._env.close()
+
+    def seed(self, seed: Any = None) -> None:
+        """Sets the seed for this env's random number generator(s).
+        Currently not implemented.
+        """
+        logger.warning("Could not seed environment %s", self.name)
+        return
+
+    @staticmethod
+    def _check_agents(n_agents: int) -> None:
+        if n_agents > 1:
+            raise UnityGymException(
+                f"There can only be one Agent in the environment but {n_agents} were detected."
+            )
+
+    @property
+    def metadata(self):
+        return {"render.modes": ["rgb_array"]}
+
+    @property
+    def reward_range(self) -> Tuple[float, float]:
+        return -float("inf"), float("inf")
+
+    @property
+    def action_space(self) -> gym.Space:
+        return self._action_space
+
+    @property
+    def observation_space(self):
+        return self._observation_space
+
+
+class ActionFlattener:
+    """
+    Flattens branched discrete action spaces into single-branch discrete action spaces.
+    """
+
+    def __init__(self, branched_action_space):
+        """
+        Initialize the flattener.
+        :param branched_action_space: A List containing the sizes of each branch of the action
+        space, e.g. [2,3,3] for three branches with size 2, 3, and 3 respectively.
+        """
+        self._action_shape = branched_action_space
+        self.action_lookup = self._create_lookup(self._action_shape)
+        self.action_space = spaces.Discrete(len(self.action_lookup))
+
+    @classmethod
+    def _create_lookup(self, branched_action_space):
+        """
+        Creates a Dict that maps discrete actions (scalars) to branched actions (lists).
+        Each key in the Dict maps to one unique set of branched actions, and each value
+        contains the List of branched actions.
+        """
+        possible_vals = [range(_num) for _num in branched_action_space]
+        all_actions = [list(_action) for _action in itertools.product(*possible_vals)]
+        # Dict should be faster than List for large action spaces
+        action_lookup = {
+            _scalar: _action for (_scalar, _action) in enumerate(all_actions)
+        }
+        return action_lookup
+
+    def lookup_action(self, action):
+        """
+        Convert a scalar discrete action into a unique set of branched actions.
+        :param action: A scalar value representing one of the discrete actions.
+        :returns: The List containing the branched actions.
+        """
+        return self.action_lookup[action]
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/unity_parallel_env.py b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_parallel_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..09398d27fa8369d3af63629d18b93fca8dc218a7
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_parallel_env.py
@@ -0,0 +1,53 @@
+from typing import Optional, Dict, Any, Tuple
+from gym import error
+from mlagents_envs.base_env import BaseEnv
+from pettingzoo import ParallelEnv
+
+from mlagents_envs.envs.unity_pettingzoo_base_env import UnityPettingzooBaseEnv
+
+
+class UnityParallelEnv(UnityPettingzooBaseEnv, ParallelEnv):
+    """
+    Unity Parallel (PettingZoo) environment wrapper.
+    """
+
+    def __init__(self, env: BaseEnv, seed: Optional[int] = None):
+        """
+        Initializes a Unity Parallel environment wrapper.
+
+        :param env: The UnityEnvironment that is being wrapped.
+        :param seed: The seed for the action spaces of the agents.
+        """
+        super().__init__(env, seed)
+
+    def reset(self) -> Dict[str, Any]:
+        """
+        Resets the environment.
+        """
+        super().reset()
+
+        return self._observations
+
+    def step(self, actions: Dict[str, Any]) -> Tuple:
+        self._assert_loaded()
+        if len(self._live_agents) <= 0 and actions:
+            raise error.Error(
+                "You must reset the environment before you can perform a step."
+            )
+
+        # Process actions
+        for current_agent, action in actions.items():
+            self._process_action(current_agent, action)
+
+        # Reset reward
+        for k in self._rewards.keys():
+            self._rewards[k] = 0
+
+        # Step environment
+        self._step()
+
+        # Agent cleanup and sorting
+        self._cleanup_agents()
+        self._live_agents.sort()  # unnecessary, only for passing API test
+
+        return self._observations, self._rewards, self._dones, self._infos
diff --git a/MLPY/Lib/site-packages/mlagents_envs/envs/unity_pettingzoo_base_env.py b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_pettingzoo_base_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..3457f18c882643346c020fc1057a7d0c194e51f9
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/envs/unity_pettingzoo_base_env.py
@@ -0,0 +1,323 @@
+import atexit
+from typing import Optional, List, Set, Dict, Any, Tuple
+import numpy as np
+from gym import error, spaces
+from mlagents_envs.base_env import BaseEnv, ActionTuple
+from mlagents_envs.envs.env_helpers import _agent_id_to_behavior, _unwrap_batch_steps
+
+
+class UnityPettingzooBaseEnv:
+    """
+    Unity Petting Zoo base environment.
+    """
+
+    def __init__(
+        self, env: BaseEnv, seed: Optional[int] = None, metadata: Optional[dict] = None
+    ):
+        super().__init__()
+        atexit.register(self.close)
+        self._env = env
+        self.metadata = metadata
+        self._assert_loaded()
+
+        self._agent_index = 0
+        self._seed = seed
+        self._side_channel_dict = {
+            type(v).__name__: v
+            for v in self._env._side_channel_manager._side_channels_dict.values()  # type: ignore
+        }
+
+        self._live_agents: List[str] = []  # agent id for agents alive
+        self._agents: List[str] = []  # all agent id in current step
+        self._possible_agents: Set[str] = set()  # all agents that have ever appear
+        self._agent_id_to_index: Dict[str, int] = {}  # agent_id: index in decision step
+        self._observations: Dict[str, np.ndarray] = {}  # agent_id: obs
+        self._dones: Dict[str, bool] = {}  # agent_id: done
+        self._rewards: Dict[str, float] = {}  # agent_id: reward
+        self._cumm_rewards: Dict[str, float] = {}  # agent_id: reward
+        self._infos: Dict[str, Dict] = {}  # agent_id: info
+        self._action_spaces: Dict[str, spaces.Space] = {}  # behavior_name: action_space
+        self._observation_spaces: Dict[
+            str, spaces.Space
+        ] = {}  # behavior_name: obs_space
+        self._current_action: Dict[str, ActionTuple] = {}  # behavior_name: ActionTuple
+        # Take a single step so that the brain information will be sent over
+        if not self._env.behavior_specs:
+            self._env.step()
+            for behavior_name in self._env.behavior_specs.keys():
+                _, _, _ = self._batch_update(behavior_name)
+        self._update_observation_spaces()
+        self._update_action_spaces()
+
+    def _assert_loaded(self) -> None:
+        if self._env is None:
+            raise error.Error("No environment loaded")
+
+    @property
+    def observation_spaces(self) -> Dict[str, spaces.Space]:
+        """
+        Return the observation spaces of all the agents.
+        """
+        return {
+            agent_id: self._observation_spaces[_agent_id_to_behavior(agent_id)]
+            for agent_id in self._possible_agents
+        }
+
+    def observation_space(self, agent: str) -> Optional[spaces.Space]:
+        """
+        The observation space of the current agent.
+        """
+        behavior_name = _agent_id_to_behavior(agent)
+        return self._observation_spaces[behavior_name]
+
+    def _update_observation_spaces(self) -> None:
+        self._assert_loaded()
+        for behavior_name in self._env.behavior_specs.keys():
+            if behavior_name not in self._observation_spaces:
+                obs_spec = self._env.behavior_specs[behavior_name].observation_specs
+                obs_spaces = tuple(
+                    spaces.Box(
+                        low=-np.float32(np.inf),
+                        high=np.float32(np.inf),
+                        shape=spec.shape,
+                        dtype=np.float32,
+                    )
+                    for spec in obs_spec
+                )
+                if len(obs_spaces) == 1:
+                    self._observation_spaces[behavior_name] = obs_spaces[0]
+                else:
+                    self._observation_spaces[behavior_name] = spaces.Tuple(obs_spaces)
+
+    @property
+    def action_spaces(self) -> Dict[str, spaces.Space]:
+        """
+        Return the action spaces of all the agents.
+        """
+        return {
+            agent_id: self._action_spaces[_agent_id_to_behavior(agent_id)]
+            for agent_id in self._possible_agents
+        }
+
+    def action_space(self, agent: str) -> Optional[spaces.Space]:
+        """
+        The action space of the current agent.
+        """
+        behavior_name = _agent_id_to_behavior(agent)
+        return self._action_spaces[behavior_name]
+
+    def _update_action_spaces(self) -> None:
+        self._assert_loaded()
+        for behavior_name in self._env.behavior_specs.keys():
+            if behavior_name not in self._action_spaces:
+                act_spec = self._env.behavior_specs[behavior_name].action_spec
+                if (
+                    act_spec.continuous_size == 0
+                    and len(act_spec.discrete_branches) == 0
+                ):
+                    raise error.Error("No actions found")
+                if act_spec.discrete_size == 1:
+                    d_space = spaces.Discrete(act_spec.discrete_branches[0])
+                    if self._seed is not None:
+                        d_space.seed(self._seed)
+                    if act_spec.continuous_size == 0:
+                        self._action_spaces[behavior_name] = d_space
+                        continue
+                if act_spec.discrete_size > 0:
+                    d_space = spaces.MultiDiscrete(act_spec.discrete_branches)
+                    if self._seed is not None:
+                        d_space.seed(self._seed)
+                    if act_spec.continuous_size == 0:
+                        self._action_spaces[behavior_name] = d_space
+                        continue
+                if act_spec.continuous_size > 0:
+                    c_space = spaces.Box(
+                        -1, 1, (act_spec.continuous_size,), dtype=np.int32
+                    )
+                    if self._seed is not None:
+                        c_space.seed(self._seed)
+                    if len(act_spec.discrete_branches) == 0:
+                        self._action_spaces[behavior_name] = c_space
+                        continue
+                self._action_spaces[behavior_name] = spaces.Tuple((c_space, d_space))
+
+    def _process_action(self, current_agent, action):
+        current_action_space = self.action_space(current_agent)
+        # Convert actions
+        if action is not None:
+            if isinstance(action, Tuple):
+                action = tuple(np.array(a) for a in action)
+            else:
+                action = self._action_to_np(current_action_space, action)
+            if not current_action_space.contains(action):  # type: ignore
+                raise error.Error(
+                    f"Invalid action, got {action} but was expecting action from {self.action_space}"
+                )
+            if isinstance(current_action_space, spaces.Tuple):
+                action = ActionTuple(action[0], action[1])
+            elif isinstance(current_action_space, spaces.MultiDiscrete):
+                action = ActionTuple(None, action)
+            elif isinstance(current_action_space, spaces.Discrete):
+                action = ActionTuple(None, np.array(action).reshape(1, 1))
+            else:
+                action = ActionTuple(action, None)
+
+        if not self._dones[current_agent]:
+            current_behavior = _agent_id_to_behavior(current_agent)
+            current_index = self._agent_id_to_index[current_agent]
+            if action.continuous is not None:
+                self._current_action[current_behavior].continuous[
+                    current_index
+                ] = action.continuous[0]
+            if action.discrete is not None:
+                self._current_action[current_behavior].discrete[
+                    current_index
+                ] = action.discrete[0]
+        else:
+            self._live_agents.remove(current_agent)
+            del self._observations[current_agent]
+            del self._dones[current_agent]
+            del self._rewards[current_agent]
+            del self._cumm_rewards[current_agent]
+            del self._infos[current_agent]
+
+    def _step(self):
+        for behavior_name, actions in self._current_action.items():
+            self._env.set_actions(behavior_name, actions)
+        self._env.step()
+        self._reset_states()
+        for behavior_name in self._env.behavior_specs.keys():
+            dones, rewards, cumulative_rewards = self._batch_update(behavior_name)
+            self._dones.update(dones)
+            self._rewards.update(rewards)
+            self._cumm_rewards.update(cumulative_rewards)
+        self._agent_index = 0
+
+    def _cleanup_agents(self):
+        for current_agent, done in self.dones.items():
+            if done:
+                self._live_agents.remove(current_agent)
+
+    @property
+    def side_channel(self) -> Dict[str, Any]:
+        """
+        The side channels of the environment. You can access the side channels
+        of an environment with `env.side_channel[<name-of-channel>]`.
+        """
+        self._assert_loaded()
+        return self._side_channel_dict
+
+    @staticmethod
+    def _action_to_np(current_action_space, action):
+        return np.array(action, dtype=current_action_space.dtype)
+
+    def _create_empty_actions(self, behavior_name, num_agents):
+        a_spec = self._env.behavior_specs[behavior_name].action_spec
+        return ActionTuple(
+            np.zeros((num_agents, a_spec.continuous_size), dtype=np.float32),
+            np.zeros((num_agents, len(a_spec.discrete_branches)), dtype=np.int32),
+        )
+
+    @property
+    def _cumulative_rewards(self):
+        return self._cumm_rewards
+
+    def _reset_states(self):
+        self._live_agents = []
+        self._agents = []
+        self._observations = {}
+        self._dones = {}
+        self._rewards = {}
+        self._cumm_rewards = {}
+        self._infos = {}
+        self._agent_id_to_index = {}
+
+    def reset(self):
+        """
+        Resets the environment.
+        """
+        self._assert_loaded()
+        self._agent_index = 0
+        self._reset_states()
+        self._possible_agents = set()
+        self._env.reset()
+        for behavior_name in self._env.behavior_specs.keys():
+            _, _, _ = self._batch_update(behavior_name)
+        self._live_agents.sort()  # unnecessary, only for passing API test
+        self._dones = {agent: False for agent in self._agents}
+        self._rewards = {agent: 0 for agent in self._agents}
+        self._cumm_rewards = {agent: 0 for agent in self._agents}
+
+    def _batch_update(self, behavior_name):
+        current_batch = self._env.get_steps(behavior_name)
+        self._current_action[behavior_name] = self._create_empty_actions(
+            behavior_name, len(current_batch[0])
+        )
+        (
+            agents,
+            obs,
+            dones,
+            rewards,
+            cumulative_rewards,
+            infos,
+            id_map,
+        ) = _unwrap_batch_steps(current_batch, behavior_name)
+        self._live_agents += agents
+        self._agents += agents
+        self._observations.update(obs)
+        self._infos.update(infos)
+        self._agent_id_to_index.update(id_map)
+        self._possible_agents.update(agents)
+        return dones, rewards, cumulative_rewards
+
+    def seed(self, seed=None):
+        """
+        Reseeds the environment (making the resulting environment deterministic).
+        `reset()` must be called after `seed()`, and before `step()`.
+        """
+        self._seed = seed
+
+    def render(self, mode="human"):
+        """
+        NOT SUPPORTED.
+
+        Displays a rendered frame from the environment, if supported.
+        Alternate render modes in the default environments are `'rgb_array'`
+        which returns a numpy array and is supported by all environments outside of classic,
+        and `'ansi'` which returns the strings printed (specific to classic environments).
+        """
+        pass
+
+    @property
+    def dones(self):
+        return dict(self._dones)
+
+    @property
+    def agents(self):
+        return sorted(self._live_agents)
+
+    @property
+    def rewards(self):
+        return dict(self._rewards)
+
+    @property
+    def infos(self):
+        return dict(self._infos)
+
+    @property
+    def possible_agents(self):
+        return sorted(self._possible_agents)
+
+    def close(self) -> None:
+        """
+        Close the environment.
+        """
+        if self._env is not None:
+            self._env.close()
+            self._env = None  # type: ignore
+
+    def __del__(self) -> None:
+        self.close()
+
+    def state(self):
+        pass
diff --git a/MLPY/Lib/site-packages/mlagents_envs/exception.py b/MLPY/Lib/site-packages/mlagents_envs/exception.py
new file mode 100644
index 0000000000000000000000000000000000000000..324cdd0969984da52f6736674edda66ee5a628d8
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/exception.py
@@ -0,0 +1,86 @@
+class UnityException(Exception):
+    """
+    Any error related to ml-agents environment.
+    """
+
+    pass
+
+
+class UnityEnvironmentException(UnityException):
+    """
+    Related to errors starting and closing environment.
+    """
+
+    pass
+
+
+class UnityCommunicationException(UnityException):
+    """
+    Related to errors with the communicator.
+    """
+
+    pass
+
+
+class UnityCommunicatorStoppedException(UnityException):
+    """
+    Raised when communicator has stopped gracefully.
+    """
+
+    pass
+
+
+class UnityObservationException(UnityException):
+    """
+    Related to errors with receiving observations.
+    """
+
+    pass
+
+
+class UnityActionException(UnityException):
+    """
+    Related to errors with sending actions.
+    """
+
+    pass
+
+
+class UnityTimeOutException(UnityException):
+    """
+    Related to errors with communication timeouts.
+    """
+
+    pass
+
+
+class UnitySideChannelException(UnityException):
+    """
+    Related to errors with side channels.
+    """
+
+    pass
+
+
+class UnityWorkerInUseException(UnityException):
+    """
+    This error occurs when the port for a certain worker ID is already reserved.
+    """
+
+    MESSAGE_TEMPLATE = (
+        "Couldn't start socket communication because worker number {} is still in use. "
+        "You may need to manually close a previously opened environment "
+        "or use a different worker number."
+    )
+
+    def __init__(self, worker_id):
+        message = self.MESSAGE_TEMPLATE.format(str(worker_id))
+        super().__init__(message)
+
+
+class UnityPolicyException(UnityException):
+    """
+    Related to errors with the Trainer.
+    """
+
+    pass
diff --git a/MLPY/Lib/site-packages/mlagents_envs/logging_util.py b/MLPY/Lib/site-packages/mlagents_envs/logging_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddd4e3b3cf56eed10afe4208b68ce89543e2377b
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/logging_util.py
@@ -0,0 +1,63 @@
+import logging  # noqa I251
+import sys
+
+CRITICAL = logging.CRITICAL
+FATAL = logging.FATAL
+ERROR = logging.ERROR
+WARNING = logging.WARNING
+INFO = logging.INFO
+DEBUG = logging.DEBUG
+NOTSET = logging.NOTSET
+
+_loggers = set()
+_log_level = NOTSET
+DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+DEBUG_LOG_FORMAT = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+LOG_FORMAT = "[%(levelname)s] %(message)s"
+
+
+def get_logger(name: str) -> logging.Logger:
+    """
+    Create a logger with the specified name. The logger will use the log level
+    specified by set_log_level()
+    """
+    logger = logging.getLogger(name=name)
+
+    if _log_level == DEBUG:
+        formatter = logging.Formatter(fmt=DEBUG_LOG_FORMAT, datefmt=DATE_FORMAT)
+    else:
+        formatter = logging.Formatter(fmt=LOG_FORMAT)
+    handler = logging.StreamHandler(stream=sys.stdout)
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+    # If we've already set the log level, make sure new loggers use it
+    if _log_level != NOTSET:
+        logger.setLevel(_log_level)
+
+    # Keep track of this logger so that we can change the log level later
+    _loggers.add(logger)
+    return logger
+
+
+def set_log_level(log_level: int) -> None:
+    """
+    Set the ML-Agents logging level. This will also configure the logging format (if it hasn't already been set).
+    """
+    global _log_level
+    _log_level = log_level
+
+    for logger in _loggers:
+        logger.setLevel(log_level)
+
+    if log_level == DEBUG:
+        formatter = logging.Formatter(fmt=DEBUG_LOG_FORMAT, datefmt=DATE_FORMAT)
+    else:
+        formatter = logging.Formatter(LOG_FORMAT)
+    _set_formatter_for_all_loggers(formatter)
+
+
+def _set_formatter_for_all_loggers(formatter: logging.Formatter) -> None:
+    for logger in _loggers:
+        for handler in logger.handlers[:]:
+            handler.setFormatter(formatter)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/mock_communicator.py b/MLPY/Lib/site-packages/mlagents_envs/mock_communicator.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e425e2759e354d80b16f19b350743ea61acb7e8
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/mock_communicator.py
@@ -0,0 +1,111 @@
+from typing import Optional
+
+from .communicator import Communicator, PollCallback
+from .environment import UnityEnvironment
+from mlagents_envs.communicator_objects.unity_rl_output_pb2 import UnityRLOutputProto
+from mlagents_envs.communicator_objects.brain_parameters_pb2 import (
+    BrainParametersProto,
+    ActionSpecProto,
+)
+from mlagents_envs.communicator_objects.unity_rl_initialization_output_pb2 import (
+    UnityRLInitializationOutputProto,
+)
+from mlagents_envs.communicator_objects.unity_input_pb2 import UnityInputProto
+from mlagents_envs.communicator_objects.unity_output_pb2 import UnityOutputProto
+from mlagents_envs.communicator_objects.agent_info_pb2 import AgentInfoProto
+from mlagents_envs.communicator_objects.observation_pb2 import (
+    ObservationProto,
+    NONE as COMPRESSION_TYPE_NONE,
+    PNG as COMPRESSION_TYPE_PNG,
+)
+
+
+class MockCommunicator(Communicator):
+    def __init__(
+        self,
+        discrete_action=False,
+        visual_inputs=0,
+        num_agents=3,
+        brain_name="RealFakeBrain",
+        vec_obs_size=3,
+    ):
+        """
+        Python side of the grpc communication. Python is the client and Unity the server
+        """
+        super().__init__()
+        self.is_discrete = discrete_action
+        self.steps = 0
+        self.visual_inputs = visual_inputs
+        self.has_been_closed = False
+        self.num_agents = num_agents
+        self.brain_name = brain_name
+        self.vec_obs_size = vec_obs_size
+
+    def initialize(
+        self, inputs: UnityInputProto, poll_callback: Optional[PollCallback] = None
+    ) -> UnityOutputProto:
+        if self.is_discrete:
+            action_spec = ActionSpecProto(
+                num_discrete_actions=2, discrete_branch_sizes=[3, 2]
+            )
+        else:
+            action_spec = ActionSpecProto(num_continuous_actions=2)
+        bp = BrainParametersProto(
+            brain_name=self.brain_name, is_training=True, action_spec=action_spec
+        )
+        rl_init = UnityRLInitializationOutputProto(
+            name="RealFakeAcademy",
+            communication_version=UnityEnvironment.API_VERSION,
+            package_version="mock_package_version",
+            log_path="",
+            brain_parameters=[bp],
+        )
+        output = UnityRLOutputProto(agentInfos=self._get_agent_infos())
+        return UnityOutputProto(rl_initialization_output=rl_init, rl_output=output)
+
+    def _get_agent_infos(self):
+        dict_agent_info = {}
+        list_agent_info = []
+        vector_obs = [1, 2, 3]
+
+        observations = [
+            ObservationProto(
+                compressed_data=None,
+                shape=[30, 40, 3],
+                compression_type=COMPRESSION_TYPE_PNG,
+            )
+            for _ in range(self.visual_inputs)
+        ]
+        vector_obs_proto = ObservationProto(
+            float_data=ObservationProto.FloatData(data=vector_obs),
+            shape=[len(vector_obs)],
+            compression_type=COMPRESSION_TYPE_NONE,
+        )
+        observations.append(vector_obs_proto)
+
+        for i in range(self.num_agents):
+            list_agent_info.append(
+                AgentInfoProto(
+                    reward=1,
+                    done=(i == 2),
+                    max_step_reached=False,
+                    id=i,
+                    observations=observations,
+                )
+            )
+        dict_agent_info["RealFakeBrain"] = UnityRLOutputProto.ListAgentInfoProto(
+            value=list_agent_info
+        )
+        return dict_agent_info
+
+    def exchange(
+        self, inputs: UnityInputProto, poll_callback: Optional[PollCallback] = None
+    ) -> UnityOutputProto:
+        result = UnityRLOutputProto(agentInfos=self._get_agent_infos())
+        return UnityOutputProto(rl_output=result)
+
+    def close(self):
+        """
+        Sends a shutdown signal to the unity environment, and closes the grpc connection.
+        """
+        self.has_been_closed = True
diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/__init__.py b/MLPY/Lib/site-packages/mlagents_envs/registry/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..74c0e8a7090fbf045dedf89409459e69263794ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/registry/__init__.py
@@ -0,0 +1,4 @@
+from mlagents_envs.registry.unity_env_registry import (  # noqa F401
+    default_registry,
+    UnityEnvRegistry,
+)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56a19980c23960888efa8c46cdad6cbefd46f41f
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/base_registry_entry.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/base_registry_entry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63a47e9bb0c2b40769c0eb88b76786abccaf3e42
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/base_registry_entry.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/binary_utils.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/binary_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77dd5e8a06462e026f0a46e6583e9e5f29136590
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/binary_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/remote_registry_entry.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/remote_registry_entry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d22903f8d94c0cfe5fb46d27cf17504091f3f7b4
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/remote_registry_entry.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/unity_env_registry.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/unity_env_registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d9d9e30860e115eacadf669feb9da4bc1aaf6d8
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/registry/__pycache__/unity_env_registry.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/base_registry_entry.py b/MLPY/Lib/site-packages/mlagents_envs/registry/base_registry_entry.py
new file mode 100644
index 0000000000000000000000000000000000000000..f72009f2a950749e199cacf800fa7cbce9a95e33
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/registry/base_registry_entry.py
@@ -0,0 +1,56 @@
+from abc import abstractmethod
+from typing import Any, Optional
+from mlagents_envs.base_env import BaseEnv
+
+
+class BaseRegistryEntry:
+    def __init__(
+        self,
+        identifier: str,
+        expected_reward: Optional[float],
+        description: Optional[str],
+    ):
+        """
+        BaseRegistryEntry allows launching a Unity Environment with its make method.
+        :param identifier: The name of the Unity Environment.
+        :param expected_reward: The cumulative reward that an Agent must receive
+        for the task to be considered solved.
+        :param description: A description of the Unity Environment. Contains human
+        readable information about potential special arguments that the make method can
+        take as well as information regarding the observation, reward, actions,
+        behaviors and number of agents in the Environment.
+        """
+        self._identifier = identifier
+        self._expected_reward = expected_reward
+        self._description = description
+
+    @property
+    def identifier(self) -> str:
+        """
+        The unique identifier of the entry
+        """
+        return self._identifier
+
+    @property
+    def expected_reward(self) -> Optional[float]:
+        """
+        The cumulative reward that an Agent must receive for the task to be considered
+        solved.
+        """
+        return self._expected_reward
+
+    @property
+    def description(self) -> Optional[str]:
+        """
+        A description of the Unity Environment the entry can make.
+        """
+        return self._description
+
+    @abstractmethod
+    def make(self, **kwargs: Any) -> BaseEnv:
+        """
+        This method creates a Unity BaseEnv (usually a UnityEnvironment).
+        """
+        raise NotImplementedError(
+            f"The make() method not implemented for entry {self.identifier}"
+        )
diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/binary_utils.py b/MLPY/Lib/site-packages/mlagents_envs/registry/binary_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..41078a46795261c433e25c0327100469166af1c5
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/registry/binary_utils.py
@@ -0,0 +1,259 @@
+import urllib.request
+import tempfile
+import os
+import uuid
+import shutil
+import glob
+
+import yaml
+import hashlib
+
+from zipfile import ZipFile
+from sys import platform
+from typing import Tuple, Optional, Dict, Any
+
+from filelock import FileLock
+
+from mlagents_envs.env_utils import validate_environment_path
+
+from mlagents_envs.logging_util import get_logger
+
+logger = get_logger(__name__)
+
+# The default logical block size is 8192 bytes (8 KB) for UFS file systems.
+BLOCK_SIZE = 8192
+
+
+def get_local_binary_path(name: str, url: str, tmp_dir: Optional[str] = None) -> str:
+    """
+    Returns the path to the executable previously downloaded with the name argument. If
+    None is found, the executable at the url argument will be downloaded and stored
+    under name for future uses.
+    :param name: The name that will be given to the folder containing the extracted data
+    :param url: The URL of the zip file
+    :param: tmp_dir: Optional override for the temporary directory to save binaries and zips in.
+    """
+    NUMBER_ATTEMPTS = 5
+    tmp_dir = tmp_dir or tempfile.gettempdir()
+    lock = FileLock(os.path.join(tmp_dir, name + ".lock"))
+    with lock:
+        path = get_local_binary_path_if_exists(name, url, tmp_dir=tmp_dir)
+        if path is None:
+            logger.debug(
+                f"Local environment {name} not found, downloading environment from {url}"
+            )
+        for attempt in range(
+            NUMBER_ATTEMPTS
+        ):  # Perform 5 attempts at downloading the file
+            if path is not None:
+                break
+            try:
+                download_and_extract_zip(url, name, tmp_dir=tmp_dir)
+            except Exception:
+                if attempt + 1 < NUMBER_ATTEMPTS:
+                    logger.warning(
+                        f"Attempt {attempt + 1} / {NUMBER_ATTEMPTS}"
+                        ": Failed to download and extract binary."
+                    )
+                else:
+                    raise
+            path = get_local_binary_path_if_exists(name, url, tmp_dir=tmp_dir)
+
+    if path is None:
+        raise FileNotFoundError(
+            f"Binary not found, make sure {url} is a valid url to "
+            "a zip folder containing a valid Unity executable"
+        )
+    return path
+
+
+def get_local_binary_path_if_exists(name: str, url: str, tmp_dir: str) -> Optional[str]:
+    """
+    Recursively searches for a Unity executable in the extracted files folders. This is
+    platform dependent : It will only return a Unity executable compatible with the
+    computer's OS. If no executable is found, None will be returned.
+    :param name: The name/identifier of the executable
+    :param url: The url the executable was downloaded from (for verification)
+    :param: tmp_dir: Optional override for the temporary directory to save binaries and zips in.
+    """
+    _, bin_dir = get_tmp_dirs(tmp_dir)
+    extension = None
+
+    if platform == "linux" or platform == "linux2":
+        extension = "*.x86_64"
+    if platform == "darwin":
+        extension = "*.app"
+    if platform == "win32":
+        extension = "*.exe"
+    if extension is None:
+        raise NotImplementedError("No extensions found for this platform.")
+    url_hash = "-" + hashlib.md5(url.encode()).hexdigest()
+    path = os.path.join(bin_dir, name + url_hash, "**", extension)
+    candidates = glob.glob(path, recursive=True)
+    if len(candidates) == 0:
+        return None
+    else:
+        for c in candidates:
+            # Unity sometimes produces another .exe file that we must filter out
+            if "UnityCrashHandler64" not in c:
+                # If the file is not valid, return None and delete faulty directory
+                if validate_environment_path(c) is None:
+                    shutil.rmtree(c)
+                    return None
+                return c
+        return None
+
+
+def _get_tmp_dir_helper(tmp_dir: Optional[str] = None) -> Tuple[str, str]:
+    tmp_dir = tmp_dir or ("/tmp" if platform == "darwin" else tempfile.gettempdir())
+    MLAGENTS = "ml-agents-binaries"
+    TMP_FOLDER_NAME = "tmp"
+    BINARY_FOLDER_NAME = "binaries"
+    mla_directory = os.path.join(tmp_dir, MLAGENTS)
+    if not os.path.exists(mla_directory):
+        os.makedirs(mla_directory)
+        os.chmod(mla_directory, 16877)
+    zip_directory = os.path.join(tmp_dir, MLAGENTS, TMP_FOLDER_NAME)
+    if not os.path.exists(zip_directory):
+        os.makedirs(zip_directory)
+        os.chmod(zip_directory, 16877)
+    bin_directory = os.path.join(tmp_dir, MLAGENTS, BINARY_FOLDER_NAME)
+    if not os.path.exists(bin_directory):
+        os.makedirs(bin_directory)
+        os.chmod(bin_directory, 16877)
+    return zip_directory, bin_directory
+
+
+def get_tmp_dirs(tmp_dir: Optional[str] = None) -> Tuple[str, str]:
+    """
+    Returns the path to the folder containing the downloaded zip files and the extracted
+    binaries. If these folders do not exist, they will be created.
+    :retrun: Tuple containing path to : (zip folder, extracted files folder)
+    """
+    # TODO: Once we don't use python 3.7 we should just use exists_ok=True when creating the dirs to avoid this.
+    # Should only be able to error out 3 times (once for each subdir).
+    for _attempt in range(3):
+        try:
+            return _get_tmp_dir_helper(tmp_dir)
+        except FileExistsError:
+            continue
+    return _get_tmp_dir_helper(tmp_dir)
+
+
+def download_and_extract_zip(
+    url: str, name: str, tmp_dir: Optional[str] = None
+) -> None:
+    """
+    Downloads a zip file under a URL, extracts its contents into a folder with the name
+    argument and gives chmod 755 to all the files it contains. Files are downloaded and
+    extracted into special folders in the temp folder of the machine.
+    :param url: The URL of the zip file
+    :param name: The name that will be given to the folder containing the extracted data
+    :param: tmp_dir: Optional override for the temporary directory to save binaries and zips in.
+    """
+    zip_dir, bin_dir = get_tmp_dirs(tmp_dir)
+    url_hash = "-" + hashlib.md5(url.encode()).hexdigest()
+    binary_path = os.path.join(bin_dir, name + url_hash)
+    if os.path.exists(binary_path):
+        shutil.rmtree(binary_path)
+
+    # Download zip
+    try:
+        request = urllib.request.urlopen(url, timeout=30)
+    except urllib.error.HTTPError as e:  # type: ignore
+        e.reason = f"{e.reason} {url}"
+        raise
+    zip_size = int(request.headers["content-length"])
+    zip_file_path = os.path.join(zip_dir, str(uuid.uuid4()) + ".zip")
+    with open(zip_file_path, "wb") as zip_file:
+        downloaded = 0
+        while True:
+            buffer = request.read(BLOCK_SIZE)
+            if not buffer:
+                # There is nothing more to read
+                break
+            downloaded += len(buffer)
+            zip_file.write(buffer)
+            downloaded_percent = downloaded / zip_size * 100
+            print_progress(f"  Downloading {name}", downloaded_percent)
+        print("")
+
+    # Extraction
+    with ZipFileWithProgress(zip_file_path, "r") as zip_ref:
+        zip_ref.extract_zip(f"  Extracting  {name}", binary_path)  # type: ignore
+    print("")
+
+    # Clean up zip
+    print_progress(f"  Cleaning up {name}", 0)
+    os.remove(zip_file_path)
+
+    # Give permission
+    for f in glob.glob(binary_path + "/**/*", recursive=True):
+        # 16877 is octal 40755, which denotes a directory with permissions 755
+        os.chmod(f, 16877)
+    print_progress(f"  Cleaning up {name}", 100)
+    print("")
+
+
+def print_progress(prefix: str, percent: float) -> None:
+    """
+    Displays a single progress bar in the terminal with value percent.
+    :param prefix: The string that will precede the progress bar.
+    :param percent: The percent progression of the bar (min is 0, max is 100)
+    """
+    BAR_LEN = 20
+    percent = min(100, max(0, percent))
+    bar_progress = min(int(percent / 100 * BAR_LEN), BAR_LEN)
+    bar = "|" + "\u2588" * bar_progress + " " * (BAR_LEN - bar_progress) + "|"
+    str_percent = "%3.0f%%" % percent
+    print(f"{prefix} : {bar} {str_percent} \r", end="", flush=True)
+
+
+def load_remote_manifest(url: str) -> Dict[str, Any]:
+    """
+    Converts a remote yaml file into a Python dictionary
+    """
+    tmp_dir, _ = get_tmp_dirs()
+    try:
+        request = urllib.request.urlopen(url, timeout=30)
+    except urllib.error.HTTPError as e:  # type: ignore
+        e.reason = f"{e.reason} {url}"
+        raise
+    manifest_path = os.path.join(tmp_dir, str(uuid.uuid4()) + ".yaml")
+    with open(manifest_path, "wb") as manifest:
+        while True:
+            buffer = request.read(BLOCK_SIZE)
+            if not buffer:
+                # There is nothing more to read
+                break
+            manifest.write(buffer)
+    try:
+        result = load_local_manifest(manifest_path)
+    finally:
+        os.remove(manifest_path)
+    return result
+
+
+def load_local_manifest(path: str) -> Dict[str, Any]:
+    """
+    Converts a local yaml file into a Python dictionary
+    """
+    with open(path) as data_file:
+        return yaml.safe_load(data_file)
+
+
+class ZipFileWithProgress(ZipFile):
+    """
+    This is a helper class inheriting from ZipFile that allows to display a progress
+    bar while the files are being extracted.
+    """
+
+    def extract_zip(self, prefix: str, path: str) -> None:
+        members = self.namelist()
+        path = os.fspath(path)
+        total = len(members)
+        n = 0
+        for zipinfo in members:
+            self.extract(zipinfo, path, None)  # type: ignore
+            n += 1
+            print_progress(prefix, n / total * 100)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/remote_registry_entry.py b/MLPY/Lib/site-packages/mlagents_envs/registry/remote_registry_entry.py
new file mode 100644
index 0000000000000000000000000000000000000000..816d7331ba70e0faf26fbd7957ec888670cc123c
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/registry/remote_registry_entry.py
@@ -0,0 +1,75 @@
+from sys import platform
+from typing import Optional, Any, List
+from mlagents_envs.environment import UnityEnvironment
+from mlagents_envs.base_env import BaseEnv
+from mlagents_envs.registry.binary_utils import get_local_binary_path
+from mlagents_envs.registry.base_registry_entry import BaseRegistryEntry
+
+
+class RemoteRegistryEntry(BaseRegistryEntry):
+    def __init__(
+        self,
+        identifier: str,
+        expected_reward: Optional[float],
+        description: Optional[str],
+        linux_url: Optional[str],
+        darwin_url: Optional[str],
+        win_url: Optional[str],
+        additional_args: Optional[List[str]] = None,
+        tmp_dir: Optional[str] = None,
+    ):
+        """
+        A RemoteRegistryEntry is an implementation of BaseRegistryEntry that uses a
+        Unity executable downloaded from the internet to launch a UnityEnvironment.
+        __Note__: The url provided must be a link to a `.zip` file containing a single
+        compressed folder with the executable inside. There can only be one executable
+        in the folder and it must be at the root of the folder.
+        :param identifier: The name of the Unity Environment.
+        :param expected_reward: The cumulative reward that an Agent must receive
+        for the task to be considered solved.
+        :param description: A description of the Unity Environment. Contains human
+        readable information about potential special arguments that the make method can
+        take as well as information regarding the observation, reward, actions,
+        behaviors and number of agents in the Environment.
+        :param linux_url: The url of the Unity executable for the Linux platform
+        :param darwin_url: The url of the Unity executable for the OSX platform
+        :param win_url: The url of the Unity executable for the Windows platform
+        """
+        super().__init__(identifier, expected_reward, description)
+        self._linux_url = linux_url
+        self._darwin_url = darwin_url
+        self._win_url = win_url
+        self._add_args = additional_args
+        self._tmp_dir_override = tmp_dir
+
+    def make(self, **kwargs: Any) -> BaseEnv:
+        """
+        Returns the UnityEnvironment that corresponds to the Unity executable found at
+        the provided url. The arguments passed to this method will be passed to the
+        constructor of the UnityEnvironment (except for the file_name argument)
+        """
+        url = None
+        if platform == "linux" or platform == "linux2":
+            url = self._linux_url
+        if platform == "darwin":
+            url = self._darwin_url
+        if platform == "win32":
+            url = self._win_url
+        if url is None:
+            raise FileNotFoundError(
+                f"The entry {self.identifier} does not contain a valid url for this "
+                "platform"
+            )
+        path = get_local_binary_path(
+            self.identifier, url, tmp_dir=self._tmp_dir_override
+        )
+        if "file_name" in kwargs:
+            kwargs.pop("file_name")
+        args: List[str] = []
+        if "additional_args" in kwargs:
+            if kwargs["additional_args"] is not None:
+                args += kwargs["additional_args"]
+        if self._add_args is not None:
+            args += self._add_args
+        kwargs["additional_args"] = args
+        return UnityEnvironment(file_name=path, **kwargs)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/registry/unity_env_registry.py b/MLPY/Lib/site-packages/mlagents_envs/registry/unity_env_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..639f85794774c9271cda215e2aaddbf3a5552852
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/registry/unity_env_registry.py
@@ -0,0 +1,125 @@
+from typing import Dict, Iterator, Any, List
+from collections.abc import Mapping
+from mlagents_envs.registry.base_registry_entry import BaseRegistryEntry
+from mlagents_envs.registry.binary_utils import (
+    load_local_manifest,
+    load_remote_manifest,
+)
+from mlagents_envs.registry.remote_registry_entry import RemoteRegistryEntry
+
+
+class UnityEnvRegistry(Mapping):
+    """
+    ### UnityEnvRegistry
+    Provides a library of Unity environments that can be launched without the need
+    of downloading the Unity Editor.
+    The UnityEnvRegistry implements a Map, to access an entry of the Registry, use:
+    ```python
+    registry = UnityEnvRegistry()
+    entry = registry[<environment_identifyier>]
+    ```
+    An entry has the following properties :
+     * `identifier` : Uniquely identifies this environment
+     * `expected_reward` : Corresponds to the reward an agent must obtained for the task
+     to be considered completed.
+     * `description` : A human readable description of the environment.
+
+    To launch a Unity environment from a registry entry, use the `make` method:
+    ```python
+    registry = UnityEnvRegistry()
+    env = registry[<environment_identifyier>].make()
+    ```
+    """
+
+    def __init__(self):
+        self._REGISTERED_ENVS: Dict[str, BaseRegistryEntry] = {}
+        self._manifests: List[str] = []
+        self._sync = True
+
+    def register(self, new_entry: BaseRegistryEntry) -> None:
+        """
+        Registers a new BaseRegistryEntry to the registry. The
+        BaseRegistryEntry.identifier value will be used as indexing key.
+        If two are more environments are registered under the same key, the most
+        recentry added will replace the others.
+        """
+        self._REGISTERED_ENVS[new_entry.identifier] = new_entry
+
+    def register_from_yaml(self, path_to_yaml: str) -> None:
+        """
+        Registers the environments listed in a yaml file (either local or remote). Note
+        that the entries are registered lazily: the registration will only happen when
+        an environment is accessed.
+        The yaml file must have the following format :
+        ```yaml
+        environments:
+        - <identifier of the first environment>:
+            expected_reward: <expected reward of the environment>
+            description: | <a multi line description of the environment>
+              <continued multi line description>
+            linux_url: <The url for the Linux executable zip file>
+            darwin_url: <The url for the OSX executable zip file>
+            win_url: <The url for the Windows executable zip file>
+
+        - <identifier of the second environment>:
+            expected_reward: <expected reward of the environment>
+            description: | <a multi line description of the environment>
+              <continued multi line description>
+            linux_url: <The url for the Linux executable zip file>
+            darwin_url: <The url for the OSX executable zip file>
+            win_url: <The url for the Windows executable zip file>
+
+        - ...
+        ```
+        :param path_to_yaml: A local path or url to the yaml file
+        """
+        self._manifests.append(path_to_yaml)
+        self._sync = False
+
+    def _load_all_manifests(self) -> None:
+        if not self._sync:
+            for path_to_yaml in self._manifests:
+                if path_to_yaml[:4] == "http":
+                    manifest = load_remote_manifest(path_to_yaml)
+                else:
+                    manifest = load_local_manifest(path_to_yaml)
+                for env in manifest["environments"]:
+                    remote_entry_args = list(env.values())[0]
+                    remote_entry_args["identifier"] = list(env.keys())[0]
+                    self.register(RemoteRegistryEntry(**remote_entry_args))
+            self._manifests = []
+            self._sync = True
+
+    def clear(self) -> None:
+        """
+        Deletes all entries in the registry.
+        """
+        self._REGISTERED_ENVS.clear()
+        self._manifests = []
+        self._sync = True
+
+    def __getitem__(self, identifier: str) -> BaseRegistryEntry:
+        """
+        Returns the BaseRegistryEntry with the provided identifier. BaseRegistryEntry
+        can then be used to make a Unity Environment.
+        :param identifier: The identifier of the BaseRegistryEntry
+        :returns: The associated BaseRegistryEntry
+        """
+        self._load_all_manifests()
+        if identifier not in self._REGISTERED_ENVS:
+            raise KeyError(f"The entry {identifier} is not present in the registry.")
+        return self._REGISTERED_ENVS[identifier]
+
+    def __len__(self) -> int:
+        self._load_all_manifests()
+        return len(self._REGISTERED_ENVS)
+
+    def __iter__(self) -> Iterator[Any]:
+        self._load_all_manifests()
+        yield from self._REGISTERED_ENVS
+
+
+default_registry = UnityEnvRegistry()
+default_registry.register_from_yaml(
+    "https://storage.googleapis.com/mlagents-test-environments/1.0.0/manifest.yaml"
+)  # noqa E501
diff --git a/MLPY/Lib/site-packages/mlagents_envs/rpc_communicator.py b/MLPY/Lib/site-packages/mlagents_envs/rpc_communicator.py
new file mode 100644
index 0000000000000000000000000000000000000000..13c0df08b1c36241dd3858c47dc82366746c87e9
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/rpc_communicator.py
@@ -0,0 +1,158 @@
+import grpc
+from typing import Optional
+
+from multiprocessing import Pipe
+from sys import platform
+import socket
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+from .communicator import Communicator, PollCallback
+from mlagents_envs.communicator_objects.unity_to_external_pb2_grpc import (
+    UnityToExternalProtoServicer,
+    add_UnityToExternalProtoServicer_to_server,
+)
+from mlagents_envs.communicator_objects.unity_message_pb2 import UnityMessageProto
+from mlagents_envs.communicator_objects.unity_input_pb2 import UnityInputProto
+from mlagents_envs.communicator_objects.unity_output_pb2 import UnityOutputProto
+from .exception import UnityTimeOutException, UnityWorkerInUseException
+
+
+class UnityToExternalServicerImplementation(UnityToExternalProtoServicer):
+    def __init__(self):
+        self.parent_conn, self.child_conn = Pipe()
+
+    def Initialize(self, request, context):
+        self.child_conn.send(request)
+        return self.child_conn.recv()
+
+    def Exchange(self, request, context):
+        self.child_conn.send(request)
+        return self.child_conn.recv()
+
+
+class RpcCommunicator(Communicator):
+    def __init__(self, worker_id=0, base_port=5005, timeout_wait=30):
+        """
+        Python side of the grpc communication. Python is the server and Unity the client
+
+
+        :int base_port: Baseline port number to connect to Unity environment over. worker_id increments over this.
+        :int worker_id: Offset from base_port. Used for training multiple environments simultaneously.
+        :int timeout_wait: Timeout (in seconds) to wait for a response before exiting.
+        """
+        super().__init__(worker_id, base_port)
+        self.port = base_port + worker_id
+        self.worker_id = worker_id
+        self.timeout_wait = timeout_wait
+        self.server = None
+        self.unity_to_external = None
+        self.is_open = False
+        self.create_server()
+
+    def create_server(self):
+        """
+        Creates the GRPC server.
+        """
+        self.check_port(self.port)
+
+        try:
+            # Establish communication grpc
+            self.server = grpc.server(
+                thread_pool=ThreadPoolExecutor(max_workers=10),
+                options=(("grpc.so_reuseport", 1),),
+            )
+            self.unity_to_external = UnityToExternalServicerImplementation()
+            add_UnityToExternalProtoServicer_to_server(
+                self.unity_to_external, self.server
+            )
+            # Using unspecified address, which means that grpc is communicating on all IPs
+            # This is so that the docker container can connect.
+            self.server.add_insecure_port("[::]:" + str(self.port))
+            self.server.start()
+            self.is_open = True
+        except Exception:
+            raise UnityWorkerInUseException(self.worker_id)
+
+    def check_port(self, port):
+        """
+        Attempts to bind to the requested communicator port, checking if it is already in use.
+        """
+        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        if platform == "linux" or platform == "linux2":
+            # On linux, the port remains unusable for TIME_WAIT=60 seconds after closing
+            # SO_REUSEADDR frees the port right after closing the environment
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        try:
+            s.bind(("localhost", port))
+        except OSError:
+            raise UnityWorkerInUseException(self.worker_id)
+        finally:
+            s.close()
+
+    def poll_for_timeout(self, poll_callback: Optional[PollCallback] = None) -> None:
+        """
+        Polls the GRPC parent connection for data, to be used before calling recv.  This prevents
+        us from hanging indefinitely in the case where the environment process has died or was not
+        launched.
+
+        Additionally, a callback can be passed to periodically check the state of the environment.
+        This is used to detect the case when the environment dies without cleaning up the connection,
+        so that we can stop sooner and raise a more appropriate error.
+        """
+        deadline = time.monotonic() + self.timeout_wait
+        callback_timeout_wait = self.timeout_wait // 10
+        while time.monotonic() < deadline:
+            if self.unity_to_external.parent_conn.poll(callback_timeout_wait):
+                # Got an acknowledgment from the connection
+                return
+            if poll_callback:
+                # Fire the callback - if it detects something wrong, it should raise an exception.
+                poll_callback()
+
+        # Got this far without reading any data from the connection, so it must be dead.
+        raise UnityTimeOutException(
+            "The Unity environment took too long to respond. Make sure that :\n"
+            "\t The environment does not need user interaction to launch\n"
+            '\t The Agents\' Behavior Parameters > Behavior Type is set to "Default"\n'
+            "\t The environment and the Python interface have compatible versions.\n"
+            "\t If you're running on a headless server without graphics support, turn off display "
+            "by either passing --no-graphics option or build your Unity executable as server build."
+        )
+
+    def initialize(
+        self, inputs: UnityInputProto, poll_callback: Optional[PollCallback] = None
+    ) -> UnityOutputProto:
+        self.poll_for_timeout(poll_callback)
+        aca_param = self.unity_to_external.parent_conn.recv().unity_output
+        message = UnityMessageProto()
+        message.header.status = 200
+        message.unity_input.CopyFrom(inputs)
+        self.unity_to_external.parent_conn.send(message)
+        self.unity_to_external.parent_conn.recv()
+        return aca_param
+
+    def exchange(
+        self, inputs: UnityInputProto, poll_callback: Optional[PollCallback] = None
+    ) -> Optional[UnityOutputProto]:
+        message = UnityMessageProto()
+        message.header.status = 200
+        message.unity_input.CopyFrom(inputs)
+        self.unity_to_external.parent_conn.send(message)
+        self.poll_for_timeout(poll_callback)
+        output = self.unity_to_external.parent_conn.recv()
+        if output.header.status != 200:
+            return None
+        return output.unity_output
+
+    def close(self):
+        """
+        Sends a shutdown signal to the unity environment, and closes the grpc connection.
+        """
+        if self.is_open:
+            message_input = UnityMessageProto()
+            message_input.header.status = 400
+            self.unity_to_external.parent_conn.send(message_input)
+            self.unity_to_external.parent_conn.close()
+            self.server.stop(False)
+            self.is_open = False
diff --git a/MLPY/Lib/site-packages/mlagents_envs/rpc_utils.py b/MLPY/Lib/site-packages/mlagents_envs/rpc_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2e3d1d4684d998434d714092b0010bdd12653cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/rpc_utils.py
@@ -0,0 +1,431 @@
+from mlagents_envs.base_env import (
+    ActionSpec,
+    ObservationSpec,
+    DimensionProperty,
+    BehaviorSpec,
+    DecisionSteps,
+    TerminalSteps,
+    ObservationType,
+)
+from mlagents_envs.exception import UnityObservationException
+from mlagents_envs.timers import hierarchical_timer, timed
+from mlagents_envs.communicator_objects.agent_info_pb2 import AgentInfoProto
+from mlagents_envs.communicator_objects.observation_pb2 import (
+    ObservationProto,
+    NONE as COMPRESSION_TYPE_NONE,
+)
+from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
+import numpy as np
+import io
+from typing import cast, List, Tuple, Collection, Optional, Iterable
+from PIL import Image
+
+
+PNG_HEADER = b"\x89PNG\r\n\x1a\n"
+
+
+def behavior_spec_from_proto(
+    brain_param_proto: BrainParametersProto, agent_info: AgentInfoProto
+) -> BehaviorSpec:
+    """
+    Converts brain parameter and agent info proto to BehaviorSpec object.
+    :param brain_param_proto: protobuf object.
+    :param agent_info: protobuf object.
+    :return: BehaviorSpec object.
+    """
+    observation_specs = []
+    for obs in agent_info.observations:
+        observation_specs.append(
+            ObservationSpec(
+                name=obs.name,
+                shape=tuple(obs.shape),
+                observation_type=ObservationType(obs.observation_type),
+                dimension_property=tuple(
+                    DimensionProperty(dim) for dim in obs.dimension_properties
+                )
+                if len(obs.dimension_properties) > 0
+                else (DimensionProperty.UNSPECIFIED,) * len(obs.shape),
+            )
+        )
+
+    # proto from communicator < v1.3 does not set action spec, use deprecated fields instead
+    if (
+        brain_param_proto.action_spec.num_continuous_actions == 0
+        and brain_param_proto.action_spec.num_discrete_actions == 0
+    ):
+        if brain_param_proto.vector_action_space_type_deprecated == 1:
+            action_spec = ActionSpec(
+                brain_param_proto.vector_action_size_deprecated[0], ()
+            )
+        else:
+            action_spec = ActionSpec(
+                0, tuple(brain_param_proto.vector_action_size_deprecated)
+            )
+    else:
+        action_spec_proto = brain_param_proto.action_spec
+        action_spec = ActionSpec(
+            action_spec_proto.num_continuous_actions,
+            tuple(branch for branch in action_spec_proto.discrete_branch_sizes),
+        )
+    return BehaviorSpec(observation_specs, action_spec)
+
+
+class OffsetBytesIO:
+    """
+    Simple file-like class that wraps a bytes, and allows moving its "start"
+    position in the bytes. This is only used for reading concatenated PNGs,
+    because Pillow always calls seek(0) at the start of reading.
+    """
+
+    __slots__ = ["fp", "offset"]
+
+    def __init__(self, data: bytes):
+        self.fp = io.BytesIO(data)
+        self.offset = 0
+
+    def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
+        if whence == io.SEEK_SET:
+            res = self.fp.seek(offset + self.offset)
+            return res - self.offset
+        raise NotImplementedError()
+
+    def tell(self) -> int:
+        return self.fp.tell() - self.offset
+
+    def read(self, size: int = -1) -> bytes:
+        return self.fp.read(size)
+
+    def original_tell(self) -> int:
+        """
+        Returns the offset into the original byte array
+        """
+        return self.fp.tell()
+
+
+@timed
+def process_pixels(
+    image_bytes: bytes, expected_channels: int, mappings: Optional[List[int]] = None
+) -> np.ndarray:
+    """
+    Converts byte array observation image into numpy array, re-sizes it,
+    and optionally converts it to grey scale
+    :param image_bytes: input byte array corresponding to image
+    :param expected_channels: Expected output channels
+    :return: processed numpy array of observation from environment
+    """
+    image_fp = OffsetBytesIO(image_bytes)
+
+    image_arrays = []
+    # Read the images back from the bytes (without knowing the sizes).
+    while True:
+        with hierarchical_timer("image_decompress"):
+            image = Image.open(image_fp)
+            # Normally Image loads lazily, load() forces it to do loading in the timer scope.
+            image.load()
+        image_arrays.append(np.array(image, dtype=np.float32) / 255.0)
+
+        # Look for the next header, starting from the current stream location
+        try:
+            new_offset = image_bytes.index(PNG_HEADER, image_fp.original_tell())
+            image_fp.offset = new_offset
+        except ValueError:
+            # Didn't find the header, so must be at the end.
+            break
+
+    if mappings is not None and len(mappings) > 0:
+        return _process_images_mapping(image_arrays, mappings)
+    else:
+        return _process_images_num_channels(image_arrays, expected_channels)
+
+
+def _process_images_mapping(image_arrays, mappings):
+    """
+    Helper function for processing decompressed images with compressed channel mappings.
+    """
+    image_arrays = np.concatenate(image_arrays, axis=2).transpose((2, 0, 1))
+
+    if len(mappings) != len(image_arrays):
+        raise UnityObservationException(
+            f"Compressed observation and its mapping had different number of channels - "
+            f"observation had {len(image_arrays)} channels but its mapping had {len(mappings)} channels"
+        )
+    if len({m for m in mappings if m > -1}) != max(mappings) + 1:
+        raise UnityObservationException(
+            f"Invalid Compressed Channel Mapping: the mapping {mappings} does not have the correct format."
+        )
+    if max(mappings) >= len(image_arrays):
+        raise UnityObservationException(
+            f"Invalid Compressed Channel Mapping: the mapping has index larger than the total "
+            f"number of channels in observation - mapping index {max(mappings)} is"
+            f"invalid for input observation with {len(image_arrays)} channels."
+        )
+
+    processed_image_arrays: List[np.array] = [[] for _ in range(max(mappings) + 1)]
+    for mapping_idx, img in zip(mappings, image_arrays):
+        if mapping_idx > -1:
+            processed_image_arrays[mapping_idx].append(img)
+
+    for i, img_array in enumerate(processed_image_arrays):
+        processed_image_arrays[i] = np.mean(img_array, axis=0)
+    img = np.stack(processed_image_arrays, axis=2)
+    return img
+
+
+def _process_images_num_channels(image_arrays, expected_channels):
+    """
+    Helper function for processing decompressed images with number of expected channels.
+    This is for old API without mapping provided. Use the first n channel, n=expected_channels.
+    """
+    if expected_channels == 1:
+        # Convert to grayscale
+        img = np.mean(image_arrays[0], axis=2)
+        img = np.reshape(img, [img.shape[0], img.shape[1], 1])
+    else:
+        img = np.concatenate(image_arrays, axis=2)
+        # We can drop additional channels since they may need to be added to include
+        # numbers of observation channels not divisible by 3.
+        actual_channels = list(img.shape)[2]
+        if actual_channels > expected_channels:
+            img = img[..., 0:expected_channels]
+    return img
+
+
+def _check_observations_match_spec(
+    obs_index: int,
+    observation_spec: ObservationSpec,
+    agent_info_list: Collection[AgentInfoProto],
+) -> None:
+    """
+    Check that all the observations match the expected size.
+    This gives a nicer error than a cryptic numpy error later.
+    """
+    expected_obs_shape = tuple(observation_spec.shape)
+    for agent_info in agent_info_list:
+        agent_obs_shape = tuple(agent_info.observations[obs_index].shape)
+        if expected_obs_shape != agent_obs_shape:
+            raise UnityObservationException(
+                f"Observation at index={obs_index} for agent with "
+                f"id={agent_info.id} didn't match the ObservationSpec. "
+                f"Expected shape {expected_obs_shape} but got {agent_obs_shape}."
+            )
+
+
+@timed
+def _observation_to_np_array(
+    obs: ObservationProto, expected_shape: Optional[Iterable[int]] = None
+) -> np.ndarray:
+    """
+    Converts observation proto into numpy array of the appropriate size.
+    :param obs: observation proto to be converted
+    :param expected_shape: optional shape information, used for sanity checks.
+    :return: processed numpy array of observation from environment
+    """
+    if expected_shape is not None:
+        if list(obs.shape) != list(expected_shape):
+            raise UnityObservationException(
+                f"Observation did not have the expected shape - got {obs.shape} but expected {expected_shape}"
+            )
+    expected_channels = obs.shape[2]
+    if obs.compression_type == COMPRESSION_TYPE_NONE:
+        img = np.array(obs.float_data.data, dtype=np.float32)
+        img = np.reshape(img, obs.shape)
+        return img
+    else:
+        img = process_pixels(
+            obs.compressed_data, expected_channels, list(obs.compressed_channel_mapping)
+        )
+        # Compare decompressed image size to observation shape and make sure they match
+        if list(obs.shape) != list(img.shape):
+            raise UnityObservationException(
+                f"Decompressed observation did not have the expected shape - "
+                f"decompressed had {img.shape} but expected {obs.shape}"
+            )
+        return img
+
+
+@timed
+def _process_maybe_compressed_observation(
+    obs_index: int,
+    observation_spec: ObservationSpec,
+    agent_info_list: Collection[AgentInfoProto],
+) -> np.ndarray:
+    shape = cast(Tuple[int, int, int], observation_spec.shape)
+    if len(agent_info_list) == 0:
+        return np.zeros((0, shape[0], shape[1], shape[2]), dtype=np.float32)
+
+    try:
+        batched_visual = [
+            _observation_to_np_array(agent_obs.observations[obs_index], shape)
+            for agent_obs in agent_info_list
+        ]
+    except ValueError:
+        # Try to get a more useful error message
+        _check_observations_match_spec(obs_index, observation_spec, agent_info_list)
+        # If that didn't raise anything, raise the original error
+        raise
+    return np.array(batched_visual, dtype=np.float32)
+
+
+def _raise_on_nan_and_inf(data: np.array, source: str) -> np.array:
+    # Check for NaNs or Infinite values in the observation or reward data.
+    # If there's a NaN in the observations, the np.mean() result will be NaN
+    # If there's an Infinite value (either sign) then the result will be Inf
+    # See https://stackoverflow.com/questions/6736590/fast-check-for-nan-in-numpy for background
+    # Note that a very large values (larger than sqrt(float_max)) will result in an Inf value here
+    # Raise a Runtime error in the case that NaNs or Infinite values make it into the data.
+    if data.size == 0:
+        return data
+
+    d = np.mean(data)
+    has_nan = np.isnan(d)
+    has_inf = not np.isfinite(d)
+
+    if has_nan:
+        raise RuntimeError(f"The {source} provided had NaN values.")
+    if has_inf:
+        raise RuntimeError(f"The {source} provided had Infinite values.")
+
+
+@timed
+def _process_rank_one_or_two_observation(
+    obs_index: int,
+    observation_spec: ObservationSpec,
+    agent_info_list: Collection[AgentInfoProto],
+) -> np.ndarray:
+    if len(agent_info_list) == 0:
+        return np.zeros((0,) + observation_spec.shape, dtype=np.float32)
+    try:
+        np_obs = np.array(
+            [
+                agent_obs.observations[obs_index].float_data.data
+                for agent_obs in agent_info_list
+            ],
+            dtype=np.float32,
+        ).reshape((len(agent_info_list),) + observation_spec.shape)
+    except ValueError:
+        # Try to get a more useful error message
+        _check_observations_match_spec(obs_index, observation_spec, agent_info_list)
+        # If that didn't raise anything, raise the original error
+        raise
+    _raise_on_nan_and_inf(np_obs, "observations")
+    return np_obs
+
+
+@timed
+def steps_from_proto(
+    agent_info_list: Collection[AgentInfoProto], behavior_spec: BehaviorSpec
+) -> Tuple[DecisionSteps, TerminalSteps]:
+    decision_agent_info_list = [
+        agent_info for agent_info in agent_info_list if not agent_info.done
+    ]
+    terminal_agent_info_list = [
+        agent_info for agent_info in agent_info_list if agent_info.done
+    ]
+    decision_obs_list: List[np.ndarray] = []
+    terminal_obs_list: List[np.ndarray] = []
+    for obs_index, observation_spec in enumerate(behavior_spec.observation_specs):
+        is_visual = len(observation_spec.shape) == 3
+        if is_visual:
+            decision_obs_list.append(
+                _process_maybe_compressed_observation(
+                    obs_index, observation_spec, decision_agent_info_list
+                )
+            )
+            terminal_obs_list.append(
+                _process_maybe_compressed_observation(
+                    obs_index, observation_spec, terminal_agent_info_list
+                )
+            )
+        else:
+            decision_obs_list.append(
+                _process_rank_one_or_two_observation(
+                    obs_index, observation_spec, decision_agent_info_list
+                )
+            )
+            terminal_obs_list.append(
+                _process_rank_one_or_two_observation(
+                    obs_index, observation_spec, terminal_agent_info_list
+                )
+            )
+    decision_rewards = np.array(
+        [agent_info.reward for agent_info in decision_agent_info_list], dtype=np.float32
+    )
+    terminal_rewards = np.array(
+        [agent_info.reward for agent_info in terminal_agent_info_list], dtype=np.float32
+    )
+
+    decision_group_rewards = np.array(
+        [agent_info.group_reward for agent_info in decision_agent_info_list],
+        dtype=np.float32,
+    )
+    terminal_group_rewards = np.array(
+        [agent_info.group_reward for agent_info in terminal_agent_info_list],
+        dtype=np.float32,
+    )
+
+    _raise_on_nan_and_inf(decision_rewards, "rewards")
+    _raise_on_nan_and_inf(terminal_rewards, "rewards")
+    _raise_on_nan_and_inf(decision_group_rewards, "group_rewards")
+    _raise_on_nan_and_inf(terminal_group_rewards, "group_rewards")
+
+    decision_group_id = [agent_info.group_id for agent_info in decision_agent_info_list]
+    terminal_group_id = [agent_info.group_id for agent_info in terminal_agent_info_list]
+
+    max_step = np.array(
+        [agent_info.max_step_reached for agent_info in terminal_agent_info_list],
+        dtype=bool,
+    )
+    decision_agent_id = np.array(
+        [agent_info.id for agent_info in decision_agent_info_list], dtype=np.int32
+    )
+    terminal_agent_id = np.array(
+        [agent_info.id for agent_info in terminal_agent_info_list], dtype=np.int32
+    )
+    action_mask = None
+    if behavior_spec.action_spec.discrete_size > 0:
+        if any(
+            [agent_info.action_mask is not None]
+            for agent_info in decision_agent_info_list
+        ):
+            n_agents = len(decision_agent_info_list)
+            a_size = np.sum(behavior_spec.action_spec.discrete_branches)
+            mask_matrix = np.ones((n_agents, a_size), dtype=bool)
+            for agent_index, agent_info in enumerate(decision_agent_info_list):
+                if agent_info.action_mask is not None:
+                    if len(agent_info.action_mask) == a_size:
+                        mask_matrix[agent_index, :] = [
+                            False if agent_info.action_mask[k] else True
+                            for k in range(a_size)
+                        ]
+            action_mask = (1 - mask_matrix).astype(bool)
+            indices = _generate_split_indices(
+                behavior_spec.action_spec.discrete_branches
+            )
+            action_mask = np.split(action_mask, indices, axis=1)
+    return (
+        DecisionSteps(
+            decision_obs_list,
+            decision_rewards,
+            decision_agent_id,
+            action_mask,
+            decision_group_id,
+            decision_group_rewards,
+        ),
+        TerminalSteps(
+            terminal_obs_list,
+            terminal_rewards,
+            max_step,
+            terminal_agent_id,
+            terminal_group_id,
+            terminal_group_rewards,
+        ),
+    )
+
+
+def _generate_split_indices(dims):
+    if len(dims) <= 1:
+        return ()
+    result = (dims[0],)
+    for i in range(len(dims) - 2):
+        result += (dims[i + 1] + result[i],)
+    return result
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__init__.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9a1f5f0f71d0b0971214921d0514538cb3c3a89
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__init__.py
@@ -0,0 +1,7 @@
+from mlagents_envs.side_channel.incoming_message import IncomingMessage  # noqa
+from mlagents_envs.side_channel.outgoing_message import OutgoingMessage  # noqa
+
+from mlagents_envs.side_channel.side_channel import SideChannel  # noqa
+from mlagents_envs.side_channel.default_training_analytics_side_channel import (  # noqa
+    DefaultTrainingAnalyticsSideChannel,  # noqa
+)  # noqa
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e59f51d70ab642acfa1f2c21a15aedca8a73c23
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/default_training_analytics_side_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/default_training_analytics_side_channel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..491514935e40567c595a985f553f2f2900be94d4
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/default_training_analytics_side_channel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/engine_configuration_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/engine_configuration_channel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7f1867eb9320b7c8cc163f361ce1e6a7d38d2f9
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/engine_configuration_channel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/environment_parameters_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/environment_parameters_channel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f399e5528e9be32428b321484e46d89236379e8
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/environment_parameters_channel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/float_properties_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/float_properties_channel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..775bd587b43e541b9ebc83c0c5cddee75ca5285d
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/float_properties_channel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/incoming_message.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/incoming_message.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b0fc669efb973d01d846586a51d5b353f021370
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/incoming_message.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/outgoing_message.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/outgoing_message.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ea39198def92a5fe76c947d90e0e36eb94ab95b
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/outgoing_message.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/raw_bytes_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/raw_bytes_channel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..723d1a503b29d6929465face5c1bca2f28e3b6ae
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/raw_bytes_channel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/side_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/side_channel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca45af506c507e50b44342f51714d00905858cdd
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/side_channel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/side_channel_manager.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/side_channel_manager.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a79f31616c47668d7f4d47a99c026d447241455b
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/side_channel_manager.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/stats_side_channel.cpython-39.pyc b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/stats_side_channel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2888cf935d27e9e44062d10407cf881f7bce39d
Binary files /dev/null and b/MLPY/Lib/site-packages/mlagents_envs/side_channel/__pycache__/stats_side_channel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/default_training_analytics_side_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/default_training_analytics_side_channel.py
new file mode 100644
index 0000000000000000000000000000000000000000..a53e686709e3876bf3444bc1027c552d53dd076b
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/default_training_analytics_side_channel.py
@@ -0,0 +1,49 @@
+import sys
+import uuid
+import mlagents_envs
+
+from mlagents_envs.exception import UnityCommunicationException
+from mlagents_envs.side_channel import SideChannel, IncomingMessage, OutgoingMessage
+from mlagents_envs.communicator_objects.training_analytics_pb2 import (
+    TrainingEnvironmentInitialized,
+)
+from google.protobuf.any_pb2 import Any
+
+
+class DefaultTrainingAnalyticsSideChannel(SideChannel):
+    """
+    Side channel that sends information about the training to the Unity environment so it can be logged.
+    """
+
+    CHANNEL_ID = uuid.UUID("b664a4a9-d86f-5a5f-95cb-e8353a7e8356")
+
+    def __init__(self) -> None:
+        # >>> uuid.uuid5(uuid.NAMESPACE_URL, "com.unity.ml-agents/TrainingAnalyticsSideChannel")
+        # UUID('b664a4a9-d86f-5a5f-95cb-e8353a7e8356')
+        # We purposefully use the SAME side channel as the TrainingAnalyticsSideChannel
+
+        super().__init__(DefaultTrainingAnalyticsSideChannel.CHANNEL_ID)
+
+    def on_message_received(self, msg: IncomingMessage) -> None:
+        raise UnityCommunicationException(
+            "The DefaultTrainingAnalyticsSideChannel received a message from Unity, "
+            + "this should not have happened."
+        )
+
+    def environment_initialized(self) -> None:
+        # Tuple of (major, minor, patch)
+        vi = sys.version_info
+
+        msg = TrainingEnvironmentInitialized(
+            python_version=f"{vi[0]}.{vi[1]}.{vi[2]}",
+            mlagents_version="Custom",
+            mlagents_envs_version=mlagents_envs.__version__,
+            torch_version="Unknown",
+            torch_device_type="Unknown",
+        )
+        any_message = Any()
+        any_message.Pack(msg)
+
+        env_init_msg = OutgoingMessage()
+        env_init_msg.set_raw_bytes(any_message.SerializeToString())  # type: ignore
+        super().queue_message_to_send(env_init_msg)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/engine_configuration_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/engine_configuration_channel.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce1715ba07349fc377fb846dc5e050855b140965
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/engine_configuration_channel.py
@@ -0,0 +1,127 @@
+from mlagents_envs.side_channel import SideChannel, OutgoingMessage, IncomingMessage
+from mlagents_envs.exception import (
+    UnityCommunicationException,
+    UnitySideChannelException,
+)
+import uuid
+from typing import NamedTuple, Optional
+from enum import IntEnum
+
+
+class EngineConfig(NamedTuple):
+    width: Optional[int]
+    height: Optional[int]
+    quality_level: Optional[int]
+    time_scale: Optional[float]
+    target_frame_rate: Optional[int]
+    capture_frame_rate: Optional[int]
+
+    @staticmethod
+    def default_config():
+        return EngineConfig(80, 80, 1, 20.0, -1, 60)
+
+
+class EngineConfigurationChannel(SideChannel):
+    """
+    This is the SideChannel for engine configuration exchange. The data in the
+    engine configuration is as follows :
+     - int width;
+     - int height;
+     - int qualityLevel;
+     - float timeScale;
+     - int targetFrameRate;
+     - int captureFrameRate;
+    """
+
+    class ConfigurationType(IntEnum):
+        SCREEN_RESOLUTION = 0
+        QUALITY_LEVEL = 1
+        TIME_SCALE = 2
+        TARGET_FRAME_RATE = 3
+        CAPTURE_FRAME_RATE = 4
+
+    def __init__(self) -> None:
+        super().__init__(uuid.UUID("e951342c-4f7e-11ea-b238-784f4387d1f7"))
+
+    def on_message_received(self, msg: IncomingMessage) -> None:
+        """
+        Is called by the environment to the side channel. Can be called
+        multiple times per step if multiple messages are meant for that
+        SideChannel.
+        Note that Python should never receive an engine configuration from
+        Unity
+        """
+        raise UnityCommunicationException(
+            "The EngineConfigurationChannel received a message from Unity, "
+            + "this should not have happened."
+        )
+
+    def set_configuration_parameters(
+        self,
+        width: Optional[int] = None,
+        height: Optional[int] = None,
+        quality_level: Optional[int] = None,
+        time_scale: Optional[float] = None,
+        target_frame_rate: Optional[int] = None,
+        capture_frame_rate: Optional[int] = None,
+    ) -> None:
+        """
+        Sets the engine configuration. Takes as input the configurations of the
+        engine.
+        :param width: Defines the width of the display. (Must be set alongside height)
+        :param height: Defines the height of the display. (Must be set alongside width)
+        :param quality_level: Defines the quality level of the simulation.
+        :param time_scale: Defines the multiplier for the deltatime in the
+        simulation. If set to a higher value, time will pass faster in the
+        simulation but the physics might break.
+        :param target_frame_rate: Instructs simulation to try to render at a
+        specified frame rate.
+        :param capture_frame_rate: Instructs the simulation to consider time between
+        updates to always be constant, regardless of the actual frame rate.
+        """
+
+        if (width is None and height is not None) or (
+            width is not None and height is None
+        ):
+            raise UnitySideChannelException(
+                "You cannot set the width/height of the screen resolution without also setting the height/width"
+            )
+
+        if width is not None and height is not None:
+            screen_msg = OutgoingMessage()
+            screen_msg.write_int32(self.ConfigurationType.SCREEN_RESOLUTION)
+            screen_msg.write_int32(width)
+            screen_msg.write_int32(height)
+            super().queue_message_to_send(screen_msg)
+
+        if quality_level is not None:
+            quality_level_msg = OutgoingMessage()
+            quality_level_msg.write_int32(self.ConfigurationType.QUALITY_LEVEL)
+            quality_level_msg.write_int32(quality_level)
+            super().queue_message_to_send(quality_level_msg)
+
+        if time_scale is not None:
+            time_scale_msg = OutgoingMessage()
+            time_scale_msg.write_int32(self.ConfigurationType.TIME_SCALE)
+            time_scale_msg.write_float32(time_scale)
+            super().queue_message_to_send(time_scale_msg)
+
+        if target_frame_rate is not None:
+            target_frame_rate_msg = OutgoingMessage()
+            target_frame_rate_msg.write_int32(self.ConfigurationType.TARGET_FRAME_RATE)
+            target_frame_rate_msg.write_int32(target_frame_rate)
+            super().queue_message_to_send(target_frame_rate_msg)
+
+        if capture_frame_rate is not None:
+            capture_frame_rate_msg = OutgoingMessage()
+            capture_frame_rate_msg.write_int32(
+                self.ConfigurationType.CAPTURE_FRAME_RATE
+            )
+            capture_frame_rate_msg.write_int32(capture_frame_rate)
+            super().queue_message_to_send(capture_frame_rate_msg)
+
+    def set_configuration(self, config: EngineConfig) -> None:
+        """
+        Sets the engine configuration. Takes as input an EngineConfig.
+        """
+        self.set_configuration_parameters(**config._asdict())
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/environment_parameters_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/environment_parameters_channel.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff516a5eb586355ee3699d4065e79c5e57340c1d
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/environment_parameters_channel.py
@@ -0,0 +1,100 @@
+from mlagents_envs.side_channel import SideChannel, IncomingMessage, OutgoingMessage
+from mlagents_envs.exception import UnityCommunicationException
+import uuid
+from enum import IntEnum
+from typing import List, Tuple
+
+
+class EnvironmentParametersChannel(SideChannel):
+    """
+    This is the SideChannel for sending environment parameters to Unity.
+    You can send parameters to an environment with the command
+    set_float_parameter.
+    """
+
+    class EnvironmentDataTypes(IntEnum):
+        FLOAT = 0
+        SAMPLER = 1
+
+    class SamplerTypes(IntEnum):
+        UNIFORM = 0
+        GAUSSIAN = 1
+        MULTIRANGEUNIFORM = 2
+
+    def __init__(self) -> None:
+        channel_id = uuid.UUID("534c891e-810f-11ea-a9d0-822485860400")
+        super().__init__(channel_id)
+
+    def on_message_received(self, msg: IncomingMessage) -> None:
+        raise UnityCommunicationException(
+            "The EnvironmentParametersChannel received a message from Unity, "
+            + "this should not have happened."
+        )
+
+    def set_float_parameter(self, key: str, value: float) -> None:
+        """
+        Sets a float environment parameter in the Unity Environment.
+        :param key: The string identifier of the parameter.
+        :param value: The float value of the parameter.
+        """
+        msg = OutgoingMessage()
+        msg.write_string(key)
+        msg.write_int32(self.EnvironmentDataTypes.FLOAT)
+        msg.write_float32(value)
+        super().queue_message_to_send(msg)
+
+    def set_uniform_sampler_parameters(
+        self, key: str, min_value: float, max_value: float, seed: int
+    ) -> None:
+        """
+        Sets a uniform environment parameter sampler.
+        :param key: The string identifier of the parameter.
+        :param min_value: The minimum of the sampling distribution.
+        :param max_value: The maximum of the sampling distribution.
+        :param seed: The random seed to initialize the sampler.
+        """
+        msg = OutgoingMessage()
+        msg.write_string(key)
+        msg.write_int32(self.EnvironmentDataTypes.SAMPLER)
+        msg.write_int32(seed)
+        msg.write_int32(self.SamplerTypes.UNIFORM)
+        msg.write_float32(min_value)
+        msg.write_float32(max_value)
+        super().queue_message_to_send(msg)
+
+    def set_gaussian_sampler_parameters(
+        self, key: str, mean: float, st_dev: float, seed: int
+    ) -> None:
+        """
+        Sets a gaussian environment parameter sampler.
+        :param key: The string identifier of the parameter.
+        :param mean: The mean of the sampling distribution.
+        :param st_dev: The standard deviation of the sampling distribution.
+        :param seed: The random seed to initialize the sampler.
+        """
+        msg = OutgoingMessage()
+        msg.write_string(key)
+        msg.write_int32(self.EnvironmentDataTypes.SAMPLER)
+        msg.write_int32(seed)
+        msg.write_int32(self.SamplerTypes.GAUSSIAN)
+        msg.write_float32(mean)
+        msg.write_float32(st_dev)
+        super().queue_message_to_send(msg)
+
+    def set_multirangeuniform_sampler_parameters(
+        self, key: str, intervals: List[Tuple[float, float]], seed: int
+    ) -> None:
+        """
+        Sets a multirangeuniform environment parameter sampler.
+        :param key: The string identifier of the parameter.
+        :param intervals: The lists of min and max that define each uniform distribution.
+        :param seed: The random seed to initialize the sampler.
+        """
+        msg = OutgoingMessage()
+        msg.write_string(key)
+        msg.write_int32(self.EnvironmentDataTypes.SAMPLER)
+        msg.write_int32(seed)
+        msg.write_int32(self.SamplerTypes.MULTIRANGEUNIFORM)
+        flattened_intervals = [value for interval in intervals for value in interval]
+        msg.write_float32_list(flattened_intervals)
+        super().queue_message_to_send(msg)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/float_properties_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/float_properties_channel.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c6701129f4f3103db238371552a9d6384ca3ab8
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/float_properties_channel.py
@@ -0,0 +1,62 @@
+from mlagents_envs.side_channel import SideChannel, IncomingMessage, OutgoingMessage
+import uuid
+from typing import Dict, Optional, List
+
+
+class FloatPropertiesChannel(SideChannel):
+    """
+    This is the SideChannel for float properties shared with Unity.
+    You can modify the float properties of an environment with the commands
+    set_property, get_property and list_properties.
+    """
+
+    def __init__(self, channel_id: uuid.UUID = None) -> None:
+        self._float_properties: Dict[str, float] = {}
+        if channel_id is None:
+            channel_id = uuid.UUID("60ccf7d0-4f7e-11ea-b238-784f4387d1f7")
+        super().__init__(channel_id)
+
+    def on_message_received(self, msg: IncomingMessage) -> None:
+        """
+        Is called by the environment to the side channel. Can be called
+        multiple times per step if multiple messages are meant for that
+        SideChannel.
+        """
+        k = msg.read_string()
+        v = msg.read_float32()
+        self._float_properties[k] = v
+
+    def set_property(self, key: str, value: float) -> None:
+        """
+        Sets a property in the Unity Environment.
+        :param key: The string identifier of the property.
+        :param value: The float value of the property.
+        """
+        self._float_properties[key] = value
+        msg = OutgoingMessage()
+        msg.write_string(key)
+        msg.write_float32(value)
+        super().queue_message_to_send(msg)
+
+    def get_property(self, key: str) -> Optional[float]:
+        """
+        Gets a property in the Unity Environment. If the property was not
+        found, will return None.
+        :param key: The string identifier of the property.
+        :return: The float value of the property or None.
+        """
+        return self._float_properties.get(key)
+
+    def list_properties(self) -> List[str]:
+        """
+        Returns a list of all the string identifiers of the properties
+        currently present in the Unity Environment.
+        """
+        return list(self._float_properties.keys())
+
+    def get_property_dict_copy(self) -> Dict[str, float]:
+        """
+        Returns a copy of the float properties.
+        :return:
+        """
+        return dict(self._float_properties)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/incoming_message.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/incoming_message.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c00f252868f072f239462581278225c2336b2c8
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/incoming_message.py
@@ -0,0 +1,93 @@
+from typing import List
+import struct
+
+
+class IncomingMessage:
+    """
+    Utility class for reading the message written to a SideChannel.
+    Values must be read in the order they were written.
+    """
+
+    def __init__(self, buffer: bytes, offset: int = 0):
+        """
+        Create a new IncomingMessage from the bytes.
+        """
+        self.buffer = buffer
+        self.offset = offset
+
+    def read_bool(self, default_value: bool = False) -> bool:
+        """
+        Read a boolean value from the message buffer.
+        :param default_value: Default value to use if the end of the message is reached.
+        :return: The value read from the message, or the default value if the end was reached.
+        """
+        if self._at_end_of_buffer():
+            return default_value
+
+        val = struct.unpack_from("<?", self.buffer, self.offset)[0]
+        self.offset += 1
+        return val
+
+    def read_int32(self, default_value: int = 0) -> int:
+        """
+        Read an integer value from the message buffer.
+        :param default_value: Default value to use if the end of the message is reached.
+        :return: The value read from the message, or the default value if the end was reached.
+        """
+        if self._at_end_of_buffer():
+            return default_value
+
+        val = struct.unpack_from("<i", self.buffer, self.offset)[0]
+        self.offset += 4
+        return val
+
+    def read_float32(self, default_value: float = 0.0) -> float:
+        """
+        Read a float value from the message buffer.
+        :param default_value: Default value to use if the end of the message is reached.
+        :return: The value read from the message, or the default value if the end was reached.
+        """
+        if self._at_end_of_buffer():
+            return default_value
+
+        val = struct.unpack_from("<f", self.buffer, self.offset)[0]
+        self.offset += 4
+        return val
+
+    def read_float32_list(self, default_value: List[float] = None) -> List[float]:
+        """
+        Read a list of float values from the message buffer.
+        :param default_value: Default value to use if the end of the message is reached.
+        :return: The value read from the message, or the default value if the end was reached.
+        """
+        if self._at_end_of_buffer():
+            return [] if default_value is None else default_value
+
+        list_len = self.read_int32()
+        output = []
+        for _ in range(list_len):
+            output.append(self.read_float32())
+        return output
+
+    def read_string(self, default_value: str = "") -> str:
+        """
+        Read a string value from the message buffer.
+        :param default_value: Default value to use if the end of the message is reached.
+        :return: The value read from the message, or the default value if the end was reached.
+        """
+        if self._at_end_of_buffer():
+            return default_value
+
+        encoded_str_len = self.read_int32()
+        val = self.buffer[self.offset : self.offset + encoded_str_len].decode("ascii")
+        self.offset += encoded_str_len
+        return val
+
+    def get_raw_bytes(self) -> bytes:
+        """
+        Get a copy of the internal bytes used by the message.
+        """
+        return bytearray(self.buffer)
+
+    def _at_end_of_buffer(self) -> bool:
+        return self.offset >= len(self.buffer)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/outgoing_message.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/outgoing_message.py
new file mode 100644
index 0000000000000000000000000000000000000000..83bbe3446cb207e8c9eeffb8146f243eae9e28a2
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/outgoing_message.py
@@ -0,0 +1,66 @@
+from typing import List
+import struct
+
+from mlagents_envs.logging_util import get_logger
+
+logger = get_logger(__name__)
+
+
+class OutgoingMessage:
+    """
+    Utility class for forming the message that is written to a SideChannel.
+    All data is written in little-endian format using the struct module.
+    """
+
+    def __init__(self):
+        """
+        Create an OutgoingMessage with an empty buffer.
+        """
+        self.buffer = bytearray()
+
+    def write_bool(self, b: bool) -> None:
+        """
+        Append a boolean value.
+        """
+        self.buffer += struct.pack("<?", b)
+
+    def write_int32(self, i: int) -> None:
+        """
+        Append an integer value.
+        """
+        self.buffer += struct.pack("<i", i)
+
+    def write_float32(self, f: float) -> None:
+        """
+        Append a float value. It will be truncated to 32-bit precision.
+        """
+        self.buffer += struct.pack("<f", f)
+
+    def write_float32_list(self, float_list: List[float]) -> None:
+        """
+        Append a list of float values. They will be truncated to 32-bit precision.
+        """
+        self.write_int32(len(float_list))
+        for f in float_list:
+            self.write_float32(f)
+
+    def write_string(self, s: str) -> None:
+        """
+        Append a string value. Internally, it will be encoded to ascii, and the
+        encoded length will also be written to the message.
+        """
+        encoded_key = s.encode("ascii")
+        self.write_int32(len(encoded_key))
+        self.buffer += encoded_key
+
+    def set_raw_bytes(self, buffer: bytearray) -> None:
+        """
+        Set the internal buffer to a new bytearray. This will overwrite any existing data.
+        :param buffer:
+        :return:
+        """
+        if self.buffer:
+            logger.warning(
+                "Called set_raw_bytes but the message already has been written to. This will overwrite data."
+            )
+        self.buffer = bytearray(buffer)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/raw_bytes_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/raw_bytes_channel.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9f748f4fb3bd3360300cfd8810860fffe2ce411
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/raw_bytes_channel.py
@@ -0,0 +1,39 @@
+from mlagents_envs.side_channel import SideChannel, IncomingMessage, OutgoingMessage
+from typing import List
+import uuid
+
+
+class RawBytesChannel(SideChannel):
+    """
+    This is an example of what the SideChannel for raw bytes exchange would
+    look like. Is meant to be used for general research purpose.
+    """
+
+    def __init__(self, channel_id: uuid.UUID):
+        self._received_messages: List[bytes] = []
+        super().__init__(channel_id)
+
+    def on_message_received(self, msg: IncomingMessage) -> None:
+        """
+        Is called by the environment to the side channel. Can be called
+        multiple times per step if multiple messages are meant for that
+        SideChannel.
+        """
+        self._received_messages.append(msg.get_raw_bytes())
+
+    def get_and_clear_received_messages(self) -> List[bytes]:
+        """
+        returns a list of bytearray received from the environment.
+        """
+        result = list(self._received_messages)
+        self._received_messages = []
+        return result
+
+    def send_raw_data(self, data: bytearray) -> None:
+        """
+        Queues a message to be sent by the environment at the next call to
+        step.
+        """
+        msg = OutgoingMessage()
+        msg.set_raw_bytes(data)
+        super().queue_message_to_send(msg)
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/side_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/side_channel.py
new file mode 100644
index 0000000000000000000000000000000000000000..469cb51eabc684194f26b206e65c6fdafd08a29e
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/side_channel.py
@@ -0,0 +1,46 @@
+from abc import ABC, abstractmethod
+from typing import List
+import uuid
+
+from mlagents_envs.side_channel import IncomingMessage, OutgoingMessage
+from mlagents_envs.logging_util import get_logger
+
+logger = get_logger(__name__)
+
+
+class SideChannel(ABC):
+    """
+    The side channel just get access to a bytes buffer that will be shared
+    between C# and Python. For example, We will create a specific side channel
+    for properties that will be a list of string (fixed size) to float number,
+    that can be modified by both C# and Python. All side channels are passed
+    to the Env object at construction.
+    """
+
+    def __init__(self, channel_id: uuid.UUID):
+        self._channel_id: uuid.UUID = channel_id
+        self.message_queue: List[bytearray] = []
+
+    def queue_message_to_send(self, msg: OutgoingMessage) -> None:
+        """
+        Queues a message to be sent by the environment at the next call to
+        step.
+        """
+        self.message_queue.append(msg.buffer)
+
+    @abstractmethod
+    def on_message_received(self, msg: IncomingMessage) -> None:
+        """
+        Is called by the environment to the side channel. Can be called
+        multiple times per step if multiple messages are meant for that
+        SideChannel.
+        """
+        pass
+
+    @property
+    def channel_id(self) -> uuid.UUID:
+        """
+        :return:The type of side channel used. Will influence how the data is
+        processed in the environment.
+        """
+        return self._channel_id
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/side_channel_manager.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/side_channel_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5356c26f86a3eaad95d2a2d5a07e0bfab29d70c
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/side_channel_manager.py
@@ -0,0 +1,81 @@
+import uuid
+import struct
+from typing import Dict, Optional, List
+from mlagents_envs.side_channel import SideChannel, IncomingMessage
+from mlagents_envs.exception import UnityEnvironmentException
+from mlagents_envs.logging_util import get_logger
+
+
+class SideChannelManager:
+    def __init__(self, side_channels=Optional[List[SideChannel]]):
+        self._side_channels_dict = self._get_side_channels_dict(side_channels)
+
+    def process_side_channel_message(self, data: bytes) -> None:
+        """
+        Separates the data received from Python into individual messages for each
+        registered side channel and calls on_message_received on them.
+        :param data: The packed message sent by Unity
+        """
+        offset = 0
+        while offset < len(data):
+            try:
+                channel_id = uuid.UUID(bytes_le=bytes(data[offset : offset + 16]))
+                offset += 16
+                (message_len,) = struct.unpack_from("<i", data, offset)
+                offset = offset + 4
+                message_data = data[offset : offset + message_len]
+                offset = offset + message_len
+            except (struct.error, ValueError, IndexError):
+                raise UnityEnvironmentException(
+                    "There was a problem reading a message in a SideChannel. "
+                    "Please make sure the version of MLAgents in Unity is "
+                    "compatible with the Python version."
+                )
+            if len(message_data) != message_len:
+                raise UnityEnvironmentException(
+                    "The message received by the side channel {} was "
+                    "unexpectedly short. Make sure your Unity Environment "
+                    "sending side channel data properly.".format(channel_id)
+                )
+            if channel_id in self._side_channels_dict:
+                incoming_message = IncomingMessage(message_data)
+                self._side_channels_dict[channel_id].on_message_received(
+                    incoming_message
+                )
+            else:
+                get_logger(__name__).warning(
+                    f"Unknown side channel data received. Channel type: {channel_id}."
+                )
+
+    def generate_side_channel_messages(self) -> bytearray:
+        """
+        Gathers the messages that the registered side channels will send to Unity
+        and combines them into a single message ready to be sent.
+        """
+        result = bytearray()
+        for channel_id, channel in self._side_channels_dict.items():
+            for message in channel.message_queue:
+                result += channel_id.bytes_le
+                result += struct.pack("<i", len(message))
+                result += message
+            channel.message_queue = []
+        return result
+
+    @staticmethod
+    def _get_side_channels_dict(
+        side_channels: Optional[List[SideChannel]],
+    ) -> Dict[uuid.UUID, SideChannel]:
+        """
+        Converts a list of side channels into a dictionary of channel_id to SideChannel
+        :param side_channels: The list of side channels.
+        """
+        side_channels_dict: Dict[uuid.UUID, SideChannel] = {}
+        if side_channels is not None:
+            for _sc in side_channels:
+                if _sc.channel_id in side_channels_dict:
+                    raise UnityEnvironmentException(
+                        f"There cannot be two side channels with "
+                        f"the same channel id {_sc.channel_id}."
+                    )
+                side_channels_dict[_sc.channel_id] = _sc
+        return side_channels_dict
diff --git a/MLPY/Lib/site-packages/mlagents_envs/side_channel/stats_side_channel.py b/MLPY/Lib/site-packages/mlagents_envs/side_channel/stats_side_channel.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fbbfb23d9f650b87892b8cd6317417dcb82e5ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/side_channel/stats_side_channel.py
@@ -0,0 +1,62 @@
+import uuid
+from typing import Tuple, List, Mapping
+from enum import Enum
+from collections import defaultdict
+
+from mlagents_envs.side_channel import SideChannel, IncomingMessage
+
+
+# Determines the behavior of how multiple stats within the same summary period are combined.
+class StatsAggregationMethod(Enum):
+    # Values within the summary period are averaged before reporting.
+    AVERAGE = 0
+
+    # Only the most recent value is reported.
+    MOST_RECENT = 1
+
+    # Values within the summary period are summed up before reporting.
+    SUM = 2
+
+    # All values within a summary period are reported as a histogram.
+    HISTOGRAM = 3
+
+
+StatList = List[Tuple[float, StatsAggregationMethod]]
+EnvironmentStats = Mapping[str, StatList]
+
+
+class StatsSideChannel(SideChannel):
+    """
+    Side channel that receives (string, float) pairs from the environment, so that they can eventually
+    be passed to a StatsReporter.
+    """
+
+    def __init__(self) -> None:
+        # >>> uuid.uuid5(uuid.NAMESPACE_URL, "com.unity.ml-agents/StatsSideChannel")
+        # UUID('a1d8f7b7-cec8-50f9-b78b-d3e165a78520')
+        super().__init__(uuid.UUID("a1d8f7b7-cec8-50f9-b78b-d3e165a78520"))
+
+        self.stats: EnvironmentStats = defaultdict(list)
+
+    def on_message_received(self, msg: IncomingMessage) -> None:
+        """
+        Receive the message from the environment, and save it for later retrieval.
+
+        :param msg:
+        :return:
+        """
+        key = msg.read_string()
+        val = msg.read_float32()
+        agg_type = StatsAggregationMethod(msg.read_int32())
+
+        self.stats[key].append((val, agg_type))
+
+    def get_and_reset_stats(self) -> EnvironmentStats:
+        """
+        Returns the current stats, and resets the internal storage of the stats.
+
+        :return:
+        """
+        s = self.stats
+        self.stats = defaultdict(list)
+        return s
diff --git a/MLPY/Lib/site-packages/mlagents_envs/timers.py b/MLPY/Lib/site-packages/mlagents_envs/timers.py
new file mode 100644
index 0000000000000000000000000000000000000000..32b8602c9ccc1b17926d5eb45dd04fde50bf07bf
--- /dev/null
+++ b/MLPY/Lib/site-packages/mlagents_envs/timers.py
@@ -0,0 +1,362 @@
+"""
+Lightweight, hierarchical timers for profiling sections of code.
+
+Example:
+
+@timed
+def foo(t):
+    time.sleep(t)
+
+def main():
+    for i in range(3):
+        foo(i + 1)
+    with hierarchical_timer("context"):
+        foo(1)
+
+    print(get_timer_tree())
+
+This would produce a timer tree like
+    (root)
+        "foo"
+        "context"
+            "foo"
+
+The total time and counts are tracked for each block of code; in this example "foo" and "context.foo" are considered
+distinct blocks, and are tracked separately.
+
+The decorator and contextmanager are equivalent; the context manager may be more useful if you want more control
+over the timer name, or are splitting up multiple sections of a large function.
+"""
+
+import math
+import sys
+import time
+import threading
+
+from contextlib import contextmanager
+from typing import Any, Callable, Dict, Generator, Optional, TypeVar
+
+TIMER_FORMAT_VERSION = "0.1.0"
+
+
+class TimerNode:
+    """
+    Represents the time spent in a block of code.
+    """
+
+    __slots__ = ["children", "total", "count", "is_parallel"]
+
+    def __init__(self):
+        # Note that since dictionary keys are the node names, we don't explicitly store the name on the TimerNode.
+        self.children: Dict[str, TimerNode] = {}
+        self.total: float = 0.0
+        self.count: int = 0
+        self.is_parallel = False
+
+    def get_child(self, name: str) -> "TimerNode":
+        """
+        Get the child node corresponding to the name (and create if it doesn't already exist).
+        """
+        child = self.children.get(name)
+        if child is None:
+            child = TimerNode()
+            self.children[name] = child
+        return child
+
+    def add_time(self, elapsed: float) -> None:
+        """
+        Accumulate the time spent in the node (and increment the count).
+        """
+        self.total += elapsed
+        self.count += 1
+
+    def merge(
+        self, other: "TimerNode", root_name: str = None, is_parallel: bool = True
+    ) -> None:
+        """
+        Add the other node to this node, then do the same recursively on its children.
+        :param other: The other node to merge
+        :param root_name: Optional name of the root node being merged.
+        :param is_parallel: Whether or not the code block was executed in parallel.
+        :return:
+        """
+        if root_name:
+            node = self.get_child(root_name)
+        else:
+            node = self
+
+        node.total += other.total
+        node.count += other.count
+        node.is_parallel |= is_parallel
+        for other_child_name, other_child_node in other.children.items():
+            child = node.get_child(other_child_name)
+            child.merge(other_child_node, is_parallel=is_parallel)
+
+
+class GaugeNode:
+    """
+    Tracks the most recent value of a metric. This is analogous to gauges in statsd.
+    """
+
+    __slots__ = ["value", "min_value", "max_value", "count", "_timestamp"]
+
+    def __init__(self, value: float):
+        self.value = value
+        self.min_value = value
+        self.max_value = value
+        self.count = 1
+        # Internal timestamp so we can determine priority.
+        self._timestamp = time.time()
+
+    def update(self, new_value: float) -> None:
+        self.min_value = min(self.min_value, new_value)
+        self.max_value = max(self.max_value, new_value)
+        self.value = new_value
+        self.count += 1
+        self._timestamp = time.time()
+
+    def merge(self, other: "GaugeNode") -> None:
+        if self._timestamp < other._timestamp:
+            # Keep the "later" value
+            self.value = other.value
+            self._timestamp = other._timestamp
+        self.min_value = min(self.min_value, other.min_value)
+        self.max_value = max(self.max_value, other.max_value)
+        self.count += other.count
+
+    def as_dict(self) -> Dict[str, float]:
+        return {
+            "value": self.value,
+            "min": self.min_value,
+            "max": self.max_value,
+            "count": self.count,
+        }
+
+
+class TimerStack:
+    """
+    Tracks all the time spent. Users shouldn't use this directly, they should use the contextmanager below to make
+    sure that pushes and pops are already matched.
+    """
+
+    __slots__ = ["root", "stack", "start_time", "gauges", "metadata"]
+
+    def __init__(self):
+        self.root = TimerNode()
+        self.stack = [self.root]
+        self.start_time = time.perf_counter()
+        self.gauges: Dict[str, GaugeNode] = {}
+        self.metadata: Dict[str, str] = {}
+        self._add_default_metadata()
+
+    def reset(self):
+        self.root = TimerNode()
+        self.stack = [self.root]
+        self.start_time = time.perf_counter()
+        self.gauges: Dict[str, GaugeNode] = {}
+        self.metadata: Dict[str, str] = {}
+        self._add_default_metadata()
+
+    def push(self, name: str) -> TimerNode:
+        """
+        Called when entering a new block of code that is timed (e.g. with a contextmanager).
+        """
+        current_node: TimerNode = self.stack[-1]
+        next_node = current_node.get_child(name)
+        self.stack.append(next_node)
+        return next_node
+
+    def pop(self) -> None:
+        """
+        Called when exiting a new block of code that is timed (e.g. with a contextmanager).
+        """
+        self.stack.pop()
+
+    def get_root(self) -> TimerNode:
+        """
+        Update the total time and count of the root name, and return it.
+        """
+        root = self.root
+        root.total = time.perf_counter() - self.start_time
+        root.count = 1
+        return root
+
+    def get_timing_tree(self, node: TimerNode = None) -> Dict[str, Any]:
+        """
+        Recursively build a tree of timings, suitable for output/archiving.
+        """
+        res: Dict[str, Any] = {}
+        if node is None:
+            # Special case the root - total is time since it was created, and count is 1
+            node = self.get_root()
+            res["name"] = "root"
+
+            # Only output gauges at top level
+            if self.gauges:
+                res["gauges"] = self._get_gauges()
+
+            if self.metadata:
+                self.metadata["end_time_seconds"] = str(int(time.time()))
+                res["metadata"] = self.metadata
+
+        res["total"] = node.total
+        res["count"] = node.count
+
+        if node.is_parallel:
+            # Note when the block ran in parallel, so that it's less confusing that a timer is less that its children.
+            res["is_parallel"] = True
+
+        child_total = 0.0
+        child_dict = {}
+        for child_name, child_node in node.children.items():
+            child_res: Dict[str, Any] = self.get_timing_tree(child_node)
+            child_dict[child_name] = child_res
+            child_total += child_res["total"]
+
+        # "self" time is total time minus all time spent on children
+        res["self"] = max(0.0, node.total - child_total)
+        if child_dict:
+            res["children"] = child_dict
+
+        return res
+
+    def set_gauge(self, name: str, value: float) -> None:
+        if math.isnan(value):
+            return
+        gauge_node = self.gauges.get(name)
+        if gauge_node:
+            gauge_node.update(value)
+        else:
+            self.gauges[name] = GaugeNode(value)
+
+    def add_metadata(self, key: str, value: str) -> None:
+        self.metadata[key] = value
+
+    def _get_gauges(self) -> Dict[str, Dict[str, float]]:
+        gauges = {}
+        for gauge_name, gauge_node in self.gauges.items():
+            gauges[gauge_name] = gauge_node.as_dict()
+        return gauges
+
+    def _add_default_metadata(self):
+        self.metadata["timer_format_version"] = TIMER_FORMAT_VERSION
+        self.metadata["start_time_seconds"] = str(int(time.time()))
+        self.metadata["python_version"] = sys.version
+        self.metadata["command_line_arguments"] = " ".join(sys.argv)
+
+
+# Maintain a separate "global" timer per thread, so that they don't accidentally conflict with each other.
+_thread_timer_stacks: Dict[int, TimerStack] = {}
+
+
+def _get_thread_timer() -> TimerStack:
+    ident = threading.get_ident()
+    if ident not in _thread_timer_stacks:
+        timer_stack = TimerStack()
+        _thread_timer_stacks[ident] = timer_stack
+    return _thread_timer_stacks[ident]
+
+
+def get_timer_stack_for_thread(t: threading.Thread) -> Optional[TimerStack]:
+    if t.ident is None:
+        # Thread hasn't started, shouldn't ever happen
+        return None
+    return _thread_timer_stacks.get(t.ident)
+
+
+@contextmanager
+def hierarchical_timer(name: str, timer_stack: TimerStack = None) -> Generator:
+    """
+    Creates a scoped timer around a block of code. This time spent will automatically be incremented when
+    the context manager exits.
+    """
+    timer_stack = timer_stack or _get_thread_timer()
+    timer_node = timer_stack.push(name)
+    start_time = time.perf_counter()
+
+    try:
+        # The wrapped code block will run here.
+        yield timer_node
+    finally:
+        # This will trigger either when the context manager exits, or an exception is raised.
+        # We'll accumulate the time, and the exception (if any) gets raised automatically.
+        elapsed = time.perf_counter() - start_time
+        timer_node.add_time(elapsed)
+        timer_stack.pop()
+
+
+# This is used to ensure the signature of the decorated function is preserved
+# See also https://github.com/python/mypy/issues/3157
+FuncT = TypeVar("FuncT", bound=Callable[..., Any])
+
+
+def timed(func: FuncT) -> FuncT:
+    """
+    Decorator for timing a function or method. The name of the timer will be the qualified name of the function.
+    Usage:
+        @timed
+        def my_func(x, y):
+            return x + y
+    Note that because this doesn't take arguments, the global timer stack is always used.
+    """
+
+    def wrapped(*args, **kwargs):
+        with hierarchical_timer(func.__qualname__):
+            return func(*args, **kwargs)
+
+    return wrapped  # type: ignore
+
+
+def set_gauge(name: str, value: float, timer_stack: TimerStack = None) -> None:
+    """
+    Updates the value of the gauge (or creates it if it hasn't been set before).
+    """
+    timer_stack = timer_stack or _get_thread_timer()
+    timer_stack.set_gauge(name, value)
+
+
+def merge_gauges(gauges: Dict[str, GaugeNode], timer_stack: TimerStack = None) -> None:
+    """
+    Merge the gauges from another TimerStack with the provided one (or the
+    current thread's stack if none is provided).
+    :param gauges:
+    :param timer_stack:
+    :return:
+    """
+    timer_stack = timer_stack or _get_thread_timer()
+    for n, g in gauges.items():
+        if n in timer_stack.gauges:
+            timer_stack.gauges[n].merge(g)
+        else:
+            timer_stack.gauges[n] = g
+
+
+def add_metadata(key: str, value: str, timer_stack: TimerStack = None) -> None:
+    timer_stack = timer_stack or _get_thread_timer()
+    timer_stack.add_metadata(key, value)
+
+
+def get_timer_tree(timer_stack: TimerStack = None) -> Dict[str, Any]:
+    """
+    Return the tree of timings from the TimerStack as a dictionary (or the
+     current thread's  stack if none is provided)
+    """
+    timer_stack = timer_stack or _get_thread_timer()
+    return timer_stack.get_timing_tree()
+
+
+def get_timer_root(timer_stack: TimerStack = None) -> TimerNode:
+    """
+    Get the root TimerNode of the timer_stack (or the current thread's
+    TimerStack if not specified)
+    """
+    timer_stack = timer_stack or _get_thread_timer()
+    return timer_stack.get_root()
+
+
+def reset_timers(timer_stack: TimerStack = None) -> None:
+    """
+    Reset the timer_stack (or the current thread's TimerStack if not specified)
+    """
+    timer_stack = timer_stack or _get_thread_timer()
+    timer_stack.reset()