Spaces:
Running
Running
File size: 6,425 Bytes
e11e4fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
from typing import Any, Dict, List
import numpy as np
from mlagents.torch_utils import torch, default_device
import copy
from mlagents.trainers.action_info import ActionInfo
from mlagents.trainers.behavior_id_utils import get_global_agent_id
from mlagents.trainers.policy import Policy
from mlagents_envs.base_env import DecisionSteps, BehaviorSpec
from mlagents_envs.timers import timed
from mlagents.trainers.settings import NetworkSettings
from mlagents.trainers.torch_entities.networks import GlobalSteps
from mlagents.trainers.torch_entities.utils import ModelUtils
EPSILON = 1e-7 # Small value to avoid divide by zero
class TorchPolicy(Policy):
def __init__(
self,
seed: int,
behavior_spec: BehaviorSpec,
network_settings: NetworkSettings,
actor_cls: type,
actor_kwargs: Dict[str, Any],
):
"""
Policy that uses a multilayer perceptron to map the observations to actions. Could
also use a CNN to encode visual input prior to the MLP. Supports discrete and
continuous actions, as well as recurrent networks.
:param seed: Random seed.
:param behavior_spec: Assigned BehaviorSpec object.
:param network_settings: Defined network parameters.
:param actor_cls: The type of Actor
:param actor_kwargs: Keyword args for the Actor class
"""
super().__init__(seed, behavior_spec, network_settings)
self.global_step = (
GlobalSteps()
) # could be much simpler if TorchPolicy is nn.Module
self.stats_name_to_update_name = {
"Losses/Value Loss": "value_loss",
"Losses/Policy Loss": "policy_loss",
}
self.actor = actor_cls(
observation_specs=self.behavior_spec.observation_specs,
network_settings=network_settings,
action_spec=behavior_spec.action_spec,
**actor_kwargs,
)
# Save the m_size needed for export
self._export_m_size = self.m_size
# m_size needed for training is determined by network, not trainer settings
self.m_size = self.actor.memory_size
self.actor.to(default_device())
@property
def export_memory_size(self) -> int:
"""
Returns the memory size of the exported ONNX policy. This only includes the memory
of the Actor and not any auxillary networks.
"""
return self._export_m_size
def _extract_masks(self, decision_requests: DecisionSteps) -> np.ndarray:
mask = None
if self.behavior_spec.action_spec.discrete_size > 0:
num_discrete_flat = np.sum(self.behavior_spec.action_spec.discrete_branches)
mask = torch.ones([len(decision_requests), num_discrete_flat])
if decision_requests.action_mask is not None:
mask = torch.as_tensor(
1 - np.concatenate(decision_requests.action_mask, axis=1)
)
return mask
@timed
def evaluate(
self, decision_requests: DecisionSteps, global_agent_ids: List[str]
) -> Dict[str, Any]:
"""
Evaluates policy for the agent experiences provided.
:param global_agent_ids:
:param decision_requests: DecisionStep object containing inputs.
:return: Outputs from network as defined by self.inference_dict.
"""
obs = decision_requests.obs
masks = self._extract_masks(decision_requests)
tensor_obs = [torch.as_tensor(np_ob) for np_ob in obs]
memories = torch.as_tensor(self.retrieve_memories(global_agent_ids)).unsqueeze(
0
)
with torch.no_grad():
action, run_out, memories = self.actor.get_action_and_stats(
tensor_obs, masks=masks, memories=memories
)
run_out["action"] = action.to_action_tuple()
if "log_probs" in run_out:
run_out["log_probs"] = run_out["log_probs"].to_log_probs_tuple()
if "entropy" in run_out:
run_out["entropy"] = ModelUtils.to_numpy(run_out["entropy"])
if self.use_recurrent:
run_out["memory_out"] = ModelUtils.to_numpy(memories).squeeze(0)
return run_out
def get_action(
self, decision_requests: DecisionSteps, worker_id: int = 0
) -> ActionInfo:
"""
Decides actions given observations information, and takes them in environment.
:param worker_id:
:param decision_requests: A dictionary of behavior names and DecisionSteps from environment.
:return: an ActionInfo containing action, memories, values and an object
to be passed to add experiences
"""
if len(decision_requests) == 0:
return ActionInfo.empty()
global_agent_ids = [
get_global_agent_id(worker_id, int(agent_id))
for agent_id in decision_requests.agent_id
] # For 1-D array, the iterator order is correct.
run_out = self.evaluate(decision_requests, global_agent_ids)
self.save_memories(global_agent_ids, run_out.get("memory_out"))
self.check_nan_action(run_out.get("action"))
return ActionInfo(
action=run_out.get("action"),
env_action=run_out.get("env_action"),
outputs=run_out,
agent_ids=list(decision_requests.agent_id),
)
def get_current_step(self):
"""
Gets current model step.
:return: current model step.
"""
return self.global_step.current_step
def set_step(self, step: int) -> int:
"""
Sets current model step to step without creating additional ops.
:param step: Step to set the current model step to.
:return: The step the model was set to.
"""
self.global_step.current_step = step
return step
def increment_step(self, n_steps):
"""
Increments model step.
"""
self.global_step.increment(n_steps)
return self.get_current_step()
def load_weights(self, values: List[np.ndarray]) -> None:
self.actor.load_state_dict(values)
def init_load_weights(self) -> None:
pass
def get_weights(self) -> List[np.ndarray]:
return copy.deepcopy(self.actor.state_dict())
def get_modules(self):
return {"Policy": self.actor, "global_step": self.global_step}
|