# Copyright 2018 The TensorFlow Authors All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Sample actor(policy) and critic(q) networks to use with DDPG/NAF agents. The DDPG networks are defined in "Section 7: Experiment Details" of "Continuous control with deep reinforcement learning" - Lilicrap et al. https://arxiv.org/abs/1509.02971 The NAF critic network is based on "Section 4" of "Continuous deep Q-learning with model-based acceleration" - Gu et al. https://arxiv.org/pdf/1603.00748. """ import tensorflow as tf slim = tf.contrib.slim import gin.tf @gin.configurable('ddpg_critic_net') def critic_net(states, actions, for_critic_loss=False, num_reward_dims=1, states_hidden_layers=(400,), actions_hidden_layers=None, joint_hidden_layers=(300,), weight_decay=0.0001, normalizer_fn=None, activation_fn=tf.nn.relu, zero_obs=False, images=False): """Creates a critic that returns q values for the given states and actions. Args: states: (castable to tf.float32) a [batch_size, num_state_dims] tensor representing a batch of states. actions: (castable to tf.float32) a [batch_size, num_action_dims] tensor representing a batch of actions. num_reward_dims: Number of reward dimensions. states_hidden_layers: tuple of hidden layers units for states. actions_hidden_layers: tuple of hidden layers units for actions. joint_hidden_layers: tuple of hidden layers units after joining states and actions using tf.concat(). weight_decay: Weight decay for l2 weights regularizer. normalizer_fn: Normalizer function, i.e. slim.layer_norm, activation_fn: Activation function, i.e. tf.nn.relu, slim.leaky_relu, ... Returns: A tf.float32 [batch_size] tensor of q values, or a tf.float32 [batch_size, num_reward_dims] tensor of vector q values if num_reward_dims > 1. """ with slim.arg_scope( [slim.fully_connected], activation_fn=activation_fn, normalizer_fn=normalizer_fn, weights_regularizer=slim.l2_regularizer(weight_decay), weights_initializer=slim.variance_scaling_initializer( factor=1.0/3.0, mode='FAN_IN', uniform=True)): orig_states = tf.to_float(states) #states = tf.to_float(states) states = tf.concat([tf.to_float(states), tf.to_float(actions)], -1) #TD3 if images or zero_obs: states *= tf.constant([0.0] * 2 + [1.0] * (states.shape[1] - 2)) #LALA actions = tf.to_float(actions) if states_hidden_layers: states = slim.stack(states, slim.fully_connected, states_hidden_layers, scope='states') if actions_hidden_layers: actions = slim.stack(actions, slim.fully_connected, actions_hidden_layers, scope='actions') joint = tf.concat([states, actions], 1) if joint_hidden_layers: joint = slim.stack(joint, slim.fully_connected, joint_hidden_layers, scope='joint') with slim.arg_scope([slim.fully_connected], weights_regularizer=None, weights_initializer=tf.random_uniform_initializer( minval=-0.003, maxval=0.003)): value = slim.fully_connected(joint, num_reward_dims, activation_fn=None, normalizer_fn=None, scope='q_value') if num_reward_dims == 1: value = tf.reshape(value, [-1]) if not for_critic_loss and num_reward_dims > 1: value = tf.reduce_sum( value * tf.abs(orig_states[:, -num_reward_dims:]), -1) return value @gin.configurable('ddpg_actor_net') def actor_net(states, action_spec, hidden_layers=(400, 300), normalizer_fn=None, activation_fn=tf.nn.relu, zero_obs=False, images=False): """Creates an actor that returns actions for the given states. Args: states: (castable to tf.float32) a [batch_size, num_state_dims] tensor representing a batch of states. action_spec: (BoundedTensorSpec) A tensor spec indicating the shape and range of actions. hidden_layers: tuple of hidden layers units. normalizer_fn: Normalizer function, i.e. slim.layer_norm, activation_fn: Activation function, i.e. tf.nn.relu, slim.leaky_relu, ... Returns: A tf.float32 [batch_size, num_action_dims] tensor of actions. """ with slim.arg_scope( [slim.fully_connected], activation_fn=activation_fn, normalizer_fn=normalizer_fn, weights_initializer=slim.variance_scaling_initializer( factor=1.0/3.0, mode='FAN_IN', uniform=True)): states = tf.to_float(states) orig_states = states if images or zero_obs: # Zero-out x, y position. Hacky. states *= tf.constant([0.0] * 2 + [1.0] * (states.shape[1] - 2)) if hidden_layers: states = slim.stack(states, slim.fully_connected, hidden_layers, scope='states') with slim.arg_scope([slim.fully_connected], weights_initializer=tf.random_uniform_initializer( minval=-0.003, maxval=0.003)): actions = slim.fully_connected(states, action_spec.shape.num_elements(), scope='actions', normalizer_fn=None, activation_fn=tf.nn.tanh) action_means = (action_spec.maximum + action_spec.minimum) / 2.0 action_magnitudes = (action_spec.maximum - action_spec.minimum) / 2.0 actions = action_means + action_magnitudes * actions return actions