|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Model is responsible for setting up Tensorflow graph. |
|
|
|
Creates policy and value networks. Also sets up all optimization |
|
ops, including gradient ops, trust region ops, and value optimizers. |
|
|
|
""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import tensorflow as tf |
|
|
|
|
|
class Model(object): |
|
|
|
def __init__(self, env_spec, global_step, |
|
target_network_lag=0.95, |
|
sample_from='online', |
|
get_policy=None, |
|
get_baseline=None, |
|
get_objective=None, |
|
get_trust_region_p_opt=None, |
|
get_value_opt=None): |
|
self.env_spec = env_spec |
|
|
|
self.global_step = global_step |
|
self.inc_global_step = self.global_step.assign_add(1) |
|
|
|
self.target_network_lag = target_network_lag |
|
self.sample_from = sample_from |
|
|
|
self.policy = get_policy() |
|
self.baseline = get_baseline() |
|
self.objective = get_objective() |
|
self.baseline.eps_lambda = self.objective.eps_lambda |
|
self.trust_region_policy_opt = get_trust_region_p_opt() |
|
self.value_opt = get_value_opt() |
|
|
|
def setup_placeholders(self): |
|
"""Create the Tensorflow placeholders.""" |
|
|
|
self.avg_episode_reward = tf.placeholder( |
|
tf.float32, [], 'avg_episode_reward') |
|
self.greedy_episode_reward = tf.placeholder( |
|
tf.float32, [], 'greedy_episode_reward') |
|
|
|
|
|
self.internal_state = tf.placeholder(tf.float32, |
|
[None, self.policy.rnn_state_dim], |
|
'internal_state') |
|
|
|
self.single_observation = [] |
|
for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types): |
|
if self.env_spec.is_discrete(obs_type): |
|
self.single_observation.append( |
|
tf.placeholder(tf.int32, [None], 'obs%d' % i)) |
|
elif self.env_spec.is_box(obs_type): |
|
self.single_observation.append( |
|
tf.placeholder(tf.float32, [None, obs_dim], 'obs%d' % i)) |
|
else: |
|
assert False |
|
|
|
self.single_action = [] |
|
for i, (action_dim, action_type) in \ |
|
enumerate(self.env_spec.act_dims_and_types): |
|
if self.env_spec.is_discrete(action_type): |
|
self.single_action.append( |
|
tf.placeholder(tf.int32, [None], 'act%d' % i)) |
|
elif self.env_spec.is_box(action_type): |
|
self.single_action.append( |
|
tf.placeholder(tf.float32, [None, action_dim], 'act%d' % i)) |
|
else: |
|
assert False |
|
|
|
|
|
self.observations = [] |
|
for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types): |
|
if self.env_spec.is_discrete(obs_type): |
|
self.observations.append( |
|
tf.placeholder(tf.int32, [None, None], 'all_obs%d' % i)) |
|
else: |
|
self.observations.append( |
|
tf.placeholder(tf.float32, [None, None, obs_dim], 'all_obs%d' % i)) |
|
|
|
self.actions = [] |
|
self.other_logits = [] |
|
for i, (action_dim, action_type) in \ |
|
enumerate(self.env_spec.act_dims_and_types): |
|
if self.env_spec.is_discrete(action_type): |
|
self.actions.append( |
|
tf.placeholder(tf.int32, [None, None], 'all_act%d' % i)) |
|
if self.env_spec.is_box(action_type): |
|
self.actions.append( |
|
tf.placeholder(tf.float32, [None, None, action_dim], |
|
'all_act%d' % i)) |
|
self.other_logits.append( |
|
tf.placeholder(tf.float32, [None, None, None], |
|
'other_logits%d' % i)) |
|
|
|
self.rewards = tf.placeholder(tf.float32, [None, None], 'rewards') |
|
self.terminated = tf.placeholder(tf.float32, [None], 'terminated') |
|
self.pads = tf.placeholder(tf.float32, [None, None], 'pads') |
|
|
|
self.prev_log_probs = tf.placeholder(tf.float32, [None, None], |
|
'prev_log_probs') |
|
|
|
def setup(self, train=True): |
|
"""Setup Tensorflow Graph.""" |
|
|
|
self.setup_placeholders() |
|
|
|
tf.summary.scalar('avg_episode_reward', self.avg_episode_reward) |
|
tf.summary.scalar('greedy_episode_reward', self.greedy_episode_reward) |
|
|
|
with tf.variable_scope('model', reuse=None): |
|
|
|
with tf.variable_scope('policy_net'): |
|
(self.policy_internal_states, self.logits, self.log_probs, |
|
self.entropies, self.self_kls) = \ |
|
self.policy.multi_step(self.observations, |
|
self.internal_state, |
|
self.actions) |
|
self.out_log_probs = sum(self.log_probs) |
|
self.kl = self.policy.calculate_kl(self.other_logits, self.logits) |
|
self.avg_kl = (tf.reduce_sum(sum(self.kl)[:-1] * (1 - self.pads)) / |
|
tf.reduce_sum(1 - self.pads)) |
|
|
|
|
|
with tf.variable_scope('value_net'): |
|
(self.values, |
|
self.regression_input, |
|
self.regression_weight) = self.baseline.get_values( |
|
self.observations, self.actions, |
|
self.policy_internal_states, self.logits) |
|
|
|
|
|
with tf.variable_scope('target_policy_net'): |
|
(self.target_policy_internal_states, |
|
self.target_logits, self.target_log_probs, |
|
_, _) = \ |
|
self.policy.multi_step(self.observations, |
|
self.internal_state, |
|
self.actions) |
|
|
|
|
|
with tf.variable_scope('target_value_net'): |
|
(self.target_values, _, _) = self.baseline.get_values( |
|
self.observations, self.actions, |
|
self.target_policy_internal_states, self.target_logits) |
|
|
|
|
|
all_vars = tf.trainable_variables() |
|
online_vars = [p for p in all_vars if |
|
'/policy_net' in p.name or '/value_net' in p.name] |
|
target_vars = [p for p in all_vars if |
|
'target_policy_net' in p.name or 'target_value_net' in p.name] |
|
online_vars.sort(key=lambda p: p.name) |
|
target_vars.sort(key=lambda p: p.name) |
|
aa = self.target_network_lag |
|
self.copy_op = tf.group(*[ |
|
target_p.assign(aa * target_p + (1 - aa) * online_p) |
|
for online_p, target_p in zip(online_vars, target_vars)]) |
|
|
|
if train: |
|
|
|
(self.loss, self.raw_loss, self.regression_target, |
|
self.gradient_ops, self.summary) = self.objective.get( |
|
self.rewards, self.pads, |
|
self.values[:-1, :], |
|
self.values[-1, :] * (1 - self.terminated), |
|
self.log_probs, self.prev_log_probs, self.target_log_probs, |
|
self.entropies, self.logits, self.target_values[:-1, :], |
|
self.target_values[-1, :] * (1 - self.terminated)) |
|
|
|
self.regression_target = tf.reshape(self.regression_target, [-1]) |
|
|
|
self.policy_vars = [ |
|
v for v in tf.trainable_variables() |
|
if '/policy_net' in v.name] |
|
self.value_vars = [ |
|
v for v in tf.trainable_variables() |
|
if '/value_net' in v.name] |
|
|
|
|
|
if self.trust_region_policy_opt is not None: |
|
with tf.variable_scope('trust_region_policy', reuse=None): |
|
avg_self_kl = ( |
|
tf.reduce_sum(sum(self.self_kls) * (1 - self.pads)) / |
|
tf.reduce_sum(1 - self.pads)) |
|
|
|
self.trust_region_policy_opt.setup( |
|
self.policy_vars, self.raw_loss, avg_self_kl, |
|
self.avg_kl) |
|
|
|
|
|
if self.value_opt is not None: |
|
with tf.variable_scope('trust_region_value', reuse=None): |
|
self.value_opt.setup( |
|
self.value_vars, |
|
tf.reshape(self.values[:-1, :], [-1]), |
|
self.regression_target, |
|
tf.reshape(self.pads, [-1]), |
|
self.regression_input, self.regression_weight) |
|
|
|
|
|
with tf.variable_scope('model', reuse=True): |
|
scope = ('target_policy_net' if self.sample_from == 'target' |
|
else 'policy_net') |
|
with tf.variable_scope(scope): |
|
self.next_internal_state, self.sampled_actions = \ |
|
self.policy.sample_step(self.single_observation, |
|
self.internal_state, |
|
self.single_action) |
|
self.greedy_next_internal_state, self.greedy_sampled_actions = \ |
|
self.policy.sample_step(self.single_observation, |
|
self.internal_state, |
|
self.single_action, |
|
greedy=True) |
|
|
|
def sample_step(self, sess, |
|
single_observation, internal_state, single_action, |
|
greedy=False): |
|
"""Sample batch of steps from policy.""" |
|
if greedy: |
|
outputs = [self.greedy_next_internal_state, self.greedy_sampled_actions] |
|
else: |
|
outputs = [self.next_internal_state, self.sampled_actions] |
|
|
|
feed_dict = {self.internal_state: internal_state} |
|
for action_place, action in zip(self.single_action, single_action): |
|
feed_dict[action_place] = action |
|
for obs_place, obs in zip(self.single_observation, single_observation): |
|
feed_dict[obs_place] = obs |
|
|
|
return sess.run(outputs, feed_dict=feed_dict) |
|
|
|
def train_step(self, sess, |
|
observations, internal_state, actions, |
|
rewards, terminated, pads, |
|
avg_episode_reward=0, greedy_episode_reward=0): |
|
"""Train network using standard gradient descent.""" |
|
outputs = [self.raw_loss, self.gradient_ops, self.summary] |
|
feed_dict = {self.internal_state: internal_state, |
|
self.rewards: rewards, |
|
self.terminated: terminated, |
|
self.pads: pads, |
|
self.avg_episode_reward: avg_episode_reward, |
|
self.greedy_episode_reward: greedy_episode_reward} |
|
time_len = None |
|
for action_place, action in zip(self.actions, actions): |
|
if time_len is None: |
|
time_len = len(action) |
|
assert time_len == len(action) |
|
feed_dict[action_place] = action |
|
for obs_place, obs in zip(self.observations, observations): |
|
assert time_len == len(obs) |
|
feed_dict[obs_place] = obs |
|
|
|
assert len(rewards) == time_len - 1 |
|
|
|
return sess.run(outputs, feed_dict=feed_dict) |
|
|
|
|
|
def trust_region_step(self, sess, |
|
observations, internal_state, actions, |
|
rewards, terminated, pads, |
|
avg_episode_reward=0, |
|
greedy_episode_reward=0): |
|
"""Train policy using trust region step.""" |
|
feed_dict = {self.internal_state: internal_state, |
|
self.rewards: rewards, |
|
self.terminated: terminated, |
|
self.pads: pads, |
|
self.avg_episode_reward: avg_episode_reward, |
|
self.greedy_episode_reward: greedy_episode_reward} |
|
for action_place, action in zip(self.actions, actions): |
|
feed_dict[action_place] = action |
|
for obs_place, obs in zip(self.observations, observations): |
|
feed_dict[obs_place] = obs |
|
|
|
(prev_log_probs, prev_logits) = sess.run( |
|
[self.out_log_probs, self.logits], feed_dict=feed_dict) |
|
feed_dict[self.prev_log_probs] = prev_log_probs |
|
for other_logit, prev_logit in zip(self.other_logits, prev_logits): |
|
feed_dict[other_logit] = prev_logit |
|
|
|
|
|
self.trust_region_policy_opt.optimize(sess, feed_dict) |
|
|
|
ret = sess.run([self.raw_loss, self.summary], feed_dict=feed_dict) |
|
ret = [ret[0], None, ret[1]] |
|
return ret |
|
|
|
def fit_values(self, sess, |
|
observations, internal_state, actions, |
|
rewards, terminated, pads): |
|
"""Train value network using value-specific optimizer.""" |
|
feed_dict = {self.internal_state: internal_state, |
|
self.rewards: rewards, |
|
self.terminated: terminated, |
|
self.pads: pads} |
|
for action_place, action in zip(self.actions, actions): |
|
feed_dict[action_place] = action |
|
for obs_place, obs in zip(self.observations, observations): |
|
feed_dict[obs_place] = obs |
|
|
|
|
|
if self.value_opt is None: |
|
raise ValueError('Specific value optimizer does not exist') |
|
self.value_opt.optimize(sess, feed_dict) |
|
|