|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r"""Script for evaluating a UVF agent. |
|
|
|
To run locally: See run_eval.py |
|
|
|
To run on borg: See train_eval.borg |
|
""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import os |
|
import tensorflow as tf |
|
slim = tf.contrib.slim |
|
import gin.tf |
|
|
|
import agent |
|
import train |
|
from utils import utils as uvf_utils |
|
from utils import eval_utils |
|
from environments import create_maze_env |
|
|
|
|
|
flags = tf.app.flags |
|
|
|
flags.DEFINE_string('eval_dir', None, |
|
'Directory for writing logs/summaries during eval.') |
|
flags.DEFINE_string('checkpoint_dir', None, |
|
'Directory containing checkpoints to eval.') |
|
FLAGS = flags.FLAGS |
|
|
|
|
|
def get_evaluate_checkpoint_fn(master, output_dir, eval_step_fns, |
|
model_rollout_fn, gamma, max_steps_per_episode, |
|
num_episodes_eval, num_episodes_videos, |
|
tuner_hook, generate_videos, |
|
generate_summaries, video_settings): |
|
"""Returns a function that evaluates a given checkpoint. |
|
|
|
Args: |
|
master: BNS name of the TensorFlow master |
|
output_dir: The output directory to which the metric summaries are written. |
|
eval_step_fns: A dictionary of a functions that return a list of |
|
[state, action, reward, discount, transition_type] tensors, |
|
indexed by summary tag name. |
|
model_rollout_fn: Model rollout fn. |
|
gamma: Discount factor for the reward. |
|
max_steps_per_episode: Maximum steps to run each episode for. |
|
num_episodes_eval: Number of episodes to evaluate and average reward over. |
|
num_episodes_videos: Number of episodes to record for video. |
|
tuner_hook: A callable(average reward, global step) that updates a Vizier |
|
tuner trial. |
|
generate_videos: Whether to generate videos of the agent in action. |
|
generate_summaries: Whether to generate summaries. |
|
video_settings: Settings for generating videos of the agent. |
|
|
|
Returns: |
|
A function that evaluates a checkpoint. |
|
""" |
|
sess = tf.Session(master, graph=tf.get_default_graph()) |
|
sess.run(tf.global_variables_initializer()) |
|
sess.run(tf.local_variables_initializer()) |
|
summary_writer = tf.summary.FileWriter(output_dir) |
|
|
|
def evaluate_checkpoint(checkpoint_path): |
|
"""Performs a one-time evaluation of the given checkpoint. |
|
|
|
Args: |
|
checkpoint_path: Checkpoint to evaluate. |
|
Returns: |
|
True if the evaluation process should stop |
|
""" |
|
restore_fn = tf.contrib.framework.assign_from_checkpoint_fn( |
|
checkpoint_path, |
|
uvf_utils.get_all_vars(), |
|
ignore_missing_vars=True, |
|
reshape_variables=False) |
|
assert restore_fn is not None, 'cannot restore %s' % checkpoint_path |
|
restore_fn(sess) |
|
global_step = sess.run(slim.get_global_step()) |
|
should_stop = False |
|
max_reward = -1e10 |
|
max_meta_reward = -1e10 |
|
|
|
for eval_tag, (eval_step, env_base,) in sorted(eval_step_fns.items()): |
|
if hasattr(env_base, 'set_sess'): |
|
env_base.set_sess(sess) |
|
|
|
if generate_summaries: |
|
tf.logging.info( |
|
'[%s] Computing average reward over %d episodes at global step %d.', |
|
eval_tag, num_episodes_eval, global_step) |
|
(average_reward, last_reward, |
|
average_meta_reward, last_meta_reward, average_success, |
|
states, actions) = eval_utils.compute_average_reward( |
|
sess, env_base, eval_step, gamma, max_steps_per_episode, |
|
num_episodes_eval) |
|
tf.logging.info('[%s] Average reward = %f', eval_tag, average_reward) |
|
tf.logging.info('[%s] Last reward = %f', eval_tag, last_reward) |
|
tf.logging.info('[%s] Average meta reward = %f', eval_tag, average_meta_reward) |
|
tf.logging.info('[%s] Last meta reward = %f', eval_tag, last_meta_reward) |
|
tf.logging.info('[%s] Average success = %f', eval_tag, average_success) |
|
if model_rollout_fn is not None: |
|
preds, model_losses = eval_utils.compute_model_loss( |
|
sess, model_rollout_fn, states, actions) |
|
for i, (pred, state, model_loss) in enumerate( |
|
zip(preds, states, model_losses)): |
|
tf.logging.info('[%s] Model rollout step %d: loss=%f', eval_tag, i, |
|
model_loss) |
|
tf.logging.info('[%s] Model rollout step %d: pred=%s', eval_tag, i, |
|
str(pred.tolist())) |
|
tf.logging.info('[%s] Model rollout step %d: state=%s', eval_tag, i, |
|
str(state.tolist())) |
|
|
|
|
|
if average_reward > max_reward: |
|
max_reward = average_reward |
|
if average_meta_reward > max_meta_reward: |
|
max_meta_reward = average_meta_reward |
|
|
|
for (tag, value) in [('Reward/average_%s_reward', average_reward), |
|
('Reward/last_%s_reward', last_reward), |
|
('Reward/average_%s_meta_reward', average_meta_reward), |
|
('Reward/last_%s_meta_reward', last_meta_reward), |
|
('Reward/average_%s_success', average_success)]: |
|
summary_str = tf.Summary(value=[ |
|
tf.Summary.Value( |
|
tag=tag % eval_tag, |
|
simple_value=value) |
|
]) |
|
summary_writer.add_summary(summary_str, global_step) |
|
summary_writer.flush() |
|
|
|
if generate_videos or should_stop: |
|
|
|
|
|
if hasattr(env_base, '_gym_env'): |
|
tf.logging.info('Resetting before recording video') |
|
if hasattr(env_base._gym_env, 'reset_model'): |
|
env_base._gym_env.reset_model() |
|
else: |
|
env_base._gym_env.wrapped_env.reset_model() |
|
video_filename = os.path.join(output_dir, 'videos', |
|
'%s_step_%d.mp4' % (eval_tag, |
|
global_step)) |
|
eval_utils.capture_video(sess, eval_step, env_base, |
|
max_steps_per_episode * num_episodes_videos, |
|
video_filename, video_settings, |
|
reset_every=max_steps_per_episode) |
|
|
|
should_stop = should_stop or (generate_summaries and tuner_hook and |
|
tuner_hook(max_reward, global_step)) |
|
return bool(should_stop) |
|
|
|
return evaluate_checkpoint |
|
|
|
|
|
def get_model_rollout(uvf_agent, tf_env): |
|
"""Model rollout function.""" |
|
state_spec = tf_env.observation_spec()[0] |
|
action_spec = tf_env.action_spec()[0] |
|
state_ph = tf.placeholder(dtype=state_spec.dtype, shape=state_spec.shape) |
|
action_ph = tf.placeholder(dtype=action_spec.dtype, shape=action_spec.shape) |
|
|
|
merged_state = uvf_agent.merged_state(state_ph) |
|
diff_value = uvf_agent.critic_net(tf.expand_dims(merged_state, 0), |
|
tf.expand_dims(action_ph, 0))[0] |
|
diff_value = tf.cast(diff_value, dtype=state_ph.dtype) |
|
state_ph.shape.assert_is_compatible_with(diff_value.shape) |
|
next_state = state_ph + diff_value |
|
|
|
def model_rollout_fn(sess, state, action): |
|
return sess.run(next_state, feed_dict={state_ph: state, action_ph: action}) |
|
|
|
return model_rollout_fn |
|
|
|
|
|
def get_eval_step(uvf_agent, |
|
state_preprocess, |
|
tf_env, |
|
action_fn, |
|
meta_action_fn, |
|
environment_steps, |
|
num_episodes, |
|
mode='eval'): |
|
"""Get one-step policy/env stepping ops. |
|
|
|
Args: |
|
uvf_agent: A UVF agent. |
|
tf_env: A TFEnvironment. |
|
action_fn: A function to produce actions given current state. |
|
meta_action_fn: A function to produce meta actions given current state. |
|
environment_steps: A variable to count the number of steps in the tf_env. |
|
num_episodes: A variable to count the number of episodes. |
|
mode: a string representing the mode=[train, explore, eval]. |
|
|
|
Returns: |
|
A collect_experience_op that excute an action and store into the |
|
replay_buffer |
|
""" |
|
|
|
tf_env.start_collect() |
|
state = tf_env.current_obs() |
|
action = action_fn(state, context=None) |
|
state_repr = state_preprocess(state) |
|
|
|
action_spec = tf_env.action_spec() |
|
action_ph = tf.placeholder(dtype=action_spec.dtype, shape=action_spec.shape) |
|
with tf.control_dependencies([state]): |
|
transition_type, reward, discount = tf_env.step(action_ph) |
|
|
|
def increment_step(): |
|
return environment_steps.assign_add(1) |
|
|
|
def increment_episode(): |
|
return num_episodes.assign_add(1) |
|
|
|
def no_op_int(): |
|
return tf.constant(0, dtype=tf.int64) |
|
|
|
step_cond = uvf_agent.step_cond_fn(state, action, |
|
transition_type, |
|
environment_steps, num_episodes) |
|
reset_episode_cond = uvf_agent.reset_episode_cond_fn( |
|
state, action, |
|
transition_type, environment_steps, num_episodes) |
|
reset_env_cond = uvf_agent.reset_env_cond_fn(state, action, |
|
transition_type, |
|
environment_steps, num_episodes) |
|
|
|
increment_step_op = tf.cond(step_cond, increment_step, no_op_int) |
|
with tf.control_dependencies([increment_step_op]): |
|
increment_episode_op = tf.cond(reset_episode_cond, increment_episode, |
|
no_op_int) |
|
|
|
with tf.control_dependencies([reward, discount]): |
|
next_state = tf_env.current_obs() |
|
next_state_repr = state_preprocess(next_state) |
|
|
|
with tf.control_dependencies([increment_episode_op]): |
|
post_reward, post_meta_reward = uvf_agent.cond_begin_episode_op( |
|
tf.logical_not(reset_episode_cond), |
|
[state, action_ph, reward, next_state, |
|
state_repr, next_state_repr], |
|
mode=mode, meta_action_fn=meta_action_fn) |
|
|
|
|
|
|
|
with tf.control_dependencies([post_reward, post_meta_reward]): |
|
cond_reset_op = tf.cond(reset_env_cond, |
|
tf_env.reset, |
|
tf_env.current_time_step) |
|
|
|
|
|
with tf.control_dependencies(cond_reset_op): |
|
post_reward, post_meta_reward = map(tf.identity, [post_reward, post_meta_reward]) |
|
|
|
eval_step = [next_state, action_ph, transition_type, post_reward, post_meta_reward, discount, uvf_agent.context_vars, state_repr] |
|
|
|
if callable(action): |
|
def step_fn(sess): |
|
action_value = action(sess) |
|
return sess.run(eval_step, feed_dict={action_ph: action_value}) |
|
else: |
|
action = uvf_utils.clip_to_spec(action, action_spec) |
|
def step_fn(sess): |
|
action_value = sess.run(action) |
|
return sess.run(eval_step, feed_dict={action_ph: action_value}) |
|
|
|
return step_fn |
|
|
|
|
|
@gin.configurable |
|
def evaluate(checkpoint_dir, |
|
eval_dir, |
|
environment=None, |
|
num_bin_actions=3, |
|
agent_class=None, |
|
meta_agent_class=None, |
|
state_preprocess_class=None, |
|
gamma=1.0, |
|
num_episodes_eval=10, |
|
eval_interval_secs=60, |
|
max_number_of_evaluations=None, |
|
checkpoint_timeout=None, |
|
timeout_fn=None, |
|
tuner_hook=None, |
|
generate_videos=False, |
|
generate_summaries=True, |
|
num_episodes_videos=5, |
|
video_settings=None, |
|
eval_modes=('eval',), |
|
eval_model_rollout=False, |
|
policy_save_dir='policy', |
|
checkpoint_range=None, |
|
checkpoint_path=None, |
|
max_steps_per_episode=None, |
|
evaluate_nohrl=False): |
|
"""Loads and repeatedly evaluates a checkpointed model at a set interval. |
|
|
|
Args: |
|
checkpoint_dir: The directory where the checkpoints reside. |
|
eval_dir: Directory to save the evaluation summary results. |
|
environment: A BaseEnvironment to evaluate. |
|
num_bin_actions: Number of bins for discretizing continuous actions. |
|
agent_class: An RL agent class. |
|
meta_agent_class: A Meta agent class. |
|
gamma: Discount factor for the reward. |
|
num_episodes_eval: Number of episodes to evaluate and average reward over. |
|
eval_interval_secs: The number of seconds between each evaluation run. |
|
max_number_of_evaluations: The max number of evaluations. If None the |
|
evaluation continues indefinitely. |
|
checkpoint_timeout: The maximum amount of time to wait between checkpoints. |
|
If left as `None`, then the process will wait indefinitely. |
|
timeout_fn: Optional function to call after a timeout. |
|
tuner_hook: A callable that takes the average reward and global step and |
|
updates a Vizier tuner trial. |
|
generate_videos: Whether to generate videos of the agent in action. |
|
generate_summaries: Whether to generate summaries. |
|
num_episodes_videos: Number of episodes to evaluate for generating videos. |
|
video_settings: Settings for generating videos of the agent. |
|
optimal action based on the critic. |
|
eval_modes: A tuple of eval modes. |
|
eval_model_rollout: Evaluate model rollout. |
|
policy_save_dir: Optional sub-directory where the policies are |
|
saved. |
|
checkpoint_range: Optional. If provided, evaluate all checkpoints in |
|
the range. |
|
checkpoint_path: Optional sub-directory specifying which checkpoint to |
|
evaluate. If None, will evaluate the most recent checkpoint. |
|
""" |
|
tf_env = create_maze_env.TFPyEnvironment(environment) |
|
observation_spec = [tf_env.observation_spec()] |
|
action_spec = [tf_env.action_spec()] |
|
|
|
assert max_steps_per_episode, 'max_steps_per_episode need to be set' |
|
|
|
if agent_class.ACTION_TYPE == 'discrete': |
|
assert False |
|
else: |
|
assert agent_class.ACTION_TYPE == 'continuous' |
|
|
|
if meta_agent_class is not None: |
|
assert agent_class.ACTION_TYPE == meta_agent_class.ACTION_TYPE |
|
with tf.variable_scope('meta_agent'): |
|
meta_agent = meta_agent_class( |
|
observation_spec, |
|
action_spec, |
|
tf_env, |
|
) |
|
else: |
|
meta_agent = None |
|
|
|
with tf.variable_scope('uvf_agent'): |
|
uvf_agent = agent_class( |
|
observation_spec, |
|
action_spec, |
|
tf_env, |
|
) |
|
uvf_agent.set_meta_agent(agent=meta_agent) |
|
|
|
with tf.variable_scope('state_preprocess'): |
|
state_preprocess = state_preprocess_class() |
|
|
|
|
|
|
|
|
|
temp_states = tf.expand_dims( |
|
tf.zeros( |
|
dtype=uvf_agent._observation_spec.dtype, |
|
shape=uvf_agent._observation_spec.shape), 0) |
|
|
|
temp_actions = uvf_agent.actor_net(temp_states) |
|
uvf_agent.critic_net(temp_states, temp_actions) |
|
|
|
|
|
eval_step_fns = dict() |
|
meta_agent = uvf_agent.meta_agent |
|
for meta in [True] + [False] * evaluate_nohrl: |
|
meta_tag = 'hrl' if meta else 'nohrl' |
|
uvf_agent.set_meta_agent(meta_agent if meta else None) |
|
for mode in eval_modes: |
|
|
|
wrapped_environment = uvf_agent.get_env_base_wrapper( |
|
environment, mode=mode) |
|
action_wrapper = lambda agent_: agent_.action |
|
action_fn = action_wrapper(uvf_agent) |
|
meta_action_fn = action_wrapper(meta_agent) |
|
eval_step_fns['%s_%s' % (mode, meta_tag)] = (get_eval_step( |
|
uvf_agent=uvf_agent, |
|
state_preprocess=state_preprocess, |
|
tf_env=tf_env, |
|
action_fn=action_fn, |
|
meta_action_fn=meta_action_fn, |
|
environment_steps=tf.Variable( |
|
0, dtype=tf.int64, name='environment_steps'), |
|
num_episodes=tf.Variable(0, dtype=tf.int64, name='num_episodes'), |
|
mode=mode), wrapped_environment,) |
|
|
|
model_rollout_fn = None |
|
if eval_model_rollout: |
|
model_rollout_fn = get_model_rollout(uvf_agent, tf_env) |
|
|
|
tf.train.get_or_create_global_step() |
|
|
|
if policy_save_dir: |
|
checkpoint_dir = os.path.join(checkpoint_dir, policy_save_dir) |
|
|
|
tf.logging.info('Evaluating policies at %s', checkpoint_dir) |
|
tf.logging.info('Running episodes for max %d steps', max_steps_per_episode) |
|
|
|
evaluate_checkpoint_fn = get_evaluate_checkpoint_fn( |
|
'', eval_dir, eval_step_fns, model_rollout_fn, gamma, |
|
max_steps_per_episode, num_episodes_eval, num_episodes_videos, tuner_hook, |
|
generate_videos, generate_summaries, video_settings) |
|
|
|
if checkpoint_path is not None: |
|
checkpoint_path = os.path.join(checkpoint_dir, checkpoint_path) |
|
evaluate_checkpoint_fn(checkpoint_path) |
|
elif checkpoint_range is not None: |
|
model_files = tf.gfile.Glob( |
|
os.path.join(checkpoint_dir, 'model.ckpt-*.index')) |
|
tf.logging.info('Found %s policies at %s', len(model_files), checkpoint_dir) |
|
model_files = { |
|
int(f.split('model.ckpt-', 1)[1].split('.', 1)[0]): |
|
os.path.splitext(f)[0] |
|
for f in model_files |
|
} |
|
model_files = { |
|
k: v |
|
for k, v in model_files.items() |
|
if k >= checkpoint_range[0] and k <= checkpoint_range[1] |
|
} |
|
tf.logging.info('Evaluating %d policies at %s', |
|
len(model_files), checkpoint_dir) |
|
for _, checkpoint_path in sorted(model_files.items()): |
|
evaluate_checkpoint_fn(checkpoint_path) |
|
else: |
|
eval_utils.evaluate_checkpoint_repeatedly( |
|
checkpoint_dir, |
|
evaluate_checkpoint_fn, |
|
eval_interval_secs=eval_interval_secs, |
|
max_number_of_evaluations=max_number_of_evaluations, |
|
checkpoint_timeout=checkpoint_timeout, |
|
timeout_fn=timeout_fn) |
|
|