|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
"""Utilities related to computing training batches from episode rollouts. |
|
|
|
Implementations here are based on code from Open AI: |
|
https://github.com/openai/universe-starter-agent/blob/master/a3c.py. |
|
""" |
|
|
|
from collections import namedtuple |
|
import numpy as np |
|
import scipy.signal |
|
|
|
from common import utils |
|
|
|
|
|
class Rollout(object): |
|
"""Holds a rollout for an episode. |
|
|
|
A rollout is a record of the states observed in some environment and actions |
|
taken by the agent to arrive at those states. Other information includes |
|
rewards received after each action, values estimated for each state, whether |
|
the rollout concluded the episide, and total reward received. Everything |
|
should be given in time order. |
|
|
|
At each time t, the agent sees state s_t, takes action a_t, and then receives |
|
reward r_t. The agent may optionally estimate a state value V(s_t) for each |
|
state. |
|
|
|
For an episode of length T: |
|
states = [s_0, ..., s_(T-1)] |
|
actions = [a_0, ..., a_(T-1)] |
|
rewards = [r_0, ..., r_(T-1)] |
|
values = [V(s_0), ..., V(s_(T-1))] |
|
|
|
Note that there is an extra state s_T observed after taking action a_(T-1), |
|
but this is not included in the rollout. |
|
|
|
Rollouts have an `terminated` attribute which is True when the rollout is |
|
"finalized", i.e. it holds a full episode. terminated will be False when |
|
time steps are still being added to it. |
|
""" |
|
|
|
def __init__(self): |
|
self.states = [] |
|
self.actions = [] |
|
self.rewards = [] |
|
self.values = [] |
|
self.total_reward = 0.0 |
|
self.terminated = False |
|
|
|
def add(self, state, action, reward, value=0.0, terminated=False): |
|
"""Add the next timestep to this rollout. |
|
|
|
Args: |
|
state: The state observed at the start of this timestep. |
|
action: The action taken after observing the given state. |
|
reward: The reward received for taking the given action. |
|
value: The value estimated for the given state. |
|
terminated: Whether this timestep ends the episode. |
|
|
|
Raises: |
|
ValueError: If this.terminated is already True, meaning that the episode |
|
has already ended. |
|
""" |
|
if self.terminated: |
|
raise ValueError( |
|
'Trying to add timestep to an already terminal rollout.') |
|
self.states += [state] |
|
self.actions += [action] |
|
self.rewards += [reward] |
|
self.values += [value] |
|
self.terminated = terminated |
|
self.total_reward += reward |
|
|
|
def add_many(self, states, actions, rewards, values=None, terminated=False): |
|
"""Add many timesteps to this rollout. |
|
|
|
Arguments are the same as `add`, but are lists of equal size. |
|
|
|
Args: |
|
states: The states observed. |
|
actions: The actions taken. |
|
rewards: The rewards received. |
|
values: The values estimated for the given states. |
|
terminated: Whether this sequence ends the episode. |
|
|
|
Raises: |
|
ValueError: If the lengths of all the input lists are not equal. |
|
ValueError: If this.terminated is already True, meaning that the episode |
|
has already ended. |
|
""" |
|
if len(states) != len(actions): |
|
raise ValueError( |
|
'Number of states and actions must be the same. Got %d states and ' |
|
'%d actions' % (len(states), len(actions))) |
|
if len(states) != len(rewards): |
|
raise ValueError( |
|
'Number of states and rewards must be the same. Got %d states and ' |
|
'%d rewards' % (len(states), len(rewards))) |
|
if values is not None and len(states) != len(values): |
|
raise ValueError( |
|
'Number of states and values must be the same. Got %d states and ' |
|
'%d values' % (len(states), len(values))) |
|
if self.terminated: |
|
raise ValueError( |
|
'Trying to add timesteps to an already terminal rollout.') |
|
self.states += states |
|
self.actions += actions |
|
self.rewards += rewards |
|
self.values += values if values is not None else [0.0] * len(states) |
|
self.terminated = terminated |
|
self.total_reward += sum(rewards) |
|
|
|
def extend(self, other): |
|
"""Append another rollout to this rollout.""" |
|
assert not self.terminated |
|
self.states.extend(other.states) |
|
self.actions.extend(other.actions) |
|
self.rewards.extend(other.rewards) |
|
self.values.extend(other.values) |
|
self.terminated = other.terminated |
|
self.total_reward += other.total_reward |
|
|
|
|
|
def discount(x, gamma): |
|
"""Returns discounted sums for each value in x, with discount factor gamma. |
|
|
|
This can be used to compute the return (discounted sum of rewards) at each |
|
timestep given a sequence of rewards. See the definitions for return and |
|
REINFORCE in section 3 of https://arxiv.org/pdf/1602.01783.pdf. |
|
|
|
Let g^k mean gamma ** k. |
|
For list [x_0, ..., x_N], the following list of discounted sums is computed: |
|
[x_0 + g^1 * x_1 + g^2 * x_2 + ... g^N * x_N, |
|
x_1 + g^1 * x_2 + g^2 * x_3 + ... g^(N-1) * x_N, |
|
x_2 + g^1 * x_3 + g^2 * x_4 + ... g^(N-2) * x_N, |
|
..., |
|
x_(N-1) + g^1 * x_N, |
|
x_N] |
|
|
|
Args: |
|
x: List of numbers [x_0, ..., x_N]. |
|
gamma: Float between 0 and 1 (inclusive). This is the discount factor. |
|
|
|
Returns: |
|
List of discounted sums. |
|
""" |
|
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] |
|
|
|
|
|
def discounted_advantage_and_rewards(rewards, values, gamma, lambda_=1.0): |
|
"""Compute advantages and returns (discounted sum of rewards). |
|
|
|
For an episode of length T, rewards = [r_0, ..., r_(T-1)]. |
|
Each reward r_t is observed after taking action a_t at state s_t. A final |
|
state s_T is observed but no reward is given at this state since no action |
|
a_T is taken (otherwise there would be a new state s_(T+1)). |
|
|
|
`rewards` and `values` are for a single episode. Return R_t is the discounted |
|
sum of future rewards starting at time t, where `gamma` is the discount |
|
factor. |
|
R_t = r_t + gamma * r_(t+1) + gamma**2 * r_(t+2) + ... |
|
+ gamma**(T-1-t) * r_(T-1) |
|
|
|
Advantage A(a_t, s_t) is approximated by computing A(a_t, s_t) = R_t - V(s_t) |
|
where V(s_t) is an approximation of the value at that state, given in the |
|
`values` list. Returns R_t are needed for all REINFORCE algorithms. Advantage |
|
is used for the advantage actor critic variant of REINFORCE. |
|
See algorithm S3 in https://arxiv.org/pdf/1602.01783.pdf. |
|
|
|
Additionally another parameter `lambda_` controls the bias-variance tradeoff. |
|
See "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438. |
|
lambda_ = 1 reduces to regular advantage. |
|
0 <= lambda_ < 1 trades off variance for bias, with lambda_ = 0 being the |
|
most biased. |
|
|
|
Bootstrapping is also supported. If an episode does not end in a terminal |
|
state (either because the episode was ended early, or the environment does not |
|
have end states), the true return cannot be computed from the rewards alone. |
|
However, it can be estimated by computing the value (an approximation of |
|
return) of the last state s_T. Thus the `values` list will have an extra item: |
|
values = [V(s_0), ..., V(s_(T-1)), V(s_T)]. |
|
|
|
Args: |
|
rewards: List of observed rewards [r_0, ..., r_(T-1)]. |
|
values: List of estimated values [V(s_0), ..., V(s_(T-1))] with an optional |
|
extra V(s_T) item. |
|
gamma: Discount factor. Number between 0 and 1. 1 means no discount. |
|
If not 1, gamma is typically near 1, like 0.99. |
|
lambda_: Bias-variance tradeoff factor. Between 0 and 1. |
|
|
|
Returns: |
|
empirical_values: Returns at each timestep. |
|
generalized_advantage: Avantages at each timestep. |
|
|
|
Raises: |
|
ValueError: If shapes of `rewards` and `values` are not rank 1. |
|
ValueError: If len(values) not in (len(rewards), len(rewards) + 1). |
|
""" |
|
rewards = np.asarray(rewards, dtype=np.float32) |
|
values = np.asarray(values, dtype=np.float32) |
|
if rewards.ndim != 1: |
|
raise ValueError('Single episode only. rewards must be rank 1.') |
|
if values.ndim != 1: |
|
raise ValueError('Single episode only. values must be rank 1.') |
|
if len(values) == len(rewards): |
|
|
|
values = np.append(values, 0) |
|
empirical_values = discount(rewards, gamma) |
|
elif len(values) == len(rewards) + 1: |
|
|
|
|
|
|
|
empirical_values = discount(np.append(rewards, values[-1]), gamma)[:-1] |
|
else: |
|
raise ValueError('values should contain the same number of items or one ' |
|
'more item than rewards') |
|
delta = rewards + gamma * values[1:] - values[:-1] |
|
generalized_advantage = discount(delta, gamma * lambda_) |
|
|
|
|
|
|
|
return empirical_values, generalized_advantage |
|
|
|
|
|
"""Batch holds a minibatch of episodes. |
|
|
|
Let bi = batch_index, i.e. the index of each episode in the minibatch. |
|
Let t = time. |
|
|
|
Attributes: |
|
states: States for each timestep in each episode. Indexed by states[bi, t]. |
|
actions: Actions for each timestep in each episode. Indexed by actions[bi, t]. |
|
discounted_adv: Advantages (computed by discounted_advantage_and_rewards) |
|
for each timestep in each episode. Indexed by discounted_adv[bi, t]. |
|
discounted_r: Returns (discounted sum of rewards computed by |
|
discounted_advantage_and_rewards) for each timestep in each episode. |
|
Indexed by discounted_r[bi, t]. |
|
total_rewards: Total reward for each episode, i.e. sum of rewards across all |
|
timesteps (not discounted). Indexed by total_rewards[bi]. |
|
episode_lengths: Number of timesteps in each episode. If an episode has |
|
N actions, N rewards, and N states, then its length is N. Indexed by |
|
episode_lengths[bi]. |
|
batch_size: Number of episodes in this minibatch. An integer. |
|
max_time: Maximum episode length in the batch. An integer. |
|
""" |
|
Batch = namedtuple( |
|
'Batch', |
|
['states', 'actions', 'discounted_adv', 'discounted_r', 'total_rewards', |
|
'episode_lengths', 'batch_size', 'max_time']) |
|
|
|
|
|
def process_rollouts(rollouts, gamma, lambda_=1.0): |
|
"""Convert a batch of rollouts into tensors ready to be fed into a model. |
|
|
|
Lists from each episode are stacked into 2D tensors and padded with 0s up to |
|
the maximum timestep in the batch. |
|
|
|
Args: |
|
rollouts: A list of Rollout instances. |
|
gamma: The discount factor. A number between 0 and 1 (inclusive). See gamma |
|
argument in discounted_advantage_and_rewards. |
|
lambda_: See lambda_ argument in discounted_advantage_and_rewards. |
|
|
|
Returns: |
|
Batch instance. states, actions, discounted_adv, and discounted_r are |
|
numpy arrays with shape (batch_size, max_episode_length). episode_lengths |
|
is a list of ints. total_rewards is a list of floats (total reward in each |
|
episode). batch_size and max_time are ints. |
|
|
|
Raises: |
|
ValueError: If any of the rollouts are not terminal. |
|
""" |
|
for ro in rollouts: |
|
if not ro.terminated: |
|
raise ValueError('Can only process terminal rollouts.') |
|
|
|
episode_lengths = [len(ro.states) for ro in rollouts] |
|
batch_size = len(rollouts) |
|
max_time = max(episode_lengths) |
|
|
|
states = utils.stack_pad([ro.states for ro in rollouts], 0, max_time) |
|
actions = utils.stack_pad([ro.actions for ro in rollouts], 0, max_time) |
|
|
|
discounted_rewards = [None] * batch_size |
|
discounted_adv = [None] * batch_size |
|
for i, ro in enumerate(rollouts): |
|
disc_r, disc_adv = discounted_advantage_and_rewards( |
|
ro.rewards, ro.values, gamma, lambda_) |
|
discounted_rewards[i] = disc_r |
|
discounted_adv[i] = disc_adv |
|
discounted_rewards = utils.stack_pad(discounted_rewards, 0, max_time) |
|
discounted_adv = utils.stack_pad(discounted_adv, 0, max_time) |
|
|
|
total_rewards = [sum(ro.rewards) for ro in rollouts] |
|
|
|
return Batch(states=states, |
|
actions=actions, |
|
discounted_adv=discounted_adv, |
|
discounted_r=discounted_rewards, |
|
total_rewards=total_rewards, |
|
episode_lengths=episode_lengths, |
|
batch_size=batch_size, |
|
max_time=max_time) |
|
|