from __future__ import absolute_import from __future__ import division from __future__ import print_function """Utilities related to computing training batches from episode rollouts. Implementations here are based on code from Open AI: https://github.com/openai/universe-starter-agent/blob/master/a3c.py. """ from collections import namedtuple import numpy as np import scipy.signal from common import utils # brain coder class Rollout(object): """Holds a rollout for an episode. A rollout is a record of the states observed in some environment and actions taken by the agent to arrive at those states. Other information includes rewards received after each action, values estimated for each state, whether the rollout concluded the episide, and total reward received. Everything should be given in time order. At each time t, the agent sees state s_t, takes action a_t, and then receives reward r_t. The agent may optionally estimate a state value V(s_t) for each state. For an episode of length T: states = [s_0, ..., s_(T-1)] actions = [a_0, ..., a_(T-1)] rewards = [r_0, ..., r_(T-1)] values = [V(s_0), ..., V(s_(T-1))] Note that there is an extra state s_T observed after taking action a_(T-1), but this is not included in the rollout. Rollouts have an `terminated` attribute which is True when the rollout is "finalized", i.e. it holds a full episode. terminated will be False when time steps are still being added to it. """ def __init__(self): self.states = [] self.actions = [] self.rewards = [] self.values = [] self.total_reward = 0.0 self.terminated = False def add(self, state, action, reward, value=0.0, terminated=False): """Add the next timestep to this rollout. Args: state: The state observed at the start of this timestep. action: The action taken after observing the given state. reward: The reward received for taking the given action. value: The value estimated for the given state. terminated: Whether this timestep ends the episode. Raises: ValueError: If this.terminated is already True, meaning that the episode has already ended. """ if self.terminated: raise ValueError( 'Trying to add timestep to an already terminal rollout.') self.states += [state] self.actions += [action] self.rewards += [reward] self.values += [value] self.terminated = terminated self.total_reward += reward def add_many(self, states, actions, rewards, values=None, terminated=False): """Add many timesteps to this rollout. Arguments are the same as `add`, but are lists of equal size. Args: states: The states observed. actions: The actions taken. rewards: The rewards received. values: The values estimated for the given states. terminated: Whether this sequence ends the episode. Raises: ValueError: If the lengths of all the input lists are not equal. ValueError: If this.terminated is already True, meaning that the episode has already ended. """ if len(states) != len(actions): raise ValueError( 'Number of states and actions must be the same. Got %d states and ' '%d actions' % (len(states), len(actions))) if len(states) != len(rewards): raise ValueError( 'Number of states and rewards must be the same. Got %d states and ' '%d rewards' % (len(states), len(rewards))) if values is not None and len(states) != len(values): raise ValueError( 'Number of states and values must be the same. Got %d states and ' '%d values' % (len(states), len(values))) if self.terminated: raise ValueError( 'Trying to add timesteps to an already terminal rollout.') self.states += states self.actions += actions self.rewards += rewards self.values += values if values is not None else [0.0] * len(states) self.terminated = terminated self.total_reward += sum(rewards) def extend(self, other): """Append another rollout to this rollout.""" assert not self.terminated self.states.extend(other.states) self.actions.extend(other.actions) self.rewards.extend(other.rewards) self.values.extend(other.values) self.terminated = other.terminated self.total_reward += other.total_reward def discount(x, gamma): """Returns discounted sums for each value in x, with discount factor gamma. This can be used to compute the return (discounted sum of rewards) at each timestep given a sequence of rewards. See the definitions for return and REINFORCE in section 3 of https://arxiv.org/pdf/1602.01783.pdf. Let g^k mean gamma ** k. For list [x_0, ..., x_N], the following list of discounted sums is computed: [x_0 + g^1 * x_1 + g^2 * x_2 + ... g^N * x_N, x_1 + g^1 * x_2 + g^2 * x_3 + ... g^(N-1) * x_N, x_2 + g^1 * x_3 + g^2 * x_4 + ... g^(N-2) * x_N, ..., x_(N-1) + g^1 * x_N, x_N] Args: x: List of numbers [x_0, ..., x_N]. gamma: Float between 0 and 1 (inclusive). This is the discount factor. Returns: List of discounted sums. """ return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] def discounted_advantage_and_rewards(rewards, values, gamma, lambda_=1.0): """Compute advantages and returns (discounted sum of rewards). For an episode of length T, rewards = [r_0, ..., r_(T-1)]. Each reward r_t is observed after taking action a_t at state s_t. A final state s_T is observed but no reward is given at this state since no action a_T is taken (otherwise there would be a new state s_(T+1)). `rewards` and `values` are for a single episode. Return R_t is the discounted sum of future rewards starting at time t, where `gamma` is the discount factor. R_t = r_t + gamma * r_(t+1) + gamma**2 * r_(t+2) + ... + gamma**(T-1-t) * r_(T-1) Advantage A(a_t, s_t) is approximated by computing A(a_t, s_t) = R_t - V(s_t) where V(s_t) is an approximation of the value at that state, given in the `values` list. Returns R_t are needed for all REINFORCE algorithms. Advantage is used for the advantage actor critic variant of REINFORCE. See algorithm S3 in https://arxiv.org/pdf/1602.01783.pdf. Additionally another parameter `lambda_` controls the bias-variance tradeoff. See "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438. lambda_ = 1 reduces to regular advantage. 0 <= lambda_ < 1 trades off variance for bias, with lambda_ = 0 being the most biased. Bootstrapping is also supported. If an episode does not end in a terminal state (either because the episode was ended early, or the environment does not have end states), the true return cannot be computed from the rewards alone. However, it can be estimated by computing the value (an approximation of return) of the last state s_T. Thus the `values` list will have an extra item: values = [V(s_0), ..., V(s_(T-1)), V(s_T)]. Args: rewards: List of observed rewards [r_0, ..., r_(T-1)]. values: List of estimated values [V(s_0), ..., V(s_(T-1))] with an optional extra V(s_T) item. gamma: Discount factor. Number between 0 and 1. 1 means no discount. If not 1, gamma is typically near 1, like 0.99. lambda_: Bias-variance tradeoff factor. Between 0 and 1. Returns: empirical_values: Returns at each timestep. generalized_advantage: Avantages at each timestep. Raises: ValueError: If shapes of `rewards` and `values` are not rank 1. ValueError: If len(values) not in (len(rewards), len(rewards) + 1). """ rewards = np.asarray(rewards, dtype=np.float32) values = np.asarray(values, dtype=np.float32) if rewards.ndim != 1: raise ValueError('Single episode only. rewards must be rank 1.') if values.ndim != 1: raise ValueError('Single episode only. values must be rank 1.') if len(values) == len(rewards): # No bootstrapping. values = np.append(values, 0) empirical_values = discount(rewards, gamma) elif len(values) == len(rewards) + 1: # With bootstrapping. # Last value is for the terminal state (final state after last action was # taken). empirical_values = discount(np.append(rewards, values[-1]), gamma)[:-1] else: raise ValueError('values should contain the same number of items or one ' 'more item than rewards') delta = rewards + gamma * values[1:] - values[:-1] generalized_advantage = discount(delta, gamma * lambda_) # empirical_values is the discounted sum of rewards into the future. # generalized_advantage is the target for each policy update. return empirical_values, generalized_advantage """Batch holds a minibatch of episodes. Let bi = batch_index, i.e. the index of each episode in the minibatch. Let t = time. Attributes: states: States for each timestep in each episode. Indexed by states[bi, t]. actions: Actions for each timestep in each episode. Indexed by actions[bi, t]. discounted_adv: Advantages (computed by discounted_advantage_and_rewards) for each timestep in each episode. Indexed by discounted_adv[bi, t]. discounted_r: Returns (discounted sum of rewards computed by discounted_advantage_and_rewards) for each timestep in each episode. Indexed by discounted_r[bi, t]. total_rewards: Total reward for each episode, i.e. sum of rewards across all timesteps (not discounted). Indexed by total_rewards[bi]. episode_lengths: Number of timesteps in each episode. If an episode has N actions, N rewards, and N states, then its length is N. Indexed by episode_lengths[bi]. batch_size: Number of episodes in this minibatch. An integer. max_time: Maximum episode length in the batch. An integer. """ # pylint: disable=pointless-string-statement Batch = namedtuple( 'Batch', ['states', 'actions', 'discounted_adv', 'discounted_r', 'total_rewards', 'episode_lengths', 'batch_size', 'max_time']) def process_rollouts(rollouts, gamma, lambda_=1.0): """Convert a batch of rollouts into tensors ready to be fed into a model. Lists from each episode are stacked into 2D tensors and padded with 0s up to the maximum timestep in the batch. Args: rollouts: A list of Rollout instances. gamma: The discount factor. A number between 0 and 1 (inclusive). See gamma argument in discounted_advantage_and_rewards. lambda_: See lambda_ argument in discounted_advantage_and_rewards. Returns: Batch instance. states, actions, discounted_adv, and discounted_r are numpy arrays with shape (batch_size, max_episode_length). episode_lengths is a list of ints. total_rewards is a list of floats (total reward in each episode). batch_size and max_time are ints. Raises: ValueError: If any of the rollouts are not terminal. """ for ro in rollouts: if not ro.terminated: raise ValueError('Can only process terminal rollouts.') episode_lengths = [len(ro.states) for ro in rollouts] batch_size = len(rollouts) max_time = max(episode_lengths) states = utils.stack_pad([ro.states for ro in rollouts], 0, max_time) actions = utils.stack_pad([ro.actions for ro in rollouts], 0, max_time) discounted_rewards = [None] * batch_size discounted_adv = [None] * batch_size for i, ro in enumerate(rollouts): disc_r, disc_adv = discounted_advantage_and_rewards( ro.rewards, ro.values, gamma, lambda_) discounted_rewards[i] = disc_r discounted_adv[i] = disc_adv discounted_rewards = utils.stack_pad(discounted_rewards, 0, max_time) discounted_adv = utils.stack_pad(discounted_adv, 0, max_time) total_rewards = [sum(ro.rewards) for ro in rollouts] return Batch(states=states, actions=actions, discounted_adv=discounted_adv, discounted_r=discounted_rewards, total_rewards=total_rewards, episode_lengths=episode_lengths, batch_size=batch_size, max_time=max_time)