|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
"""Reward functions, distance functions, and reward managers.""" |
|
|
|
from abc import ABCMeta |
|
from abc import abstractmethod |
|
from math import log |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def abs_diff(a, b, base=0): |
|
"""Absolute value of difference between scalars. |
|
|
|
abs_diff is symmetric, i.e. `a` and `b` are interchangeable. |
|
|
|
Args: |
|
a: First argument. An int. |
|
b: Seconds argument. An int. |
|
base: Dummy argument so that the argument signature matches other scalar |
|
diff functions. abs_diff is the same in all bases. |
|
|
|
Returns: |
|
abs(a - b). |
|
""" |
|
del base |
|
return abs(a - b) |
|
|
|
|
|
def mod_abs_diff(a, b, base): |
|
"""Shortest distance between `a` and `b` in the modular integers base `base`. |
|
|
|
The smallest distance between a and b is returned. |
|
Example: mod_abs_diff(1, 99, 100) ==> 2. It is not 98. |
|
|
|
mod_abs_diff is symmetric, i.e. `a` and `b` are interchangeable. |
|
|
|
Args: |
|
a: First argument. An int. |
|
b: Seconds argument. An int. |
|
base: The modulo base. A positive int. |
|
|
|
Returns: |
|
Shortest distance. |
|
""" |
|
diff = abs(a - b) |
|
if diff >= base: |
|
diff %= base |
|
return min(diff, (-diff) + base) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def absolute_distance(pred, target, base, scalar_diff_fn=abs_diff): |
|
"""Asymmetric list distance function. |
|
|
|
List distance is the sum of element-wise distances, like Hamming distance, but |
|
where `pred` can be longer or shorter than `target`. For each position in both |
|
`pred` and `target`, distance between those elements is computed with |
|
`scalar_diff_fn`. For missing or extra elements in `pred`, the maximum |
|
distance is assigned, which is equal to `base`. |
|
|
|
Distance is 0 when `pred` and `target` are identical, and will be a positive |
|
integer when they are not. |
|
|
|
Args: |
|
pred: Prediction list. Distance from this list is computed. |
|
target: Target list. Distance to this list is computed. |
|
base: The integer base to use. For example, a list of chars would use base |
|
256. |
|
scalar_diff_fn: Element-wise distance function. |
|
|
|
Returns: |
|
List distance between `pred` and `target`. |
|
""" |
|
d = 0 |
|
for i, target_t in enumerate(target): |
|
if i >= len(pred): |
|
d += base |
|
else: |
|
|
|
d += scalar_diff_fn(pred[i], target_t, base) |
|
if len(pred) > len(target): |
|
|
|
d += (len(pred) - len(target)) * base |
|
return d |
|
|
|
|
|
def log_absolute_distance(pred, target, base): |
|
"""Asymmetric list distance function that uses log distance. |
|
|
|
A list distance which computes sum of element-wise distances, similar to |
|
`absolute_distance`. Unlike `absolute_distance`, this scales the resulting |
|
distance to be a float. |
|
|
|
Element-wise distance are log-scale. Distance between two list changes |
|
relatively less for elements that are far apart, but changes a lot (goes to 0 |
|
faster) when values get close together. |
|
|
|
Args: |
|
pred: List of ints. Computes distance from this list to the target. |
|
target: List of ints. This is the "correct" list which the prediction list |
|
is trying to match. |
|
base: Integer base. |
|
|
|
Returns: |
|
Float distance normalized so that when `pred` is at most as long as `target` |
|
the distance is between 0.0 and 1.0. Distance grows unboundedly large |
|
as `pred` grows past `target` in length. |
|
""" |
|
if not target: |
|
length_normalizer = 1.0 |
|
if not pred: |
|
|
|
return 0.0 |
|
else: |
|
length_normalizer = float(len(target)) |
|
|
|
|
|
|
|
max_dist = base // 2 + 1 |
|
|
|
|
|
|
|
|
|
factor = log(max_dist + 1) |
|
|
|
d = 0.0 |
|
for i, target_t in enumerate(target): |
|
if i >= len(pred): |
|
|
|
|
|
d += 1.0 |
|
else: |
|
|
|
d += log(mod_abs_diff(pred[i], target_t, base) + 1) / factor |
|
if len(pred) > len(target): |
|
|
|
|
|
|
|
d += (len(pred) - len(target)) |
|
return d / length_normalizer |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def absolute_distance_reward(pred, target, base, scalar_diff_fn=abs_diff): |
|
"""Reward function based on absolute_distance function. |
|
|
|
Maximum reward, 1.0, is given when the lists are equal. Reward is scaled |
|
so that 0.0 reward is given when `pred` is the empty list (assuming `target` |
|
is not empty). Reward can go negative when `pred` is longer than `target`. |
|
|
|
This is an asymmetric reward function, so which list is the prediction and |
|
which is the target matters. |
|
|
|
Args: |
|
pred: Prediction sequence. This should be the sequence outputted by the |
|
generated code. List of ints n, where 0 <= n < base. |
|
target: Target sequence. The correct sequence that the generated code needs |
|
to output. List of ints n, where 0 <= n < base. |
|
base: Base of the computation. |
|
scalar_diff_fn: Element-wise distance function. |
|
|
|
Returns: |
|
Reward computed based on `pred` and `target`. A float. |
|
""" |
|
unit_dist = float(base * len(target)) |
|
if unit_dist == 0: |
|
unit_dist = base |
|
dist = absolute_distance(pred, target, base, scalar_diff_fn=scalar_diff_fn) |
|
return (unit_dist - dist) / unit_dist |
|
|
|
|
|
def absolute_mod_distance_reward(pred, target, base): |
|
"""Same as `absolute_distance_reward` but `mod_abs_diff` scalar diff is used. |
|
|
|
Args: |
|
pred: Prediction sequence. This should be the sequence outputted by the |
|
generated code. List of ints n, where 0 <= n < base. |
|
target: Target sequence. The correct sequence that the generated code needs |
|
to output. List of ints n, where 0 <= n < base. |
|
base: Base of the computation. |
|
|
|
Returns: |
|
Reward computed based on `pred` and `target`. A float. |
|
""" |
|
return absolute_distance_reward(pred, target, base, mod_abs_diff) |
|
|
|
|
|
def absolute_log_distance_reward(pred, target, base): |
|
"""Compute reward using `log_absolute_distance`. |
|
|
|
Maximum reward, 1.0, is given when the lists are equal. Reward is scaled |
|
so that 0.0 reward is given when `pred` is the empty list (assuming `target` |
|
is not empty). Reward can go negative when `pred` is longer than `target`. |
|
|
|
This is an asymmetric reward function, so which list is the prediction and |
|
which is the target matters. |
|
|
|
This reward function has the nice property that much more reward is given |
|
for getting the correct value (at each position) than for there being any |
|
value at all. For example, in base 100, lets say pred = [1] * 1000 |
|
and target = [10] * 1000. A lot of reward would be given for being 80% |
|
accurate (worst element-wise distance is 50, distances here are 9) using |
|
`absolute_distance`. `log_absolute_distance` on the other hand will give |
|
greater and greater reward increments the closer each predicted value gets to |
|
the target. That makes the reward given for accuracy somewhat independant of |
|
the base. |
|
|
|
Args: |
|
pred: Prediction sequence. This should be the sequence outputted by the |
|
generated code. List of ints n, where 0 <= n < base. |
|
target: Target sequence. The correct sequence that the generated code needs |
|
to output. List of ints n, where 0 <= n < base. |
|
base: Base of the computation. |
|
|
|
Returns: |
|
Reward computed based on `pred` and `target`. A float. |
|
""" |
|
return 1.0 - log_absolute_distance(pred, target, base) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RewardManager(object): |
|
"""Reward managers administer reward across an episode. |
|
|
|
Reward managers are used for "editor" environments. These are environments |
|
where the agent has some way to edit its code over time, and run its code |
|
many time in the same episode, so that it can make incremental improvements. |
|
|
|
Reward managers are instantiated with a target sequence, which is the known |
|
correct program output. The manager is called on the output from a proposed |
|
code, and returns reward. If many proposal outputs are tried, reward may be |
|
some stateful function that takes previous tries into account. This is done, |
|
in part, so that an agent cannot accumulate unbounded reward just by trying |
|
junk programs as often as possible. So reward managers should not give the |
|
same reward twice if the next proposal is not better than the last. |
|
""" |
|
__metaclass__ = ABCMeta |
|
|
|
def __init__(self, target, base, distance_fn=absolute_distance): |
|
self._target = list(target) |
|
self._base = base |
|
self._distance_fn = distance_fn |
|
|
|
@abstractmethod |
|
def __call__(self, sequence): |
|
"""Call this reward manager like a function to get reward. |
|
|
|
Calls to reward manager are stateful, and will take previous sequences |
|
into account. Repeated calls with the same sequence may produce different |
|
rewards. |
|
|
|
Args: |
|
sequence: List of integers (each between 0 and base - 1). This is the |
|
proposal sequence. Reward will be computed based on the distance |
|
from this sequence to the target (distance function and target are |
|
given in the constructor), as well as previous sequences tried during |
|
the lifetime of this object. |
|
|
|
Returns: |
|
Float value. The reward received from this call. |
|
""" |
|
return 0.0 |
|
|
|
|
|
class DeltaRewardManager(RewardManager): |
|
"""Simple reward manager that assigns reward for the net change in distance. |
|
|
|
Given some (possibly asymmetric) list distance function, gives reward for |
|
relative changes in prediction distance to the target. |
|
|
|
For example, if on the first call the distance is 3.0, the change in distance |
|
is -3 (from starting distance of 0). That relative change will be scaled to |
|
produce a negative reward for this step. On the next call, the distance is 2.0 |
|
which is a +1 change, and that will be scaled to give a positive reward. |
|
If the final call has distance 0 (the target is achieved), that is another |
|
positive change of +2. The total reward across all 3 calls is then 0, which is |
|
the highest posible episode total. |
|
|
|
Reward is scaled so that the maximum element-wise distance is worth 1.0. |
|
Maximum total episode reward attainable is 0. |
|
""" |
|
|
|
def __init__(self, target, base, distance_fn=absolute_distance): |
|
super(DeltaRewardManager, self).__init__(target, base, distance_fn) |
|
self._last_diff = 0 |
|
|
|
def _diff(self, seq): |
|
return self._distance_fn(seq, self._target, self._base) |
|
|
|
def _delta_reward(self, seq): |
|
|
|
|
|
|
|
|
|
|
|
diff = self._diff(seq) |
|
reward = (self._last_diff - diff) / float(self._base) |
|
self._last_diff = diff |
|
return reward |
|
|
|
def __call__(self, seq): |
|
return self._delta_reward(seq) |
|
|
|
|
|
class FloorRewardManager(RewardManager): |
|
"""Assigns positive reward for each step taken closer to the target. |
|
|
|
Given some (possibly asymmetric) list distance function, gives reward for |
|
whenever a new episode minimum distance is reached. No reward is given if |
|
the distance regresses to a higher value, so that the sum of rewards |
|
for the episode is positive. |
|
|
|
Reward is scaled so that the maximum element-wise distance is worth 1.0. |
|
Maximum total episode reward attainable is len(target). |
|
|
|
If the prediction sequence is longer than the target, a reward of -1 is given. |
|
Subsequence predictions which are also longer get 0 reward. The -1 penalty |
|
will be canceled out with a +1 reward when a prediction is given which is at |
|
most the length of the target. |
|
""" |
|
|
|
def __init__(self, target, base, distance_fn=absolute_distance): |
|
super(FloorRewardManager, self).__init__(target, base, distance_fn) |
|
self._last_diff = 0 |
|
self._min_diff = self._max_diff() |
|
self._too_long_penality_given = False |
|
|
|
def _max_diff(self): |
|
return self._distance_fn([], self._target, self._base) |
|
|
|
def _diff(self, seq): |
|
return self._distance_fn(seq, self._target, self._base) |
|
|
|
def _delta_reward(self, seq): |
|
|
|
|
|
|
|
|
|
|
|
diff = self._diff(seq) |
|
if diff < self._min_diff: |
|
reward = (self._min_diff - diff) / float(self._base) |
|
self._min_diff = diff |
|
else: |
|
reward = 0.0 |
|
return reward |
|
|
|
def __call__(self, seq): |
|
if len(seq) > len(self._target): |
|
if not self._too_long_penality_given: |
|
self._too_long_penality_given = True |
|
reward = -1.0 |
|
else: |
|
reward = 0.0 |
|
return reward |
|
|
|
reward = self._delta_reward(seq) |
|
if self._too_long_penality_given: |
|
reward += 1.0 |
|
self._too_long_penality_given = False |
|
return reward |
|
|
|
|