from __future__ import absolute_import from __future__ import division from __future__ import print_function """Reward functions, distance functions, and reward managers.""" from abc import ABCMeta from abc import abstractmethod from math import log # All sequences here are assumed to be lists of ints bounded # between 0 and `base`-1 (inclusive). ################################# ### Scalar Distance Functions ### ################################# def abs_diff(a, b, base=0): """Absolute value of difference between scalars. abs_diff is symmetric, i.e. `a` and `b` are interchangeable. Args: a: First argument. An int. b: Seconds argument. An int. base: Dummy argument so that the argument signature matches other scalar diff functions. abs_diff is the same in all bases. Returns: abs(a - b). """ del base # Unused. return abs(a - b) def mod_abs_diff(a, b, base): """Shortest distance between `a` and `b` in the modular integers base `base`. The smallest distance between a and b is returned. Example: mod_abs_diff(1, 99, 100) ==> 2. It is not 98. mod_abs_diff is symmetric, i.e. `a` and `b` are interchangeable. Args: a: First argument. An int. b: Seconds argument. An int. base: The modulo base. A positive int. Returns: Shortest distance. """ diff = abs(a - b) if diff >= base: diff %= base return min(diff, (-diff) + base) ############################### ### List Distance Functions ### ############################### def absolute_distance(pred, target, base, scalar_diff_fn=abs_diff): """Asymmetric list distance function. List distance is the sum of element-wise distances, like Hamming distance, but where `pred` can be longer or shorter than `target`. For each position in both `pred` and `target`, distance between those elements is computed with `scalar_diff_fn`. For missing or extra elements in `pred`, the maximum distance is assigned, which is equal to `base`. Distance is 0 when `pred` and `target` are identical, and will be a positive integer when they are not. Args: pred: Prediction list. Distance from this list is computed. target: Target list. Distance to this list is computed. base: The integer base to use. For example, a list of chars would use base 256. scalar_diff_fn: Element-wise distance function. Returns: List distance between `pred` and `target`. """ d = 0 for i, target_t in enumerate(target): if i >= len(pred): d += base # A missing slot is worth the max distance. else: # Add element-wise distance for this slot. d += scalar_diff_fn(pred[i], target_t, base) if len(pred) > len(target): # Each extra slot is worth the max distance. d += (len(pred) - len(target)) * base return d def log_absolute_distance(pred, target, base): """Asymmetric list distance function that uses log distance. A list distance which computes sum of element-wise distances, similar to `absolute_distance`. Unlike `absolute_distance`, this scales the resulting distance to be a float. Element-wise distance are log-scale. Distance between two list changes relatively less for elements that are far apart, but changes a lot (goes to 0 faster) when values get close together. Args: pred: List of ints. Computes distance from this list to the target. target: List of ints. This is the "correct" list which the prediction list is trying to match. base: Integer base. Returns: Float distance normalized so that when `pred` is at most as long as `target` the distance is between 0.0 and 1.0. Distance grows unboundedly large as `pred` grows past `target` in length. """ if not target: length_normalizer = 1.0 if not pred: # Distance between [] and [] is 0.0 since they are equal. return 0.0 else: length_normalizer = float(len(target)) # max_dist is the maximum element-wise distance, before taking log and # scaling. Since we use `mod_abs_diff`, it would be (base // 2), but we add # 1 to it so that missing or extra positions get the maximum penalty. max_dist = base // 2 + 1 # The log-distance will be scaled by a factor. # Note: +1 is added to the numerator and denominator to avoid log(0). This # only has a translational effect, i.e. log(dist + 1) / log(max_dist + 1). factor = log(max_dist + 1) d = 0.0 # Total distance to be computed. for i, target_t in enumerate(target): if i >= len(pred): # Assign the max element-wise distance for missing positions. This is 1.0 # after scaling. d += 1.0 else: # Add the log-dist divided by a scaling factor. d += log(mod_abs_diff(pred[i], target_t, base) + 1) / factor if len(pred) > len(target): # Add the max element-wise distance for each extra position. # Since max dist after scaling is 1, this is just the difference in list # lengths. d += (len(pred) - len(target)) return d / length_normalizer # Normalize again by the target length. ######################## ### Reward Functions ### ######################## # Reward functions assign reward based on program output. # Warning: only use these functions as the terminal rewards in episodes, i.e. # for the "final" programs. def absolute_distance_reward(pred, target, base, scalar_diff_fn=abs_diff): """Reward function based on absolute_distance function. Maximum reward, 1.0, is given when the lists are equal. Reward is scaled so that 0.0 reward is given when `pred` is the empty list (assuming `target` is not empty). Reward can go negative when `pred` is longer than `target`. This is an asymmetric reward function, so which list is the prediction and which is the target matters. Args: pred: Prediction sequence. This should be the sequence outputted by the generated code. List of ints n, where 0 <= n < base. target: Target sequence. The correct sequence that the generated code needs to output. List of ints n, where 0 <= n < base. base: Base of the computation. scalar_diff_fn: Element-wise distance function. Returns: Reward computed based on `pred` and `target`. A float. """ unit_dist = float(base * len(target)) if unit_dist == 0: unit_dist = base dist = absolute_distance(pred, target, base, scalar_diff_fn=scalar_diff_fn) return (unit_dist - dist) / unit_dist def absolute_mod_distance_reward(pred, target, base): """Same as `absolute_distance_reward` but `mod_abs_diff` scalar diff is used. Args: pred: Prediction sequence. This should be the sequence outputted by the generated code. List of ints n, where 0 <= n < base. target: Target sequence. The correct sequence that the generated code needs to output. List of ints n, where 0 <= n < base. base: Base of the computation. Returns: Reward computed based on `pred` and `target`. A float. """ return absolute_distance_reward(pred, target, base, mod_abs_diff) def absolute_log_distance_reward(pred, target, base): """Compute reward using `log_absolute_distance`. Maximum reward, 1.0, is given when the lists are equal. Reward is scaled so that 0.0 reward is given when `pred` is the empty list (assuming `target` is not empty). Reward can go negative when `pred` is longer than `target`. This is an asymmetric reward function, so which list is the prediction and which is the target matters. This reward function has the nice property that much more reward is given for getting the correct value (at each position) than for there being any value at all. For example, in base 100, lets say pred = [1] * 1000 and target = [10] * 1000. A lot of reward would be given for being 80% accurate (worst element-wise distance is 50, distances here are 9) using `absolute_distance`. `log_absolute_distance` on the other hand will give greater and greater reward increments the closer each predicted value gets to the target. That makes the reward given for accuracy somewhat independant of the base. Args: pred: Prediction sequence. This should be the sequence outputted by the generated code. List of ints n, where 0 <= n < base. target: Target sequence. The correct sequence that the generated code needs to output. List of ints n, where 0 <= n < base. base: Base of the computation. Returns: Reward computed based on `pred` and `target`. A float. """ return 1.0 - log_absolute_distance(pred, target, base) ####################### ### Reward Managers ### ####################### # Reward managers assign reward to many code attempts throughout an episode. class RewardManager(object): """Reward managers administer reward across an episode. Reward managers are used for "editor" environments. These are environments where the agent has some way to edit its code over time, and run its code many time in the same episode, so that it can make incremental improvements. Reward managers are instantiated with a target sequence, which is the known correct program output. The manager is called on the output from a proposed code, and returns reward. If many proposal outputs are tried, reward may be some stateful function that takes previous tries into account. This is done, in part, so that an agent cannot accumulate unbounded reward just by trying junk programs as often as possible. So reward managers should not give the same reward twice if the next proposal is not better than the last. """ __metaclass__ = ABCMeta def __init__(self, target, base, distance_fn=absolute_distance): self._target = list(target) self._base = base self._distance_fn = distance_fn @abstractmethod def __call__(self, sequence): """Call this reward manager like a function to get reward. Calls to reward manager are stateful, and will take previous sequences into account. Repeated calls with the same sequence may produce different rewards. Args: sequence: List of integers (each between 0 and base - 1). This is the proposal sequence. Reward will be computed based on the distance from this sequence to the target (distance function and target are given in the constructor), as well as previous sequences tried during the lifetime of this object. Returns: Float value. The reward received from this call. """ return 0.0 class DeltaRewardManager(RewardManager): """Simple reward manager that assigns reward for the net change in distance. Given some (possibly asymmetric) list distance function, gives reward for relative changes in prediction distance to the target. For example, if on the first call the distance is 3.0, the change in distance is -3 (from starting distance of 0). That relative change will be scaled to produce a negative reward for this step. On the next call, the distance is 2.0 which is a +1 change, and that will be scaled to give a positive reward. If the final call has distance 0 (the target is achieved), that is another positive change of +2. The total reward across all 3 calls is then 0, which is the highest posible episode total. Reward is scaled so that the maximum element-wise distance is worth 1.0. Maximum total episode reward attainable is 0. """ def __init__(self, target, base, distance_fn=absolute_distance): super(DeltaRewardManager, self).__init__(target, base, distance_fn) self._last_diff = 0 def _diff(self, seq): return self._distance_fn(seq, self._target, self._base) def _delta_reward(self, seq): # Reward is relative to previous sequence diff. # Reward is scaled so that maximum token difference is worth 1.0. # Reward = (last_diff - this_diff) / self.base. # Reward is positive if this sequence is closer to the target than the # previous sequence, and negative if this sequence is further away. diff = self._diff(seq) reward = (self._last_diff - diff) / float(self._base) self._last_diff = diff return reward def __call__(self, seq): return self._delta_reward(seq) class FloorRewardManager(RewardManager): """Assigns positive reward for each step taken closer to the target. Given some (possibly asymmetric) list distance function, gives reward for whenever a new episode minimum distance is reached. No reward is given if the distance regresses to a higher value, so that the sum of rewards for the episode is positive. Reward is scaled so that the maximum element-wise distance is worth 1.0. Maximum total episode reward attainable is len(target). If the prediction sequence is longer than the target, a reward of -1 is given. Subsequence predictions which are also longer get 0 reward. The -1 penalty will be canceled out with a +1 reward when a prediction is given which is at most the length of the target. """ def __init__(self, target, base, distance_fn=absolute_distance): super(FloorRewardManager, self).__init__(target, base, distance_fn) self._last_diff = 0 self._min_diff = self._max_diff() self._too_long_penality_given = False def _max_diff(self): return self._distance_fn([], self._target, self._base) def _diff(self, seq): return self._distance_fn(seq, self._target, self._base) def _delta_reward(self, seq): # Reward is only given if this sequence is closer to the target than any # previous sequence. # Reward is scaled so that maximum token difference is worth 1.0 # Reward = (min_diff - this_diff) / self.base # Reward is always positive. diff = self._diff(seq) if diff < self._min_diff: reward = (self._min_diff - diff) / float(self._base) self._min_diff = diff else: reward = 0.0 return reward def __call__(self, seq): if len(seq) > len(self._target): # Output is too long. if not self._too_long_penality_given: self._too_long_penality_given = True reward = -1.0 else: reward = 0.0 # Don't give this penalty more than once. return reward reward = self._delta_reward(seq) if self._too_long_penality_given: reward += 1.0 # Return the subtracted reward. self._too_long_penality_given = False return reward