|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Generates toy optimization problems. |
|
|
|
This module contains a base class, Problem, that defines a minimal interface |
|
for optimization problems, and a few specific problem types that subclass it. |
|
|
|
Test functions for optimization: http://www.sfu.ca/~ssurjano/optimization.html |
|
""" |
|
|
|
from __future__ import absolute_import |
|
from __future__ import division |
|
from __future__ import print_function |
|
|
|
import numpy as np |
|
import tensorflow as tf |
|
|
|
from learned_optimizer.problems import problem_spec as prob_spec |
|
|
|
tf.app.flags.DEFINE_float("l2_reg_scale", 1e-3, |
|
"""Scaling factor for parameter value regularization |
|
in softmax classifier problems.""") |
|
FLAGS = tf.app.flags.FLAGS |
|
|
|
EPSILON = 1e-6 |
|
MAX_SEED = 4294967295 |
|
PARAMETER_SCOPE = "parameters" |
|
|
|
_Spec = prob_spec.Spec |
|
|
|
|
|
class Problem(object): |
|
"""Base class for optimization problems. |
|
|
|
This defines an interface for optimization problems, including objective and |
|
gradients functions and a feed_generator function that yields data to pass to |
|
feed_dict in tensorflow. |
|
|
|
Subclasses of Problem must (at the minimum) override the objective method, |
|
which computes the objective/loss/cost to minimize, and specify the desired |
|
shape of the parameters in a list in the param_shapes attribute. |
|
""" |
|
|
|
def __init__(self, param_shapes, random_seed, noise_stdev, init_fn=None): |
|
"""Initializes a global random seed for the problem. |
|
|
|
Args: |
|
param_shapes: A list of tuples defining the expected shapes of the |
|
parameters for this problem |
|
random_seed: Either an integer (or None, in which case the seed is |
|
randomly drawn) |
|
noise_stdev: Strength (standard deviation) of added gradient noise |
|
init_fn: A function taking a tf.Session object that is used to |
|
initialize the problem's variables. |
|
|
|
Raises: |
|
ValueError: If the random_seed is not an integer and not None |
|
""" |
|
if random_seed is not None and not isinstance(random_seed, int): |
|
raise ValueError("random_seed must be an integer or None") |
|
|
|
|
|
self.random_seed = (np.random.randint(MAX_SEED) if random_seed is None |
|
else random_seed) |
|
|
|
|
|
self.noise_stdev = noise_stdev |
|
|
|
|
|
np.random.seed(self.random_seed) |
|
|
|
|
|
self.param_shapes = param_shapes |
|
|
|
if init_fn is not None: |
|
self.init_fn = init_fn |
|
else: |
|
self.init_fn = lambda _: None |
|
|
|
def init_tensors(self, seed=None): |
|
"""Returns a list of tensors with the given shape.""" |
|
return [tf.random_normal(shape, seed=seed) for shape in self.param_shapes] |
|
|
|
def init_variables(self, seed=None): |
|
"""Returns a list of variables with the given shape.""" |
|
with tf.variable_scope(PARAMETER_SCOPE): |
|
params = [tf.Variable(param) for param in self.init_tensors(seed)] |
|
return params |
|
|
|
def objective(self, parameters, data=None, labels=None): |
|
"""Computes the objective given a list of parameters. |
|
|
|
Args: |
|
parameters: The parameters to optimize (as a list of tensors) |
|
data: An optional batch of data for calculating objectives |
|
labels: An optional batch of corresponding labels |
|
|
|
Returns: |
|
A scalar tensor representing the objective value |
|
""" |
|
raise NotImplementedError |
|
|
|
def gradients(self, objective, parameters): |
|
"""Compute gradients of the objective with respect to the parameters. |
|
|
|
Args: |
|
objective: The objective op (e.g. output of self.objective()) |
|
parameters: A list of tensors (the parameters to optimize) |
|
|
|
Returns: |
|
A list of tensors representing the gradient for each parameter, |
|
returned in the same order as the given list |
|
""" |
|
grads = tf.gradients(objective, list(parameters)) |
|
noisy_grads = [] |
|
|
|
for grad in grads: |
|
if isinstance(grad, tf.IndexedSlices): |
|
noise = self.noise_stdev * tf.random_normal(tf.shape(grad.values)) |
|
new_grad = tf.IndexedSlices(grad.values + noise, grad.indices) |
|
else: |
|
new_grad = grad + self.noise_stdev * tf.random_normal(grad.get_shape()) |
|
noisy_grads.append(new_grad) |
|
|
|
return noisy_grads |
|
|
|
|
|
class Quadratic(Problem): |
|
"""Optimizes a random quadratic function. |
|
|
|
The objective is: f(x) = (1/2) ||Wx - y||_2^2 |
|
where W is a random Gaussian matrix and y is a random Gaussian vector. |
|
""" |
|
|
|
def __init__(self, ndim, random_seed=None, noise_stdev=0.0): |
|
"""Initializes a random quadratic problem.""" |
|
param_shapes = [(ndim, 1)] |
|
super(Quadratic, self).__init__(param_shapes, random_seed, noise_stdev) |
|
|
|
|
|
self.w = np.random.randn(ndim, ndim).astype("float32") |
|
self.y = np.random.randn(ndim, 1).astype("float32") |
|
|
|
def objective(self, params, data=None, labels=None): |
|
"""Quadratic objective (see base class for details).""" |
|
return tf.nn.l2_loss(tf.matmul(self.w, params[0]) - self.y) |
|
|
|
|
|
class SoftmaxClassifier(Problem): |
|
"""Helper functions for supervised softmax classification problems.""" |
|
|
|
def init_tensors(self, seed=None): |
|
"""Returns a list of tensors with the given shape.""" |
|
return [tf.random_normal(shape, seed=seed) * 1.2 / np.sqrt(shape[0]) |
|
for shape in self.param_shapes] |
|
|
|
def inference(self, params, data): |
|
"""Computes logits given parameters and data. |
|
|
|
Args: |
|
params: List of parameter tensors or variables |
|
data: Batch of features with samples along the first dimension |
|
|
|
Returns: |
|
logits: Un-normalized logits with shape (num_samples, num_classes) |
|
""" |
|
raise NotImplementedError |
|
|
|
def objective(self, params, data, labels): |
|
"""Computes the softmax cross entropy. |
|
|
|
Args: |
|
params: List of parameter tensors or variables |
|
data: Batch of features with samples along the first dimension |
|
labels: Vector of labels with the same number of samples as the data |
|
|
|
Returns: |
|
loss: Softmax cross entropy loss averaged over the samples in the batch |
|
|
|
Raises: |
|
ValueError: If the objective is to be computed over >2 classes, because |
|
this operation is broken in tensorflow at the moment. |
|
""" |
|
|
|
logits = self.inference(params, data) |
|
|
|
|
|
l2reg = [tf.reduce_sum(param ** 2) for param in params] |
|
if int(logits.get_shape()[1]) == 2: |
|
labels = tf.cast(labels, tf.float32) |
|
losses = tf.nn.sigmoid_cross_entropy_with_logits( |
|
labels=labels, logits=logits[:, 0]) |
|
else: |
|
raise ValueError("Unable to compute softmax cross entropy for more than" |
|
" 2 classes.") |
|
|
|
return tf.reduce_mean(losses) + tf.reduce_mean(l2reg) * FLAGS.l2_reg_scale |
|
|
|
def argmax(self, logits): |
|
"""Samples the most likely class label given the logits. |
|
|
|
Args: |
|
logits: Un-normalized logits with shape (num_samples, num_classes) |
|
|
|
Returns: |
|
predictions: Predicted class labels, has shape (num_samples,) |
|
""" |
|
return tf.cast(tf.argmax(tf.nn.softmax(logits), 1), tf.int32) |
|
|
|
def accuracy(self, params, data, labels): |
|
"""Computes the accuracy (fraction of correct classifications). |
|
|
|
Args: |
|
params: List of parameter tensors or variables |
|
data: Batch of features with samples along the first dimension |
|
labels: Vector of labels with the same number of samples as the data |
|
|
|
Returns: |
|
accuracy: Fraction of correct classifications across the batch |
|
""" |
|
predictions = self.argmax(self.inference(params, data)) |
|
return tf.contrib.metrics.accuracy(predictions, tf.cast(labels, tf.int32)) |
|
|
|
|
|
class SoftmaxRegression(SoftmaxClassifier): |
|
"""Builds a softmax regression problem.""" |
|
|
|
def __init__(self, n_features, n_classes, activation=tf.identity, |
|
random_seed=None, noise_stdev=0.0): |
|
self.activation = activation |
|
self.n_features = n_features |
|
param_shapes = [(n_features, n_classes), (n_classes,)] |
|
super(SoftmaxRegression, self).__init__(param_shapes, |
|
random_seed, |
|
noise_stdev) |
|
|
|
def inference(self, params, data): |
|
features = tf.reshape(data, (-1, self.n_features)) |
|
return tf.matmul(features, params[0]) + params[1] |
|
|
|
|
|
class SparseSoftmaxRegression(SoftmaxClassifier): |
|
"""Builds a sparse input softmax regression problem.""" |
|
|
|
def __init__(self, |
|
n_features, |
|
n_classes, |
|
activation=tf.identity, |
|
random_seed=None, |
|
noise_stdev=0.0): |
|
self.activation = activation |
|
self.n_features = n_features |
|
param_shapes = [(n_classes, n_features), (n_features, n_classes), ( |
|
n_classes,)] |
|
super(SparseSoftmaxRegression, self).__init__(param_shapes, random_seed, |
|
noise_stdev) |
|
|
|
def inference(self, params, data): |
|
all_embeddings, softmax_weights, softmax_bias = params |
|
embeddings = tf.nn.embedding_lookup(all_embeddings, tf.cast(data, tf.int32)) |
|
embeddings = tf.reduce_sum(embeddings, 1) |
|
return tf.matmul(embeddings, softmax_weights) + softmax_bias |
|
|
|
|
|
class OneHotSparseSoftmaxRegression(SoftmaxClassifier): |
|
"""Builds a sparse input softmax regression problem. |
|
|
|
This is identical to SparseSoftmaxRegression, but without using embedding |
|
ops. |
|
""" |
|
|
|
def __init__(self, |
|
n_features, |
|
n_classes, |
|
activation=tf.identity, |
|
random_seed=None, |
|
noise_stdev=0.0): |
|
self.activation = activation |
|
self.n_features = n_features |
|
self.n_classes = n_classes |
|
param_shapes = [(n_classes, n_features), (n_features, n_classes), ( |
|
n_classes,)] |
|
super(OneHotSparseSoftmaxRegression, self).__init__(param_shapes, |
|
random_seed, |
|
noise_stdev) |
|
|
|
def inference(self, params, data): |
|
all_embeddings, softmax_weights, softmax_bias = params |
|
num_ids = tf.shape(data)[1] |
|
one_hot_embeddings = tf.one_hot(tf.cast(data, tf.int32), self.n_classes) |
|
one_hot_embeddings = tf.reshape(one_hot_embeddings, [-1, self.n_classes]) |
|
embeddings = tf.matmul(one_hot_embeddings, all_embeddings) |
|
embeddings = tf.reshape(embeddings, [-1, num_ids, self.n_features]) |
|
embeddings = tf.reduce_sum(embeddings, 1) |
|
return tf.matmul(embeddings, softmax_weights) + softmax_bias |
|
|
|
|
|
class FullyConnected(SoftmaxClassifier): |
|
"""Builds a multi-layer perceptron classifier.""" |
|
|
|
def __init__(self, n_features, n_classes, hidden_sizes=(32, 64), |
|
activation=tf.nn.sigmoid, random_seed=None, noise_stdev=0.0): |
|
"""Initializes an multi-layer perceptron classification problem.""" |
|
|
|
self.n_features = n_features |
|
self.activation = activation |
|
|
|
|
|
param_shapes = [] |
|
for ix, sz in enumerate(hidden_sizes + (n_classes,)): |
|
|
|
|
|
prev_size = n_features if ix == 0 else hidden_sizes[ix - 1] |
|
|
|
|
|
param_shapes.append((prev_size, sz)) |
|
|
|
|
|
param_shapes.append((sz,)) |
|
|
|
super(FullyConnected, self).__init__(param_shapes, random_seed, noise_stdev) |
|
|
|
def inference(self, params, data): |
|
|
|
features = tf.reshape(data, (-1, self.n_features)) |
|
|
|
|
|
preactivations = tf.matmul(features, params[0]) + params[1] |
|
|
|
for layer in range(2, len(self.param_shapes), 2): |
|
net = self.activation(preactivations) |
|
preactivations = tf.matmul(net, params[layer]) + params[layer + 1] |
|
|
|
return preactivations |
|
|
|
def accuracy(self, params, data, labels): |
|
"""Computes the accuracy (fraction of correct classifications). |
|
|
|
Args: |
|
params: List of parameter tensors or variables |
|
data: Batch of features with samples along the first dimension |
|
labels: Vector of labels with the same number of samples as the data |
|
|
|
Returns: |
|
accuracy: Fraction of correct classifications across the batch |
|
""" |
|
predictions = self.argmax(self.activation(self.inference(params, data))) |
|
return tf.contrib.metrics.accuracy(predictions, tf.cast(labels, tf.int32)) |
|
|
|
|
|
class ConvNet(SoftmaxClassifier): |
|
"""Builds an N-layer convnet for image classification.""" |
|
|
|
def __init__(self, |
|
image_shape, |
|
n_classes, |
|
filter_list, |
|
activation=tf.nn.relu, |
|
random_seed=None, |
|
noise_stdev=0.0): |
|
|
|
n_channels, px, py = image_shape |
|
|
|
|
|
self.activation = activation |
|
|
|
param_shapes = [] |
|
input_size = n_channels |
|
for fltr in filter_list: |
|
|
|
param_shapes.append((fltr[0], fltr[1], input_size, fltr[2])) |
|
input_size = fltr[2] |
|
|
|
|
|
self.affine_size = input_size * px * py |
|
|
|
param_shapes.append((self.affine_size, n_classes)) |
|
param_shapes.append((n_classes,)) |
|
|
|
super(ConvNet, self).__init__(param_shapes, random_seed, noise_stdev) |
|
|
|
def init_tensors(self, seed=None): |
|
"""Returns a list of tensors with the given shape.""" |
|
return [tf.random_normal(shape, mean=0., stddev=0.01, seed=seed) |
|
for shape in self.param_shapes] |
|
|
|
def inference(self, params, data): |
|
|
|
|
|
w_conv_list = params[:-2] |
|
output_w, output_b = params[-2:] |
|
|
|
conv_input = data |
|
for w_conv in w_conv_list: |
|
layer = tf.nn.conv2d(conv_input, w_conv, strides=[1] * 4, padding="SAME") |
|
output = self.activation(layer) |
|
conv_input = output |
|
|
|
|
|
flattened = tf.reshape(conv_input, (-1, self.affine_size)) |
|
|
|
|
|
return tf.matmul(flattened, output_w) + output_b |
|
|
|
|
|
class Bowl(Problem): |
|
"""A 2D quadratic bowl.""" |
|
|
|
def __init__(self, condition_number, angle=0.0, |
|
random_seed=None, noise_stdev=0.0): |
|
assert condition_number > 0, "Condition number must be positive." |
|
|
|
|
|
param_shapes = [(2, 1)] |
|
super(Bowl, self).__init__(param_shapes, random_seed, noise_stdev) |
|
|
|
self.condition_number = condition_number |
|
self.angle = angle |
|
self._build_matrix(condition_number, angle) |
|
|
|
def _build_matrix(self, condition_number, angle): |
|
"""Builds the Hessian matrix.""" |
|
hessian = np.array([[condition_number, 0.], [0., 1.]], dtype="float32") |
|
|
|
|
|
rotation_matrix = np.array([ |
|
[np.cos(angle), -np.sin(angle)], |
|
[np.sin(angle), np.cos(angle)] |
|
]) |
|
|
|
|
|
|
|
self.matrix = np.sqrt(hessian).dot(rotation_matrix) |
|
|
|
def objective(self, params, data=None, labels=None): |
|
mtx = tf.constant(self.matrix, dtype=tf.float32) |
|
return tf.nn.l2_loss(tf.matmul(mtx, params[0])) |
|
|
|
def surface(self, xlim=5, ylim=5, n=50): |
|
xm, ym = _mesh(xlim, ylim, n) |
|
pts = np.vstack([xm.ravel(), ym.ravel()]) |
|
zm = 0.5 * np.linalg.norm(self.matrix.dot(pts), axis=0) ** 2 |
|
return xm, ym, zm.reshape(n, n) |
|
|
|
|
|
class Problem2D(Problem): |
|
|
|
def __init__(self, random_seed=None, noise_stdev=0.0): |
|
param_shapes = [(2,)] |
|
super(Problem2D, self).__init__(param_shapes, random_seed, noise_stdev) |
|
|
|
def surface(self, n=50, xlim=5, ylim=5): |
|
"""Computes the objective surface over a 2d mesh.""" |
|
|
|
|
|
xm, ym = _mesh(xlim, ylim, n) |
|
|
|
with tf.Graph().as_default(), tf.Session() as sess: |
|
|
|
|
|
x = tf.placeholder(tf.float32, shape=xm.shape) |
|
y = tf.placeholder(tf.float32, shape=ym.shape) |
|
obj = self.objective([[x, y]]) |
|
|
|
|
|
zm = sess.run(obj, feed_dict={x: xm, y: ym}) |
|
|
|
return xm, ym, zm |
|
|
|
|
|
class Rosenbrock(Problem2D): |
|
"""See https://en.wikipedia.org/wiki/Rosenbrock_function. |
|
|
|
This function has a single global minima at [1, 1] |
|
The objective value at this point is zero. |
|
""" |
|
|
|
def init_tensors(self, seed=None): |
|
"""Returns a list of tensors with the given shape.""" |
|
return [tf.random_uniform(shape, minval=-5., maxval=10., seed=seed) |
|
for shape in self.param_shapes] |
|
|
|
def objective(self, params, data=None, labels=None): |
|
x, y = tf.split(params[0], 2, axis=0) |
|
obj = (1 - x)**2 + 100 * (y - x**2)**2 |
|
return tf.squeeze(obj) |
|
|
|
|
|
def make_rosenbrock_loss_and_init(device=None): |
|
"""A variable-backed version of Rosenbrock problem. |
|
|
|
See the Rosenbrock class for details. |
|
|
|
Args: |
|
device: Where to place the ops of this problem. |
|
|
|
Returns: |
|
A tuple of two callables, first of which creates the loss and the second |
|
creates the parameter initializer function. |
|
""" |
|
def make_rosenbrock_loss(): |
|
with tf.name_scope("optimizee"): |
|
with tf.device(device): |
|
x = tf.get_variable("x", [1]) |
|
y = tf.get_variable("y", [1]) |
|
c = tf.get_variable( |
|
"c", [1], |
|
initializer=tf.constant_initializer(100.0), |
|
trainable=False) |
|
obj = (1 - x)**2 + c * (y - x**2)**2 |
|
return tf.squeeze(obj) |
|
|
|
def make_init_fn(parameters): |
|
with tf.device(device): |
|
init_op = tf.variables_initializer(parameters) |
|
def init_fn(sess): |
|
tf.logging.info("Initializing model parameters.") |
|
sess.run(init_op) |
|
return init_fn |
|
|
|
return make_rosenbrock_loss, make_init_fn |
|
|
|
|
|
class Saddle(Problem2D): |
|
"""Loss surface around a saddle point.""" |
|
|
|
def objective(self, params, data=None, labels=None): |
|
x, y = tf.split(params[0], 2, axis=0) |
|
obj = x ** 2 - y ** 2 |
|
return tf.squeeze(obj) |
|
|
|
|
|
class LogSumExp(Problem2D): |
|
"""2D function defined by the log of the sum of exponentials.""" |
|
|
|
def objective(self, params, data=None, labels=None): |
|
x, y = tf.split(params[0], 2, axis=0) |
|
obj = tf.log(tf.exp(x + 3. * y - 0.1) + |
|
tf.exp(x - 3. * y - 0.1) + |
|
tf.exp(-x - 0.1) + 1.0) |
|
return tf.squeeze(obj) |
|
|
|
|
|
class Ackley(Problem2D): |
|
"""Ackley's function (contains many local minima).""" |
|
|
|
def init_tensors(self, seed=None): |
|
"""Returns a list of tensors with the given shape.""" |
|
return [tf.random_uniform(shape, minval=-32.768, maxval=32.768, seed=seed) |
|
for shape in self.param_shapes] |
|
|
|
def objective(self, params, data=None, labels=None): |
|
x, y = tf.split(params[0], 2, axis=0) |
|
obj = (-20 * tf.exp(-0.2 * tf.sqrt(0.5 * (x ** 2 + y ** 2))) - |
|
tf.exp(0.5 * (tf.cos(2 * np.pi * x) + tf.cos(2 * np.pi * y))) + |
|
tf.exp(1.0) + 20.) |
|
return tf.squeeze(obj) |
|
|
|
|
|
class Beale(Problem2D): |
|
"""Beale function (a multimodal function with sharp peaks).""" |
|
|
|
def init_tensors(self, seed=None): |
|
"""Returns a list of tensors with the given shape.""" |
|
return [tf.random_uniform(shape, minval=-4.5, maxval=4.5, seed=seed) |
|
for shape in self.param_shapes] |
|
|
|
def objective(self, params, data=None, labels=None): |
|
x, y = tf.split(params[0], 2, axis=0) |
|
obj = ((1.5 - x + x * y) ** 2 + |
|
(2.25 - x + x * y ** 2) ** 2 + |
|
(2.625 - x + x * y ** 3) ** 2) |
|
return tf.squeeze(obj) |
|
|
|
|
|
class Booth(Problem2D): |
|
"""Booth's function (has a long valley along one dimension).""" |
|
|
|
def init_tensors(self, seed=None): |
|
"""Returns a list of tensors with the given shape.""" |
|
return [tf.random_uniform(shape, minval=-10., maxval=10., seed=seed) |
|
for shape in self.param_shapes] |
|
|
|
def objective(self, params, data=None, labels=None): |
|
x, y = tf.split(params[0], 2, axis=0) |
|
obj = (x + 2 * y - 7) ** 2 + (2 * x + y - 5) ** 2 |
|
return tf.squeeze(obj) |
|
|
|
|
|
class StyblinskiTang(Problem2D): |
|
"""Styblinski-Tang function (a bumpy function in two dimensions).""" |
|
|
|
def init_tensors(self, seed=None): |
|
"""Returns a list of tensors with the given shape.""" |
|
return [tf.random_uniform(shape, minval=-5., maxval=5., seed=seed) |
|
for shape in self.param_shapes] |
|
|
|
def objective(self, params, data=None, labels=None): |
|
params = tf.split(params[0], 2, axis=0) |
|
obj = 0.5 * tf.reduce_sum([x ** 4 - 16 * x ** 2 + 5 * x |
|
for x in params], 0) + 80. |
|
return tf.squeeze(obj) |
|
|
|
|
|
class Matyas(Problem2D): |
|
"""Matyas function (a function with a single global minimum in a valley).""" |
|
|
|
def init_tensors(self, seed=None): |
|
"""Returns a list of tensors with the given shape.""" |
|
return [tf.random_uniform(shape, minval=-10, maxval=10, seed=seed) |
|
for shape in self.param_shapes] |
|
|
|
def objective(self, params, data=None, labels=None): |
|
x, y = tf.split(params[0], 2, axis=0) |
|
obj = 0.26 * (x ** 2 + y ** 2) - 0.48 * x * y |
|
return tf.squeeze(obj) |
|
|
|
|
|
class Branin(Problem2D): |
|
"""Branin function (a function with three global minima).""" |
|
|
|
def init_tensors(self, seed=None): |
|
"""Returns a list of tensors with the given shape.""" |
|
x1 = tf.random_uniform((1,), minval=-5., maxval=10., |
|
seed=seed) |
|
x2 = tf.random_uniform((1,), minval=0., maxval=15., |
|
seed=seed) |
|
return [tf.concat([x1, x2], 0)] |
|
|
|
def objective(self, params, data=None, labels=None): |
|
x, y = tf.split(params[0], 2, axis=0) |
|
|
|
|
|
a = 1. |
|
b = 5.1 / (4. * np.pi ** 2) |
|
c = 5 / np.pi |
|
r = 6. |
|
s = 10. |
|
t = 1 / (8. * np.pi) |
|
|
|
|
|
obj = a * (y - b * x ** 2 + c * x - r) ** 2 + s * (1 - t) * tf.cos(x) + s |
|
return tf.squeeze(obj) |
|
|
|
|
|
class Michalewicz(Problem2D): |
|
"""Michalewicz function (has steep ridges and valleys).""" |
|
|
|
def init_tensors(self, seed=None): |
|
"""Returns a list of tensors with the given shape.""" |
|
return [tf.random_uniform(shape, minval=0., maxval=np.pi, seed=seed) |
|
for shape in self.param_shapes] |
|
|
|
def objective(self, params, data=None, labels=None): |
|
x, y = tf.split(params[0], 2, axis=0) |
|
m = 5 |
|
obj = 2. - (tf.sin(x) * tf.sin(x ** 2 / np.pi) ** (2 * m) + |
|
tf.sin(y) * tf.sin(2 * y ** 2 / np.pi) ** (2 * m)) |
|
return tf.squeeze(obj) |
|
|
|
|
|
class Rescale(Problem): |
|
"""Takes an existing problem, and rescales all the parameters.""" |
|
|
|
def __init__(self, problem_spec, scale=10., noise_stdev=0.0): |
|
self.problem = problem_spec.build() |
|
self.param_shapes = self.problem.param_shapes |
|
self.scale = scale |
|
|
|
super(Rescale, self).__init__(self.param_shapes, random_seed=None, |
|
noise_stdev=noise_stdev) |
|
|
|
def init_tensors(self, seed=None): |
|
params_raw = self.problem.init_tensors(seed=seed) |
|
params = [t * self.scale for t in params_raw] |
|
return params |
|
|
|
def objective(self, params, data=None, labels=None): |
|
params_raw = [t/self.scale for t in params] |
|
|
|
problem_obj = self.problem.objective(params_raw, data, labels) |
|
return problem_obj |
|
|
|
|
|
class SumTask(Problem): |
|
"""Takes a list of problems and modifies the objective to be their sum.""" |
|
|
|
def __init__(self, problem_specs, noise_stdev=0.0): |
|
self.problems = [ps.build() for ps in problem_specs] |
|
self.param_shapes = [] |
|
for prob in self.problems: |
|
self.param_shapes += prob.param_shapes |
|
|
|
super(SumTask, self).__init__(self.param_shapes, random_seed=None, |
|
noise_stdev=noise_stdev) |
|
|
|
def init_tensors(self, seed=None): |
|
tensors = [] |
|
for prob in self.problems: |
|
tensors += prob.init_tensors(seed=seed) |
|
return tensors |
|
|
|
def objective(self, params, data=None, labels=None): |
|
obj = 0. |
|
index = 0 |
|
for prob in self.problems: |
|
num_params = len(prob.param_shapes) |
|
obj += prob.objective(params[index:index + num_params]) |
|
index += num_params |
|
return obj |
|
|
|
|
|
class IsotropicQuadratic(Problem): |
|
"""An isotropic quadratic problem.""" |
|
|
|
def objective(self, params, data=None, labels=None): |
|
return sum([tf.reduce_sum(param ** 2) for param in params]) |
|
|
|
|
|
class Norm(Problem): |
|
"""Takes an existing problem and modifies the objective to be its N-norm.""" |
|
|
|
def __init__(self, ndim, random_seed=None, noise_stdev=0.0, norm_power=2.): |
|
param_shapes = [(ndim, 1)] |
|
super(Norm, self).__init__(param_shapes, random_seed, noise_stdev) |
|
|
|
|
|
self.w = np.random.randn(ndim, ndim).astype("float32") |
|
self.y = np.random.randn(ndim, 1).astype("float32") |
|
self.norm_power = norm_power |
|
|
|
def objective(self, params, data=None, labels=None): |
|
diff = tf.matmul(self.w, params[0]) - self.y |
|
exp = 1. / self.norm_power |
|
loss = tf.reduce_sum((tf.abs(diff) + EPSILON) ** self.norm_power) ** exp |
|
return loss |
|
|
|
|
|
class LogObjective(Problem): |
|
"""Takes an existing problem and modifies the objective to be its log.""" |
|
|
|
def __init__(self, problem_spec): |
|
self.problem = problem_spec.build() |
|
self.param_shapes = self.problem.param_shapes |
|
|
|
super(LogObjective, self).__init__(self.param_shapes, |
|
random_seed=None, |
|
noise_stdev=0.0) |
|
|
|
def objective(self, params, data=None, labels=None): |
|
problem_obj = self.problem.objective(params, data, labels) |
|
return tf.log(problem_obj + EPSILON) - tf.log(EPSILON) |
|
|
|
|
|
class SparseProblem(Problem): |
|
"""Takes a problem and sets gradients to 0 with the given probability.""" |
|
|
|
def __init__(self, |
|
problem_spec, |
|
zero_probability=0.99, |
|
random_seed=None, |
|
noise_stdev=0.0): |
|
self.problem = problem_spec.build() |
|
self.param_shapes = self.problem.param_shapes |
|
self.zero_prob = zero_probability |
|
|
|
super(SparseProblem, self).__init__(self.param_shapes, |
|
random_seed=random_seed, |
|
noise_stdev=noise_stdev) |
|
|
|
def objective(self, parameters, data=None, labels=None): |
|
return self.problem.objective(parameters, data, labels) |
|
|
|
def gradients(self, objective, parameters): |
|
grads = tf.gradients(objective, list(parameters)) |
|
|
|
new_grads = [] |
|
for grad in grads: |
|
mask = tf.greater(self.zero_prob, tf.random_uniform(grad.get_shape())) |
|
zero_grad = tf.zeros_like(grad, dtype=tf.float32) |
|
noisy_grad = grad + self.noise_stdev * tf.random_normal(grad.get_shape()) |
|
new_grads.append(tf.where(mask, zero_grad, noisy_grad)) |
|
return new_grads |
|
|
|
|
|
class DependencyChain(Problem): |
|
"""A problem in which parameters must be optimized in order. |
|
|
|
A sequence of parameters which all need to be brought to 0, but where each |
|
parameter in the sequence can't be brought to 0 until the preceding one |
|
has been. This should take a long time to optimize, with steady |
|
(or accelerating) progress throughout the entire process. |
|
""" |
|
|
|
def __init__(self, ndim, random_seed=None, noise_stdev=0.): |
|
param_shapes = [(ndim + 1,)] |
|
self.ndim = ndim |
|
super(DependencyChain, self).__init__( |
|
param_shapes, random_seed, noise_stdev) |
|
|
|
def objective(self, params, data=None, labels=None): |
|
terms = params[0][0]**2 + params[0][1:]**2 / (params[0][:-1]**2 + EPSILON) |
|
return tf.reduce_sum(terms) |
|
|
|
|
|
class MinMaxWell(Problem): |
|
"""Problem with global min when both the min and max (absolute) params are 1. |
|
|
|
The gradient for all but two parameters (the min and max) is zero. This |
|
should therefore encourage the optimizer to behave sensible even when |
|
parameters have zero gradients, as is common eg for some deep neural nets. |
|
""" |
|
|
|
def __init__(self, ndim, random_seed=None, noise_stdev=0.): |
|
param_shapes = [(ndim,)] |
|
self.ndim = ndim |
|
super(MinMaxWell, self).__init__(param_shapes, random_seed, noise_stdev) |
|
|
|
def objective(self, params, data=None, labels=None): |
|
params_sqr = params[0]**2 |
|
min_sqr = tf.reduce_min(params_sqr) |
|
max_sqr = tf.reduce_max(params_sqr) |
|
epsilon = 1e-12 |
|
|
|
return max_sqr + 1./min_sqr - 2. + epsilon |
|
|
|
|
|
class OutwardSnake(Problem): |
|
"""A winding path out to infinity. |
|
|
|
Ideal step length stays constant along the entire path. |
|
""" |
|
|
|
def __init__(self, ndim, random_seed=None, noise_stdev=0.): |
|
param_shapes = [(ndim,)] |
|
self.ndim = ndim |
|
super(OutwardSnake, self).__init__(param_shapes, random_seed, noise_stdev) |
|
|
|
def objective(self, params, data, labels=None): |
|
radius = tf.sqrt(tf.reduce_sum(params[0]**2)) |
|
rad_loss = tf.reduce_sum(1. / (radius + 1e-6) * data[:, 0]) |
|
|
|
sin_dist = params[0][1:] - tf.cos(params[0][:-1]) * np.pi |
|
sin_loss = tf.reduce_sum((sin_dist * data[:, 1:])**2) |
|
|
|
return rad_loss + sin_loss |
|
|
|
|
|
class ProjectionQuadratic(Problem): |
|
"""Dataset consists of different directions to probe. Global min is at 0.""" |
|
|
|
def __init__(self, ndim, random_seed=None, noise_stdev=0.): |
|
param_shapes = [(1, ndim)] |
|
super(ProjectionQuadratic, self).__init__( |
|
param_shapes, random_seed, noise_stdev) |
|
|
|
def objective(self, params, data, labels=None): |
|
return tf.reduce_sum((params[0] * data)**2) |
|
|
|
|
|
class SumOfQuadratics(Problem): |
|
|
|
def __init__(self, ndim, random_seed=None, noise_stdev=0.): |
|
param_shapes = [(1, ndim)] |
|
super(SumOfQuadratics, self).__init__( |
|
param_shapes, random_seed, noise_stdev) |
|
|
|
def objective(self, params, data, labels=None): |
|
epsilon = 1e-12 |
|
|
|
|
|
|
|
return (tf.reduce_sum((params[0] - data)**2) - tf.reduce_sum(data**2) + |
|
epsilon) |
|
|
|
|
|
class MatMulAlgorithm(Problem): |
|
"""A 6-th order polynomial optimization problem. |
|
|
|
This problem is parametrized by n and k. A solution to this problem with |
|
objective value exactly zero defines a matrix multiplication algorithm of |
|
n x n matrices using k multiplications between matrices. When applied |
|
recursively, such an algorithm has complexity O(n^(log_n(k))). |
|
|
|
Given n, it is not known in general which values of k in [n^2, n^3] have a |
|
solution. There is always a solution with k = n^3 (this is the naive |
|
algorithm). |
|
|
|
In the special case n = 2, it is known that there are solutions for k = {7, 8} |
|
but not for k <= 6. For n = 3, it is known that there are exact solutions for |
|
23 <= k <= 27, and there are asymptotic solutions for k = {21, 22}, but the |
|
other cases are unknown. |
|
|
|
For a given n and k, if one solution exists then infinitely many solutions |
|
exist due to permutation and scaling symmetries in the parameters. |
|
|
|
This is a very hard problem for some values of n and k (e.g. n = 3, k = 21), |
|
but very easy for other values (e.g. n = 2, k = 7). |
|
|
|
For a given n and k, the specific formulation of this problem is as follows. |
|
Let theta_a, theta_b, theta_c be parameter matrices with respective dimensions |
|
[n**2, k], [n**2, k], [k, n**2]. Then for any matrices a, b with shape [n, n], |
|
we can form the matrix c with shape [n, n] via the operation: |
|
((vec(a) * theta_a) .* (vec(b) * theta_b)) * theta_c = vec(c), (#) |
|
where vec(x) is the operator that flattens a matrix with shape [n, n] into a |
|
row vector with shape [1, n**2], * denotes matrix multiplication and .* |
|
denotes elementwise multiplication. |
|
|
|
This operation, parameterized by theta_a, theta_b, theta_c, is a matrix |
|
multiplication algorithm iff c = a*b for all [n, n] matrices a and b. But |
|
actually it suffices to verify all combinations of one-hot matrices a and b, |
|
of which there are n**4 such combinations. This gives a batch of n**4 matrix |
|
triplets (a, b, c) such that equation (#) must hold for each triplet. We solve |
|
for theta_a, theta_b, theta_c by minimizing the sum of squares of errors |
|
across this batch. |
|
|
|
Finally, theta_c can be computed from theta_a and theta_b. Therefore it |
|
suffices to learn theta_a and theta_b, from which theta_c and therefore the |
|
objective value can be computed. |
|
""" |
|
|
|
def __init__(self, n, k): |
|
assert isinstance(n, int), "n must be an integer" |
|
assert isinstance(k, int), "k must be an integer" |
|
assert n >= 2, "Must have n >= 2" |
|
assert k >= n**2 and k <= n**3, "Must have n**2 <= k <= n**3" |
|
|
|
param_shapes = [(n**2, k), (n**2, k)] |
|
super(MatMulAlgorithm, self).__init__( |
|
param_shapes, random_seed=None, noise_stdev=0.0) |
|
|
|
self.n = n |
|
self.k = k |
|
|
|
|
|
|
|
|
|
|
|
onehots = np.identity(n**2).reshape(n**2, n, n) |
|
a_3d = np.repeat(onehots, n**2, axis=0) |
|
b_3d = np.tile(onehots, [n**2, 1, 1]) |
|
c_3d = np.matmul(a_3d, b_3d) |
|
|
|
|
|
self.a = tf.constant(a_3d.reshape(n**4, n**2), tf.float32, name="a") |
|
self.b = tf.constant(b_3d.reshape(n**4, n**2), tf.float32, name="b") |
|
self.c = tf.constant(c_3d.reshape(n**4, n**2), tf.float32, name="c") |
|
|
|
def init_tensors(self, seed=None): |
|
|
|
|
|
def _param_initializer(shape, seed=None): |
|
x = tf.random_normal(shape, dtype=tf.float32, seed=seed) |
|
return tf.transpose(tf.nn.l2_normalize(tf.transpose(x), 1)) |
|
|
|
return [_param_initializer(shape, seed) for shape in self.param_shapes] |
|
|
|
def objective(self, parameters, data=None, labels=None): |
|
theta_a = parameters[0] |
|
theta_b = parameters[1] |
|
|
|
|
|
p = tf.matmul(self.a, theta_a) * tf.matmul(self.b, theta_b) |
|
p_trans = tf.transpose(p, name="p_trans") |
|
p_inv = tf.matmul( |
|
tf.matrix_inverse(tf.matmul(p_trans, p)), p_trans, name="p_inv") |
|
theta_c = tf.matmul(p_inv, self.c, name="theta_c") |
|
|
|
|
|
c_hat = tf.matmul(p, theta_c, name="c_hat") |
|
|
|
|
|
loss = tf.reduce_sum((c_hat - self.c)**2, name="loss") |
|
|
|
return loss |
|
|
|
|
|
def matmul_problem_sequence(n, k_min, k_max): |
|
"""Helper to generate a sequence of matrix multiplication problems.""" |
|
return [(_Spec(MatMulAlgorithm, (n, k), {}), None, None) |
|
for k in range(k_min, k_max + 1)] |
|
|
|
|
|
def init_fixed_variables(arrays): |
|
with tf.variable_scope(PARAMETER_SCOPE): |
|
params = [tf.Variable(arr.astype("float32")) for arr in arrays] |
|
return params |
|
|
|
|
|
def _mesh(xlim, ylim, n): |
|
"""Creates a 2D meshgrid covering the given ranges. |
|
|
|
Args: |
|
xlim: int that defines the desired x-range (-xlim, xlim) |
|
ylim: int that defines the desired y-range (-ylim, ylim) |
|
n: number of points in each dimension of the mesh |
|
|
|
Returns: |
|
xm: 2D array of x-values in the mesh |
|
ym: 2D array of y-values in the mesh |
|
""" |
|
return np.meshgrid(np.linspace(-xlim, xlim, n), |
|
np.linspace(-ylim, ylim, n)) |
|
|