Spaces:

NCTCMumbai
/

AdvaitBERT-AI_Explanability

Sleeping

App Files Files Community

AdvaitBERT-AI_Explanability / models /research /lfads /distributions.py

NCTCMumbai

Upload 2583 files

18ddfe2 verified about 1 year ago

raw

history blame

17.4 kB

	# Copyright 2017 Google Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# ==============================================================================
	import numpy as np
	import tensorflow as tf
	from utils import linear, log_sum_exp

	class Poisson(object):
	"""Poisson distributon

	Computes the log probability under the model.

	"""
	def __init__(self, log_rates):
	""" Create Poisson distributions with log_rates parameters.

	Args:
	log_rates: a tensor-like list of log rates underlying the Poisson dist.
	"""
	self.logr = log_rates

	def logp(self, bin_counts):
	"""Compute the log probability for the counts in the bin, under the model.

	Args:
	bin_counts: array-like integer counts

	Returns:
	The log-probability under the Poisson models for each element of
	bin_counts.
	"""
	k = tf.to_float(bin_counts)
	# log poisson(k, r) = log(r^k * e^(-r) / k!) = k log(r) - r - log k!
	# log poisson(k, r=exp(x)) = k * x - exp(x) - lgamma(k + 1)
	return k * self.logr - tf.exp(self.logr) - tf.lgamma(k + 1)


	def diag_gaussian_log_likelihood(z, mu=0.0, logvar=0.0):
	"""Log-likelihood under a Gaussian distribution with diagonal covariance.
	Returns the log-likelihood for each dimension. One should sum the
	results for the log-likelihood under the full multidimensional model.

	Args:
	z: The value to compute the log-likelihood.
	mu: The mean of the Gaussian
	logvar: The log variance of the Gaussian.

	Returns:
	The log-likelihood under the Gaussian model.
	"""

	return -0.5 * (logvar + np.log(2*np.pi) + \
	tf.square((z-mu)/tf.exp(0.5*logvar)))


	def gaussian_pos_log_likelihood(unused_mean, logvar, noise):
	"""Gaussian log-likelihood function for a posterior in VAE

	Note: This function is specialized for a posterior distribution, that has the
	form of z = mean + sigma * noise.

	Args:
	unused_mean: ignore
	logvar: The log variance of the distribution
	noise: The noise used in the sampling of the posterior.

	Returns:
	The log-likelihood under the Gaussian model.
	"""
	# ln N(z; mean, sigma) = - ln(sigma) - 0.5 ln 2pi - noise^2 / 2
	return - 0.5 * (logvar + np.log(2 * np.pi) + tf.square(noise))


	class Gaussian(object):
	"""Base class for Gaussian distribution classes."""
	pass


	class DiagonalGaussian(Gaussian):
	"""Diagonal Gaussian with different constant mean and variances in each
	dimension.
	"""

	def __init__(self, batch_size, z_size, mean, logvar):
	"""Create a diagonal gaussian distribution.

	Args:
	batch_size: The size of the batch, i.e. 0th dim in 2D tensor of samples.
	z_size: The dimension of the distribution, i.e. 1st dim in 2D tensor.
	mean: The N-D mean of the distribution.
	logvar: The N-D log variance of the diagonal distribution.
	"""
	size__xz = [None, z_size]
	self.mean = mean # bxn already
	self.logvar = logvar # bxn already
	self.noise = noise = tf.random_normal(tf.shape(logvar))
	self.sample = mean + tf.exp(0.5 * logvar) * noise
	mean.set_shape(size__xz)
	logvar.set_shape(size__xz)
	self.sample.set_shape(size__xz)

	def logp(self, z=None):
	"""Compute the log-likelihood under the distribution.

	Args:
	z (optional): value to compute likelihood for, if None, use sample.

	Returns:
	The likelihood of z under the model.
	"""
	if z is None:
	z = self.sample

	# This is needed to make sure that the gradients are simple.
	# The value of the function shouldn't change.
	if z == self.sample:
	return gaussian_pos_log_likelihood(self.mean, self.logvar, self.noise)

	return diag_gaussian_log_likelihood(z, self.mean, self.logvar)


	class LearnableDiagonalGaussian(Gaussian):
	"""Diagonal Gaussian whose mean and variance are learned parameters."""

	def __init__(self, batch_size, z_size, name, mean_init=0.0,
	var_init=1.0, var_min=0.0, var_max=1000000.0):
	"""Create a learnable diagonal gaussian distribution.

	Args:
	batch_size: The size of the batch, i.e. 0th dim in 2D tensor of samples.
	z_size: The dimension of the distribution, i.e. 1st dim in 2D tensor.
	name: prefix name for the mean and log TF variables.
	mean_init (optional): The N-D mean initialization of the distribution.
	var_init (optional): The N-D variance initialization of the diagonal
	distribution.
	var_min (optional): The minimum value the learned variance can take in any
	dimension.
	var_max (optional): The maximum value the learned variance can take in any
	dimension.
	"""

	size_1xn = [1, z_size]
	size__xn = [None, z_size]
	size_bx1 = tf.stack([batch_size, 1])
	assert var_init > 0.0, "Problems"
	assert var_max >= var_min, "Problems"
	assert var_init >= var_min, "Problems"
	assert var_max >= var_init, "Problems"


	z_mean_1xn = tf.get_variable(name=name+"/mean", shape=size_1xn,
	initializer=tf.constant_initializer(mean_init))
	self.mean_bxn = mean_bxn = tf.tile(z_mean_1xn, size_bx1)
	mean_bxn.set_shape(size__xn) # tile loses shape

	log_var_init = np.log(var_init)
	if var_max > var_min:
	var_is_trainable = True
	else:
	var_is_trainable = False

	z_logvar_1xn = \
	tf.get_variable(name=(name+"/logvar"), shape=size_1xn,
	initializer=tf.constant_initializer(log_var_init),
	trainable=var_is_trainable)

	if var_is_trainable:
	z_logit_var_1xn = tf.exp(z_logvar_1xn)
	z_var_1xn = tf.nn.sigmoid(z_logit_var_1xn)*(var_max-var_min) + var_min
	z_logvar_1xn = tf.log(z_var_1xn)

	logvar_bxn = tf.tile(z_logvar_1xn, size_bx1)
	self.logvar_bxn = logvar_bxn
	self.noise_bxn = noise_bxn = tf.random_normal(tf.shape(logvar_bxn))
	self.sample_bxn = mean_bxn + tf.exp(0.5 * logvar_bxn) * noise_bxn

	def logp(self, z=None):
	"""Compute the log-likelihood under the distribution.

	Args:
	z (optional): value to compute likelihood for, if None, use sample.

	Returns:
	The likelihood of z under the model.
	"""
	if z is None:
	z = self.sample

	# This is needed to make sure that the gradients are simple.
	# The value of the function shouldn't change.
	if z == self.sample_bxn:
	return gaussian_pos_log_likelihood(self.mean_bxn, self.logvar_bxn,
	self.noise_bxn)

	return diag_gaussian_log_likelihood(z, self.mean_bxn, self.logvar_bxn)

	@property
	def mean(self):
	return self.mean_bxn

	@property
	def logvar(self):
	return self.logvar_bxn

	@property
	def sample(self):
	return self.sample_bxn


	class DiagonalGaussianFromInput(Gaussian):
	"""Diagonal Gaussian whose mean and variance are conditioned on other
	variables.

	Note: the parameters to convert from input to the learned mean and log
	variance are held in this class.
	"""

	def __init__(self, x_bxu, z_size, name, var_min=0.0):
	"""Create an input dependent diagonal Gaussian distribution.

	Args:
	x: The input tensor from which the mean and variance are computed,
	via a linear transformation of x. I.e.
	mu = Wx + b, log(var) = Mx + c
	z_size: The size of the distribution.
	name: The name to prefix to learned variables.
	var_min (optional): Minimal variance allowed. This is an additional
	way to control the amount of information getting through the stochastic
	layer.
	"""
	size_bxn = tf.stack([tf.shape(x_bxu)[0], z_size])
	self.mean_bxn = mean_bxn = linear(x_bxu, z_size, name=(name+"/mean"))
	logvar_bxn = linear(x_bxu, z_size, name=(name+"/logvar"))
	if var_min > 0.0:
	logvar_bxn = tf.log(tf.exp(logvar_bxn) + var_min)
	self.logvar_bxn = logvar_bxn

	self.noise_bxn = noise_bxn = tf.random_normal(size_bxn)
	self.noise_bxn.set_shape([None, z_size])
	self.sample_bxn = mean_bxn + tf.exp(0.5 * logvar_bxn) * noise_bxn

	def logp(self, z=None):
	"""Compute the log-likelihood under the distribution.

	Args:
	z (optional): value to compute likelihood for, if None, use sample.

	Returns:
	The likelihood of z under the model.
	"""

	if z is None:
	z = self.sample

	# This is needed to make sure that the gradients are simple.
	# The value of the function shouldn't change.
	if z == self.sample_bxn:
	return gaussian_pos_log_likelihood(self.mean_bxn,
	self.logvar_bxn, self.noise_bxn)

	return diag_gaussian_log_likelihood(z, self.mean_bxn, self.logvar_bxn)

	@property
	def mean(self):
	return self.mean_bxn

	@property
	def logvar(self):
	return self.logvar_bxn

	@property
	def sample(self):
	return self.sample_bxn


	class GaussianProcess:
	"""Base class for Gaussian processes."""
	pass


	class LearnableAutoRegressive1Prior(GaussianProcess):
	"""AR(1) model where autocorrelation and process variance are learned
	parameters. Assumed zero mean.

	"""

	def __init__(self, batch_size, z_size,
	autocorrelation_taus, noise_variances,
	do_train_prior_ar_atau, do_train_prior_ar_nvar,
	num_steps, name):
	"""Create a learnable autoregressive (1) process.

	Args:
	batch_size: The size of the batch, i.e. 0th dim in 2D tensor of samples.
	z_size: The dimension of the distribution, i.e. 1st dim in 2D tensor.
	autocorrelation_taus: The auto correlation time constant of the AR(1)
	process.
	A value of 0 is uncorrelated gaussian noise.
	noise_variances: The variance of the additive noise, not the process
	variance.
	do_train_prior_ar_atau: Train or leave as constant, the autocorrelation?
	do_train_prior_ar_nvar: Train or leave as constant, the noise variance?
	num_steps: Number of steps to run the process.
	name: The name to prefix to learned TF variables.
	"""

	# Note the use of the plural in all of these quantities. This is intended
	# to mark that even though a sample z_t from the posterior is thought of a
	# single sample of a multidimensional gaussian, the prior is actually
	# thought of as U AR(1) processes, where U is the dimension of the inferred
	# input.
	size_bx1 = tf.stack([batch_size, 1])
	size__xu = [None, z_size]
	# process variance, the variance at time t over all instantiations of AR(1)
	# with these parameters.
	log_evar_inits_1xu = tf.expand_dims(tf.log(noise_variances), 0)
	self.logevars_1xu = logevars_1xu = \
	tf.Variable(log_evar_inits_1xu, name=name+"/logevars", dtype=tf.float32,
	trainable=do_train_prior_ar_nvar)
	self.logevars_bxu = logevars_bxu = tf.tile(logevars_1xu, size_bx1)
	logevars_bxu.set_shape(size__xu) # tile loses shape

	# \tau, which is the autocorrelation time constant of the AR(1) process
	log_atau_inits_1xu = tf.expand_dims(tf.log(autocorrelation_taus), 0)
	self.logataus_1xu = logataus_1xu = \
	tf.Variable(log_atau_inits_1xu, name=name+"/logatau", dtype=tf.float32,
	trainable=do_train_prior_ar_atau)

	# phi in x_t = \mu + phi x_tm1 + \eps
	# phi = exp(-1/tau)
	# phi = exp(-1/exp(logtau))
	# phi = exp(-exp(-logtau))
	phis_1xu = tf.exp(-tf.exp(-logataus_1xu))
	self.phis_bxu = phis_bxu = tf.tile(phis_1xu, size_bx1)
	phis_bxu.set_shape(size__xu)

	# process noise
	# pvar = evar / (1- phi^2)
	# logpvar = log ( exp(logevar) / (1 - phi^2) )
	# logpvar = logevar - log(1-phi^2)
	# logpvar = logevar - (log(1-phi) + log(1+phi))
	self.logpvars_1xu = \
	logevars_1xu - tf.log(1.0-phis_1xu) - tf.log(1.0+phis_1xu)
	self.logpvars_bxu = logpvars_bxu = tf.tile(self.logpvars_1xu, size_bx1)
	logpvars_bxu.set_shape(size__xu)

	# process mean (zero but included in for completeness)
	self.pmeans_bxu = pmeans_bxu = tf.zeros_like(phis_bxu)

	# For sampling from the prior during de-novo generation.
	self.means_t = means_t = [None] * num_steps
	self.logvars_t = logvars_t = [None] * num_steps
	self.samples_t = samples_t = [None] * num_steps
	self.gaussians_t = gaussians_t = [None] * num_steps
	sample_bxu = tf.zeros_like(phis_bxu)
	for t in range(num_steps):
	# process variance used here to make process completely stationary
	if t == 0:
	logvar_pt_bxu = self.logpvars_bxu
	else:
	logvar_pt_bxu = self.logevars_bxu

	z_mean_pt_bxu = pmeans_bxu + phis_bxu * sample_bxu
	gaussians_t[t] = DiagonalGaussian(batch_size, z_size,
	mean=z_mean_pt_bxu,
	logvar=logvar_pt_bxu)
	sample_bxu = gaussians_t[t].sample
	samples_t[t] = sample_bxu
	logvars_t[t] = logvar_pt_bxu
	means_t[t] = z_mean_pt_bxu

	def logp_t(self, z_t_bxu, z_tm1_bxu=None):
	"""Compute the log-likelihood under the distribution for a given time t,
	not the whole sequence.

	Args:
	z_t_bxu: sample to compute likelihood for at time t.
	z_tm1_bxu (optional): sample condition probability of z_t upon.

	Returns:
	The likelihood of p_t under the model at time t. i.e.
	p(z_t\|z_tm1_bxu) = N(z_tm1_bxu * phis, eps^2)

	"""
	if z_tm1_bxu is None:
	return diag_gaussian_log_likelihood(z_t_bxu, self.pmeans_bxu,
	self.logpvars_bxu)
	else:
	means_t_bxu = self.pmeans_bxu + self.phis_bxu * z_tm1_bxu
	logp_tgtm1_bxu = diag_gaussian_log_likelihood(z_t_bxu,
	means_t_bxu,
	self.logevars_bxu)
	return logp_tgtm1_bxu


	class KLCost_GaussianGaussian(object):
	"""log p(x\|z) + KL(q\|\|p) terms for Gaussian posterior and Gaussian prior. See
	eqn 10 and Appendix B in VAE for latter term,
	http://arxiv.org/abs/1312.6114

	The log p(x\|z) term is the reconstruction error under the model.
	The KL term represents the penalty for passing information from the encoder
	to the decoder.
	To sample KL(q\|\|p), we simply sample
	ln q - ln p
	by drawing samples from q and averaging.
	"""

	def __init__(self, zs, prior_zs):
	"""Create a lower bound in three parts, normalized reconstruction
	cost, normalized KL divergence cost, and their sum.

	E_q[ln p(z_i \| z_{i+1}) / q(z_i \| x)
	\int q(z) ln p(z) dz = - 0.5 ln(2pi) - 0.5 \sum (ln(sigma_p^2) + \
	sigma_q^2 / sigma_p^2 + (mean_p - mean_q)^2 / sigma_p^2)

	\int q(z) ln q(z) dz = - 0.5 ln(2pi) - 0.5 \sum (ln(sigma_q^2) + 1)

	Args:
	zs: posterior z ~ q(z\|x)
	prior_zs: prior zs
	"""
	# L = -KL + log p(x\|z), to maximize bound on likelihood
	# -L = KL - log p(x\|z), to minimize bound on NLL
	# so 'KL cost' is postive KL divergence
	kl_b = 0.0
	for z, prior_z in zip(zs, prior_zs):
	assert isinstance(z, Gaussian)
	assert isinstance(prior_z, Gaussian)
	# ln(2pi) terms cancel
	kl_b += 0.5 * tf.reduce_sum(
	prior_z.logvar - z.logvar
	+ tf.exp(z.logvar - prior_z.logvar)
	+ tf.square((z.mean - prior_z.mean) / tf.exp(0.5 * prior_z.logvar))
	- 1.0, [1])

	self.kl_cost_b = kl_b
	self.kl_cost = tf.reduce_mean(kl_b)


	class KLCost_GaussianGaussianProcessSampled(object):
	""" log p(x\|z) + KL(q\|\|p) terms for Gaussian posterior and Gaussian process
	prior via sampling.

	The log p(x\|z) term is the reconstruction error under the model.
	The KL term represents the penalty for passing information from the encoder
	to the decoder.
	To sample KL(q\|\|p), we simply sample
	ln q - ln p
	by drawing samples from q and averaging.
	"""

	def __init__(self, post_zs, prior_z_process):
	"""Create a lower bound in three parts, normalized reconstruction
	cost, normalized KL divergence cost, and their sum.

	Args:
	post_zs: posterior z ~ q(z\|x)
	prior_z_process: prior AR(1) process
	"""
	assert len(post_zs) > 1, "GP is for time, need more than 1 time step."
	assert isinstance(prior_z_process, GaussianProcess), "Must use GP."

	# L = -KL + log p(x\|z), to maximize bound on likelihood
	# -L = KL - log p(x\|z), to minimize bound on NLL
	# so 'KL cost' is postive KL divergence
	z0_bxu = post_zs[0].sample
	logq_bxu = post_zs[0].logp(z0_bxu)
	logp_bxu = prior_z_process.logp_t(z0_bxu)
	z_tm1_bxu = z0_bxu
	for z_t in post_zs[1:]:
	# posterior is independent in time, prior is not
	z_t_bxu = z_t.sample
	logq_bxu += z_t.logp(z_t_bxu)
	logp_bxu += prior_z_process.logp_t(z_t_bxu, z_tm1_bxu)
	z_tm1_bxu = z_t_bxu

	kl_bxu = logq_bxu - logp_bxu
	kl_b = tf.reduce_sum(kl_bxu, [1])
	self.kl_cost_b = kl_b
	self.kl_cost = tf.reduce_mean(kl_b)