pranavSIT
/

PaliOpenVocabSegmentation

Model card Files Files and versions Community

PaliOpenVocabSegmentation / big_vision /trainers /proj /paligemma /predict_fns.py

pranavSIT

added pali inference

74e8f2f 12 months ago

raw

history blame contribute delete

16.7 kB

	# Copyright 2024 Big Vision Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Prediction functions for PaliGemma."""

	import collections
	import functools

	from big_vision.pp import registry
	import big_vision.utils as u
	import einops
	import jax
	import jax.numpy as jnp
	import numpy as np


	P = jax.sharding.PartitionSpec

	# pylint: disable=missing-function-docstring


	def get_all(model):
	"""Returns `predict_fns` for evaluators."""
	fns = {
	"logits": _logits,
	"image_avg_repr": _image_avg_repr,
	"decode": _decode,
	"decode_with_logp": _decode_with_logp,
	"beam_decode": _beam_decode,
	}
	return {name: functools.partial(fn, model=model) for name, fn in fns.items()}


	def _logits(train_state, batch, *, model):
	images, text, mask = batch["image"], batch["text"], batch["mask_ar"]
	text_logits, out = model.apply(
	{"params": train_state["params"]},
	images, text[:, :-1], mask[:, :-1],
	)
	return text_logits, out


	def _image_avg_repr(train_state, batch, *, model, key="img/pre_logits"):
	zimg, out = model.apply(
	{"params": train_state["params"]},
	image=batch["image"],
	method=model.embed_image,
	)
	if key:
	zimg = u.tree_get(out, key)
	# At this point, zimg is a (batch of) sequence of image tokens, because we
	# assume the model is a vit with "none" head. This predict-fn is for fewshot
	# evaluator, so we need to turn it into reasonably-sized vector -> avg.
	zimg = jnp.mean(zimg, axis=range(1, zimg.ndim - 1))
	return zimg, out


	def _decode_with_logp(
	train_state, batch, *, model, devices, max_decode_len, eos_token,
	best_of_n=1, sampler="greedy", replicate_out=False, eos_look_behind=0):
	"""Sample token continuations to the input sequences."""
	mesh = jax.sharding.Mesh(devices, ("devices",))
	replicate_sharding = jax.sharding.NamedSharding(mesh, P())
	out_sharding = jax.sharding.NamedSharding(
	mesh, P() if replicate_out else P("devices")
	)

	# Prefill the model cache and generate logits for first token.
	logits, cache = jax.jit(
	_prefill_cache,
	out_shardings=out_sharding,
	static_argnames=("model", "max_decode_len"),
	)(
	train_state["params"],
	{
	"image": batch["image"],
	"text": batch["text"],
	"mask_input": batch["mask_input"],
	"mask_ar": batch["mask_ar"],
	},
	model=model,
	max_decode_len=max_decode_len,
	)

	# Mask indicating real examples. False if example is used to pad the batch.
	mask = batch["_mask"]

	# Repeat example in case we are picking the best of n.
	logits, cache, mask = jax.jit(
	_bon_repeat,
	static_argnames=("n",)
	)((logits, cache, mask), n=best_of_n)

	decode_sample_output = jax.jit(
	_decode_sample_output,
	static_argnames=("max_decode_len", "sampler"),
	)
	decode_early_stop = jax.jit(
	_decode_early_stop,
	out_shardings=replicate_sharding,
	static_argnames=("eos_token",),
	)
	extend_cache = jax.jit(
	_extend_cache,
	donate_argnums=1,
	static_argnames=("model",),
	)

	# Keep sampling tokens from last logits until EOS or max_decode_len.
	state = None
	# Setting `eos_look_behind>0` removes blocking transfer with small batches.
	stops = collections.deque(maxlen=1 + eos_look_behind)
	for idx in range(max_decode_len):
	tokens, state = decode_sample_output(
	state, logits, max_decode_len=max_decode_len, sampler=sampler
	)

	if idx + 1 >= max_decode_len:
	break

	stops.append(decode_early_stop(state, mask, eos_token=eos_token))
	if len(stops) == stops.maxlen and jax.device_get(stops[0]):
	break

	# Compute logits for next token
	logits, cache = extend_cache(
	train_state["params"], cache, tokens, model=model
	)

	# Select the best of n sample for each example.
	_, tokens, logp = jax.jit(
	_bon_select,
	out_shardings=out_sharding,
	static_argnames=("n", "eos_token"),
	)(state, n=best_of_n, eos_token=eos_token)

	return tokens, logp


	def _decode(train_state, batch, **kwargs):
	tokens, _ = _decode_with_logp(train_state, batch, **kwargs)
	return tokens


	def _bon_repeat(tree, *, n):
	return jax.tree.map(lambda x: jnp.repeat(x, n, axis=0), tree)


	def _compute_score(tokens, logp, eos_token):
	"""Compute log-probability of each sequence up to first eos (including it)."""
	seqlen = jnp.sum(jnp.cumsum(tokens == eos_token, axis=-1) == 0, axis=-1) + 1
	token_mask = jnp.arange(tokens.shape[-1]) < seqlen[..., None]
	scores = jnp.sum(logp * token_mask, axis=-1)
	return scores


	def _bon_select(state, *, n, eos_token):
	"""Pick the sampled sequence with the highest likelihood for each example."""
	(_, tokens, logp) = state

	# Filter state to only keep the best of each example.
	scores = _compute_score(tokens, logp, eos_token)
	scores = einops.rearrange(scores, "(b n) -> b n", n=n)
	state = jax.tree.map(
	lambda x: einops.rearrange(x, "(b n) l -> b n l", n=n), state)
	best_indices = jnp.argmax(scores, -1) # [b]
	state = jax.tree.map(
	lambda x: jnp.take_along_axis(x, best_indices[:, None, None], axis=1),
	state)
	state = jax.tree.map(lambda x: x[:, 0], state)

	return state


	def _decode_sample_output(state, logits, *, max_decode_len, sampler):
	if state is None:
	# Decode state keeps track of sampled tokens and their logp.
	bs = logits.shape[0]
	seqlen = jnp.zeros((bs, 1), dtype=jnp.int32)
	tokens = jnp.zeros((bs, max_decode_len), dtype=jnp.int32)
	logp = jnp.zeros((bs, max_decode_len), dtype=logits.dtype)
	else:
	(seqlen, tokens, logp) = state

	# Sample tokens.
	sampled_tokens, sampled_logp = _sample_logits(logits, sampler=sampler)

	# Update state with sampled outputs.
	new_len = seqlen + 1
	new_tokens = _put_along_last_axis(tokens, seqlen, sampled_tokens)
	new_logp = _put_along_last_axis(logp, seqlen, sampled_logp)
	new_state = (new_len, new_tokens, new_logp)

	return sampled_tokens, new_state


	def _decode_early_stop(state, mask, *, eos_token):
	(seqlen, tokens, unused_logp) = state
	token_mask = jnp.arange(tokens.shape[-1])[None, :] < seqlen
	has_eos = jnp.any(jnp.logical_and(tokens == eos_token, token_mask), axis=-1)
	done = jnp.logical_or(has_eos, jnp.logical_not(mask))
	return jnp.all(done)


	def _put_along_last_axis(arr, indices, values):
	"""Like np.put_along_axis(..., axis=-1), since jax is missing it."""
	assert arr.ndim == indices.ndim == values.ndim, (
	arr.ndim, indices.ndim, values.ndim)
	onehot = jax.nn.one_hot(indices, arr.shape[-1], dtype=values.dtype)
	put_mask = jnp.einsum("...i,...in->...n",
	jnp.ones(values.shape, jnp.int32), onehot)
	put_values = jnp.einsum("...i,...in->...n", values, onehot)
	return jnp.where(put_mask, put_values, arr)


	def _prefill_cache(params, batch, *, model, max_decode_len):
	"""Initialize the model cache for decoding with the prompts."""
	variables = {"params": params}
	(x, input_mask, mask_ar), _ = model.apply(
	variables, batch["image"], batch["text"],
	input_mask=batch["mask_input"],
	mask_ar=batch["mask_ar"],
	method=model.embed_image_and_text)
	last_logits, variables = model.apply(
	variables, x, input_mask, mask_ar,
	cache_size=x.shape[1] + max_decode_len,
	method=model.prefill_cache,
	mutable=("cache",))
	return last_logits, variables["cache"]


	def _extend_cache(params, cache, tokens, *, model):
	"""Extend the model cache for decoding with one token per sequence."""
	variables = {"params": params, "cache": cache}
	x, _ = model.apply(variables, tokens, method=model.embed_text)
	last_logits, variables = model.apply(
	variables, x, method=model.extend_cache, mutable=("cache",))
	return last_logits, variables["cache"]


	def _sample_logits(logits, sampler):
	"""Returns a sampled token and its logp from logits."""
	# Note: Consider making it possible for evaluators to pass rng seed to
	# decode functions. For now generate it from jax.lax and avoid evaluators
	# having to deal with it.
	rng = jax.random.PRNGKey(
	jax.lax.rng_uniform(0, np.iinfo(np.int32).max, tuple()))

	# Use Registry to support specifying things like:
	# "greedy", "nucleus(0.2)", "temperature(t=1.0)"
	sampled_tokens = registry.Registry.lookup("paligemma_sampler." + sampler)(
	logits=logits, rng=rng)

	# Find the log probability (normalized logits) of selected tokens.
	sampled_logp = jnp.take_along_axis(
	jax.nn.log_softmax(logits, axis=-1),
	sampled_tokens[..., None], -1)[..., 0]

	return sampled_tokens, sampled_logp


	@registry.Registry.register("paligemma_sampler.greedy")
	def _greedy_sampling(*, logits, rng):
	del rng
	return jnp.argmax(logits, axis=-1)


	@registry.Registry.register("paligemma_sampler.temperature")
	def _temperature_sampling(t, *, logits, rng):
	return jax.random.categorical(rng, logits / t)


	@registry.Registry.register("paligemma_sampler.nucleus")
	def _nucleus_sampling(p: float, t: float = 1.0, *, logits, rng):
	logits = logits / t
	neg_inf = np.array(-1.0e7) # Effective negative infinity.
	logits_sorted = jnp.sort(logits, axis=-1, descending=True)
	sorted_cum_probs = jnp.cumsum(
	jax.nn.softmax(logits_sorted, axis=-1), axis=-1)
	cutoff_index = jnp.sum(sorted_cum_probs < p, axis=-1, keepdims=True)
	cutoff_logit = jnp.take_along_axis(logits_sorted, cutoff_index, axis=-1)
	logits = jnp.where(logits < cutoff_logit,
	jnp.full_like(logits, neg_inf), logits)
	return jax.random.categorical(rng, logits)


	def _beam_decode(train_state, batch, *,
	model, devices, max_decode_len,
	eos_token, beam_size, replicate_out=False):
	"""Beam search (greedy/top-k exploration)."""
	mesh = jax.sharding.Mesh(devices, ("devices",))
	replicate_sharding = jax.sharding.NamedSharding(mesh, P())
	out_sharding = jax.sharding.NamedSharding(
	mesh, P() if replicate_out else P("devices")
	)

	# Prefill the model cache and generate logits for first token.
	logits, cache = jax.jit(
	_prefill_cache,
	out_shardings=out_sharding,
	static_argnames=("model", "max_decode_len"),
	)(
	train_state["params"],
	{
	"image": batch["image"],
	"text": batch["text"],
	"mask_input": batch["mask_input"],
	"mask_ar": batch["mask_ar"],
	},
	model=model,
	max_decode_len=max_decode_len,
	)

	# Mask indicating real examples. False if example is used to pad the batch.
	mask = batch["_mask"]

	beam_sample_output = jax.jit(
	_beam_sample_output,
	static_argnames=("max_decode_len", "beam_size", "eos_token"),
	)
	beam_early_stop = jax.jit(
	_beam_early_stop,
	out_shardings=replicate_sharding,
	static_argnames=("eos_token",),
	)
	extend_cache = jax.jit(
	_extend_cache,
	donate_argnums=1,
	static_argnames=("model",),
	)

	# Keep sampling tokens from last logits until EOS or max_decode_len.
	state = None
	for idx in range(max_decode_len):
	tokens, state, cache = beam_sample_output(
	state, logits, cache,
	max_decode_len=max_decode_len, beam_size=beam_size, eos_token=eos_token)

	early_stop = beam_early_stop(state, mask, eos_token=eos_token)
	if jax.device_get(early_stop) or (idx + 1 >= max_decode_len):
	break

	# Compute logits for next token
	logits, cache = extend_cache(
	train_state["params"], cache, tokens, model=model)

	return jax.jit(_beam_make_output, out_shardings=out_sharding)(state)


	def _beam_early_stop(state, mask, eos_token):
	(best_tokens, best_logp, seqlen, unused_tokens, logp) = state

	# Scores of finalized sequences.
	best_scores = _compute_score(best_tokens, best_logp, eos_token)

	# Scores of live sequences.
	live_mask = jnp.arange(logp.shape[-1])[None, None] < seqlen
	live_scores = jnp.sum(logp * live_mask, axis=-1)
	live_scores = jnp.max(live_scores, axis=1)

	done = live_scores < best_scores
	return jnp.all(jnp.logical_or(done, jnp.logical_not(mask)))


	def _beam_make_output(state):
	(best_tokens, *_) = state
	return best_tokens[:, 0, ...]


	def _beam_sample_output(state, logits, cache, *,
	beam_size, max_decode_len, eos_token):
	assert logits.shape[1] == 1
	logits = jax.nn.log_softmax(logits[:, 0, :]) # Normalize logits

	if state is None:
	bs = logits.shape[0]
	# Beam decode state keeps track of:
	# A) Best sampled output for each example. At initialization these have
	# shape[1]=0, but end up with shape[1]=1 after first call.
	best_tokens = jnp.zeros((bs, 0, max_decode_len), dtype=jnp.int32)
	best_logp = jnp.zeros((bs, 0, max_decode_len), dtype=logits.dtype)
	# B) N candidate sequences for each example. At initialization these have
	# beam_size=1, but end up with correct beam_size when expanded.
	seqlen = jnp.zeros((bs, 1, 1), dtype=jnp.int32)
	tokens = jnp.zeros((bs, 1, max_decode_len), dtype=jnp.int32)
	logp = jnp.zeros((bs, 1, max_decode_len), dtype=logits.dtype)
	else:
	(best_tokens, best_logp, seqlen, tokens, logp) = state
	bs = logits.shape[0] // beam_size
	assert best_tokens.shape[0] == bs

	# Reshape cache to [example, candidate, ...].
	# Note: on first call the number of candidates is 1. Later it is beam_size.
	cache, logits = jax.tree.map(
	lambda x: einops.rearrange(x, "(b n) ... -> b n ...", b=bs),
	(cache, logits))

	# Consider a live sequence could end now and update the best finished
	# sequences so far for each example. This strategy is found in some beam
	# implementations such as in praxis.
	# The code below also adjusts the best shape[1]=0 -> 1 during first call.
	eos_tokens = jnp.array(eos_token)[None, None, None]
	new_tokens = _put_along_last_axis(tokens, seqlen, eos_tokens)
	new_logp = _put_along_last_axis(logp, seqlen, logits[:, :, eos_token, None])

	best_tokens = jnp.concatenate([best_tokens, new_tokens], axis=1)
	best_logp = jnp.concatenate([best_logp, new_logp], axis=1)
	best_scores = _compute_score(best_tokens, best_logp, eos_token=eos_token)
	_, top_indices = jax.lax.top_k(best_scores, k=1)

	best_tokens = jnp.take_along_axis(best_tokens, top_indices[..., None], axis=1)
	best_logp = jnp.take_along_axis(best_logp, top_indices[..., None], axis=1)

	# To find the next best N live candidates we expand each candidate and keep
	# the best N (ignoring EOS tokens). In this case we expand into (N+1)
	# candidates and set their likelihood to "-inf" (if EOS) after the fact.
	live_mask = jnp.arange(logp.shape[-1])[None, None] < seqlen
	live_scores = jnp.sum(logp * live_mask, axis=-1)
	topk_logits, topk_tokens = jax.lax.top_k(logits, beam_size+1)
	scores = live_scores[..., None] + topk_logits
	scores = jnp.where(
	topk_tokens != eos_token, scores, jnp.finfo(scores.dtype).min)

	# From the N*(N+1) candidates find the top N for each example.
	topk_logits, topk_tokens, scores = jax.tree.map(
	lambda x: einops.rearrange(x, "b n np1 -> b (n np1)"),
	(topk_logits, topk_tokens, scores))
	_, topk_indices = jax.lax.top_k(scores, k=beam_size)
	sampled_indices = topk_indices // (beam_size+1)
	sampled_tokens = jnp.take_along_axis(
	topk_tokens, topk_indices, axis=-1)[..., None]
	sampled_logits = jnp.take_along_axis(
	topk_logits, topk_indices, axis=-1)[..., None]

	# Adjust cache and state so it matches the selected top N input candidates.
	# This also adjusts the beam_size=1->n during first call.
	def take_candidates(x):
	one_hot_matrix = jax.nn.one_hot(sampled_indices, x.shape[1], dtype=x.dtype)
	return jnp.einsum("bi...,boi->bo...", x, one_hot_matrix)
	cache, seqlen, tokens, logp = jax.tree.map(
	take_candidates, (cache, seqlen, tokens, logp))

	# Write the sampled tokens/logits on the reshuffled state.
	tokens = _put_along_last_axis(tokens, seqlen, sampled_tokens)
	logp = _put_along_last_axis(logp, seqlen, sampled_logits)
	seqlen = seqlen + 1

	state = (best_tokens, best_logp, seqlen, tokens, logp)

	# Reshape to [(example, candidate), ...].
	sampled_tokens, cache = jax.tree.map(
	lambda x: einops.rearrange(x, "b n ... -> (b n) ..."),
	(sampled_tokens, cache))

	return sampled_tokens, state, cache