File size: 12,894 Bytes
74e8f2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
# Copyright 2024 Big Vision Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Autorgregressive sampler for GIVT."""

import functools
from typing import Any, Optional

from big_vision.models.proj.givt import parallel_decode
import flax
from flax import linen as nn
import jax
from jax import lax
from jax import numpy as jnp
import ml_collections


def _sample_gmm(
    gmm_pdf,
    *,
    rng,
    cfg_inference_weight=None,
    gmm_pdf_uncond=None,
):
  """Draw a single sample from a GMM."""
  if cfg_inference_weight is not None:
    assert gmm_pdf_uncond is not None
    gmm_pdf = parallel_decode.CFGDensity(
        gmm_pdf, gmm_pdf_uncond, w=cfg_inference_weight, rng=rng
    )
  samples = gmm_pdf.sample(seed=rng)
  logprobs = gmm_pdf.log_prob(samples)
  if logprobs.ndim == 2:
    logprobs = logprobs[..., None]
  return samples, logprobs


# Beam search reshaping utils
def _flatten_samples_dim(x):
  """Flattens samples dimension into batch dimension."""
  if x.ndim == 0:  # ignore scalars (e.g. cache index)
    return x
  return x.reshape((x.shape[0] * x.shape[1],) + x.shape[2:])


def _unflatten_samples_dim(x, batch_size, num_samples):
  """Unflattens first dimension into batch and samples dimensions."""
  if x.ndim == 0:  # ignore scalars (e.g. cache index)
    return x
  assert batch_size * num_samples == x.shape[0]
  return x.reshape((batch_size, num_samples) + x.shape[1:])


def _cache_map(fn, cache, scan=False):
  """Maps function over cache."""
  if scan:
    # Assuming the chache is scanned over the first dimension, we apply a map
    # function over this dimension for scanned models
    fn_mod = lambda x: jax.lax.map(fn, x) if x.ndim > 0 else fn(x)
  else:
    fn_mod = fn

  frozen = isinstance(cache, flax.core.FrozenDict)
  if frozen:
    cache = flax.core.unfreeze(cache)
  flat_cache = flax.traverse_util.flatten_dict(cache)
  # Exclude cached relative position bias from beam expansion, etc.
  keyvals = {k: v for k, v in flat_cache.items() if k[-1] != "cached_bias"}
  keyvals = jax.tree_map(fn_mod, keyvals)
  flat_cache.update(keyvals)
  new_cache = flax.traverse_util.unflatten_dict(flat_cache)
  if frozen:
    new_cache = flax.core.freeze(new_cache)
  return new_cache


@flax.struct.dataclass
class LoopState:
  """Internal state of the sampling loop."""
  # Terminology
  # b:  batch size
  # nb: number of beams
  # nf: number of fans
  # s:  seaquence length
  # d:  feature dimension
  rng: jnp.ndarray        # PRNGKey of the loop state.
  cache: Any              # Cache for fast auto-regressive decoding.
  sequences: jnp.ndarray  # (b * nb, s, d)
  logprobs: jnp.ndarray   # (b * nb, s, d)
  cache_u: Any            # Uncond cache if cfg, otherwise None


def _create_cache(
    labels,
    model,
    init_sequence,
    params,
    encoded,
    uncond=False,
):
  """Creates the cache and returns initial logits."""
  if uncond:
    assert labels is not None  # Need labels for CFG!
    drop_labels = jnp.ones((labels.shape[0],), dtype=jnp.bool_)
  else:
    drop_labels = None

  def init_cache(model):
    return model.decode(
        init_sequence, labels, encoded, decode=True, drop_labels=drop_labels
    )

  cache = nn.apply(init_cache, model, mutable=True)(params)[1]["cache"]

  def prefill_cache(model):
    return model.prefill(
        labels, init_sequence.shape[0], encoded, drop_labels=drop_labels
    )

  # prefill class label or BOS token
  prefill_logits, aux = nn.apply(prefill_cache, model, mutable=True)(
      {"params": params["params"], "cache": cache})
  cache = aux["cache"]
  return cache, prefill_logits


def generate(
    params: Any,
    seed: jax.Array,
    *,
    model: nn.Module,
    seq_len: int,
    feature_dim: int,
    labels: Optional[jnp.ndarray] = None,
    cond_image: Optional[jnp.ndarray] = None,
    batch_size: Optional[int] = None,
    config: Optional[ml_collections.ConfigDict] = None,
) -> tuple[jax.Array, jax.Array]:
  """Sampling loop for GIVT."""
  if model.style != "ar":  # pytype: disable=wrong-arg-types
    raise ValueError(f"Invalid style: {model.style}")
  if model.has_encoder != (cond_image is not None):
    raise ValueError("Need cond_image if and only if the model has an encoder!")

  assert labels is not None or batch_size, (
      "Please provide either labels or batch_size.")

  config = config or {}
  config = dict(config)  # copy

  # For sampling, we support keep_gt (a bool mask), and gt (ground truth)
  # tokens to use instead of samples.
  keep_gt = config.pop("keep_gt", None)
  gt = config.pop("gt", None)

  if isinstance(seed, int):
    seed = jax.random.PRNGKey(seed)

  beam_size = config.pop("beam_size", 1)
  fan_size = config.pop("fan_size", 1)

  if labels is not None:
    batch_size = labels.shape[0]
    # fold beams into batch dimension
    labels = labels.repeat(beam_size, axis=0)

  # initialize sequence and logprobs (we track per feature dim logprobs)
  init_sequence = jnp.zeros((batch_size * beam_size, seq_len, feature_dim))
  init_logprobs = jnp.zeros_like(init_sequence)

  if cond_image is not None:
    # embed conditioning image if provided
    def encode_cond_img(model, cond_img):
      return model.encode(cond_img)
    encoded = nn.apply(encode_cond_img, model)(params, cond_image)
    encoded = jnp.repeat(encoded, beam_size, axis=0)
  else:
    encoded = None

  cache, prefill_logits = _create_cache(
      labels, model, init_sequence, params, encoded
  )

  cfg_inference_weight = config.pop("cfg_inference_weight", None)
  if cfg_inference_weight == 0.0:
    cfg_inference_weight = None
  cfg = cfg_inference_weight is not None

  get_pdf = functools.partial(
      model.get_pdf,
      temperature_scales=config.pop("temp", None),
      temperature_probs=config.pop("temp_probs", None),
  )

  # setup sampling function
  sample = functools.partial(
      _sample_gmm, cfg_inference_weight=cfg_inference_weight
  )

  # draw first output token
  pdf_first = get_pdf(prefill_logits)
  rng_first, rng = jax.random.split(seed)

  if cfg:
    assert beam_size == 1 and fan_size == 1  # CFG + Beam not supported.
    cache_u, prefill_logits_u = _create_cache(
        labels, model, init_sequence, params, encoded, uncond=True
    )
    pdf_first_u = get_pdf(prefill_logits_u)
  else:
    cache_u = None
    pdf_first_u = None

  tokens_first, logprobs_first = sample(
      pdf_first, rng=rng_first, gmm_pdf_uncond=pdf_first_u
  )
  init_sequence = init_sequence.at[:, 0].set(tokens_first.squeeze(axis=1))
  init_logprobs = init_logprobs.at[:, 0].set(logprobs_first.squeeze(axis=1))

  def tokens_to_logits(tokens, cache, uncond=False):
    if uncond:
      drop_labels = jnp.ones((labels.shape[0],), dtype=jnp.bool_)
    else:
      drop_labels = None

    def decode_step(model, tokens):
      return model.decode(tokens, labels, encoded,
                          decode=True, drop_labels=drop_labels)

    logits, aux = nn.apply(decode_step, model, mutable=True)(
        {"params": params["params"], "cache": cache}, tokens)
    return logits, aux["cache"]

  init_state = LoopState(
      cache=cache,
      sequences=init_sequence,  # (b * nb, s, d)
      logprobs=init_logprobs,   # (b * nb, s, d)
      rng=rng,
      cache_u=cache_u,
  )

  rand_top_k = config.pop("rand_top_k", False)
  rand_top_k_temp = config.pop("rand_top_k_temp", 1.0)

  assert not config, f"Sampling config is expected to be empty: {config}"

  def sampling_iteration(i, state):
    rng_sampling, rng_local = jax.random.split(state.rng)
    cur_tokens = state.sequences[:, i][:, None]
    # (b * nb, d)
    cur_logits, cache = tokens_to_logits(cur_tokens, state.cache)

    # (b, nb, d)
    cur_logits = _unflatten_samples_dim(
        cur_logits, batch_size, beam_size).squeeze(axis=2)

    # (b, nb * nf, d)
    cur_pdf = get_pdf(cur_logits.repeat(fan_size, axis=1))

    if cfg:
      cur_logits_u, cache_u = tokens_to_logits(
          cur_tokens, state.cache_u, uncond=True
      )
      cur_logits_u = _unflatten_samples_dim(
          cur_logits_u, batch_size, beam_size).squeeze(axis=2)
      cur_pdf_u = get_pdf(cur_logits_u.repeat(fan_size, axis=1))
      new_tokens, new_logprobs = sample(
          cur_pdf, rng=rng_sampling, gmm_pdf_uncond=cur_pdf_u
      )
    else:
      new_tokens, new_logprobs = sample(cur_pdf, rng=rng_sampling)
      cache_u = None

    if gt is not None:
      assert keep_gt is not None
      new_tokens = jnp.where(keep_gt[i], gt[:, i, :][:, None], new_tokens)

    # Skip beam search if not needed
    if beam_size == fan_size == 1:
      sampled_tokens = new_tokens.squeeze(axis=1)
      sequences = state.sequences.at[:, i + 1].set(sampled_tokens)
      return LoopState(
          cache=cache,
          rng=rng_local,
          sequences=sequences,
          logprobs=state.logprobs,
          cache_u=cache_u,
      )

    # (b, nb, s, d)
    logprobs = _unflatten_samples_dim(state.logprobs, batch_size, beam_size)
    cur_logprobs = logprobs[:, :, i]  # (b, nb, d)
    # (b, nb * nf, d)
    new_logprobs = new_logprobs + cur_logprobs.repeat(fan_size, axis=1)
    beam_logprobs = new_logprobs.sum(axis=-1)  # (b, nb * nf)

    if rand_top_k:
      # randomize top-k sampling via sampling from a categorical distribution
      def stoc_top_k(r, x, p):
        return jax.random.choice(r, x, shape=(beam_size,), replace=False, p=p)
      # construct index grid
      index_grid = jnp.arange(beam_logprobs.shape[1], dtype=jnp.int32)
      # (b, nb * nf)
      index_grid = index_grid[None].repeat(beam_logprobs.shape[0], axis=0)
      top_k_rng, rng_local = jax.random.split(rng_local)
      top_k_rng = jax.random.split(top_k_rng, beam_logprobs.shape[0])
      # vmap categorical sampling
      top_beam_fan_indices = jax.vmap(stoc_top_k, in_axes=(0, 0, 0))(
          top_k_rng,
          index_grid,
          nn.softmax(beam_logprobs / rand_top_k_temp, axis=-1))
    else:
      _, top_beam_fan_indices = lax.top_k(beam_logprobs, k=beam_size)  # (b, nb)

    top_beam_indices = top_beam_fan_indices // fan_size

    def _gather_beams(x):
      if x.ndim == 0:
        return x
      # checkify.check(jnp.all(top_beam_indices < x.shape[1]),
      #                f"`take_along_axis` out of bounds in `_gather_beams`: "
      #                f"{top_beam_indices.max()} vs. {x.shape[1]}")
      # (b, nb, 1 ... 1)
      expanded_indices = top_beam_indices.reshape(
          top_beam_indices.shape + (1,) * (x.ndim - 2))
      return jnp.take_along_axis(x, expanded_indices, axis=1)

    def _gather_tokens(x):
      # (b, nb * nf, d) -> (b, nb, d)
      # checkify.check(jnp.all(top_beam_fan_indices < x.shape[1]),
      #                f"`take_along_axis` out of bounds in `_gather_tokens`: "
      #                f"{top_beam_fan_indices.max()} vs. {x.shape[1]}")
      return jnp.take_along_axis(x, top_beam_fan_indices[..., None], axis=1)
    # (b, nb, s, d)
    sequences = _unflatten_samples_dim(state.sequences, batch_size, beam_size)
    sequences = _gather_beams(sequences)  # (b, nb, s, d)
    sequences = sequences.at[:, :, i + 1].set(_gather_tokens(new_tokens))
    # (b, nb, s, d)
    sequences = _flatten_samples_dim(sequences)

    logprobs = _gather_beams(logprobs)
    logprobs = logprobs.at[:, :, i + 1].set(_gather_tokens(new_logprobs))
    logprobs = _flatten_samples_dim(logprobs)

    scanned_cache = getattr(model, "scan", False)
    cache = _cache_map(
        lambda x: _unflatten_samples_dim(x, batch_size, beam_size),
        cache, scanned_cache)
    cache = _cache_map(_gather_beams, cache, scanned_cache)
    cache = _cache_map(_flatten_samples_dim, cache, scanned_cache)

    if cfg:
      assert cache_u is not None
      cache_u = _cache_map(
          lambda x: _unflatten_samples_dim(x, batch_size, beam_size),
          cache_u, scanned_cache
      )
      cache_u = _cache_map(_gather_beams, cache_u, scanned_cache)
      cache_u = _cache_map(_flatten_samples_dim, cache_u, scanned_cache)
    else:
      assert cache_u is None

    return LoopState(
        cache=cache,
        rng=rng_local,
        sequences=sequences,
        logprobs=logprobs,
        cache_u=cache_u,
    )

  final_state = lax.fori_loop(0, seq_len, sampling_iteration, init_state)
  final_logprobs = final_state.logprobs[::beam_size][:, -1].sum(axis=-1)

  # return top beams and corresponding log probs
  return final_state.sequences[::beam_size], final_logprobs