File size: 6,703 Bytes

74e8f2f

# Copyright 2022 Big Vision Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Preprocessing functions for CLIP with Pixels Only (CLIPPO)."""
from absl import logging
from  big_vision.pp import utils
from big_vision.pp.registry import Registry
import numpy as np
import tensorflow as tf


@Registry.register("preprocess_ops.render_unifont")
@utils.InKeyOutKey(indefault="texts", outdefault="image")
def get_pp_render_text(image_size: int, font_size: int = 16, max_chars=768,
                       background_brightness=127, text_brightness=0,
                       lower=True, monospace=False, spacing=1, min_width=4,
                       resize_method="area"):
  """Renders text as image, using binary Unifont.

  Largely based on Jeffrey Sorensen's text rendering implementation.

  Args:
    image_size: Width/height of output image.
    font_size: Font size to use. Recommended to leave at 16, as this requires
    no resizing, and is safe.
    max_chars: Maximum inpute characters to render, to make faster.
    background_brightness: (r, g, b) of background pixels.
    text_brightness: (r, g, b) of text pixels.
    lower: whether to lowercase.
    monospace: if False, text characters are horizontally trimmed according to
      `spacing` and `minwidth` args.
    spacing: # pixels between each letter.
    min_width: Minimum width of each letter. Useful to make sure e.g. spaces and
      full stops aren't collapsed to nothing.
    resize_method: resize method to use if fontsize != 16.

  Returns:
    Function which renders text as an image.
  """
  bit_embedding = np.zeros((0x200000, 32), dtype=np.uint8)
  colpattern = {64: range(32),
                32: sorted(tuple(range(0, 32, 4)) + tuple(range(2, 32, 4)))}

  unifont_path = "big_vision/pp/proj/clippo/unifont-9.0.06.hex"
  unifont_upper_path = "big_vision/pp/proj/clippo/unifont_upper-9.0.06.hex"

  with tf.io.gfile.GFile(unifont_path) as f:
    for line in f:
      row = int(line[0:4], 16)
      hexbits = line[5:-1]
      bit_embedding[row, colpattern[len(hexbits)]] = bytearray.fromhex(hexbits)

  with tf.io.gfile.GFile(unifont_upper_path) as f:
    for line in f:
      row = int(line[0:6], 16)
      hexbits = line[7:-1]
      bit_embedding[row, colpattern[len(hexbits)]] = bytearray.fromhex(hexbits)

  params = tf.constant(bit_embedding, dtype=tf.uint8)

  def trim_letter(letter):
    """Remove white space based on the letter size."""
    v = tf.reduce_max(letter, axis=0)
    has_pixels = tf.reshape(tf.where(v), (-1,), name="RS5")
    no_pixels = tf.equal(tf.reduce_max(v), 0)
    first = tf.cond(no_pixels, lambda: tf.constant(0, tf.int64),
                    lambda: has_pixels[0])
    last = tf.cond(no_pixels, lambda: tf.constant(0, tf.int64),
                   lambda: has_pixels[-1])

    first = tf.maximum(first - spacing, 0)
    last = tf.maximum(last + spacing, first + min_width)
    return tf.RaggedTensor.from_tensor(tf.transpose(letter[:, first:last]))

  def to_image(rendered, width, height=None):
    """Makes a nice square image from a long string of rendered charcaters."""
    height = height or width
    max_letter_width = tf.reduce_max(rendered.row_lengths(1))
    row_lengths = tf.cast(tf.cumsum(rendered.row_lengths(1)), tf.float32)
    div = tf.cast(width - max_letter_width, tf.float32)  # For rounding errors.
    row_idx = tf.cast(tf.floor(row_lengths / div), tf.int64)
    row_idx = tf.RaggedTensor.from_value_rowids(tf.range(tf.shape(rendered)[0]),
                                                row_idx)
    trimmed = tf.gather(rendered, row_idx, axis=0)
    trimmed = trimmed.merge_dims(1, 2)
    trimmed = trimmed.to_tensor(default_value=0)
    trimmed = tf.transpose(trimmed, (0, 2, 1))
    trimmed = tf.reshape(trimmed, (-1, tf.shape(trimmed)[-1]), name="RS4")
    trimmed = trimmed[:height]

    wpad = width - tf.shape(trimmed)[1]
    hpad = height - tf.shape(trimmed)[0]
    padded = tf.pad(trimmed, [[0, hpad], [0, wpad]])
    tf.assert_equal(tf.shape(padded), tf.constant((height, width)))
    return tf.ensure_shape(padded, (width, height))

  def render(text):
    if lower:
      text = tf.strings.lower(text)
    text = tf.reshape(text, (-1,))[0]
    ids = tf.strings.unicode_decode(text, "UTF-8")
    if max_chars:
      ids = ids[:max_chars]
    embed = tf.nn.embedding_lookup(params, ids)  # Get the letters
    # Each letter is 32 uint8s, but we want binary 16x16 grid.
    # The following does that in a rather hard to parse way.
    vertical = tf.reshape(embed, [1, -1])
    repl = tf.reshape(tf.transpose(tf.tile(vertical, multiples=[8, 1])), [-1])
    ones = tf.ones_like(repl)
    index = tf.cumsum(ones, exclusive=True)
    sevens = tf.cast(tf.fill(tf.shape(repl), 7), tf.uint8)
    moded = tf.bitwise.bitwise_and(index, sevens)
    shifted = tf.bitwise.right_shift(repl,
                                     tf.bitwise.bitwise_xor(moded, sevens))
    anded = tf.bitwise.bitwise_and(shifted, ones)
    # And finally, letters; binary, 0 = background, 1 = letter.
    letters = tf.reshape(anded, [tf.shape(ids)[0], 16, 16])

    if font_size != 16:
      logging.warning("The unifont text rendering function is highly optimized "
                      "for font size 16; using font size %i might lead to "
                      "suboptimal rendering and might degrade performance.",
                      font_size)
      letters = tf.image.resize(letters[..., None], (font_size, font_size),
                                method=resize_method, antialias=True)
      letters = tf.squeeze(letters, axis=-1)

    if monospace:
      letters = tf.RaggedTensor.from_tensor(tf.transpose(letters, (0, 2, 1)))
    else:
      letters = tf.RaggedTensor.from_tensor(letters)
      signature = tf.RaggedTensorSpec(shape=(None, font_size), ragged_rank=1,
                                      dtype=letters.dtype)
      letters = tf.map_fn(trim_letter, letters, fn_output_signature=signature)

    img = to_image(letters, image_size)[..., None]    # A nice square image.
    img *= (text_brightness - background_brightness)  # Rescale value range.
    img += background_brightness

    return tf.image.grayscale_to_rgb(tf.cast(img, tf.uint8))

  return render