# Copyright 2022 Big Vision Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Preprocessing functions for CLIP with Pixels Only (CLIPPO).""" from absl import logging from big_vision.pp import utils from big_vision.pp.registry import Registry import numpy as np import tensorflow as tf @Registry.register("preprocess_ops.render_unifont") @utils.InKeyOutKey(indefault="texts", outdefault="image") def get_pp_render_text(image_size: int, font_size: int = 16, max_chars=768, background_brightness=127, text_brightness=0, lower=True, monospace=False, spacing=1, min_width=4, resize_method="area"): """Renders text as image, using binary Unifont. Largely based on Jeffrey Sorensen's text rendering implementation. Args: image_size: Width/height of output image. font_size: Font size to use. Recommended to leave at 16, as this requires no resizing, and is safe. max_chars: Maximum inpute characters to render, to make faster. background_brightness: (r, g, b) of background pixels. text_brightness: (r, g, b) of text pixels. lower: whether to lowercase. monospace: if False, text characters are horizontally trimmed according to `spacing` and `minwidth` args. spacing: # pixels between each letter. min_width: Minimum width of each letter. Useful to make sure e.g. spaces and full stops aren't collapsed to nothing. resize_method: resize method to use if fontsize != 16. Returns: Function which renders text as an image. """ bit_embedding = np.zeros((0x200000, 32), dtype=np.uint8) colpattern = {64: range(32), 32: sorted(tuple(range(0, 32, 4)) + tuple(range(2, 32, 4)))} unifont_path = "big_vision/pp/proj/clippo/unifont-9.0.06.hex" unifont_upper_path = "big_vision/pp/proj/clippo/unifont_upper-9.0.06.hex" with tf.io.gfile.GFile(unifont_path) as f: for line in f: row = int(line[0:4], 16) hexbits = line[5:-1] bit_embedding[row, colpattern[len(hexbits)]] = bytearray.fromhex(hexbits) with tf.io.gfile.GFile(unifont_upper_path) as f: for line in f: row = int(line[0:6], 16) hexbits = line[7:-1] bit_embedding[row, colpattern[len(hexbits)]] = bytearray.fromhex(hexbits) params = tf.constant(bit_embedding, dtype=tf.uint8) def trim_letter(letter): """Remove white space based on the letter size.""" v = tf.reduce_max(letter, axis=0) has_pixels = tf.reshape(tf.where(v), (-1,), name="RS5") no_pixels = tf.equal(tf.reduce_max(v), 0) first = tf.cond(no_pixels, lambda: tf.constant(0, tf.int64), lambda: has_pixels[0]) last = tf.cond(no_pixels, lambda: tf.constant(0, tf.int64), lambda: has_pixels[-1]) first = tf.maximum(first - spacing, 0) last = tf.maximum(last + spacing, first + min_width) return tf.RaggedTensor.from_tensor(tf.transpose(letter[:, first:last])) def to_image(rendered, width, height=None): """Makes a nice square image from a long string of rendered charcaters.""" height = height or width max_letter_width = tf.reduce_max(rendered.row_lengths(1)) row_lengths = tf.cast(tf.cumsum(rendered.row_lengths(1)), tf.float32) div = tf.cast(width - max_letter_width, tf.float32) # For rounding errors. row_idx = tf.cast(tf.floor(row_lengths / div), tf.int64) row_idx = tf.RaggedTensor.from_value_rowids(tf.range(tf.shape(rendered)[0]), row_idx) trimmed = tf.gather(rendered, row_idx, axis=0) trimmed = trimmed.merge_dims(1, 2) trimmed = trimmed.to_tensor(default_value=0) trimmed = tf.transpose(trimmed, (0, 2, 1)) trimmed = tf.reshape(trimmed, (-1, tf.shape(trimmed)[-1]), name="RS4") trimmed = trimmed[:height] wpad = width - tf.shape(trimmed)[1] hpad = height - tf.shape(trimmed)[0] padded = tf.pad(trimmed, [[0, hpad], [0, wpad]]) tf.assert_equal(tf.shape(padded), tf.constant((height, width))) return tf.ensure_shape(padded, (width, height)) def render(text): if lower: text = tf.strings.lower(text) text = tf.reshape(text, (-1,))[0] ids = tf.strings.unicode_decode(text, "UTF-8") if max_chars: ids = ids[:max_chars] embed = tf.nn.embedding_lookup(params, ids) # Get the letters # Each letter is 32 uint8s, but we want binary 16x16 grid. # The following does that in a rather hard to parse way. vertical = tf.reshape(embed, [1, -1]) repl = tf.reshape(tf.transpose(tf.tile(vertical, multiples=[8, 1])), [-1]) ones = tf.ones_like(repl) index = tf.cumsum(ones, exclusive=True) sevens = tf.cast(tf.fill(tf.shape(repl), 7), tf.uint8) moded = tf.bitwise.bitwise_and(index, sevens) shifted = tf.bitwise.right_shift(repl, tf.bitwise.bitwise_xor(moded, sevens)) anded = tf.bitwise.bitwise_and(shifted, ones) # And finally, letters; binary, 0 = background, 1 = letter. letters = tf.reshape(anded, [tf.shape(ids)[0], 16, 16]) if font_size != 16: logging.warning("The unifont text rendering function is highly optimized " "for font size 16; using font size %i might lead to " "suboptimal rendering and might degrade performance.", font_size) letters = tf.image.resize(letters[..., None], (font_size, font_size), method=resize_method, antialias=True) letters = tf.squeeze(letters, axis=-1) if monospace: letters = tf.RaggedTensor.from_tensor(tf.transpose(letters, (0, 2, 1))) else: letters = tf.RaggedTensor.from_tensor(letters) signature = tf.RaggedTensorSpec(shape=(None, font_size), ragged_rank=1, dtype=letters.dtype) letters = tf.map_fn(trim_letter, letters, fn_output_signature=signature) img = to_image(letters, image_size)[..., None] # A nice square image. img *= (text_brightness - background_brightness) # Rescale value range. img += background_brightness return tf.image.grayscale_to_rgb(tf.cast(img, tf.uint8)) return render