|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Preprocessing functions for CLIP with Pixels Only (CLIPPO).""" |
|
from absl import logging |
|
from big_vision.pp import utils |
|
from big_vision.pp.registry import Registry |
|
import numpy as np |
|
import tensorflow as tf |
|
|
|
|
|
@Registry.register("preprocess_ops.render_unifont") |
|
@utils.InKeyOutKey(indefault="texts", outdefault="image") |
|
def get_pp_render_text(image_size: int, font_size: int = 16, max_chars=768, |
|
background_brightness=127, text_brightness=0, |
|
lower=True, monospace=False, spacing=1, min_width=4, |
|
resize_method="area"): |
|
"""Renders text as image, using binary Unifont. |
|
|
|
Largely based on Jeffrey Sorensen's text rendering implementation. |
|
|
|
Args: |
|
image_size: Width/height of output image. |
|
font_size: Font size to use. Recommended to leave at 16, as this requires |
|
no resizing, and is safe. |
|
max_chars: Maximum inpute characters to render, to make faster. |
|
background_brightness: (r, g, b) of background pixels. |
|
text_brightness: (r, g, b) of text pixels. |
|
lower: whether to lowercase. |
|
monospace: if False, text characters are horizontally trimmed according to |
|
`spacing` and `minwidth` args. |
|
spacing: # pixels between each letter. |
|
min_width: Minimum width of each letter. Useful to make sure e.g. spaces and |
|
full stops aren't collapsed to nothing. |
|
resize_method: resize method to use if fontsize != 16. |
|
|
|
Returns: |
|
Function which renders text as an image. |
|
""" |
|
bit_embedding = np.zeros((0x200000, 32), dtype=np.uint8) |
|
colpattern = {64: range(32), |
|
32: sorted(tuple(range(0, 32, 4)) + tuple(range(2, 32, 4)))} |
|
|
|
unifont_path = "big_vision/pp/proj/clippo/unifont-9.0.06.hex" |
|
unifont_upper_path = "big_vision/pp/proj/clippo/unifont_upper-9.0.06.hex" |
|
|
|
with tf.io.gfile.GFile(unifont_path) as f: |
|
for line in f: |
|
row = int(line[0:4], 16) |
|
hexbits = line[5:-1] |
|
bit_embedding[row, colpattern[len(hexbits)]] = bytearray.fromhex(hexbits) |
|
|
|
with tf.io.gfile.GFile(unifont_upper_path) as f: |
|
for line in f: |
|
row = int(line[0:6], 16) |
|
hexbits = line[7:-1] |
|
bit_embedding[row, colpattern[len(hexbits)]] = bytearray.fromhex(hexbits) |
|
|
|
params = tf.constant(bit_embedding, dtype=tf.uint8) |
|
|
|
def trim_letter(letter): |
|
"""Remove white space based on the letter size.""" |
|
v = tf.reduce_max(letter, axis=0) |
|
has_pixels = tf.reshape(tf.where(v), (-1,), name="RS5") |
|
no_pixels = tf.equal(tf.reduce_max(v), 0) |
|
first = tf.cond(no_pixels, lambda: tf.constant(0, tf.int64), |
|
lambda: has_pixels[0]) |
|
last = tf.cond(no_pixels, lambda: tf.constant(0, tf.int64), |
|
lambda: has_pixels[-1]) |
|
|
|
first = tf.maximum(first - spacing, 0) |
|
last = tf.maximum(last + spacing, first + min_width) |
|
return tf.RaggedTensor.from_tensor(tf.transpose(letter[:, first:last])) |
|
|
|
def to_image(rendered, width, height=None): |
|
"""Makes a nice square image from a long string of rendered charcaters.""" |
|
height = height or width |
|
max_letter_width = tf.reduce_max(rendered.row_lengths(1)) |
|
row_lengths = tf.cast(tf.cumsum(rendered.row_lengths(1)), tf.float32) |
|
div = tf.cast(width - max_letter_width, tf.float32) |
|
row_idx = tf.cast(tf.floor(row_lengths / div), tf.int64) |
|
row_idx = tf.RaggedTensor.from_value_rowids(tf.range(tf.shape(rendered)[0]), |
|
row_idx) |
|
trimmed = tf.gather(rendered, row_idx, axis=0) |
|
trimmed = trimmed.merge_dims(1, 2) |
|
trimmed = trimmed.to_tensor(default_value=0) |
|
trimmed = tf.transpose(trimmed, (0, 2, 1)) |
|
trimmed = tf.reshape(trimmed, (-1, tf.shape(trimmed)[-1]), name="RS4") |
|
trimmed = trimmed[:height] |
|
|
|
wpad = width - tf.shape(trimmed)[1] |
|
hpad = height - tf.shape(trimmed)[0] |
|
padded = tf.pad(trimmed, [[0, hpad], [0, wpad]]) |
|
tf.assert_equal(tf.shape(padded), tf.constant((height, width))) |
|
return tf.ensure_shape(padded, (width, height)) |
|
|
|
def render(text): |
|
if lower: |
|
text = tf.strings.lower(text) |
|
text = tf.reshape(text, (-1,))[0] |
|
ids = tf.strings.unicode_decode(text, "UTF-8") |
|
if max_chars: |
|
ids = ids[:max_chars] |
|
embed = tf.nn.embedding_lookup(params, ids) |
|
|
|
|
|
vertical = tf.reshape(embed, [1, -1]) |
|
repl = tf.reshape(tf.transpose(tf.tile(vertical, multiples=[8, 1])), [-1]) |
|
ones = tf.ones_like(repl) |
|
index = tf.cumsum(ones, exclusive=True) |
|
sevens = tf.cast(tf.fill(tf.shape(repl), 7), tf.uint8) |
|
moded = tf.bitwise.bitwise_and(index, sevens) |
|
shifted = tf.bitwise.right_shift(repl, |
|
tf.bitwise.bitwise_xor(moded, sevens)) |
|
anded = tf.bitwise.bitwise_and(shifted, ones) |
|
|
|
letters = tf.reshape(anded, [tf.shape(ids)[0], 16, 16]) |
|
|
|
if font_size != 16: |
|
logging.warning("The unifont text rendering function is highly optimized " |
|
"for font size 16; using font size %i might lead to " |
|
"suboptimal rendering and might degrade performance.", |
|
font_size) |
|
letters = tf.image.resize(letters[..., None], (font_size, font_size), |
|
method=resize_method, antialias=True) |
|
letters = tf.squeeze(letters, axis=-1) |
|
|
|
if monospace: |
|
letters = tf.RaggedTensor.from_tensor(tf.transpose(letters, (0, 2, 1))) |
|
else: |
|
letters = tf.RaggedTensor.from_tensor(letters) |
|
signature = tf.RaggedTensorSpec(shape=(None, font_size), ragged_rank=1, |
|
dtype=letters.dtype) |
|
letters = tf.map_fn(trim_letter, letters, fn_output_signature=signature) |
|
|
|
img = to_image(letters, image_size)[..., None] |
|
img *= (text_brightness - background_brightness) |
|
img += background_brightness |
|
|
|
return tf.image.grayscale_to_rgb(tf.cast(img, tf.uint8)) |
|
|
|
return render |
|
|