pranavSIT
/

PaliOpenVocabSegmentation

Model card Files Files and versions Community

PaliOpenVocabSegmentation / big_vision /pp /proj /clippo /pp_ops.py

pranavSIT

added pali inference

74e8f2f 12 months ago

raw

history blame contribute delete

6.7 kB

	# Copyright 2022 Big Vision Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Preprocessing functions for CLIP with Pixels Only (CLIPPO)."""
	from absl import logging
	from big_vision.pp import utils
	from big_vision.pp.registry import Registry
	import numpy as np
	import tensorflow as tf


	@Registry.register("preprocess_ops.render_unifont")
	@utils.InKeyOutKey(indefault="texts", outdefault="image")
	def get_pp_render_text(image_size: int, font_size: int = 16, max_chars=768,
	background_brightness=127, text_brightness=0,
	lower=True, monospace=False, spacing=1, min_width=4,
	resize_method="area"):
	"""Renders text as image, using binary Unifont.

	Largely based on Jeffrey Sorensen's text rendering implementation.

	Args:
	image_size: Width/height of output image.
	font_size: Font size to use. Recommended to leave at 16, as this requires
	no resizing, and is safe.
	max_chars: Maximum inpute characters to render, to make faster.
	background_brightness: (r, g, b) of background pixels.
	text_brightness: (r, g, b) of text pixels.
	lower: whether to lowercase.
	monospace: if False, text characters are horizontally trimmed according to
	`spacing` and `minwidth` args.
	spacing: # pixels between each letter.
	min_width: Minimum width of each letter. Useful to make sure e.g. spaces and
	full stops aren't collapsed to nothing.
	resize_method: resize method to use if fontsize != 16.

	Returns:
	Function which renders text as an image.
	"""
	bit_embedding = np.zeros((0x200000, 32), dtype=np.uint8)
	colpattern = {64: range(32),
	32: sorted(tuple(range(0, 32, 4)) + tuple(range(2, 32, 4)))}

	unifont_path = "big_vision/pp/proj/clippo/unifont-9.0.06.hex"
	unifont_upper_path = "big_vision/pp/proj/clippo/unifont_upper-9.0.06.hex"

	with tf.io.gfile.GFile(unifont_path) as f:
	for line in f:
	row = int(line[0:4], 16)
	hexbits = line[5:-1]
	bit_embedding[row, colpattern[len(hexbits)]] = bytearray.fromhex(hexbits)

	with tf.io.gfile.GFile(unifont_upper_path) as f:
	for line in f:
	row = int(line[0:6], 16)
	hexbits = line[7:-1]
	bit_embedding[row, colpattern[len(hexbits)]] = bytearray.fromhex(hexbits)

	params = tf.constant(bit_embedding, dtype=tf.uint8)

	def trim_letter(letter):
	"""Remove white space based on the letter size."""
	v = tf.reduce_max(letter, axis=0)
	has_pixels = tf.reshape(tf.where(v), (-1,), name="RS5")
	no_pixels = tf.equal(tf.reduce_max(v), 0)
	first = tf.cond(no_pixels, lambda: tf.constant(0, tf.int64),
	lambda: has_pixels[0])
	last = tf.cond(no_pixels, lambda: tf.constant(0, tf.int64),
	lambda: has_pixels[-1])

	first = tf.maximum(first - spacing, 0)
	last = tf.maximum(last + spacing, first + min_width)
	return tf.RaggedTensor.from_tensor(tf.transpose(letter[:, first:last]))

	def to_image(rendered, width, height=None):
	"""Makes a nice square image from a long string of rendered charcaters."""
	height = height or width
	max_letter_width = tf.reduce_max(rendered.row_lengths(1))
	row_lengths = tf.cast(tf.cumsum(rendered.row_lengths(1)), tf.float32)
	div = tf.cast(width - max_letter_width, tf.float32) # For rounding errors.
	row_idx = tf.cast(tf.floor(row_lengths / div), tf.int64)
	row_idx = tf.RaggedTensor.from_value_rowids(tf.range(tf.shape(rendered)[0]),
	row_idx)
	trimmed = tf.gather(rendered, row_idx, axis=0)
	trimmed = trimmed.merge_dims(1, 2)
	trimmed = trimmed.to_tensor(default_value=0)
	trimmed = tf.transpose(trimmed, (0, 2, 1))
	trimmed = tf.reshape(trimmed, (-1, tf.shape(trimmed)[-1]), name="RS4")
	trimmed = trimmed[:height]

	wpad = width - tf.shape(trimmed)[1]
	hpad = height - tf.shape(trimmed)[0]
	padded = tf.pad(trimmed, [[0, hpad], [0, wpad]])
	tf.assert_equal(tf.shape(padded), tf.constant((height, width)))
	return tf.ensure_shape(padded, (width, height))

	def render(text):
	if lower:
	text = tf.strings.lower(text)
	text = tf.reshape(text, (-1,))[0]
	ids = tf.strings.unicode_decode(text, "UTF-8")
	if max_chars:
	ids = ids[:max_chars]
	embed = tf.nn.embedding_lookup(params, ids) # Get the letters
	# Each letter is 32 uint8s, but we want binary 16x16 grid.
	# The following does that in a rather hard to parse way.
	vertical = tf.reshape(embed, [1, -1])
	repl = tf.reshape(tf.transpose(tf.tile(vertical, multiples=[8, 1])), [-1])
	ones = tf.ones_like(repl)
	index = tf.cumsum(ones, exclusive=True)
	sevens = tf.cast(tf.fill(tf.shape(repl), 7), tf.uint8)
	moded = tf.bitwise.bitwise_and(index, sevens)
	shifted = tf.bitwise.right_shift(repl,
	tf.bitwise.bitwise_xor(moded, sevens))
	anded = tf.bitwise.bitwise_and(shifted, ones)
	# And finally, letters; binary, 0 = background, 1 = letter.
	letters = tf.reshape(anded, [tf.shape(ids)[0], 16, 16])

	if font_size != 16:
	logging.warning("The unifont text rendering function is highly optimized "
	"for font size 16; using font size %i might lead to "
	"suboptimal rendering and might degrade performance.",
	font_size)
	letters = tf.image.resize(letters[..., None], (font_size, font_size),
	method=resize_method, antialias=True)
	letters = tf.squeeze(letters, axis=-1)

	if monospace:
	letters = tf.RaggedTensor.from_tensor(tf.transpose(letters, (0, 2, 1)))
	else:
	letters = tf.RaggedTensor.from_tensor(letters)
	signature = tf.RaggedTensorSpec(shape=(None, font_size), ragged_rank=1,
	dtype=letters.dtype)
	letters = tf.map_fn(trim_letter, letters, fn_output_signature=signature)

	img = to_image(letters, image_size)[..., None] # A nice square image.
	img *= (text_brightness - background_brightness) # Rescale value range.
	img += background_brightness

	return tf.image.grayscale_to_rgb(tf.cast(img, tf.uint8))

	return render