import html import logging from pathlib import Path import gradio as gr from gradio.themes.utils import colors from transformers import CLIPTokenizer logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) gr_logger = logging.getLogger("gradio") gr_logger.setLevel(logging.INFO) class ClipUtil: def __init__(self): logger.info("Loading ClipUtil") self.theme = gr.themes.Base( primary_hue=colors.violet, secondary_hue=colors.indigo, neutral_hue=colors.slate, font=[gr.themes.GoogleFont("Fira Sans"), "ui-sans-serif", "system-ui", "sans-serif"], font_mono=[gr.themes.GoogleFont("Fira Code"), "ui-monospace", "Consolas", "monospace"], ).set( slider_color_dark="*primary_500", ) try: self.css = Path(__file__).with_suffix(".css").read_text() except Exception: logger.exception("Failed to load CSS file") self.css = "" self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") self.vocab = {v: k for k, v in self.tokenizer.get_vocab().items()} self.blocks = gr.Blocks( title="ClipTokenizerUtil", analytics_enabled=False, theme=self.theme, css=self.css ) def tokenize(self, text: str, input_ids: bool = False): if input_ids: tokens = [int(x.strip()) for x in text.split(",")] else: tokens = self.tokenizer(text, return_tensors="np").input_ids.squeeze().tolist() code = "" ids = [] current_ids = [] class_index = 0 byte_decoder = self.tokenizer.byte_decoder def dump(last=False): nonlocal code, ids, current_ids words = [self.vocab.get(x, "") for x in current_ids] def wordscode(ids, word): nonlocal class_index word_title = html.escape(", ".join([str(x) for x in ids])) res = f""" {html.escape(word)} """ class_index += 1 return res try: word = bytearray([byte_decoder[x] for x in "".join(words)]).decode("utf-8") except UnicodeDecodeError: if last: word = "❌" * len(current_ids) elif len(current_ids) > 4: id = current_ids[0] ids += [id] local_ids = current_ids[1:] code += wordscode([id], "❌") current_ids = [] for id in local_ids: current_ids.append(id) dump() return else: return # word = word.replace("", " ") code += wordscode(current_ids, word) ids += current_ids current_ids = [] for token in tokens: token = int(token) current_ids.append(token) dump() dump(last=True) ids_html = f"""
Token count: {len(ids)}
{", ".join([str(x) for x in ids])}