import json from functools import lru_cache from typing import Any import gradio as gr import pandas as pd from loguru import logger from playground_examples import ( default_tokenizer_name_1, default_tokenizer_name_2, default_user_input, examples, ) from playground_tokenizers import TokenizerFactory @lru_cache def run_tokenization( text: str, tokenizer_name: str, color_num: int = 5, add_special_token: bool = False ) -> tuple[list[tuple[str, str]], int, pd.DataFrame]: """Tokenize an input text and return the tokens with their positions.""" logger.info( "param=" + json.dumps( {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False ) ) pos_tokens = [] tokenizer = TokenizerFactory().get_tokenizer(tokenizer_name) encoding = tokenizer.encode(text) if add_special_token else tokenizer.encode(text) table = [] for idx, token_id in enumerate(encoding): decoded_text = tokenizer.decode([token_id]) decoded_text = decoded_text.replace( " ", "⋅" ) # replace space with ⋅ for better visualization pos_tokens.extend([(decoded_text, str(idx % color_num))]) try: token = tokenizer.decode([token_id])[0] except: token = {v: k for k, v in tokenizer.get_vocab().items()}[token_id] if isinstance(token, bytes): try: token_str = token.decode("utf-8") except: token_str = token.decode("utf-8", errors="ignore") logger.error( f"{idx}: decode_error: {tokenizer_name}, {token} {token_str}" ) elif isinstance(token, str): token_str = token else: logger.error( f"{idx}: wrong type for token {token_id} {type(token)} " + json.dumps( {"text": text, "tokenizer_type": tokenizer_name}, ensure_ascii=False ) ) token_str = token table.append({"TokenID": token_id, "Text": decoded_text}) table_df = pd.DataFrame(table) logger.info(f"tokenizer_type={tokenizer_name}, Tokens={table[:4]}") return pos_tokens, len(encoding), table_df def tokenize( text: str, tokenizer_name: str, color_num: int = 5 ) -> tuple[dict[Any, Any], pd.DataFrame]: """Tokenize an input text.""" pos_tokens, num_tokens, table_df = run_tokenization(text, tokenizer_name, color_num) return gr.update(value=pos_tokens, label=f"Tokens: {num_tokens}"), table_df def tokenize_pair( text: str, tokenizer_name_1: str, tokenizer_name_2: str, color_num: int = 5 ): """input_text.change.""" pos_tokens_1, table_df_1 = tokenize( text=text, tokenizer_name=tokenizer_name_1, color_num=color_num ) pos_tokens_2, table_df_2 = tokenize( text=text, tokenizer_name=tokenizer_name_2, color_num=color_num ) return pos_tokens_1, table_df_1, pos_tokens_2, table_df_2 def on_load(url_params: str, request: gr.Request | None = None) -> tuple[str, str, str]: """Function triggered on page load to get URL parameters.""" text = default_user_input tokenizer_type_1 = default_tokenizer_name_1 tokenizer_type_2 = default_tokenizer_name_2 return text, tokenizer_type_1, tokenizer_type_2 get_window_url_params = """ function(url_params) { const params = new URLSearchParams(window.location.search); url_params = JSON.stringify(Object.fromEntries(params)); return url_params; } """ all_tokenizer_name = [ (config.name_display, config.name_or_path) for config in TokenizerFactory().all_tokenizer_configs ] with gr.Blocks() as demo: with gr.Row(): gr.Markdown("## Input Text") dropdown_examples = gr.Dropdown( sorted(examples.keys()), value="Examples", type="index", allow_custom_value=True, show_label=False, container=False, scale=0, elem_classes="example-style", ) user_input = gr.Textbox( label="Input Text", lines=5, show_label=False, ) with gr.Row(): with gr.Column(scale=6), gr.Group(): tokenizer_name_1 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 1") with gr.Column(scale=6), gr.Group(): tokenizer_name_2 = gr.Dropdown(all_tokenizer_name, label="Tokenizer 2") with gr.Row(): with gr.Column(): output_text_1 = gr.Highlightedtext( show_legend=False, show_inline_category=False ) with gr.Column(): output_text_2 = gr.Highlightedtext( show_legend=False, show_inline_category=False ) with gr.Row(): output_table_1 = gr.Dataframe() output_table_2 = gr.Dataframe() tokenizer_name_1.change( tokenize, [user_input, tokenizer_name_1], [output_text_1, output_table_1] ) tokenizer_name_2.change( tokenize, [user_input, tokenizer_name_2], [output_text_2, output_table_2] ) user_input.change( tokenize_pair, [user_input, tokenizer_name_1, tokenizer_name_2], [output_text_1, output_table_1, output_text_2, output_table_2], show_api=False, ) dropdown_examples.change( lambda example_idx: ( examples[sorted(examples.keys())[example_idx]]["text"], examples[sorted(examples.keys())[example_idx]]["tokenizer_1"], examples[sorted(examples.keys())[example_idx]]["tokenizer_2"], ), dropdown_examples, [user_input, tokenizer_name_1, tokenizer_name_2], show_api=False, ) demo.load( fn=on_load, inputs=[user_input], outputs=[user_input, tokenizer_name_1, tokenizer_name_2], js=get_window_url_params, show_api=False, ) if __name__ == "__main__": demo.launch(share=True)