Spaces:

Frinkleko
/

RoBERTa_Tokenizer_Playground

Running

App Files Files Community

RoBERTa_Tokenizer_Playground / app.py

Frinkleko

add more base model

73ff676 verified 3 days ago

raw

history blame contribute delete

4.36 kB

	import gradio as gr
	from transformers import AutoTokenizer
	import pandas as pd
	import json

	def process_text(model_name, text, include_special_tokens=False, show_attention_mask=False):
	"""
	Processes text using a specified Hugging Face tokenizer model.
	"""
	try:
	# Dynamically load the tokenizer based on the selected model name
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	except Exception as e:
	return (
	pd.DataFrame([{"Error": f"Could not load tokenizer for '{model_name}': {e}. Please ensure the model name is correct and accessible (e.g., through Hugging Face Hub or a local path)."}]),
	"",
	"",
	)

	encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True)

	# Use tokenizer.tokenize and tokenizer.encode for consistency and general compatibility
	tokens = tokenizer.tokenize(text)
	token_ids = tokenizer.encode(text)

	# Adjust special token handling based on the flag
	if not include_special_tokens:
	# Attempt to remove special tokens by decoding and then encoding without special tokens.
	# This approach aims for a general solution but might behave differently for
	# tokenizers with complex special token handling or if tokens are meant to be inseparable.
	try:
	decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
	token_ids = tokenizer.encode(decoded_text, add_special_tokens=False)
	tokens = tokenizer.tokenize(decoded_text, add_special_tokens=False)
	except Exception as e:
	# Fallback if specific handling fails. It's better to process without removing
	# special tokens if an error occurs rather than failing the whole process.
	print(f"Warning: Could not remove special tokens for {model_name}. Error: {e}")
	# Keep original tokens and IDs which include special tokens
	tokens = tokenizer.tokenize(text)
	token_ids = tokenizer.encode(text)

	token_info = []
	# Ensure tokens and token_ids have matching lengths for zipping
	min_len = min(len(tokens), len(token_ids))
	for i in range(min_len):
	token = tokens[i]
	token_id = token_ids[i]
	info = {
	"Token": token,
	"ID": token_id,
	}

	# Check if attention_mask is available and has the correct dimension before accessing
	if show_attention_mask and encoding["attention_mask"].shape[1] > i:
	info["Attention Mask"] = encoding["attention_mask"][0][i]

	token_info.append(info)

	df = pd.DataFrame(token_info)

	stats = f"""
	Number of tokens: {len(tokens)}
	Input text length: {len(text)}
	Tokens/character ratio: {len(tokens)/len(text):.2f}
	Vocabulary size: {tokenizer.vocab_size}
	"""

	json_output = json.dumps(
	{
	"input_ids": token_ids,
	"tokens": tokens,
	},
	indent=2,
	ensure_ascii=False # Ensure non-ASCII characters are not escaped in JSON
	)

	return df, stats, json_output

	# Define available models using your specified paths
	model_choices = [
	"roberta-base",
	"klue/roberta-large",
	"distilbert/distilbert-base-uncased",
	"BAAI/bge-m3-retromae",
	"DTAI-KULeuven/robbert-2023-dutch-base",
	"DTAI-KULeuven/robbert-2023-dutch-large",
	]

	iface = gr.Interface(
	fn=process_text,
	inputs=[
	gr.Dropdown(
	choices=model_choices,
	value="roberta-base",
	label="Select Model",
	),
	gr.Textbox(
	lines=5, placeholder="Enter text to tokenize...", label="Input Text"
	),
	gr.Checkbox(label="Include Special Tokens", value=False),
	gr.Checkbox(label="Show Attention Mask", value=False),
	],
	outputs=[
	gr.Dataframe(
	headers=["Token", "ID", "Attention Mask"], label="Tokenization Results"
	),
	gr.Textbox(label="Statistics", lines=4),
	gr.JSON(label="JSON Output"),
	],
	title="Hugging Face Tokenizer Playground",
	description="""
	An interactive demonstration of various Hugging Face tokenizers.
	Select a model from the dropdown to see how it tokenizes your input text.
	""",
	theme="default",
	)

	if __name__ == "__main__":
	iface.launch(share=True)