Spaces:

Frinkleko
/

RoBERTa_Tokenizer_Playground

Running

App Files Files Community

Frinkleko commited on 3 days ago

Commit

73ff676

verified ·

1 Parent(s): 2060e45

add more base model

Browse files

Files changed (1) hide show

app.py +57 -15

app.py CHANGED Viewed

@@ -1,32 +1,59 @@
 import gradio as gr
-from transformers import RobertaTokenizer
 import pandas as pd
 import json
-tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
-def process_text(text, include_special_tokens=False, show_attention_mask=False):
     encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True)
     tokens = tokenizer.tokenize(text)
     token_ids = tokenizer.encode(text)
     if not include_special_tokens:
-        tokens = tokens
-        token_ids = token_ids[1:-1]
     token_info = []
-    for token, token_id in zip(tokens, token_ids):
         info = {
             "Token": token,
             "ID": token_id,
         }
-        if show_attention_mask:
-            info["Attention Mask"] = encoding["attention_mask"][0][len(token_info)]
         token_info.append(info)
@@ -45,14 +72,29 @@ def process_text(text, include_special_tokens=False, show_attention_mask=False):
             "tokens": tokens,
         },
         indent=2,
     )
     return df, stats, json_output
 iface = gr.Interface(
     fn=process_text,
     inputs=[
         gr.Textbox(
             lines=5, placeholder="Enter text to tokenize...", label="Input Text"
         ),
@@ -66,13 +108,13 @@ iface = gr.Interface(
         gr.Textbox(label="Statistics", lines=4),
         gr.JSON(label="JSON Output"),
     ],
-    title="RoBERTa Tokenizer Playground",
     description="""
-    An interactive demonstration of the RoBERTa tokenizer.
     """,
     theme="default",
 )
 if __name__ == "__main__":
-    iface.launch(share=True)

 import gradio as gr
+from transformers import AutoTokenizer
 import pandas as pd
 import json
+def process_text(model_name, text, include_special_tokens=False, show_attention_mask=False):
+    """
+    Processes text using a specified Hugging Face tokenizer model.
+    """
+    try:
+        # Dynamically load the tokenizer based on the selected model name
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    except Exception as e:
+        return (
+            pd.DataFrame([{"Error": f"Could not load tokenizer for '{model_name}': {e}. Please ensure the model name is correct and accessible (e.g., through Hugging Face Hub or a local path)."}]),
+            "",
+            "",
+        )
     encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True)
+    # Use tokenizer.tokenize and tokenizer.encode for consistency and general compatibility
     tokens = tokenizer.tokenize(text)
     token_ids = tokenizer.encode(text)
+    # Adjust special token handling based on the flag
     if not include_special_tokens:
+        # Attempt to remove special tokens by decoding and then encoding without special tokens.
+        # This approach aims for a general solution but might behave differently for
+        # tokenizers with complex special token handling or if tokens are meant to be inseparable.
+        try:
+            decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
+            token_ids = tokenizer.encode(decoded_text, add_special_tokens=False)
+            tokens = tokenizer.tokenize(decoded_text, add_special_tokens=False)
+        except Exception as e:
+            # Fallback if specific handling fails. It's better to process without removing
+            # special tokens if an error occurs rather than failing the whole process.
+            print(f"Warning: Could not remove special tokens for {model_name}. Error: {e}")
+            # Keep original tokens and IDs which include special tokens
+            tokens = tokenizer.tokenize(text)
+            token_ids = tokenizer.encode(text)
     token_info = []
+    # Ensure tokens and token_ids have matching lengths for zipping
+    min_len = min(len(tokens), len(token_ids))
+    for i in range(min_len):
+        token = tokens[i]
+        token_id = token_ids[i]
         info = {
             "Token": token,
             "ID": token_id,
         }
+        # Check if attention_mask is available and has the correct dimension before accessing
+        if show_attention_mask and encoding["attention_mask"].shape[1] > i:
+            info["Attention Mask"] = encoding["attention_mask"][0][i]
         token_info.append(info)
             "tokens": tokens,
         },
         indent=2,
+        ensure_ascii=False # Ensure non-ASCII characters are not escaped in JSON
     )
     return df, stats, json_output
+# Define available models using your specified paths
+model_choices = [
+    "roberta-base",
+    "klue/roberta-large",
+    "distilbert/distilbert-base-uncased",
+    "BAAI/bge-m3-retromae",
+    "DTAI-KULeuven/robbert-2023-dutch-base",
+    "DTAI-KULeuven/robbert-2023-dutch-large",
+]
 iface = gr.Interface(
     fn=process_text,
     inputs=[
+        gr.Dropdown(
+            choices=model_choices,
+            value="roberta-base",
+            label="Select Model",
+        ),
         gr.Textbox(
             lines=5, placeholder="Enter text to tokenize...", label="Input Text"
         ),
         gr.Textbox(label="Statistics", lines=4),
         gr.JSON(label="JSON Output"),
     ],
+    title="Hugging Face Tokenizer Playground",
     description="""
+    An interactive demonstration of various Hugging Face tokenizers.
+    Select a model from the dropdown to see how it tokenizes your input text.
     """,
     theme="default",
 )
 if __name__ == "__main__":
+    iface.launch(share=True)