File size: 1,535 Bytes
98985f3
 
 
bbc0512
24e4af8
 
 
 
 
0f64548
24e4af8
 
 
 
 
 
 
 
 
 
0f64548
 
 
24e4af8
a17b6c0
 
24e4af8
 
 
 
 
 
 
a17b6c0
 
 
98985f3
2789d18
 
24e4af8
e363f01
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from transformers import AutoTokenizer
import gradio as gr


# Define tokenizer models
MODELS = {
    "LLaMa-1/LLaMa-2": "TheBloke/Llama-2-7B-fp16",
    "LLaMa-3": "unsloth/llama-3-8b",
    "Mistral": "mistral-community/Mistral-7B-v0.2",
    "GPT-2/GPT-J": "openai-community/gpt2",
    "GPT-NeoX": "EleutherAI/gpt-neox-20b",
    "Falcon": "tiiuae/falcon-7b",
    "Phi-1/Phi-2": "microsoft/phi-2",
    "Phi-3": "microsoft/Phi-3-mini-4k-instruct",
    "T5": "google/flan-t5-xxl",
    "Gemma": "alpindale/gemma-2b",
    "Command-R": "CohereForAI/c4ai-command-r-plus",
    "Qwen/Qwen1.5": "Qwen/Qwen1.5-7B",
    "CodeQwen": "Qwen/CodeQwen1.5-7B",
    "RWKV-v4": "RWKV/rwkv-4-14b-pile",
    "RWKV-v5/RWKV-v6": "RWKV/v5-EagleX-v2-7B-HF",
    "DeepSeek-LLM": "deepseek-ai/deepseek-llm-7b-base",
    "DeepSeek-V2": "deepseek-ai/DeepSeek-V2"
}


def tokenize(input_text):
    results = {}
    for model_name, model_tokenizer in MODELS.items():
        tokenizer = AutoTokenizer.from_pretrained(model_tokenizer, trust_remote_code=True)
        tokens = len(tokenizer(input_text, add_special_tokens=True)["input_ids"])
        results[model_name] = tokens
    
    # Sort the results in descending order based on token length
    sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
    return "\n".join([f"{model}: {tokens}" for model, tokens in sorted_results])


if __name__ == "__main__":
    iface = gr.Interface(fn=tokenize, inputs=gr.Textbox(label="Input Text", lines=len(MODELS)), outputs="text")
    iface.launch()