|
import os |
|
|
|
import gradio as gr |
|
import tiktoken |
|
|
|
|
|
os.environ["TIKTOKEN_CACHE_DIR"] = "" |
|
|
|
encoding = tiktoken.get_encoding("cl100k_base") |
|
|
|
enc_mapping = { |
|
"gpt-4": "cl100k_base", "gpt-3.5-turbo(chatgpt)": "cl100k_base", "text-embedding-ada-002": "cl100k_base", "Codex": "p50k_base", "text-davinci-002": "p50k_base", "text-davinci-003": "p50k_base", "gpt3": "r50k_base", "gpt2": "r50k_base" |
|
} |
|
|
|
|
|
def tokenize(text, model): |
|
encoding = tiktoken.get_encoding(enc_mapping[model]) |
|
enc = encoding.encode(text) |
|
return len(enc), enc |
|
|
|
|
|
title = "GPT Token" |
|
description = "This demo uses <a href='https://github.com/openai/tiktoken' target='_blank'>tiktoken</a> to calculate the token number needed for GPT models." |
|
|
|
iface = gr.Interface(fn=tokenize, |
|
inputs=[ |
|
gr.Textbox(label="input sequence"), |
|
gr.Radio(choices=["gpt-4", "gpt-3.5-turbo(chatgpt)", "text-embedding-ada-002", "Codex", "text-davinci-002", "text-davinci-003", "gpt3", "gpt2"], value="gpt-3.5-turbo(chatgpt)", label="model")], |
|
outputs=[gr.Textbox(label="token number"), gr.Textbox( |
|
label="token sequence")], |
|
title=title, |
|
description=description, |
|
allow_flagging='never') |
|
iface.launch(share=False, debug=True) |
|
|