token_counter / app.py
luminoussg's picture
Update app.py
0878173 verified
raw
history blame
1.42 kB
import gradio as gr
import tiktoken
import json
# Function to count tokens in the dataset based on the "messages" field
def count_tokens(json_file, encoding_name):
encoding = tiktoken.get_encoding(encoding_name)
# Load the JSON or JSONL data
with open(json_file.name, 'r') as f:
data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]
token_counts = []
for entry in data:
conversation_token_count = 0
conversation_texts = []
if "messages" in entry:
for message in entry["messages"]:
content = message.get("content", "")
conversation_texts.append(content)
conversation_token_count += len(encoding.encode(content))
token_counts.append({
'conversation': ' '.join(conversation_texts),
'token_count': conversation_token_count
})
return token_counts
# Gradio interface function
def token_counter(json_file, encoding_name):
token_data = count_tokens(json_file, encoding_name)
return token_data
# Gradio UI setup
gr.Interface(
fn=token_counter,
inputs=[
gr.File(label="Upload JSON/JSONL File"),
gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
],
outputs=gr.JSON(label="Token Counts")
).launch()