Spaces:
Sleeping
Sleeping
File size: 1,415 Bytes
e7d39a8 0878173 e7d39a8 0878173 e7d39a8 0878173 e7d39a8 0878173 e7d39a8 0878173 e7d39a8 0878173 e7d39a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import gradio as gr
import tiktoken
import json
# Function to count tokens in the dataset based on the "messages" field
def count_tokens(json_file, encoding_name):
encoding = tiktoken.get_encoding(encoding_name)
# Load the JSON or JSONL data
with open(json_file.name, 'r') as f:
data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]
token_counts = []
for entry in data:
conversation_token_count = 0
conversation_texts = []
if "messages" in entry:
for message in entry["messages"]:
content = message.get("content", "")
conversation_texts.append(content)
conversation_token_count += len(encoding.encode(content))
token_counts.append({
'conversation': ' '.join(conversation_texts),
'token_count': conversation_token_count
})
return token_counts
# Gradio interface function
def token_counter(json_file, encoding_name):
token_data = count_tokens(json_file, encoding_name)
return token_data
# Gradio UI setup
gr.Interface(
fn=token_counter,
inputs=[
gr.File(label="Upload JSON/JSONL File"),
gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
],
outputs=gr.JSON(label="Token Counts")
).launch()
|