Spaces:
Sleeping
Sleeping
import gradio as gr | |
import tiktoken | |
import json | |
# Function to count tokens in the dataset based on the "messages" field | |
def count_tokens(json_file, encoding_name): | |
encoding = tiktoken.get_encoding(encoding_name) | |
# Load the JSON or JSONL data | |
with open(json_file.name, 'r') as f: | |
data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()] | |
token_counts = [] | |
for entry in data: | |
conversation_token_count = 0 | |
conversation_texts = [] | |
if "messages" in entry: | |
for message in entry["messages"]: | |
content = message.get("content", "") | |
conversation_texts.append(content) | |
conversation_token_count += len(encoding.encode(content)) | |
token_counts.append({ | |
'conversation': ' '.join(conversation_texts), | |
'token_count': conversation_token_count | |
}) | |
return token_counts | |
# Gradio interface function | |
def token_counter(json_file, encoding_name): | |
token_data = count_tokens(json_file, encoding_name) | |
return token_data | |
# Gradio UI setup | |
gr.Interface( | |
fn=token_counter, | |
inputs=[ | |
gr.File(label="Upload JSON/JSONL File"), | |
gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base") | |
], | |
outputs=gr.JSON(label="Token Counts") | |
).launch() | |