Spaces:
Sleeping
Sleeping
File size: 2,731 Bytes
e7d39a8 0878173 ee3031b e7d39a8 0cc0797 ee3031b 0cc0797 0878173 1476a99 e7d39a8 0878173 1476a99 0878173 1476a99 e7d39a8 0878173 e7d39a8 f825e7c e7d39a8 0878173 ee3031b f825e7c ee3031b f825e7c e7d39a8 0878173 f825e7c 0cc0797 f825e7c 0cc0797 f825e7c 47925da ee3031b f825e7c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import gradio as gr
import tiktoken
import json
# Function to count tokens in the dataset based on the "messages" field
def count_tokens(json_file, encoding_name):
encoding = tiktoken.get_encoding(encoding_name)
# Load the JSON or JSONL data
with open(json_file.name, 'r') as f:
data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]
total_token_count = 0
token_counts = []
for entry in data:
conversation_token_count = 0
conversation_texts = []
if "messages" in entry:
for message in entry["messages"]:
content = message.get("content", "")
conversation_texts.append(content)
tokens = len(encoding.encode(content))
conversation_token_count += tokens
# Add conversation token count to the total
total_token_count += conversation_token_count
token_counts.append({
'conversation': ' '.join(conversation_texts),
'token_count': conversation_token_count
})
return token_counts, total_token_count
# Gradio interface function
def token_counter(json_file, encoding_with_model):
# Split encoding name and model type from the dropdown input
encoding_name = encoding_with_model.split()[0]
# Get token counts
token_data, total_token_count = count_tokens(json_file, encoding_name)
return token_data, total_token_count
# Define the encoding choices with model information
encoding_options = [
"o200k_base (gpt-4o, gpt-4o-mini)",
"cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)",
"p50k_base (Codex models, text-davinci-002, text-davinci-003)",
"r50k_base (GPT-3 models like davinci)"
]
# Gradio UI setup
with gr.Blocks() as app:
gr.Markdown("# Token Counter for JSON/JSONL Datasets")
with gr.Row():
json_input = gr.File(label="Upload JSON/JSONL File")
encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
# Output for individual conversation token counts
conversation_output = gr.JSON(label="Token Counts per Conversation")
# Output for total token count
total_output = gr.Number(label="Total Token Count", interactive=False)
# Add a submit button to trigger token counting
submit_button = gr.Button("Submit")
# Link the button click event to the token counting function
submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])
# Launch the app
app.launch()
|