Spaces:

luminoussg
/

token_counter

Sleeping

File size: 2,709 Bytes

import gradio as gr
import tiktoken
import json

# Function to count tokens in the dataset based on the "messages" field
def count_tokens(jsonl_file, encoding_name):
    encoding = tiktoken.get_encoding(encoding_name)
    
    # Load the JSONL data
    with open(jsonl_file.name, 'r') as f:
        data = [json.loads(line) for line in f.readlines()]

    total_token_count = 0
    token_counts = []
    for entry in data:
        conversation_token_count = 0
        conversation_texts = []
        if "messages" in entry:
            for message in entry["messages"]:
                content = message.get("content", "")
                conversation_texts.append(content)
                tokens = len(encoding.encode(content))
                conversation_token_count += tokens

        # Add conversation token count to the total
        total_token_count += conversation_token_count
        
        token_counts.append({
            'conversation': ' '.join(conversation_texts),
            'token_count': conversation_token_count
        })
    
    return token_counts, total_token_count

# Gradio interface function
def token_counter(jsonl_file, encoding_with_model):
    # Split encoding name and model type from the dropdown input
    encoding_name = encoding_with_model.split()[0]
    
    # Get token counts
    token_data, total_token_count = count_tokens(jsonl_file, encoding_name)
    
    return token_data, total_token_count

# Define the encoding choices with model information
encoding_options = [
    "o200k_base (gpt-4o, gpt-4o-mini)",
    "cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)",
    "p50k_base (Codex models, text-davinci-002, text-davinci-003)",
    "r50k_base (GPT-3 models like davinci)"
]

# Gradio UI setup
with gr.Blocks() as app:
    gr.Markdown("# Token Counter for JSONL Datasets (OpenAI Fine-tuning)")
    
    with gr.Row():
        jsonl_input = gr.File(label="Upload JSONL File", file_types=[".jsonl"])
        encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
    
    # Output for individual conversation token counts
    conversation_output = gr.JSON(label="Token Counts per Conversation")
    
    # Output for total token count
    total_output = gr.Number(label="Total Token Count", interactive=False)
    
    # Add a submit button to trigger token counting
    submit_button = gr.Button("Submit")
    
    # Link the button click event to the token counting function
    submit_button.click(token_counter, [jsonl_input, encoding_dropdown], [conversation_output, total_output])

# Launch the app
app.launch()