File size: 2,709 Bytes
e7d39a8
 
 
 
0878173
7894d41
e7d39a8
 
7894d41
 
 
0878173
1476a99
e7d39a8
0878173
 
 
 
 
 
 
1476a99
 
0878173
1476a99
 
 
e7d39a8
0878173
 
e7d39a8
 
f825e7c
e7d39a8
0878173
7894d41
f825e7c
 
 
 
7894d41
f825e7c
 
 
 
 
 
 
 
 
 
e7d39a8
0878173
f825e7c
7894d41
f825e7c
 
7894d41
f825e7c
 
 
 
 
 
 
 
47925da
 
 
 
7894d41
f825e7c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
import tiktoken
import json

# Function to count tokens in the dataset based on the "messages" field
def count_tokens(jsonl_file, encoding_name):
    encoding = tiktoken.get_encoding(encoding_name)
    
    # Load the JSONL data
    with open(jsonl_file.name, 'r') as f:
        data = [json.loads(line) for line in f.readlines()]

    total_token_count = 0
    token_counts = []
    for entry in data:
        conversation_token_count = 0
        conversation_texts = []
        if "messages" in entry:
            for message in entry["messages"]:
                content = message.get("content", "")
                conversation_texts.append(content)
                tokens = len(encoding.encode(content))
                conversation_token_count += tokens

        # Add conversation token count to the total
        total_token_count += conversation_token_count
        
        token_counts.append({
            'conversation': ' '.join(conversation_texts),
            'token_count': conversation_token_count
        })
    
    return token_counts, total_token_count

# Gradio interface function
def token_counter(jsonl_file, encoding_with_model):
    # Split encoding name and model type from the dropdown input
    encoding_name = encoding_with_model.split()[0]
    
    # Get token counts
    token_data, total_token_count = count_tokens(jsonl_file, encoding_name)
    
    return token_data, total_token_count

# Define the encoding choices with model information
encoding_options = [
    "o200k_base (gpt-4o, gpt-4o-mini)",
    "cl100k_base (gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large)",
    "p50k_base (Codex models, text-davinci-002, text-davinci-003)",
    "r50k_base (GPT-3 models like davinci)"
]

# Gradio UI setup
with gr.Blocks() as app:
    gr.Markdown("# Token Counter for JSONL Datasets (OpenAI Fine-tuning)")
    
    with gr.Row():
        jsonl_input = gr.File(label="Upload JSONL File", file_types=[".jsonl"])
        encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
    
    # Output for individual conversation token counts
    conversation_output = gr.JSON(label="Token Counts per Conversation")
    
    # Output for total token count
    total_output = gr.Number(label="Total Token Count", interactive=False)
    
    # Add a submit button to trigger token counting
    submit_button = gr.Button("Submit")
    
    # Link the button click event to the token counting function
    submit_button.click(token_counter, [jsonl_input, encoding_dropdown], [conversation_output, total_output])

# Launch the app
app.launch()