Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,11 +3,15 @@ import tiktoken
|
|
3 |
import json
|
4 |
|
5 |
# Function to count tokens in the dataset based on the "messages" field
|
6 |
-
def count_tokens(
|
7 |
encoding = tiktoken.get_encoding(encoding_name)
|
8 |
|
|
|
|
|
|
|
|
|
9 |
# Load the JSONL data
|
10 |
-
with open(
|
11 |
data = [json.loads(line) for line in f.readlines()]
|
12 |
|
13 |
total_token_count = 0
|
@@ -33,12 +37,12 @@ def count_tokens(jsonl_file, encoding_name):
|
|
33 |
return token_counts, total_token_count
|
34 |
|
35 |
# Gradio interface function
|
36 |
-
def token_counter(
|
37 |
# Split encoding name and model type from the dropdown input
|
38 |
encoding_name = encoding_with_model.split()[0]
|
39 |
|
40 |
# Get token counts
|
41 |
-
token_data, total_token_count = count_tokens(
|
42 |
|
43 |
return token_data, total_token_count
|
44 |
|
@@ -52,10 +56,10 @@ encoding_options = [
|
|
52 |
|
53 |
# Gradio UI setup
|
54 |
with gr.Blocks() as app:
|
55 |
-
gr.Markdown("# Token Counter for JSONL Datasets (OpenAI Fine-
|
56 |
|
57 |
with gr.Row():
|
58 |
-
|
59 |
encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
|
60 |
|
61 |
# Output for individual conversation token counts
|
@@ -68,7 +72,7 @@ with gr.Blocks() as app:
|
|
68 |
submit_button = gr.Button("Submit")
|
69 |
|
70 |
# Link the button click event to the token counting function
|
71 |
-
submit_button.click(token_counter, [
|
72 |
|
73 |
# Launch the app
|
74 |
app.launch()
|
|
|
3 |
import json
|
4 |
|
5 |
# Function to count tokens in the dataset based on the "messages" field
|
6 |
+
def count_tokens(json_file, encoding_name):
|
7 |
encoding = tiktoken.get_encoding(encoding_name)
|
8 |
|
9 |
+
# Validate that the file is a .jsonl file
|
10 |
+
if not json_file.name.endswith('.jsonl'):
|
11 |
+
return {"error": "Please upload a valid .jsonl file."}, 0
|
12 |
+
|
13 |
# Load the JSONL data
|
14 |
+
with open(json_file.name, 'r') as f:
|
15 |
data = [json.loads(line) for line in f.readlines()]
|
16 |
|
17 |
total_token_count = 0
|
|
|
37 |
return token_counts, total_token_count
|
38 |
|
39 |
# Gradio interface function
|
40 |
+
def token_counter(json_file, encoding_with_model):
|
41 |
# Split encoding name and model type from the dropdown input
|
42 |
encoding_name = encoding_with_model.split()[0]
|
43 |
|
44 |
# Get token counts
|
45 |
+
token_data, total_token_count = count_tokens(json_file, encoding_name)
|
46 |
|
47 |
return token_data, total_token_count
|
48 |
|
|
|
56 |
|
57 |
# Gradio UI setup
|
58 |
with gr.Blocks() as app:
|
59 |
+
gr.Markdown("# Token Counter for JSONL Datasets (OpenAI Fine-Tuning)")
|
60 |
|
61 |
with gr.Row():
|
62 |
+
json_input = gr.File(label="Upload .jsonl File", type="file") # Accept only file uploads
|
63 |
encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
|
64 |
|
65 |
# Output for individual conversation token counts
|
|
|
72 |
submit_button = gr.Button("Submit")
|
73 |
|
74 |
# Link the button click event to the token counting function
|
75 |
+
submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])
|
76 |
|
77 |
# Launch the app
|
78 |
app.launch()
|