Spaces:

luminoussg
/

token_counter

Sleeping

App Files Files Community

luminoussg commited on Oct 16, 2024

Commit

ee3031b

verified ·

1 Parent(s): 7894d41

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -7

app.py CHANGED Viewed

@@ -3,11 +3,15 @@ import tiktoken
 import json
 # Function to count tokens in the dataset based on the "messages" field
-def count_tokens(jsonl_file, encoding_name):
     encoding = tiktoken.get_encoding(encoding_name)
     # Load the JSONL data
-    with open(jsonl_file.name, 'r') as f:
         data = [json.loads(line) for line in f.readlines()]
     total_token_count = 0
@@ -33,12 +37,12 @@ def count_tokens(jsonl_file, encoding_name):
     return token_counts, total_token_count
 # Gradio interface function
-def token_counter(jsonl_file, encoding_with_model):
     # Split encoding name and model type from the dropdown input
     encoding_name = encoding_with_model.split()[0]
     # Get token counts
-    token_data, total_token_count = count_tokens(jsonl_file, encoding_name)
     return token_data, total_token_count
@@ -52,10 +56,10 @@ encoding_options = [
 # Gradio UI setup
 with gr.Blocks() as app:
-    gr.Markdown("# Token Counter for JSONL Datasets (OpenAI Fine-tuning)")
     with gr.Row():
-        jsonl_input = gr.File(label="Upload JSONL File", file_types=[".jsonl"])
         encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
     # Output for individual conversation token counts
@@ -68,7 +72,7 @@ with gr.Blocks() as app:
     submit_button = gr.Button("Submit")
     # Link the button click event to the token counting function
-    submit_button.click(token_counter, [jsonl_input, encoding_dropdown], [conversation_output, total_output])
 # Launch the app
 app.launch()

 import json
 # Function to count tokens in the dataset based on the "messages" field
+def count_tokens(json_file, encoding_name):
     encoding = tiktoken.get_encoding(encoding_name)
+    # Validate that the file is a .jsonl file
+    if not json_file.name.endswith('.jsonl'):
+        return {"error": "Please upload a valid .jsonl file."}, 0
     # Load the JSONL data
+    with open(json_file.name, 'r') as f:
         data = [json.loads(line) for line in f.readlines()]
     total_token_count = 0
     return token_counts, total_token_count
 # Gradio interface function
+def token_counter(json_file, encoding_with_model):
     # Split encoding name and model type from the dropdown input
     encoding_name = encoding_with_model.split()[0]
     # Get token counts
+    token_data, total_token_count = count_tokens(json_file, encoding_name)
     return token_data, total_token_count
 # Gradio UI setup
 with gr.Blocks() as app:
+    gr.Markdown("# Token Counter for JSONL Datasets (OpenAI Fine-Tuning)")
     with gr.Row():
+        json_input = gr.File(label="Upload .jsonl File", type="file")  # Accept only file uploads
         encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
     # Output for individual conversation token counts
     submit_button = gr.Button("Submit")
     # Link the button click event to the token counting function
+    submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])
 # Launch the app
 app.launch()