luminoussg commited on
Commit
ee3031b
·
verified ·
1 Parent(s): 7894d41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -7
app.py CHANGED
@@ -3,11 +3,15 @@ import tiktoken
3
  import json
4
 
5
  # Function to count tokens in the dataset based on the "messages" field
6
- def count_tokens(jsonl_file, encoding_name):
7
  encoding = tiktoken.get_encoding(encoding_name)
8
 
 
 
 
 
9
  # Load the JSONL data
10
- with open(jsonl_file.name, 'r') as f:
11
  data = [json.loads(line) for line in f.readlines()]
12
 
13
  total_token_count = 0
@@ -33,12 +37,12 @@ def count_tokens(jsonl_file, encoding_name):
33
  return token_counts, total_token_count
34
 
35
  # Gradio interface function
36
- def token_counter(jsonl_file, encoding_with_model):
37
  # Split encoding name and model type from the dropdown input
38
  encoding_name = encoding_with_model.split()[0]
39
 
40
  # Get token counts
41
- token_data, total_token_count = count_tokens(jsonl_file, encoding_name)
42
 
43
  return token_data, total_token_count
44
 
@@ -52,10 +56,10 @@ encoding_options = [
52
 
53
  # Gradio UI setup
54
  with gr.Blocks() as app:
55
- gr.Markdown("# Token Counter for JSONL Datasets (OpenAI Fine-tuning)")
56
 
57
  with gr.Row():
58
- jsonl_input = gr.File(label="Upload JSONL File", file_types=[".jsonl"])
59
  encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
60
 
61
  # Output for individual conversation token counts
@@ -68,7 +72,7 @@ with gr.Blocks() as app:
68
  submit_button = gr.Button("Submit")
69
 
70
  # Link the button click event to the token counting function
71
- submit_button.click(token_counter, [jsonl_input, encoding_dropdown], [conversation_output, total_output])
72
 
73
  # Launch the app
74
  app.launch()
 
3
  import json
4
 
5
  # Function to count tokens in the dataset based on the "messages" field
6
+ def count_tokens(json_file, encoding_name):
7
  encoding = tiktoken.get_encoding(encoding_name)
8
 
9
+ # Validate that the file is a .jsonl file
10
+ if not json_file.name.endswith('.jsonl'):
11
+ return {"error": "Please upload a valid .jsonl file."}, 0
12
+
13
  # Load the JSONL data
14
+ with open(json_file.name, 'r') as f:
15
  data = [json.loads(line) for line in f.readlines()]
16
 
17
  total_token_count = 0
 
37
  return token_counts, total_token_count
38
 
39
  # Gradio interface function
40
+ def token_counter(json_file, encoding_with_model):
41
  # Split encoding name and model type from the dropdown input
42
  encoding_name = encoding_with_model.split()[0]
43
 
44
  # Get token counts
45
+ token_data, total_token_count = count_tokens(json_file, encoding_name)
46
 
47
  return token_data, total_token_count
48
 
 
56
 
57
  # Gradio UI setup
58
  with gr.Blocks() as app:
59
+ gr.Markdown("# Token Counter for JSONL Datasets (OpenAI Fine-Tuning)")
60
 
61
  with gr.Row():
62
+ json_input = gr.File(label="Upload .jsonl File", type="file") # Accept only file uploads
63
  encoding_dropdown = gr.Dropdown(choices=encoding_options, label="Select Encoding", value="o200k_base (gpt-4o, gpt-4o-mini)")
64
 
65
  # Output for individual conversation token counts
 
72
  submit_button = gr.Button("Submit")
73
 
74
  # Link the button click event to the token counting function
75
+ submit_button.click(token_counter, [json_input, encoding_dropdown], [conversation_output, total_output])
76
 
77
  # Launch the app
78
  app.launch()