Spaces:

luminoussg
/

token_counter

Sleeping

luminoussg commited on Oct 16, 2024

Commit

1476a99

verified ·

1 Parent(s): 0878173

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ def count_tokens(json_file, encoding_name):
     with open(json_file.name, 'r') as f:
         data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]
     token_counts = []
     for entry in data:
         conversation_token_count = 0
@@ -18,14 +19,18 @@ def count_tokens(json_file, encoding_name):
             for message in entry["messages"]:
                 content = message.get("content", "")
                 conversation_texts.append(content)
-                conversation_token_count += len(encoding.encode(content))
         token_counts.append({
             'conversation': ' '.join(conversation_texts),
             'token_count': conversation_token_count
         })
-    return token_counts
 # Gradio interface function
 def token_counter(json_file, encoding_name):
@@ -39,5 +44,7 @@ gr.Interface(
         gr.File(label="Upload JSON/JSONL File"),
         gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
     ],
-    outputs=gr.JSON(label="Token Counts")
 ).launch()

     with open(json_file.name, 'r') as f:
         data = json.load(f) if json_file.name.endswith('.json') else [json.loads(line) for line in f.readlines()]
+    total_token_count = 0
     token_counts = []
     for entry in data:
         conversation_token_count = 0
             for message in entry["messages"]:
                 content = message.get("content", "")
                 conversation_texts.append(content)
+                tokens = len(encoding.encode(content))
+                conversation_token_count += tokens
+        # Add conversation token count to the total
+        total_token_count += conversation_token_count
         token_counts.append({
             'conversation': ' '.join(conversation_texts),
             'token_count': conversation_token_count
         })
+    return {"conversations": token_counts, "total_token_count": total_token_count}
 # Gradio interface function
 def token_counter(json_file, encoding_name):
         gr.File(label="Upload JSON/JSONL File"),
         gr.Dropdown(["r50k_base", "p50k_base", "cl100k_base", "o200k_base"], label="Select Encoding", value="cl100k_base")
     ],
+    outputs=[
+        gr.JSON(label="Token Counts per Conversation and Total"),
+    ]
 ).launch()