arco_gguf_inference

Sleeping

App Files Files Community

Tobias Bergmann commited on Dec 14, 2024

Commit

4a44fb0

1 Parent(s): 91a07e0

tps field

Browse files

Files changed (1) hide show

app.py +13 -20

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ pipe = Llama(
 def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()):
     if not message:
-        return "", history
     prompt = message
     history.append([message, ""])
@@ -40,9 +40,6 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
     # Initialize token count and start time
     token_count = 0
     start_time = time.time()
-    last_token_count = 0
-    last_time = start_time
     # This will produce a generator of output chunks
     stream = pipe(
@@ -58,24 +55,20 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
         reply += new_text
         token_count += len(new_text.split()) # Estimate tokens by counting spaces
         history[-1][1] = reply # Update the current reply in history
-        # Calculate elapsed time since last update
-        elapsed_time = time.time() - last_time
         if elapsed_time > 0:
-            # Calculate tokens per second since last update
-            tokens_per_second = (token_count - last_token_count) / elapsed_time
         else:
-            tokens_per_second = 0
         # Update the status using gradio's progress
-        progress(message=f"Tokens per second: {tokens_per_second:.2f}")
-        # Update for next iteration
-        last_token_count = token_count
-        last_time = time.time()
-        yield "", history
 with gr.Blocks() as demo:
@@ -89,7 +82,7 @@ with gr.Blocks() as demo:
         value=DEFAULT_MAX_NEW_TOKENS,
         label="Max New Tokens",
     )
-    textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot], )
 demo.queue().launch()

 def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()):
     if not message:
+        return "", history, ""
     prompt = message
     history.append([message, ""])
     # Initialize token count and start time
     token_count = 0
     start_time = time.time()
     # This will produce a generator of output chunks
     stream = pipe(
         reply += new_text
         token_count += len(new_text.split()) # Estimate tokens by counting spaces
         history[-1][1] = reply # Update the current reply in history
+        # Calculate elapsed time and TPS
+        elapsed_time = time.time() - start_time
         if elapsed_time > 0:
+            tps = token_count / elapsed_time
         else:
+            tps = 0
         # Update the status using gradio's progress
+        status_message = f"Tokens per second: {tps:.2f}"
+        yield "", history, status_message
 with gr.Blocks() as demo:
         value=DEFAULT_MAX_NEW_TOKENS,
         label="Max New Tokens",
     )
+    status_field = gr.Text(label="Status", interactive=False, visible=True) # Add Status field
+    textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot, status_field])
 demo.queue().launch()