arco_gguf_inference

Sleeping

Tobias Bergmann commited on Dec 14, 2024

Commit

785de3c

1 Parent(s): 668ee0d

streaming per token

Files changed (1) hide show

app.py CHANGED Viewed

@@ -29,10 +29,10 @@ pipe = Llama(
 def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
     if not message:
         return "", history
     prompt = message
     history.append([message, ""])
     # Initialize reply for this round
     reply = ""
@@ -44,12 +44,13 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
         stream=True
     )
     for output in stream:
         new_text = output['choices'][0]['text']
-        reply += new_text
-        history[-1][1] = reply  # Update the current reply in history
-        yield "", history
-    return "", history  # Always return at the end to terminate the generator
 with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
@@ -64,4 +65,4 @@ with gr.Blocks() as demo:
     )
     textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
-demo.queue().launch()

 def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
     if not message:
         return "", history
     prompt = message
     history.append([message, ""])
     # Initialize reply for this round
     reply = ""
         stream=True
     )
+    # Send each token stream output to the user
     for output in stream:
         new_text = output['choices'][0]['text']
+        reply += new_text
+        history[-1][1] = reply # Update the current reply in history
+        yield "", history
 with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
     )
     textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
+demo.queue().launch()