arco_gguf_inference

Sleeping

Tobias Bergmann commited on Dec 14, 2024

Commit

3a8892f

1 Parent(s): 5ac9a35

streaming

Files changed (1) hide show

app.py CHANGED Viewed

@@ -18,27 +18,45 @@ model_path = hf_hub_download(
 )
 # Load the GGUF model
 pipe = Llama(
-	n_ctx=MAX_MAX_NEW_TOKENS,
-	# n_threads=4, # Set the desired number of threads to use, defaults to number of cores
     # n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size.
     # n_batch=1, # Set the batch size.
     # use_mlock =True, # Set to False to disable locking to RAM.
-	model_path=model_path
 )
 def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
     if not message:
-      return "", history
     prompt = message
-    output = pipe(
         prompt,
         max_tokens=max_new_tokens,
         stop=["</s>"],
     )
-    reply = output['choices'][0]['text']
-    history.append([message, reply])
-    return "", history
 with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
     chatbot = gr.Chatbot()
@@ -52,6 +70,4 @@ with gr.Blocks() as demo:
     )
     textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
-demo.queue().launch(share=True)

 )
 # Load the GGUF model
 pipe = Llama(
+    n_ctx=MAX_MAX_NEW_TOKENS,
+    # n_threads=4, # Set the desired number of threads to use, defaults to number of cores
     # n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size.
     # n_batch=1, # Set the batch size.
     # use_mlock =True, # Set to False to disable locking to RAM.
+    model_path=model_path
 )
 def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
     if not message:
+        return "", history
     prompt = message
+    # Initialize reply
+    reply = ""
+    history.append([message, ""])
+    # Use stream=True for streaming
+    stream = pipe(
         prompt,
         max_tokens=max_new_tokens,
         stop=["</s>"],
+        stream=True
     )
+    for output in stream:
+        # This loop will receive partial output (one token at a time)
+        new_text = output['choices'][0]['text']
+        # Append to the current reply
+        reply += new_text
+        # Update the history
+        history[-1][1] = reply
+        # Yield for incremental display on chat
+        yield "", history
 with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
     chatbot = gr.Chatbot()
     )
     textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
+demo.queue().launch()