Spaces:
Sleeping
Sleeping
Tobias Bergmann
commited on
Commit
·
3a8892f
1
Parent(s):
5ac9a35
streaming
Browse files
app.py
CHANGED
@@ -18,27 +18,45 @@ model_path = hf_hub_download(
|
|
18 |
)
|
19 |
# Load the GGUF model
|
20 |
pipe = Llama(
|
21 |
-
|
22 |
-
|
23 |
# n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size.
|
24 |
# n_batch=1, # Set the batch size.
|
25 |
# use_mlock =True, # Set to False to disable locking to RAM.
|
26 |
-
|
27 |
)
|
28 |
|
29 |
def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
|
30 |
if not message:
|
31 |
-
|
32 |
prompt = message
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
prompt,
|
35 |
max_tokens=max_new_tokens,
|
36 |
stop=["</s>"],
|
|
|
37 |
)
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
with gr.Blocks() as demo:
|
43 |
gr.Markdown(DESCRIPTION)
|
44 |
chatbot = gr.Chatbot()
|
@@ -52,6 +70,4 @@ with gr.Blocks() as demo:
|
|
52 |
)
|
53 |
textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
demo.queue().launch(share=True)
|
|
|
18 |
)
|
19 |
# Load the GGUF model
|
20 |
pipe = Llama(
|
21 |
+
n_ctx=MAX_MAX_NEW_TOKENS,
|
22 |
+
# n_threads=4, # Set the desired number of threads to use, defaults to number of cores
|
23 |
# n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size.
|
24 |
# n_batch=1, # Set the batch size.
|
25 |
# use_mlock =True, # Set to False to disable locking to RAM.
|
26 |
+
model_path=model_path
|
27 |
)
|
28 |
|
29 |
def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
|
30 |
if not message:
|
31 |
+
return "", history
|
32 |
prompt = message
|
33 |
+
|
34 |
+
# Initialize reply
|
35 |
+
reply = ""
|
36 |
+
|
37 |
+
history.append([message, ""])
|
38 |
+
|
39 |
+
# Use stream=True for streaming
|
40 |
+
stream = pipe(
|
41 |
prompt,
|
42 |
max_tokens=max_new_tokens,
|
43 |
stop=["</s>"],
|
44 |
+
stream=True
|
45 |
)
|
46 |
+
|
47 |
+
for output in stream:
|
48 |
+
# This loop will receive partial output (one token at a time)
|
49 |
+
new_text = output['choices'][0]['text']
|
50 |
+
|
51 |
+
# Append to the current reply
|
52 |
+
reply += new_text
|
53 |
+
|
54 |
+
# Update the history
|
55 |
+
history[-1][1] = reply
|
56 |
+
|
57 |
+
# Yield for incremental display on chat
|
58 |
+
yield "", history
|
59 |
+
|
60 |
with gr.Blocks() as demo:
|
61 |
gr.Markdown(DESCRIPTION)
|
62 |
chatbot = gr.Chatbot()
|
|
|
70 |
)
|
71 |
textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
|
72 |
|
73 |
+
demo.queue().launch()
|
|
|
|