Tobias Bergmann commited on
Commit
3a8892f
·
1 Parent(s): 5ac9a35
Files changed (1) hide show
  1. app.py +28 -12
app.py CHANGED
@@ -18,27 +18,45 @@ model_path = hf_hub_download(
18
  )
19
  # Load the GGUF model
20
  pipe = Llama(
21
- n_ctx=MAX_MAX_NEW_TOKENS,
22
- # n_threads=4, # Set the desired number of threads to use, defaults to number of cores
23
  # n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size.
24
  # n_batch=1, # Set the batch size.
25
  # use_mlock =True, # Set to False to disable locking to RAM.
26
- model_path=model_path
27
  )
28
 
29
  def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
30
  if not message:
31
- return "", history
32
  prompt = message
33
- output = pipe(
 
 
 
 
 
 
 
34
  prompt,
35
  max_tokens=max_new_tokens,
36
  stop=["</s>"],
 
37
  )
38
- reply = output['choices'][0]['text']
39
- history.append([message, reply])
40
- return "", history
41
-
 
 
 
 
 
 
 
 
 
 
42
  with gr.Blocks() as demo:
43
  gr.Markdown(DESCRIPTION)
44
  chatbot = gr.Chatbot()
@@ -52,6 +70,4 @@ with gr.Blocks() as demo:
52
  )
53
  textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
54
 
55
-
56
-
57
- demo.queue().launch(share=True)
 
18
  )
19
  # Load the GGUF model
20
  pipe = Llama(
21
+ n_ctx=MAX_MAX_NEW_TOKENS,
22
+ # n_threads=4, # Set the desired number of threads to use, defaults to number of cores
23
  # n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size.
24
  # n_batch=1, # Set the batch size.
25
  # use_mlock =True, # Set to False to disable locking to RAM.
26
+ model_path=model_path
27
  )
28
 
29
  def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
30
  if not message:
31
+ return "", history
32
  prompt = message
33
+
34
+ # Initialize reply
35
+ reply = ""
36
+
37
+ history.append([message, ""])
38
+
39
+ # Use stream=True for streaming
40
+ stream = pipe(
41
  prompt,
42
  max_tokens=max_new_tokens,
43
  stop=["</s>"],
44
+ stream=True
45
  )
46
+
47
+ for output in stream:
48
+ # This loop will receive partial output (one token at a time)
49
+ new_text = output['choices'][0]['text']
50
+
51
+ # Append to the current reply
52
+ reply += new_text
53
+
54
+ # Update the history
55
+ history[-1][1] = reply
56
+
57
+ # Yield for incremental display on chat
58
+ yield "", history
59
+
60
  with gr.Blocks() as demo:
61
  gr.Markdown(DESCRIPTION)
62
  chatbot = gr.Chatbot()
 
70
  )
71
  textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
72
 
73
+ demo.queue().launch()