Tobias Bergmann commited on
Commit
785de3c
·
1 Parent(s): 668ee0d

streaming per token

Browse files
Files changed (1) hide show
  1. app.py +8 -7
app.py CHANGED
@@ -29,10 +29,10 @@ pipe = Llama(
29
  def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
30
  if not message:
31
  return "", history
32
-
33
  prompt = message
34
  history.append([message, ""])
35
-
36
  # Initialize reply for this round
37
  reply = ""
38
 
@@ -44,12 +44,13 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
44
  stream=True
45
  )
46
 
 
47
  for output in stream:
48
  new_text = output['choices'][0]['text']
49
- reply += new_text
50
- history[-1][1] = reply # Update the current reply in history
51
- yield "", history
52
- return "", history # Always return at the end to terminate the generator
53
 
54
  with gr.Blocks() as demo:
55
  gr.Markdown(DESCRIPTION)
@@ -64,4 +65,4 @@ with gr.Blocks() as demo:
64
  )
65
  textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
66
 
67
- demo.queue().launch()
 
29
  def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
30
  if not message:
31
  return "", history
32
+
33
  prompt = message
34
  history.append([message, ""])
35
+
36
  # Initialize reply for this round
37
  reply = ""
38
 
 
44
  stream=True
45
  )
46
 
47
+ # Send each token stream output to the user
48
  for output in stream:
49
  new_text = output['choices'][0]['text']
50
+ reply += new_text
51
+ history[-1][1] = reply # Update the current reply in history
52
+ yield "", history
53
+
54
 
55
  with gr.Blocks() as demo:
56
  gr.Markdown(DESCRIPTION)
 
65
  )
66
  textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
67
 
68
+ demo.queue().launch()