Tobias Bergmann commited on
Commit
7d00bdf
·
1 Parent(s): 0b2f919
Files changed (1) hide show
  1. app.py +17 -11
app.py CHANGED
@@ -2,7 +2,7 @@ from llama_cpp import Llama
2
  from huggingface_hub import hf_hub_download
3
  import gradio as gr
4
  from typing import Tuple, List
5
- import time
6
 
7
  DESCRIPTION = f"""
8
  # Chat with Arco 500M as GGUF on CPU
@@ -27,7 +27,7 @@ pipe = Llama(
27
  model_path=model_path
28
  )
29
 
30
- def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS):
31
  if not message:
32
  return "", history
33
 
@@ -37,6 +37,10 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
37
  # Initialize reply for this round
38
  reply = ""
39
 
 
 
 
 
40
  # This will produce a generator of output chunks
41
  stream = pipe(
42
  prompt,
@@ -45,23 +49,24 @@ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAUL
45
  stream=True
46
  )
47
 
48
- start_time = time.time()
49
- tokens_generated = 0
50
-
51
  # Send each token stream output to the user
52
  for output in stream:
53
  new_text = output['choices'][0]['text']
54
  reply += new_text
55
- tokens_generated += len(new_text.split()) # Simple token counting by splitting on whitespace
56
  history[-1][1] = reply # Update the current reply in history
57
-
 
58
  elapsed_time = time.time() - start_time
59
  if elapsed_time > 0:
60
- tokens_per_second = tokens_generated / elapsed_time
61
  else:
62
- tokens_per_second = 0
 
 
 
63
 
64
- yield f"{reply} \n\n *Tokens/second: {tokens_per_second:.2f}*", history
65
 
66
 
67
  with gr.Blocks() as demo:
@@ -75,6 +80,7 @@ with gr.Blocks() as demo:
75
  value=DEFAULT_MAX_NEW_TOKENS,
76
  label="Max New Tokens",
77
  )
78
- textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot])
 
79
 
80
  demo.queue().launch()
 
2
  from huggingface_hub import hf_hub_download
3
  import gradio as gr
4
  from typing import Tuple, List
5
+ import time # Import the time module
6
 
7
  DESCRIPTION = f"""
8
  # Chat with Arco 500M as GGUF on CPU
 
27
  model_path=model_path
28
  )
29
 
30
+ def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()):
31
  if not message:
32
  return "", history
33
 
 
37
  # Initialize reply for this round
38
  reply = ""
39
 
40
+ # Initialize token count and start time
41
+ token_count = 0
42
+ start_time = time.time()
43
+
44
  # This will produce a generator of output chunks
45
  stream = pipe(
46
  prompt,
 
49
  stream=True
50
  )
51
 
 
 
 
52
  # Send each token stream output to the user
53
  for output in stream:
54
  new_text = output['choices'][0]['text']
55
  reply += new_text
56
+ token_count += len(new_text.split()) # Estimate tokens by counting spaces
57
  history[-1][1] = reply # Update the current reply in history
58
+
59
+ # Calculate elapsed time and TPS
60
  elapsed_time = time.time() - start_time
61
  if elapsed_time > 0:
62
+ tps = token_count / elapsed_time
63
  else:
64
+ tps = 0
65
+
66
+ # Update the status using gradio's progress
67
+ progress(message=f"Tokens per second: {tps:.2f}")
68
 
69
+ yield "", history
70
 
71
 
72
  with gr.Blocks() as demo:
 
80
  value=DEFAULT_MAX_NEW_TOKENS,
81
  label="Max New Tokens",
82
  )
83
+ status_field = gr.Text(label="Status", interactive=False, visible=True) # Add Status field
84
+ textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot], progress=status_field)
85
 
86
  demo.queue().launch()