Spaces:
Sleeping
Sleeping
File size: 2,572 Bytes
9329033 0c8feaf 518754f 0c8feaf 9329033 0c8feaf 9329033 fa8a778 9329033 6ba0c05 3a8892f 6ba0c05 3a8892f 0c8feaf 7d00bdf 5ac9a35 4a44fb0 785de3c 5ac9a35 3a8892f 785de3c 668ee0d 518754f 7d00bdf 518754f 668ee0d 3a8892f 518754f 668ee0d 5ac9a35 3a8892f 5ac9a35 518754f 785de3c 3a8892f 785de3c 7d00bdf 785de3c 4a44fb0 0b2f919 4a44fb0 0b2f919 4a44fb0 7d00bdf 4a44fb0 0b2f919 4a44fb0 785de3c 668ee0d 5ac9a35 0c8feaf 5ac9a35 0c8feaf 5ac9a35 0c8feaf 5ac9a35 0c8feaf 4a44fb0 668ee0d 518754f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import gradio as gr
from typing import Tuple, List
import time
DESCRIPTION = f"""
# Chat with Arco 500M as GGUF on CPU
"""
MAX_MAX_NEW_TOKENS = 1024
DEFAULT_MAX_NEW_TOKENS = 200
# Download the GGUF file
model_path = hf_hub_download(
repo_id="TobDeBer/arco-Q4_K_M-GGUF",
filename="arco-q4_k_m.gguf",
repo_type="model"
)
# Load the GGUF model
pipe = Llama(
n_ctx=MAX_MAX_NEW_TOKENS,
# n_threads=4, # Set the desired number of threads to use, defaults to number of cores
# n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size.
# n_batch=1, # Set the batch size.
# use_mlock =True, # Set to False to disable locking to RAM.
model_path=model_path
)
def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()):
if not message:
return "", history, ""
prompt = message
history.append([message, ""])
# Initialize reply for this round
reply = ""
# Initialize token count and start time
token_count = 0
start_time = time.time()
# This will produce a generator of output chunks
stream = pipe(
prompt,
max_tokens=max_new_tokens,
stop=["</s>"],
stream=True
)
# Send each token stream output to the user
for output in stream:
new_text = output['choices'][0]['text']
reply += new_text
token_count += len(new_text.split()) # Estimate tokens by counting spaces
history[-1][1] = reply # Update the current reply in history
# Calculate elapsed time and TPS
elapsed_time = time.time() - start_time
if elapsed_time > 0:
tps = token_count / elapsed_time
else:
tps = 0
# Update the status using gradio's progress
status_message = f"Tokens per second: {tps:.2f}"
yield "", history, status_message
with gr.Blocks() as demo:
gr.Markdown(DESCRIPTION)
chatbot = gr.Chatbot()
with gr.Row():
textbox = gr.Textbox(placeholder="Type here and press enter")
max_new_tokens_slider = gr.Slider(
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
value=DEFAULT_MAX_NEW_TOKENS,
label="Max New Tokens",
)
status_field = gr.Text(label="Status", interactive=False, visible=True) # Add Status field
textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot, status_field])
demo.queue().launch()
|