Spaces:
Sleeping
Sleeping
from llama_cpp import Llama | |
from huggingface_hub import hf_hub_download | |
import gradio as gr | |
from typing import Tuple, List | |
import time | |
DESCRIPTION = f""" | |
# Chat with Arco 500M as GGUF on CPU | |
""" | |
MAX_MAX_NEW_TOKENS = 1024 | |
DEFAULT_MAX_NEW_TOKENS = 200 | |
# Download the GGUF file | |
model_path = hf_hub_download( | |
repo_id="TobDeBer/arco-Q4_K_M-GGUF", | |
filename="arco-q4_k_m.gguf", | |
repo_type="model" | |
) | |
# Load the GGUF model | |
pipe = Llama( | |
n_ctx=MAX_MAX_NEW_TOKENS, | |
# n_threads=4, # Set the desired number of threads to use, defaults to number of cores | |
# n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size. | |
# n_batch=1, # Set the batch size. | |
# use_mlock =True, # Set to False to disable locking to RAM. | |
model_path=model_path | |
) | |
def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()): | |
if not message: | |
return "", history, "" | |
prompt = message | |
history.append([message, ""]) | |
# Initialize reply for this round | |
reply = "" | |
# Initialize token count and start time | |
token_count = 0 | |
start_time = time.time() | |
# This will produce a generator of output chunks | |
stream = pipe( | |
prompt, | |
max_tokens=max_new_tokens, | |
stop=["</s>"], | |
stream=True | |
) | |
# Send each token stream output to the user | |
for output in stream: | |
new_text = output['choices'][0]['text'] | |
reply += new_text | |
token_count += len(new_text.split()) # Estimate tokens by counting spaces | |
history[-1][1] = reply # Update the current reply in history | |
# Calculate elapsed time and TPS | |
elapsed_time = time.time() - start_time | |
if elapsed_time > 0: | |
tps = token_count / elapsed_time | |
else: | |
tps = 0 | |
# Update the status using gradio's progress | |
status_message = f"Tokens per second: {tps:.2f}" | |
yield "", history, status_message | |
with gr.Blocks() as demo: | |
gr.Markdown(DESCRIPTION) | |
chatbot = gr.Chatbot() | |
with gr.Row(): | |
textbox = gr.Textbox(placeholder="Type here and press enter") | |
max_new_tokens_slider = gr.Slider( | |
minimum=1, | |
maximum=MAX_MAX_NEW_TOKENS, | |
value=DEFAULT_MAX_NEW_TOKENS, | |
label="Max New Tokens", | |
) | |
status_field = gr.Text(label="Status", interactive=False, visible=True) # Add Status field | |
textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot, status_field]) | |
demo.queue().launch() | |