arco_gguf_inference

Sleeping

File size: 2,572 Bytes

9329033
 
0c8feaf
 
518754f
0c8feaf
 
9329033
0c8feaf
 
 
 
 
9329033
 
fa8a778
 
9329033
 
 
6ba0c05
3a8892f
 
6ba0c05
 
 
3a8892f
0c8feaf
 
7d00bdf
5ac9a35
4a44fb0
785de3c
5ac9a35
3a8892f
785de3c
668ee0d
 
518754f
7d00bdf
 
 
518754f
668ee0d
3a8892f
518754f
668ee0d
5ac9a35
3a8892f
5ac9a35
518754f
785de3c
3a8892f
 
785de3c
7d00bdf
785de3c
4a44fb0
 
 
0b2f919
4a44fb0
0b2f919
4a44fb0
 
7d00bdf
4a44fb0
 
 
0b2f919
4a44fb0
785de3c
668ee0d
5ac9a35
0c8feaf
5ac9a35
0c8feaf
5ac9a35
 
0c8feaf
5ac9a35
 
 
0c8feaf
4a44fb0
 
668ee0d
518754f

from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import gradio as gr
from typing import Tuple, List
import time

DESCRIPTION = f"""
# Chat with Arco 500M as GGUF on CPU
"""

MAX_MAX_NEW_TOKENS = 1024
DEFAULT_MAX_NEW_TOKENS = 200

# Download the GGUF file
model_path = hf_hub_download(
    repo_id="TobDeBer/arco-Q4_K_M-GGUF",
    filename="arco-q4_k_m.gguf",
    repo_type="model"
)
# Load the GGUF model
pipe = Llama(
    n_ctx=MAX_MAX_NEW_TOKENS,
    # n_threads=4, # Set the desired number of threads to use, defaults to number of cores
    # n_gpu_layers = 1, # Enable to use GPU, check supported layers and GPU size.
    # n_batch=1, # Set the batch size.
    # use_mlock =True, # Set to False to disable locking to RAM.
    model_path=model_path
)

def predict(message: str, history: List[List[str]], max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, progress=gr.Progress()):
    if not message:
        return "", history, ""

    prompt = message
    history.append([message, ""])

    # Initialize reply for this round
    reply = ""

    # Initialize token count and start time
    token_count = 0
    start_time = time.time()

    # This will produce a generator of output chunks
    stream = pipe(
        prompt,
        max_tokens=max_new_tokens,
        stop=["</s>"],
        stream=True
    )

    # Send each token stream output to the user
    for output in stream:
        new_text = output['choices'][0]['text']
        reply += new_text
        token_count += len(new_text.split()) # Estimate tokens by counting spaces
        history[-1][1] = reply # Update the current reply in history
    
        # Calculate elapsed time and TPS
        elapsed_time = time.time() - start_time
        if elapsed_time > 0:
            tps = token_count / elapsed_time
        else:
            tps = 0
        
        # Update the status using gradio's progress
        status_message = f"Tokens per second: {tps:.2f}"
        
        yield "", history, status_message

    


with gr.Blocks() as demo:
    gr.Markdown(DESCRIPTION)
    chatbot = gr.Chatbot()
    with gr.Row():
        textbox = gr.Textbox(placeholder="Type here and press enter")
    max_new_tokens_slider = gr.Slider(
        minimum=1,
        maximum=MAX_MAX_NEW_TOKENS,
        value=DEFAULT_MAX_NEW_TOKENS,
        label="Max New Tokens",
    )
    status_field = gr.Text(label="Status", interactive=False, visible=True) # Add Status field
    textbox.submit(predict, [textbox, chatbot, max_new_tokens_slider], [textbox, chatbot, status_field])

demo.queue().launch()