kwabs22
after bufsize=1 change, exploring word or token level stream
d9e0520
raw
history blame
4.02 kB
import gradio as gr
import random
import subprocess
import time
def generate_response(user_message): #Figure Out the parameters later and find a way to get the ram usage
cmd = [
"/app/llama.cpp/main", # Path to the executable
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
"-p", user_message,
"-n", "400",
"-e"
]
# Start the subprocess
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)
start_time = time.time()
alllines = ""
# Yield each line of output as it becomes available
for line in process.stdout:
alllines += " " + line
elapsed_time = time.time() - start_time # Calculate elapsed time
print(line)
yield f"{alllines} \n\n [Inference time: {elapsed_time:.2f} seconds]"
# Wait for the subprocess to finish if it hasn't already
process.wait()
# Check for any errors
if process.returncode != 0:
error_message = process.stderr.read()
print(f"Error: {error_message}")
def generate_response_token_by_token(user_message):
cmd = [
"/app/llama.cpp/main", # Path to the executable
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
"-p", user_message,
"-n", "400",
"-e"
]
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)
start_time = time.time()
token_buffer = ''
while True:
# Read one character at a time
char = process.stdout.read(1)
if char == '' and process.poll() is not None:
break
if char != '':
token_buffer += char
if char == ' ' or char == '\n': # Token delimiters
elapsed_time = time.time() - start_time # Calculate elapsed time
yield f"{token_buffer} [Inference time: {elapsed_time:.2f} seconds]"
token_buffer = '' # Reset token buffer
# Yield the last token if there is any
if token_buffer:
elapsed_time = time.time() - start_time # Calculate elapsed time
yield f"{token_buffer} [Inference time: {elapsed_time:.2f} seconds]"
# Wait for the subprocess to finish if it hasn't already
process.wait()
# Check for any errors
if process.returncode != 0:
error_message = process.stderr.read()
print(f"Error: {error_message}")
def custom_generate_response(cust_user_message):
cust_user_message = CustomPrompts[0] + '\n\n' + cust_user_message
yield from generate_response(cust_user_message)
CustomPrompts = [
"Write a Class Diagram based on the following text:",
"Write a Pydot code based on the following text:",
]
with gr.Blocks() as iface:
gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=2, placeholder="Type your message here..."),
outputs="text",
title="Stable LM 2 Zephyr (1.6b) LLama.cpp Interface Test",
description="No Message History for now - Enter your message and get a response. (One sentence every 20s)",
flagging_dir="/usr/src/app/flagged",
)
gr.Interface(fn=generate_response_token_by_token, inputs=gr.Textbox(lines=2, placeholder='Type prompt here...'), outputs="text", description="More Responsive streaming test")
with gr.Group():
gr.HTML("Test for wrapping generator (20 seconds a piece of the response)")
MainOutput = gr.TextArea(placeholder='Output will show here')
CustomButtonInput = gr.TextArea(lines=1, placeholder='Prompt goes here')
CustomButtonClassDiagram = gr.Button(CustomPrompts[0])
CustomButtonPydotcode = gr.Button(CustomPrompts[1])
CustomButtonClassDiagram.click(custom_generate_response, inputs=[CustomButtonInput], outputs=MainOutput)
CustomButtonPydotcode.click(custom_generate_response, inputs=[CustomButtonInput], outputs=MainOutput)
iface.queue().launch(server_name="0.0.0.0", share=True)