File size: 4,147 Bytes
9c1188f 4213f50 e3894fb 7e3fb58 078637c 4213f50 6524289 4213f50 49ae654 078637c 03936f4 078637c 03936f4 cb5934e 078637c 7e3fb58 078637c 7e3fb58 d9e0520 ee032a8 d9e0520 ee032a8 7e3fb58 d9e0520 7e3fb58 d9e0520 fea7e7a 078637c cb5934e 078637c cb5934e 078637c 7e3fb58 c903cf3 cb5934e 762532e c903cf3 762532e c903cf3 9c1188f bbb68cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import gradio as gr
import random
import subprocess
import time
"""
def generate_response(user_message): #Figure Out the parameters later and find a way to get the ram usage
cmd = [
"/app/llama.cpp/main", # Path to the executable
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
"-p", user_message,
"-n", "400",
"-e"
]
# Start the subprocess
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)
start_time = time.time()
alllines = ""
# Yield each line of output as it becomes available
for line in process.stdout:
alllines += " " + line
elapsed_time = time.time() - start_time # Calculate elapsed time
print(line)
yield f"{alllines} \n\n [Inference time: {elapsed_time:.2f} seconds]"
# Wait for the subprocess to finish if it hasn't already
process.wait()
# Check for any errors
if process.returncode != 0:
error_message = process.stderr.read()
print(f"Error: {error_message}")
"""
def generate_response(user_message): #generate_response_token_by_token
cmd = [
"/app/llama.cpp/main", # Path to the executable
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
"-p", user_message,
"-n", "400",
"-e"
]
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)
start_time = time.time()
alltokens = ""
token_buffer = ''
while True:
# Read one character at a time
char = process.stdout.read(1)
if char == '' and process.poll() is not None:
break
if char != '':
token_buffer += char
if char == ' ' or char == '\n': # Token delimiters
elapsed_time = time.time() - start_time # Calculate elapsed time
alltokens += token_buffer
yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds]"
token_buffer = '' # Reset token buffer
# Yield the last token if there is any
if token_buffer:
elapsed_time = time.time() - start_time # Calculate elapsed time
alltokens += token_buffer
yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds]"
# Wait for the subprocess to finish if it hasn't already
process.wait()
# Check for any errors
if process.returncode != 0:
error_message = process.stderr.read()
print(f"Error: {error_message}")
def custom_generate_response(cust_user_message):
cust_user_message = CustomPrompts[0] + '\n\n' + cust_user_message
yield from generate_response(cust_user_message)
CustomPrompts = [
"Write a Class Diagram based on the following text:",
"Write a Pydot code based on the following text:",
]
with gr.Blocks() as iface:
gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=2, placeholder="Type your message here..."),
outputs="text",
title="Stable LM 2 Zephyr (1.6b) LLama.cpp Interface Test",
description="No Message History for now - Enter your message and get a response. (One sentence every 20s)",
flagging_dir="/usr/src/app/flagged",
)
#gr.Interface(fn=generate_response_token_by_token, inputs=gr.Textbox(lines=2, placeholder='Type prompt here...'), outputs="text", description="More Responsive streaming test")
with gr.Group():
gr.HTML("Test for wrapping generator (20 seconds a piece of the response)")
MainOutput = gr.TextArea(placeholder='Output will show here')
CustomButtonInput = gr.TextArea(lines=1, placeholder='Prompt goes here')
CustomButtonClassDiagram = gr.Button(CustomPrompts[0])
CustomButtonPydotcode = gr.Button(CustomPrompts[1])
CustomButtonClassDiagram.click(custom_generate_response, inputs=[CustomButtonInput], outputs=MainOutput)
CustomButtonPydotcode.click(custom_generate_response, inputs=[CustomButtonInput], outputs=MainOutput)
iface.queue().launch(server_name="0.0.0.0", share=True) |