import gradio as gr | |
import psutil | |
import subprocess | |
import time | |
def generate_response(user_message): #generate_response_token_by_token | |
cmd = [ | |
"/app/llama.cpp/main", # Path to the executable | |
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf", | |
"-p", user_message, | |
"-n", "400", | |
"-e" | |
] | |
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1) | |
process_monitor = psutil.Process(process.pid) | |
start_time = time.time() | |
monitor_start_time = time.time() | |
alltokens = "" | |
token_buffer = '' | |
tokencount = 0 | |
try: | |
while True: | |
# Read one character at a time | |
char = process.stdout.read(1) | |
if char == '' and process.poll() is not None: | |
break | |
if char != '': | |
token_buffer += char | |
if char == ' ' or char == '\n': # Token delimiters | |
elapsed_time = time.time() - start_time # Calculate elapsed time | |
alltokens += token_buffer | |
tokencount += 1 | |
yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds | Tokens: { tokencount }]" | |
token_buffer = '' # Reset token buffer | |
# Log resource usage every minute | |
if time.time() - monitor_start_time > 60: | |
cpu_usage = process_monitor.cpu_percent() | |
memory_usage = process_monitor.memory_info().rss # in bytes | |
print(f"Subprocess CPU Usage: {cpu_usage}%, Memory Usage: {memory_usage / 1024 ** 2} MB") | |
monitor_start_time = time.time() # Reset the timer | |
# Yield the last token if there is any | |
if token_buffer: | |
elapsed_time = time.time() - start_time # Calculate elapsed time | |
alltokens += token_buffer | |
yield f"{alltokens} \n\n [Inference time: {elapsed_time:.2f} seconds | Average Tokens per second: { round(tokencount / elapsed_time, 2) }]" | |
finally: | |
try: | |
# Wait for the process to complete, with a timeout | |
process.wait(timeout=60) # Timeout in seconds | |
except subprocess.TimeoutExpired: | |
print("Process didn't complete within the timeout. Killing it.") | |
process.kill() | |
process.wait() # Ensure proper cleanup | |
# Wait for the subprocess to finish if it hasn't already | |
process.stdout.close() | |
process.stderr.close() | |
# Check for any errors | |
if process.returncode != 0: | |
error_message = process.stderr.read() | |
print(f"Error: {error_message}") | |
# def custom_generate_response0(cust_user_message): | |
# cust_user_message = CustomPrompts[0] + '\n\n' + cust_user_message + '\n\nClass Diagram:' | |
# yield from generate_response(cust_user_message) | |
# def custom_generate_response1(cust_user_message): | |
# cust_user_message = CustomPrompts[1] + '\n\n' + cust_user_message + '\n\nPydot Code:' | |
# yield from generate_response(cust_user_message) | |
# def custom_generate_response2(cust_user_message): | |
# cust_user_message = CustomPrompts[2] + '\n' + cust_user_message + '\n\nScene Details' | |
# yield from generate_response(cust_user_message) | |
# def custom_generate_response3(cust_user_message): | |
# cust_user_message = CustomPrompts[3] + '\n' + cust_user_message + '\n\nTeardown Details:' | |
# yield from generate_response(cust_user_message) | |
# def custom_generate_response4(cust_user_message): | |
# cust_user_message = CustomPrompts[4] + '\n' + cust_user_message + '\n\nManufacturing Details:' | |
# yield from generate_response(cust_user_message) | |
# def custom_generate_response5(cust_user_message): | |
# cust_user_message = CustomPrompts[5] + '\n' + cust_user_message + '\n\nConsiderations:' | |
# yield from generate_response(cust_user_message) | |
# CustomPrompts = [ | |
# "Write a Class Diagram based on the following text:", | |
# "Write a Pydot code based on the following text:", | |
# "Describe what a standard happy scene in any movie would be planned in great detail, based on the following text:", | |
# "Explain a teardown of the product mentioned in the following text:", | |
# "Explain the manufacturing of the product mentioned in the following text:", | |
# "Explain the marketing considerations of the product mentioned in the following text:", | |
# ] | |
# with gr.Blocks() as iface: | |
# gr.Interface( | |
# fn=generate_response, | |
# inputs=gr.Textbox(lines=2, placeholder="Type your message here..."), | |
# outputs="text", | |
# title="Stable LM 2 Zephyr (1.6b) LLama.cpp Interface Test (Inconsistent Performance - 100 tokens in 50 secs or 800+ secs | Over 100 token prompt always slow)", | |
# description="No Prompt template used yet (Essentially autocomplete). No Message History for now - Enter your message and get a response.", | |
# flagging_dir="/usr/src/app/flagged", | |
# ) | |
# #gr.Interface(fn=generate_response_token_by_token, inputs=gr.Textbox(lines=2, placeholder='Type prompt here...'), outputs="text", description="More Responsive streaming test") | |
# with gr.Group(): | |
# gr.HTML("Test for wrapping generator (Instead of buttons tabs and dropdowns?)") | |
# MainOutput = gr.TextArea(placeholder='Output will show here') | |
# CustomButtonInput = gr.TextArea(lines=1, placeholder='Prompt goes here') | |
# CustomButtonTeardown = gr.Button(CustomPrompts[3]) | |
# CustomButtonManufacture = gr.Button(CustomPrompts[4]) | |
# CustomButtonMarketingConsid = gr.Button(CustomPrompts[5]) | |
# CustomButtonClassDiagram = gr.Button(CustomPrompts[0]) | |
# CustomButtonPydotcode = gr.Button(CustomPrompts[1]) | |
# CustomButtonHappyMovieScene = gr.Button(CustomPrompts[2]) | |
# CustomButtonClassDiagram.click(custom_generate_response0, inputs=[CustomButtonInput], outputs=MainOutput) | |
# CustomButtonPydotcode.click(custom_generate_response1, inputs=[CustomButtonInput], outputs=MainOutput) | |
# CustomButtonHappyMovieScene.click(custom_generate_response2, inputs=[CustomButtonInput], outputs=MainOutput) | |
# CustomButtonTeardown.click(custom_generate_response3, inputs=[CustomButtonInput], outputs=MainOutput) | |
# CustomButtonManufacture.click(custom_generate_response4, inputs=[CustomButtonInput], outputs=MainOutput) | |
# CustomButtonMarketingConsid.click(custom_generate_response5, inputs=[CustomButtonInput], outputs=MainOutput) | |
def custom_generate_response(cust_user_message, prompt_index): | |
""" | |
Generates a custom response based on the user message and the selected prompt, | |
including a custom ending specific to the prompt. | |
Parameters: | |
- cust_user_message: The message input from the user. | |
- prompt_index: The index of the custom prompt to use. | |
""" | |
prompt, ending = CustomPrompts[prompt_index] # Unpack the prompt and its ending | |
cust_user_message = f"{prompt}\n\n{cust_user_message}\n\n{ending}" | |
yield from generate_response(cust_user_message) | |
CustomPrompts = [ | |
("Write a Class Diagram based on the following text:", "Class Diagram:"), | |
("Write a Pydot code based on the following text:", "Pydot Code:"), | |
("Describe what a standard happy scene in any movie would be planned in great detail, based on the following text:", "Scene Details"), | |
("Explain a teardown of the product mentioned in the following text:", "Teardown Details:"), | |
("Explain the manufacturing of the product mentioned in the following text:", "Manufacturing Details:"), | |
("Explain the marketing considerations of the product mentioned in the following text:", "Considerations:"), | |
("Explain the target users considerations of the product mentioned in the following text:", "Target Users Considerations:"), | |
] | |
with gr.Blocks() as iface: | |
gr.Interface( | |
fn=generate_response, | |
inputs=gr.Textbox(lines=2, placeholder="Type your message here..."), | |
outputs="text", | |
title="Stable LM 2 Zephyr (1.6b) LLama.cpp Interface Test (Inconsistent Performance - 100 tokens in 50 secs or 800+ secs)", | |
description="No Prompt template used yet (Essentially autocomplete). No Message History for now - Enter your message and get a response.", | |
flagging_dir="/usr/src/app/flagged", | |
) | |
with gr.Group(): | |
gr.HTML("Test for wrapping generator (Instead of buttons tabs and dropdowns?)") | |
MainOutput = gr.TextArea(placeholder='Output will show here') | |
CustomButtonInput = gr.TextArea(lines=1, placeholder='Prompt goes here') | |
# Dynamically create buttons and assign actions | |
for index, (prompt, _) in enumerate(CustomPrompts): | |
button = gr.Button(prompt) | |
button.click(custom_generate_response, inputs=[CustomButtonInput, gr.State(index)], outputs=MainOutput) | |
iface.queue().launch(server_name="0.0.0.0", share=True) |