kwabs22
Testing Stable LM 2 1.6B Zephyr - subprocess issues
49ae654
raw
history blame
2.21 kB
import gradio as gr
#from llama_cpp import Llama
import random
import subprocess
# Initialize model
#llm = Llama(model_path="/stablelm-2-zephyr-1_6b-Q4_0.gguf", n_gpu_layers=0, seed=random.randint(1, 2**31))
"""
def generate_response(user_message):
encodeduserm = b"### Human: " + user_message.encode('utf-8') + b"\n### Assistant:"
tokens = llm.tokenize(encodeduserm)
output = b""
count = 0
for token in llm.generate(tokens, top_k=40, top_p=0.95, temp=0.72, repeat_penalty=1.1):
text = llm.detokenize([token])
output += text
count += 1
if count >= 500 or (token == llm.token_eos()):
break
return output.decode()
"""
"""
def generate_response(user_message):
print("Before request")
cmd = [
"/app/llama.cpp/main", # Path to the executable
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
"-p", user_message,
"-n", "400",
"-e"
]
result = subprocess.run(cmd, capture_output=True, text=True)
print("After response")
return result.stdout
"""
def generate_response(user_message):
print("Before request")
cmd = [
"/app/llama.cpp/main", # Path to the executable
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
"-p", user_message,
"-n", "400",
"-e"
]
# Start the subprocess
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
# Yield each line of output as it becomes available
for line in process.stdout:
yield line
# Wait for the subprocess to finish if it hasn't already
process.wait()
print("After response")
# Check for any errors
if process.returncode != 0:
error_message = process.stderr.read()
print(f"Error: {error_message}")
iface = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=2, placeholder="Type your message here..."),
outputs="text",
title="LLaMA Chat Interface",
description="Enter your message and get a response from the LLaMA model.",
flagging_dir="/usr/src/app/flagged",
)
iface.launch(server_name="0.0.0.0") #share=True)