|
import gradio as gr |
|
|
|
import random |
|
import subprocess |
|
|
|
|
|
|
|
|
|
""" |
|
def generate_response(user_message): |
|
encodeduserm = b"### Human: " + user_message.encode('utf-8') + b"\n### Assistant:" |
|
tokens = llm.tokenize(encodeduserm) |
|
output = b"" |
|
count = 0 |
|
|
|
for token in llm.generate(tokens, top_k=40, top_p=0.95, temp=0.72, repeat_penalty=1.1): |
|
text = llm.detokenize([token]) |
|
output += text |
|
count += 1 |
|
if count >= 500 or (token == llm.token_eos()): |
|
break |
|
return output.decode() |
|
""" |
|
|
|
""" |
|
def generate_response(user_message): |
|
print("Before request") |
|
cmd = [ |
|
"/app/llama.cpp/main", # Path to the executable |
|
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf", |
|
"-p", user_message, |
|
"-n", "400", |
|
"-e" |
|
] |
|
result = subprocess.run(cmd, capture_output=True, text=True) |
|
print("After response") |
|
return result.stdout |
|
""" |
|
|
|
def generate_response(user_message): |
|
print("Before request") |
|
cmd = [ |
|
"/app/llama.cpp/main", |
|
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf", |
|
"-p", user_message, |
|
"-n", "400", |
|
"-e" |
|
] |
|
|
|
|
|
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |
|
|
|
|
|
for line in process.stdout: |
|
yield line |
|
|
|
|
|
process.wait() |
|
|
|
print("After response") |
|
|
|
if process.returncode != 0: |
|
error_message = process.stderr.read() |
|
print(f"Error: {error_message}") |
|
|
|
iface = gr.Interface( |
|
fn=generate_response, |
|
inputs=gr.Textbox(lines=2, placeholder="Type your message here..."), |
|
outputs="text", |
|
title="LLaMA Chat Interface", |
|
description="Enter your message and get a response from the LLaMA model.", |
|
flagging_dir="/usr/src/app/flagged", |
|
) |
|
|
|
iface.launch(server_name="0.0.0.0") |