File size: 2,255 Bytes
9c1188f 4213f50 9c1188f 4213f50 9c1188f 4213f50 9c1188f 4213f50 9c1188f 4213f50 49ae654 4213f50 3924225 4213f50 6524289 4213f50 3924225 4213f50 49ae654 5dd2646 49ae654 5dd2646 49ae654 5dd2646 49ae654 4213f50 49ae654 9c1188f 3739999 9c1188f ce41f2c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import gradio as gr
#from llama_cpp import Llama
import random
import subprocess
# Initialize model
#llm = Llama(model_path="/stablelm-2-zephyr-1_6b-Q4_0.gguf", n_gpu_layers=0, seed=random.randint(1, 2**31))
"""
def generate_response(user_message):
encodeduserm = b"### Human: " + user_message.encode('utf-8') + b"\n### Assistant:"
tokens = llm.tokenize(encodeduserm)
output = b""
count = 0
for token in llm.generate(tokens, top_k=40, top_p=0.95, temp=0.72, repeat_penalty=1.1):
text = llm.detokenize([token])
output += text
count += 1
if count >= 500 or (token == llm.token_eos()):
break
return output.decode()
"""
"""
def generate_response(user_message):
print("Before request")
cmd = [
"/app/llama.cpp/main", # Path to the executable
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
"-p", user_message,
"-n", "400",
"-e"
]
result = subprocess.run(cmd, capture_output=True, text=True)
print("After response")
return result.stdout
"""
def generate_response(user_message):
cmd = [
"/app/llama.cpp/main", # Path to the executable
"-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
"-p", user_message,
"-n", "400",
"-e"
]
print("Before request")
# Start the subprocess
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
alllines = ""
# Yield each line of output as it becomes available
for line in process.stdout:
alllines += " " + line
yield alllines
# Wait for the subprocess to finish if it hasn't already
process.wait()
print("After response")
# Check for any errors
if process.returncode != 0:
error_message = process.stderr.read()
print(f"Error: {error_message}")
iface = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=2, placeholder="Type your message here..."),
outputs="text",
title="LLaMA Chat Interface",
description="Enter your message and get a response from the LLaMA model.",
flagging_dir="/usr/src/app/flagged",
)
iface.launch(server_name="0.0.0.0") #share=True) |