File size: 2,255 Bytes
9c1188f
4213f50
9c1188f
4213f50
9c1188f
 
4213f50
9c1188f
4213f50
9c1188f
 
 
 
 
 
 
 
 
 
 
 
 
4213f50
 
49ae654
4213f50
3924225
4213f50
6524289
 
4213f50
 
 
 
 
3924225
4213f50
49ae654
 
 
 
 
 
 
 
 
 
5dd2646
 
49ae654
 
 
5dd2646
 
49ae654
 
5dd2646
 
49ae654
 
 
4213f50
49ae654
 
 
 
 
9c1188f
 
 
 
 
 
3739999
 
9c1188f
 
ce41f2c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gradio as gr
#from llama_cpp import Llama
import random
import subprocess

# Initialize model
#llm = Llama(model_path="/stablelm-2-zephyr-1_6b-Q4_0.gguf", n_gpu_layers=0, seed=random.randint(1, 2**31))

"""
def generate_response(user_message):
    encodeduserm = b"### Human: " + user_message.encode('utf-8') + b"\n### Assistant:"
    tokens = llm.tokenize(encodeduserm)
    output = b""
    count = 0

    for token in llm.generate(tokens, top_k=40, top_p=0.95, temp=0.72, repeat_penalty=1.1):
        text = llm.detokenize([token])
        output += text
        count += 1
        if count >= 500 or (token == llm.token_eos()):
            break
    return output.decode()
"""

"""
def generate_response(user_message):
    print("Before request")
    cmd = [
        "/app/llama.cpp/main",  # Path to the executable
        "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
        "-p", user_message,
        "-n", "400",
        "-e"
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    print("After response")
    return result.stdout
"""

def generate_response(user_message):
    cmd = [
        "/app/llama.cpp/main",  # Path to the executable
        "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
        "-p", user_message,
        "-n", "400",
        "-e"
    ]

    print("Before request")
    # Start the subprocess
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

    alllines = ""

    # Yield each line of output as it becomes available
    for line in process.stdout:
        alllines += " " + line
        yield alllines
    
    # Wait for the subprocess to finish if it hasn't already
    process.wait()

    print("After response")
    # Check for any errors
    if process.returncode != 0:
        error_message = process.stderr.read()
        print(f"Error: {error_message}")

iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=2, placeholder="Type your message here..."),
    outputs="text",
    title="LLaMA Chat Interface",
    description="Enter your message and get a response from the LLaMA model.",
    flagging_dir="/usr/src/app/flagged",
)

iface.launch(server_name="0.0.0.0") #share=True)