File size: 4,021 Bytes
9c1188f
 
4213f50
e3894fb
 
078637c
4213f50
6524289
 
4213f50
 
 
 
49ae654
078637c
03936f4
078637c
 
 
 
 
 
 
 
03936f4
cb5934e
078637c
 
 
 
 
 
 
 
 
fea7e7a
d9e0520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fea7e7a
 
 
 
078637c
cb5934e
 
078637c
 
 
 
 
 
 
 
cb5934e
078637c
 
d9e0520
c903cf3
cb5934e
762532e
 
c903cf3
 
762532e
c903cf3
9c1188f
bbb68cf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr
import random
import subprocess
import time

def generate_response(user_message): #Figure Out the parameters later and find a way to get the ram usage
    cmd = [
        "/app/llama.cpp/main",  # Path to the executable
        "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
        "-p", user_message,
        "-n", "400",
        "-e"
    ]

    # Start the subprocess
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)

    start_time = time.time()
    alllines = ""

    # Yield each line of output as it becomes available
    for line in process.stdout:
        alllines += " " + line
        elapsed_time = time.time() - start_time  # Calculate elapsed time
        print(line)
        yield f"{alllines} \n\n [Inference time: {elapsed_time:.2f} seconds]"
    
    # Wait for the subprocess to finish if it hasn't already
    process.wait()

    # Check for any errors
    if process.returncode != 0:
        error_message = process.stderr.read()
        print(f"Error: {error_message}")


def generate_response_token_by_token(user_message):
    cmd = [
        "/app/llama.cpp/main",  # Path to the executable
        "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
        "-p", user_message,
        "-n", "400",
        "-e"
    ]

    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)

    start_time = time.time()
    token_buffer = ''
    while True:
        # Read one character at a time
        char = process.stdout.read(1)
        if char == '' and process.poll() is not None:
            break
        if char != '':
            token_buffer += char
            if char == ' ' or char == '\n':  # Token delimiters
                elapsed_time = time.time() - start_time  # Calculate elapsed time
                yield f"{token_buffer} [Inference time: {elapsed_time:.2f} seconds]"
                token_buffer = ''  # Reset token buffer
    
    # Yield the last token if there is any
    if token_buffer:
        elapsed_time = time.time() - start_time  # Calculate elapsed time
        yield f"{token_buffer} [Inference time: {elapsed_time:.2f} seconds]"

    # Wait for the subprocess to finish if it hasn't already
    process.wait()

    # Check for any errors
    if process.returncode != 0:
        error_message = process.stderr.read()
        print(f"Error: {error_message}")


def custom_generate_response(cust_user_message):
    cust_user_message = CustomPrompts[0] + '\n\n' + cust_user_message
    yield from generate_response(cust_user_message)

CustomPrompts = [
    "Write a Class Diagram based on the following text:",
    "Write a Pydot code based on the following text:",
]

with gr.Blocks() as iface: 
    gr.Interface(
        fn=generate_response,
        inputs=gr.Textbox(lines=2, placeholder="Type your message here..."),
        outputs="text",
        title="Stable LM 2 Zephyr (1.6b) LLama.cpp Interface Test",
        description="No Message History for now - Enter your message and get a response. (One sentence every 20s)",
        flagging_dir="/usr/src/app/flagged",
    )
    gr.Interface(fn=generate_response_token_by_token, inputs=gr.Textbox(lines=2, placeholder='Type prompt here...'), outputs="text", description="More Responsive streaming test")
    with gr.Group():
        gr.HTML("Test for wrapping generator (20 seconds a piece of the response)")
        MainOutput = gr.TextArea(placeholder='Output will show here')
        CustomButtonInput = gr.TextArea(lines=1, placeholder='Prompt goes here')
        CustomButtonClassDiagram = gr.Button(CustomPrompts[0])
        CustomButtonPydotcode = gr.Button(CustomPrompts[1])
        CustomButtonClassDiagram.click(custom_generate_response, inputs=[CustomButtonInput], outputs=MainOutput)
        CustomButtonPydotcode.click(custom_generate_response, inputs=[CustomButtonInput], outputs=MainOutput)

iface.queue().launch(server_name="0.0.0.0", share=True)