import gradio as gr
import random
import subprocess
import time

def generate_response(user_message): #Figure Out the parameters later and find a way to get the ram usage
    cmd = [
        "/app/llama.cpp/main",  # Path to the executable
        "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
        "-p", user_message,
        "-n", "400",
        "-e"
    ]

    # Start the subprocess
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)

    start_time = time.time()
    alllines = ""

    # Yield each line of output as it becomes available
    for line in process.stdout:
        alllines += " " + line
        elapsed_time = time.time() - start_time  # Calculate elapsed time
        print(line)
        yield f"{alllines} \n\n [Inference time: {elapsed_time:.2f} seconds]"
    
    # Wait for the subprocess to finish if it hasn't already
    process.wait()

    # Check for any errors
    if process.returncode != 0:
        error_message = process.stderr.read()
        print(f"Error: {error_message}")


def generate_response_token_by_token(user_message):
    cmd = [
        "/app/llama.cpp/main",  # Path to the executable
        "-m", "/app/llama.cpp/models/stablelm-2-zephyr-1_6b-Q4_0.gguf",
        "-p", user_message,
        "-n", "400",
        "-e"
    ]

    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1)

    start_time = time.time()
    token_buffer = ''
    while True:
        # Read one character at a time
        char = process.stdout.read(1)
        if char == '' and process.poll() is not None:
            break
        if char != '':
            token_buffer += char
            if char == ' ' or char == '\n':  # Token delimiters
                elapsed_time = time.time() - start_time  # Calculate elapsed time
                yield f"{token_buffer} [Inference time: {elapsed_time:.2f} seconds]"
                token_buffer = ''  # Reset token buffer
    
    # Yield the last token if there is any
    if token_buffer:
        elapsed_time = time.time() - start_time  # Calculate elapsed time
        yield f"{token_buffer} [Inference time: {elapsed_time:.2f} seconds]"

    # Wait for the subprocess to finish if it hasn't already
    process.wait()

    # Check for any errors
    if process.returncode != 0:
        error_message = process.stderr.read()
        print(f"Error: {error_message}")


def custom_generate_response(cust_user_message):
    cust_user_message = CustomPrompts[0] + '\n\n' + cust_user_message
    yield from generate_response(cust_user_message)

CustomPrompts = [
    "Write a Class Diagram based on the following text:",
    "Write a Pydot code based on the following text:",
]

with gr.Blocks() as iface: 
    gr.Interface(
        fn=generate_response,
        inputs=gr.Textbox(lines=2, placeholder="Type your message here..."),
        outputs="text",
        title="Stable LM 2 Zephyr (1.6b) LLama.cpp Interface Test",
        description="No Message History for now - Enter your message and get a response. (One sentence every 20s)",
        flagging_dir="/usr/src/app/flagged",
    )
    gr.Interface(fn=generate_response_token_by_token, inputs=gr.Textbox(lines=2, placeholder='Type prompt here...'), outputs="text", description="More Responsive streaming test")
    with gr.Group():
        gr.HTML("Test for wrapping generator (20 seconds a piece of the response)")
        MainOutput = gr.TextArea(placeholder='Output will show here')
        CustomButtonInput = gr.TextArea(lines=1, placeholder='Prompt goes here')
        CustomButtonClassDiagram = gr.Button(CustomPrompts[0])
        CustomButtonPydotcode = gr.Button(CustomPrompts[1])
        CustomButtonClassDiagram.click(custom_generate_response, inputs=[CustomButtonInput], outputs=MainOutput)
        CustomButtonPydotcode.click(custom_generate_response, inputs=[CustomButtonInput], outputs=MainOutput)

iface.queue().launch(server_name="0.0.0.0", share=True)