import gradio as gr
import spaces

from vllm import LLM, SamplingParams

llm = LLM(model="meta-llama/Llama-2-7B-Chat-hf")
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

@spaces.GPU
def pipe(text: str):
    prompt = [text]
    tokens = llm.generate(prompt, sampling_params)
    output = (output.outputs[0].text for output in tokens)
    return output[0]

if __name__ == "__main__":
    interface = gr.Interface(pipe, gr.Textbox(label="Prompt"), gr.Textbox(label="Response"), title="Text Completion")
    interface.launch()