test / app.py
jerukperas's picture
wip
01a9f90 verified
raw
history blame
734 Bytes
import gradio as gr
from llama_cpp import Llama
print("Downloading model")
llm = Llama.from_pretrained(
repo_id="bartowski/gemma-2-2b-it-abliterated-GGUF",
filename="gemma-2-2b-it-abliterated-IQ4_XS.gguf",
numa=True,
n_ctx=4096,
)
def respond(prompt: str):
stream = llm.create_chat_completion(stream=True, messages=[{"role": "user", "content": prompt}])
response = ""
for chunk in stream:
if "content" in chunk["choices"][0]["delta"]:
response += chunk["choices"][0]["delta"]["content"]
yield response
demo = gr.Interface(fn=respond, inputs=[gr.TextArea("What is the capital of France?")], outputs=[gr.TextArea()])
demo.launch(server_name="0.0.0.0", server_port=7860)