import gradio as gr
from llama_cpp import Llama
llm = Llama(model_path="./model.bin")
with open('system.prompt', 'r', encoding='utf-8') as f:
prompt = f.read()
title = "Openbuddy LLama Api"
desc = '''
Hello, world!
This is showcase how to make own server with OpenBuddy's model.
I'm using here 3b model just for example. Also here's only CPU power.
But you can use GPU power as well!
How to GPU?
Change `CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`
in Dockerfile on `CMAKE_ARGS="-DLLAMA_CUBLAS=on"`
. Also you can try `DLLAMA_CLBLAST`
, `DLLAMA_METAL`
or `DLLAMA_METAL`
.
Powered by llama-cpp-python and Gradio.
How to test it on own machine?
You can install Docker, build image and run it. I made `run-docker.sh`
for ya. To stop container run `docker ps`
, find name of container and run `docker stop _dockerContainerName_`
Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.
Also it can run with quart+uvicorn! Check the repo!'''
def greet(request: str, max_tokens: int = 64, override_system_prompt: str = ""):
try:
system_prompt = override_system_prompt if override_system_prompt != "" else prompt
max_tokens = max_tokens if max_tokens > 0 and max_tokens < 256 else 64
userPrompt = system_prompt + "\n\nUser: " + request + "\nAssistant: "
except: return "ERROR 400: Not enough data"
try:
output = llm(userPrompt, max_tokens=max_tokens, stop=["User:", "\n"], echo=False)
print(output)
return output["choices"][0]["text"]
except Exception as e:
print(e)
return "ERROR 500: Server error. Check logs!!"
demo = gr.Interface(
fn=greet,
inputs=[gr.Text("Hello, how are you?"), gr.Number(64), gr.Textbox()],
outputs=["text"],
description=desc,
title=title,
allow_flagging="never"
).queue()
if __name__ == "__main__":
demo.launch()