demo-llm / main.py
Petro
model
299f41e
raw
history blame
525 Bytes
from fastapi import FastAPI
from llama_cpp import Llama
from pydantic import BaseModel
from chat import Chat
model_path = "zephyr-7b-beta.Q4_K_S.gguf"
llm = Llama(model_path=model_path, n_ctx=512, max_answer_len=100) # Set chat_format according to the model you are using
class validation(BaseModel):
prompt: str
app = FastAPI()
chat = Chat(model=llm)
@app.post("/llm_on_cpu")
async def stream(item: validation):
chat.send_message(item.prompt)
response = chat.generate_reply()
return llm(response)