Spaces:

lpetrl
/

demo-llm

Sleeping

File size: 525 Bytes

f41e5fe
036f518
f41e5fe
bc985a0
036f518
 
 
 
 
 
 
f41e5fe
 
036f518
461052c
f41e5fe
036f518
 
f41e5fe
 
 
036f518
 
3fa3baf
036f518

from fastapi import FastAPI
from llama_cpp import Llama
from pydantic import BaseModel

from chat import Chat

model_path = "zephyr-7b-beta.Q4_K_S.gguf"

llm = Llama(model_path=model_path, n_ctx=512, max_answer_len=100)  # Set chat_format according to the model you are using


class validation(BaseModel):
    prompt: str


app = FastAPI()
chat = Chat(model=llm)


@app.post("/llm_on_cpu")
async def stream(item: validation):
    chat.send_message(item.prompt)
    response = chat.generate_reply()

    return llm(response)