from fastapi import FastAPI
from llama_cpp import Llama
from pydantic import BaseModel

from chat import Chat

model_path = "zephyr-7b-beta.Q4_K_S.gguf"

llm = Llama(model_path=model_path, n_ctx=512, max_answer_len=100)  # Set chat_format according to the model you are using


class validation(BaseModel):
    prompt: str


app = FastAPI()
chat = Chat(model=llm)


@app.post("/llm_on_cpu")
async def stream(item: validation):
    chat.send_message(item.prompt)
    response = chat.generate_reply()

    return llm(response)