from fastapi import FastAPI from llama_cpp import Llama from pydantic import BaseModel from chat import Chat model_path = "zephyr-7b-beta.Q4_K_S.gguf" llm = Llama(model_path=model_path, n_ctx=512, max_answer_len=100) # Set chat_format according to the model you are using class validation(BaseModel): prompt: str app = FastAPI() chat = Chat(model=llm) @app.post("/llm_on_cpu") async def stream(item: validation): chat.send_message(item.prompt) response = chat.generate_reply() return llm(response)