Spaces:
Runtime error
Runtime error
from fastapi import FastAPI, HTTPException, Request | |
from fastapi.responses import JSONResponse | |
import gradio as gr | |
app = FastAPI() | |
llm = gr.Llama(model_path="model.gguf", n_ctx=4000, n_threads=2, chat_format="chatml") | |
async def chat_post(request: Request): | |
data = await request.json() | |
message = data.get("message") | |
history = data.get("history", []) | |
temperature = data.get("temperature", 0.3) | |
max_tokens = data.get("max_tokens", 512) | |
async def generate(): | |
system_prompt = "You are OpenChat, a useful AI assistant." | |
formatted_prompt = [{"role": "system", "content": system_prompt}] | |
for user_prompt, bot_response in history: | |
formatted_prompt.append({"role": "user", "content": user_prompt}) | |
formatted_prompt.append({"role": "assistant", "content": bot_response }) | |
formatted_prompt.append({"role": "user", "content": message}) | |
stream_response = llm.create_chat_completion(messages=formatted_prompt, temperature=temperature, max_tokens=max_tokens, stream=True) | |
response = "" | |
for chunk in stream_response: | |
if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]: | |
response += chunk['choices'][0]["delta"]["content"] | |
yield response | |
return JSONResponse(content={"response": await generate()}) | |
async def chat_get(): | |
return {"message": "Send a POST request to this endpoint to chat."} | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0", port=8000) | |