File size: 1,613 Bytes
9dd4f46
 
874a573
 
9dd4f46
874a573
9dd4f46
874a573
9dd4f46
 
 
 
 
 
 
874a573
9dd4f46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse
import gradio as gr

app = FastAPI()

llm = gr.Llama(model_path="model.gguf", n_ctx=4000, n_threads=2, chat_format="chatml")

@app.post("/api/v1/chat")
async def chat_post(request: Request):
    data = await request.json()
    message = data.get("message")
    history = data.get("history", [])
    temperature = data.get("temperature", 0.3)
    max_tokens = data.get("max_tokens", 512)

    async def generate():
        system_prompt = "You are OpenChat, a useful AI assistant."
        formatted_prompt = [{"role": "system", "content": system_prompt}]
        for user_prompt, bot_response  in history:
            formatted_prompt.append({"role": "user", "content": user_prompt})
            formatted_prompt.append({"role": "assistant", "content": bot_response })
        formatted_prompt.append({"role": "user", "content": message})
        stream_response = llm.create_chat_completion(messages=formatted_prompt, temperature=temperature, max_tokens=max_tokens, stream=True)
        response  = ""
        for chunk in stream_response:
            if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]:
                response  += chunk['choices'][0]["delta"]["content"]
            yield response

    return JSONResponse(content={"response": await generate()})

@app.get("/api/v1/chat")
async def chat_get():
    return {"message": "Send a POST request to this endpoint to chat."}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)