Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,8 @@
|
|
1 |
-
from fastapi import FastAPI, Request
|
2 |
from fastapi.responses import HTMLResponse, StreamingResponse
|
3 |
from fastapi.staticfiles import StaticFiles
|
4 |
from modules.pmbl import PMBL
|
5 |
import torch
|
6 |
-
import asyncio
|
7 |
|
8 |
print(f"CUDA available: {torch.cuda.is_available()}")
|
9 |
print(f"CUDA device count: {torch.cuda.device_count()}")
|
@@ -16,8 +15,6 @@ app.mount("/static", StaticFiles(directory="static"), name="static")
|
|
16 |
app.mount("/templates", StaticFiles(directory="templates"), name="templates")
|
17 |
|
18 |
pmbl = PMBL("./PMB-7b.Q6_K.gguf", gpu_layers=50)
|
19 |
-
request_queue = asyncio.Queue()
|
20 |
-
processing_lock = asyncio.Lock()
|
21 |
|
22 |
@app.head("/")
|
23 |
@app.get("/")
|
@@ -25,40 +22,19 @@ def index() -> HTMLResponse:
|
|
25 |
with open("templates/index.html") as f:
|
26 |
return HTMLResponse(content=f.read())
|
27 |
|
28 |
-
async def process_request(user_input: str, mode: str):
|
29 |
-
async with processing_lock:
|
30 |
-
history = pmbl.get_chat_history(mode, user_input)
|
31 |
-
async for chunk in pmbl.generate_response(user_input, history, mode):
|
32 |
-
yield chunk
|
33 |
-
|
34 |
@app.post("/chat")
|
35 |
-
async def chat(request: Request
|
36 |
try:
|
37 |
data = await request.json()
|
38 |
user_input = data["user_input"]
|
39 |
mode = data["mode"]
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
async for chunk in await process_request(user_input, mode):
|
44 |
-
yield chunk
|
45 |
-
|
46 |
-
return StreamingResponse(response_generator(), media_type="text/plain")
|
47 |
except Exception as e:
|
48 |
print(f"[SYSTEM] Error: {str(e)}")
|
49 |
return {"error": str(e)}
|
50 |
|
51 |
-
async def queue_worker():
|
52 |
-
while True:
|
53 |
-
user_input, mode = await request_queue.get()
|
54 |
-
async for _ in process_request(user_input, mode):
|
55 |
-
pass
|
56 |
-
request_queue.task_done()
|
57 |
-
|
58 |
-
@app.on_event("startup")
|
59 |
-
async def startup_event():
|
60 |
-
asyncio.create_task(queue_worker())
|
61 |
-
|
62 |
@app.post("/sleep")
|
63 |
async def sleep():
|
64 |
try:
|
|
|
1 |
+
from fastapi import FastAPI, Request
|
2 |
from fastapi.responses import HTMLResponse, StreamingResponse
|
3 |
from fastapi.staticfiles import StaticFiles
|
4 |
from modules.pmbl import PMBL
|
5 |
import torch
|
|
|
6 |
|
7 |
print(f"CUDA available: {torch.cuda.is_available()}")
|
8 |
print(f"CUDA device count: {torch.cuda.device_count()}")
|
|
|
15 |
app.mount("/templates", StaticFiles(directory="templates"), name="templates")
|
16 |
|
17 |
pmbl = PMBL("./PMB-7b.Q6_K.gguf", gpu_layers=50)
|
|
|
|
|
18 |
|
19 |
@app.head("/")
|
20 |
@app.get("/")
|
|
|
22 |
with open("templates/index.html") as f:
|
23 |
return HTMLResponse(content=f.read())
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
@app.post("/chat")
|
26 |
+
async def chat(request: Request):
|
27 |
try:
|
28 |
data = await request.json()
|
29 |
user_input = data["user_input"]
|
30 |
mode = data["mode"]
|
31 |
+
history = pmbl.get_chat_history(mode, user_input)
|
32 |
+
response_generator = pmbl.generate_response(user_input, history, mode)
|
33 |
+
return StreamingResponse(response_generator, media_type="text/plain")
|
|
|
|
|
|
|
|
|
34 |
except Exception as e:
|
35 |
print(f"[SYSTEM] Error: {str(e)}")
|
36 |
return {"error": str(e)}
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
@app.post("/sleep")
|
39 |
async def sleep():
|
40 |
try:
|