Damien Benveniste commited on
Commit
d809ddf
·
1 Parent(s): accbc5a

added llm to app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -4
app.py CHANGED
@@ -1,10 +1,47 @@
1
- from fastapi import FastAPI
 
 
 
 
 
2
 
3
  app = FastAPI()
4
 
5
- @app.get("/")
6
- def greet_json():
7
- return {"Hello": "World!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
 
 
1
+ from fastapi import FastAPI, Request
2
+ from fastapi.responses import StreamingResponse
3
+ from pydantic import BaseModel
4
+ from vllm import AsyncLLMEngine, SamplingParams
5
+ import asyncio
6
+ import json
7
 
8
  app = FastAPI()
9
 
10
+ # Initialize the AsyncLLMEngine
11
+ # Replace 'your-model-path' with the actual path or name of your model
12
+ engine = AsyncLLMEngine.from_pretrained('microsoft/Phi-3-mini-4k-instruct')
13
+
14
+ class GenerationRequest(BaseModel):
15
+ prompt: str
16
+ max_tokens: int = 100
17
+ temperature: float = 0.7
18
+
19
+ async def generate_stream(prompt: str, max_tokens: int, temperature: float):
20
+ sampling_params = SamplingParams(
21
+ temperature=temperature,
22
+ max_tokens=max_tokens
23
+ )
24
+
25
+ async for output in engine.generate(prompt, sampling_params, True): # True enables streaming
26
+ yield f"data: {json.dumps({'text': output.outputs[0].text})}\n\n"
27
+
28
+ yield "data: [DONE]\n\n"
29
+
30
+ @app.post("/generate-stream")
31
+ async def generate_text(request: Request):
32
+ try:
33
+ data = await request.json()
34
+ gen_request = GenerationRequest(**data)
35
+
36
+ return StreamingResponse(
37
+ generate_stream(gen_request.prompt, gen_request.max_tokens, gen_request.temperature),
38
+ media_type="text/event-stream"
39
+ )
40
+ except Exception as e:
41
+ return StreamingResponse(
42
+ iter([f"data: {json.dumps({'error': str(e)})}\n\n"]),
43
+ media_type="text/event-stream"
44
+ )
45
 
46
 
47