deploy_vLLM / app.py
Damien Benveniste
added llm to app.py
d809ddf
raw
history blame
1.44 kB
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from vllm import AsyncLLMEngine, SamplingParams
import asyncio
import json
app = FastAPI()
# Initialize the AsyncLLMEngine
# Replace 'your-model-path' with the actual path or name of your model
engine = AsyncLLMEngine.from_pretrained('microsoft/Phi-3-mini-4k-instruct')
class GenerationRequest(BaseModel):
prompt: str
max_tokens: int = 100
temperature: float = 0.7
async def generate_stream(prompt: str, max_tokens: int, temperature: float):
sampling_params = SamplingParams(
temperature=temperature,
max_tokens=max_tokens
)
async for output in engine.generate(prompt, sampling_params, True): # True enables streaming
yield f"data: {json.dumps({'text': output.outputs[0].text})}\n\n"
yield "data: [DONE]\n\n"
@app.post("/generate-stream")
async def generate_text(request: Request):
try:
data = await request.json()
gen_request = GenerationRequest(**data)
return StreamingResponse(
generate_stream(gen_request.prompt, gen_request.max_tokens, gen_request.temperature),
media_type="text/event-stream"
)
except Exception as e:
return StreamingResponse(
iter([f"data: {json.dumps({'error': str(e)})}\n\n"]),
media_type="text/event-stream"
)