File size: 1,439 Bytes
d809ddf
 
 
 
 
 
48d8d65
 
 
d809ddf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef8deae
 
48d8d65
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from vllm import AsyncLLMEngine, SamplingParams
import asyncio
import json

app = FastAPI()

# Initialize the AsyncLLMEngine
# Replace 'your-model-path' with the actual path or name of your model
engine = AsyncLLMEngine.from_pretrained('microsoft/Phi-3-mini-4k-instruct')

class GenerationRequest(BaseModel):
    prompt: str
    max_tokens: int = 100
    temperature: float = 0.7

async def generate_stream(prompt: str, max_tokens: int, temperature: float):
    sampling_params = SamplingParams(
        temperature=temperature,
        max_tokens=max_tokens
    )
    
    async for output in engine.generate(prompt, sampling_params, True):  # True enables streaming
        yield f"data: {json.dumps({'text': output.outputs[0].text})}\n\n"
    
    yield "data: [DONE]\n\n"

@app.post("/generate-stream")
async def generate_text(request: Request):
    try:
        data = await request.json()
        gen_request = GenerationRequest(**data)
        
        return StreamingResponse(
            generate_stream(gen_request.prompt, gen_request.max_tokens, gen_request.temperature),
            media_type="text/event-stream"
        )
    except Exception as e:
        return StreamingResponse(
            iter([f"data: {json.dumps({'error': str(e)})}\n\n"]),
            media_type="text/event-stream"
        )