File size: 2,043 Bytes
d809ddf
 
 
 
1d83e4f
d809ddf
 
b210a93
48d8d65
 
 
d809ddf
 
1d83e4f
 
 
0306c33
 
 
ae23345
a959d74
0306c33
1d83e4f
 
d809ddf
 
 
 
 
 
 
 
 
 
 
b210a93
 
d809ddf
b210a93
d809ddf
 
 
 
c7bb7b5
 
 
 
 
d809ddf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef8deae
 
48d8d65
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from vllm import AsyncLLMEngine, SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
import asyncio
import json
import uuid

app = FastAPI()

# Initialize the AsyncLLMEngine
# Replace 'your-model-path' with the actual path or name of your model
engine = AsyncLLMEngine.from_engine_args(
    AsyncEngineArgs(
        model='microsoft/Phi-3-mini-4k-instruct',
        max_num_batched_tokens=512,    # Reduced for T4
        max_num_seqs=16,               # Reduced for T4
        gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
        max_model_len=512,            # Phi-3-mini-4k context length
        enforce_eager=True,            # Disable CUDA graph
        dtype='half',                  # Use half precision
    )
)

class GenerationRequest(BaseModel):
    prompt: str
    max_tokens: int = 100
    temperature: float = 0.7

async def generate_stream(prompt: str, max_tokens: int, temperature: float):
    sampling_params = SamplingParams(
        temperature=temperature,
        max_tokens=max_tokens
    )

    request_id = str(uuid.uuid4())
    
    async for output in engine.generate(prompt, sampling_params, request_id=request_id):  # True enables streaming
        yield f"data: {json.dumps({'text': output.outputs[0].text})}\n\n"
    
    yield "data: [DONE]\n\n"

@app.get("/")
def greet_json():
    return {"Hello": "World!"}


@app.post("/generate-stream")
async def generate_text(request: Request):
    try:
        data = await request.json()
        gen_request = GenerationRequest(**data)
        
        return StreamingResponse(
            generate_stream(gen_request.prompt, gen_request.max_tokens, gen_request.temperature),
            media_type="text/event-stream"
        )
    except Exception as e:
        return StreamingResponse(
            iter([f"data: {json.dumps({'error': str(e)})}\n\n"]),
            media_type="text/event-stream"
        )