VLLM Integration
#44
by
ssingh22
- opened
Could we somehow integrate this with vLLM?
That would make it much faster
@ssingh22 Here is an example code for an inference API in a multi-GPU environment using vllm(==0.5.0.post1) and FastAPI.
import os
import asyncio
import torch
from vllm import LLM
from fastapi import FastAPI, Body
from fastapi.middleware.cors import CORSMiddleware
from typing import List
# Set the tensor parallel size from environment variables
TENSOR_PARALLEL_SIZE = int(os.getenv("TENSOR_PARALLEL_SIZE", 1))
class E5Mistral7bInstruct:
def __init__(self):
# Initialize the model with the specified tensor parallel size
self.model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True, tensor_parallel_size=TENSOR_PARALLEL_SIZE)
async def embedding(self, sentences_batch: List[str]):
# Generate embeddings asynchronously
document_embeddings = await asyncio.to_thread(
self.model.encode,
sentences_batch,
use_tqdm=False
)
return document_embeddings
# Create an instance of the embedding model
embedding_model = E5Mistral7bInstruct()
# Initialize the FastAPI app
ROOT_PATH = os.getenv("ROOT_PATH", "")
app = FastAPI(root_path=ROOT_PATH)
# Configure CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=[
"http://localhost",
"http://localhost:8080"
],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app
.post("/text-embeddings")
async def generate(
sentences: List[str] = Body(["What is the meaning of life?"], title="Input Sentences List", description="List of sentences to generate embeddings for"),
):
# Return the embeddings for the provided sentences
return await embedding_model.embedding(sentences)
Awesome!
ssingh22
changed discussion status to
closed