VLLM Integration

#44
by ssingh22 - opened

Could we somehow integrate this with vLLM?

That would make it much faster

@ssingh22 Here is an example code for an inference API in a multi-GPU environment using vllm(==0.5.0.post1) and FastAPI.

import os
import asyncio
import torch
from vllm import LLM
from fastapi import FastAPI, Body
from fastapi.middleware.cors import CORSMiddleware
from typing import List

# Set the tensor parallel size from environment variables
TENSOR_PARALLEL_SIZE = int(os.getenv("TENSOR_PARALLEL_SIZE", 1))

class E5Mistral7bInstruct:
    def __init__(self):
        # Initialize the model with the specified tensor parallel size
        self.model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True, tensor_parallel_size=TENSOR_PARALLEL_SIZE)

    async def embedding(self, sentences_batch: List[str]):
        # Generate embeddings asynchronously
        document_embeddings = await asyncio.to_thread(
            self.model.encode,
            sentences_batch,
            use_tqdm=False
        )
        return document_embeddings

# Create an instance of the embedding model
embedding_model = E5Mistral7bInstruct()

# Initialize the FastAPI app
ROOT_PATH = os.getenv("ROOT_PATH", "")
app = FastAPI(root_path=ROOT_PATH)

# Configure CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=[
        "http://localhost",
        "http://localhost:8080"
    ],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)



@app
	.post("/text-embeddings")
async def generate(
    sentences: List[str] = Body(["What is the meaning of life?"], title="Input Sentences List", description="List of sentences to generate embeddings for"),
):
    # Return the embeddings for the provided sentences
    return await embedding_model.embedding(sentences)
ssingh22 changed discussion status to closed

Sign up or log in to comment