blip / src /embedder.py
amezi's picture
more minor bux fixedd
2e8783e
raw
history blame
1.19 kB
import torch
import numpy as np
from transformers import AutoProcessor, AutoModel
import decord
class InternVLEmbedder:
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = AutoModel.from_pretrained("OpenGVLab/InternVL2_5-1B-MPO", trust_remote_code=True).to(self.device)
self.processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL2_5-1B-MPO", trust_remote_code=True)
def embed_video(self, video_path):
vr = decord.VideoReader(video_path)
frames = np.stack([vr[i].asnumpy() for i in np.linspace(0, len(vr)-1, 8).astype(int)])
tensor = torch.tensor(frames).permute(0, 3, 1, 2).unsqueeze(0).to(self.device)
with torch.no_grad():
video_vector = self.model.get_video_features(tensor).squeeze(0).cpu().numpy()
return video_vector / np.linalg.norm(video_vector)
def embed_text(self, text):
inputs = self.processor(text=[text], return_tensors="pt").to(self.device)
with torch.no_grad():
text_vector = self.model.get_text_features(**inputs).squeeze(0).cpu().numpy()
return text_vector / np.linalg.norm(text_vector)