|
import torch |
|
import numpy as np |
|
from transformers import AutoProcessor, AutoModel |
|
import decord |
|
|
|
class InternVLEmbedder: |
|
def __init__(self): |
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
self.model = AutoModel.from_pretrained("OpenGVLab/InternVL2_5-1B-MPO", trust_remote_code=True).to(self.device) |
|
self.processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL2_5-1B-MPO", trust_remote_code=True) |
|
|
|
def embed_video(self, video_path): |
|
vr = decord.VideoReader(video_path) |
|
frames = np.stack([vr[i].asnumpy() for i in np.linspace(0, len(vr)-1, 8).astype(int)]) |
|
tensor = torch.tensor(frames).permute(0, 3, 1, 2).unsqueeze(0).to(self.device) |
|
|
|
with torch.no_grad(): |
|
video_vector = self.model.get_video_features(tensor).squeeze(0).cpu().numpy() |
|
|
|
return video_vector / np.linalg.norm(video_vector) |
|
|
|
def embed_text(self, text): |
|
inputs = self.processor(text=[text], return_tensors="pt").to(self.device) |
|
|
|
with torch.no_grad(): |
|
text_vector = self.model.get_text_features(**inputs).squeeze(0).cpu().numpy() |
|
|
|
return text_vector / np.linalg.norm(text_vector) |
|
|