import torch import numpy as np from transformers import AutoProcessor, AutoModel import decord class InternVLEmbedder: def __init__(self): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model = AutoModel.from_pretrained("OpenGVLab/InternVL2_5-1B-MPO", trust_remote_code=True).to(self.device) self.processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL2_5-1B-MPO", trust_remote_code=True) def embed_video(self, video_path): vr = decord.VideoReader(video_path) frames = np.stack([vr[i].asnumpy() for i in np.linspace(0, len(vr)-1, 8).astype(int)]) tensor = torch.tensor(frames).permute(0, 3, 1, 2).unsqueeze(0).to(self.device) with torch.no_grad(): video_vector = self.model.get_video_features(tensor).squeeze(0).cpu().numpy() return video_vector / np.linalg.norm(video_vector) def embed_text(self, text): inputs = self.processor(text=[text], return_tensors="pt").to(self.device) with torch.no_grad(): text_vector = self.model.get_text_features(**inputs).squeeze(0).cpu().numpy() return text_vector / np.linalg.norm(text_vector)