File size: 1,267 Bytes
58ac08a 61b5ef6 58ac08a 61b5ef6 fb3036a 61b5ef6 58ac08a 61b5ef6 58ac08a 61b5ef6 58ac08a 61b5ef6 58ac08a 61b5ef6 58ac08a 61b5ef6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
import torch
import numpy as np
from transformers import XCLIPProcessor, XCLIPModel
from decord import VideoReader, cpu
class XCLIPEmbedder:
def __init__(self, model_name="microsoft/xclip-large-patch14"):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model = XCLIPModel.from_pretrained(model_name).to(self.device)
self.processor = XCLIPProcessor.from_pretrained(model_name)
def embed_video(self, video_path):
vr = VideoReader(video_path, ctx=cpu(0))
frame_indices = np.linspace(0, len(vr) - 1, num=8, dtype=int)
video_frames = vr.get_batch(frame_indices).asnumpy()
inputs = self.processor(videos=list(video_frames), return_tensors="pt", padding=True).to(self.device)
with torch.no_grad():
video_features = self.model.get_video_features(**inputs).squeeze(0).cpu().numpy()
return video_features / np.linalg.norm(video_features)
def embed_text(self, text):
inputs = self.processor(text=[text], return_tensors="pt", padding=True).to(self.device)
with torch.no_grad():
text_features = self.model.get_text_features(**inputs).squeeze(0).cpu().numpy()
return text_features / np.linalg.norm(text_features)
|