|
import torch |
|
import torchvision.transforms as transforms |
|
import cv2 |
|
from PIL import Image |
|
from transformers import BigBirdForImageCaptioning, BigBirdTokenizer |
|
import pafy |
|
|
|
|
|
video_url = 'https://www.youtube.com/watch?v=VIDEO_ID' |
|
|
|
|
|
video = pafy.new(video_url) |
|
best = video.getbest(preftype='mp4') |
|
|
|
|
|
model = BigBirdForImageCaptioning.from_pretrained('bigbird-roberta-image-captioning') |
|
tokenizer = BigBirdTokenizer.from_pretrained('bigbird-roberta-image-captioning') |
|
|
|
|
|
image_transforms = transforms.Compose([ |
|
transforms.Resize((224, 224)), |
|
transforms.ToTensor(), |
|
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) |
|
]) |
|
|
|
|
|
cap = cv2.VideoCapture(best.url) |
|
|
|
|
|
while cap.isOpened(): |
|
|
|
ret, frame = cap.read() |
|
|
|
|
|
if not ret: |
|
break |
|
|
|
|
|
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) |
|
image = image_transforms(image) |
|
|
|
|
|
image = image.unsqueeze(0) |
|
|
|
|
|
caption_ids = model.generate(image) |
|
caption = tokenizer.decode(caption_ids[0], skip_special_tokens=True) |
|
|
|
|
|
print(caption) |
|
|
|
|
|
cap.release() |
|
|