import torch import torchvision.transforms as transforms import cv2 from PIL import Image from transformers import BigBirdForImageCaptioning, BigBirdTokenizer import pafy # URL of the YouTube video video_url = 'https://www.youtube.com/watch?v=VIDEO_ID' # Get the best available stream for the video video = pafy.new(video_url) best = video.getbest(preftype='mp4') # Set up the Hugging Face model and tokenizer model = BigBirdForImageCaptioning.from_pretrained('bigbird-roberta-image-captioning') tokenizer = BigBirdTokenizer.from_pretrained('bigbird-roberta-image-captioning') # Set up the image transformation pipeline image_transforms = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) # Open the video file cap = cv2.VideoCapture(best.url) # Loop through the frames of the video while cap.isOpened(): # Read the next frame ret, frame = cap.read() # If we've reached the end of the video, break out of the loop if not ret: break # Convert the frame to a PIL image and apply the image transformation pipeline image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) image = image_transforms(image) # Add batch dimension to the image tensor image = image.unsqueeze(0) # Generate a caption for the image using the model and tokenizer caption_ids = model.generate(image) caption = tokenizer.decode(caption_ids[0], skip_special_tokens=True) # Print the caption for the frame print(caption) # Release the video file cap.release()