tubegpt / app.py
bidit's picture
Test App
51971b3
raw
history blame
1.62 kB
import torch
import torchvision.transforms as transforms
import cv2
from PIL import Image
from transformers import BigBirdForImageCaptioning, BigBirdTokenizer
import pafy
# URL of the YouTube video
video_url = 'https://www.youtube.com/watch?v=VIDEO_ID'
# Get the best available stream for the video
video = pafy.new(video_url)
best = video.getbest(preftype='mp4')
# Set up the Hugging Face model and tokenizer
model = BigBirdForImageCaptioning.from_pretrained('bigbird-roberta-image-captioning')
tokenizer = BigBirdTokenizer.from_pretrained('bigbird-roberta-image-captioning')
# Set up the image transformation pipeline
image_transforms = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
# Open the video file
cap = cv2.VideoCapture(best.url)
# Loop through the frames of the video
while cap.isOpened():
# Read the next frame
ret, frame = cap.read()
# If we've reached the end of the video, break out of the loop
if not ret:
break
# Convert the frame to a PIL image and apply the image transformation pipeline
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
image = image_transforms(image)
# Add batch dimension to the image tensor
image = image.unsqueeze(0)
# Generate a caption for the image using the model and tokenizer
caption_ids = model.generate(image)
caption = tokenizer.decode(caption_ids[0], skip_special_tokens=True)
# Print the caption for the frame
print(caption)
# Release the video file
cap.release()