Spaces:

bidit
/

tubegpt

Runtime error

App Files Files Community

bidit commited on May 4, 2023

Commit

51971b3

1 Parent(s): a931c26

Test App

Browse files

Files changed (1) hide show

app.py +53 -0

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+import torchvision.transforms as transforms
+import cv2
+from PIL import Image
+from transformers import BigBirdForImageCaptioning, BigBirdTokenizer
+import pafy
+# URL of the YouTube video
+video_url = 'https://www.youtube.com/watch?v=VIDEO_ID'
+# Get the best available stream for the video
+video = pafy.new(video_url)
+best = video.getbest(preftype='mp4')
+# Set up the Hugging Face model and tokenizer
+model = BigBirdForImageCaptioning.from_pretrained('bigbird-roberta-image-captioning')
+tokenizer = BigBirdTokenizer.from_pretrained('bigbird-roberta-image-captioning')
+# Set up the image transformation pipeline
+image_transforms = transforms.Compose([
+    transforms.Resize((224, 224)),
+    transforms.ToTensor(),
+    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+])
+# Open the video file
+cap = cv2.VideoCapture(best.url)
+# Loop through the frames of the video
+while cap.isOpened():
+    # Read the next frame
+    ret, frame = cap.read()
+    # If we've reached the end of the video, break out of the loop
+    if not ret:
+        break
+    # Convert the frame to a PIL image and apply the image transformation pipeline
+    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    image = image_transforms(image)
+    # Add batch dimension to the image tensor
+    image = image.unsqueeze(0)
+    # Generate a caption for the image using the model and tokenizer
+    caption_ids = model.generate(image)
+    caption = tokenizer.decode(caption_ids[0], skip_special_tokens=True)
+    # Print the caption for the frame
+    print(caption)
+# Release the video file
+cap.release()