bidit commited on
Commit
51971b3
1 Parent(s): a931c26
Files changed (1) hide show
  1. app.py +53 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchvision.transforms as transforms
3
+ import cv2
4
+ from PIL import Image
5
+ from transformers import BigBirdForImageCaptioning, BigBirdTokenizer
6
+ import pafy
7
+
8
+ # URL of the YouTube video
9
+ video_url = 'https://www.youtube.com/watch?v=VIDEO_ID'
10
+
11
+ # Get the best available stream for the video
12
+ video = pafy.new(video_url)
13
+ best = video.getbest(preftype='mp4')
14
+
15
+ # Set up the Hugging Face model and tokenizer
16
+ model = BigBirdForImageCaptioning.from_pretrained('bigbird-roberta-image-captioning')
17
+ tokenizer = BigBirdTokenizer.from_pretrained('bigbird-roberta-image-captioning')
18
+
19
+ # Set up the image transformation pipeline
20
+ image_transforms = transforms.Compose([
21
+ transforms.Resize((224, 224)),
22
+ transforms.ToTensor(),
23
+ transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
24
+ ])
25
+
26
+ # Open the video file
27
+ cap = cv2.VideoCapture(best.url)
28
+
29
+ # Loop through the frames of the video
30
+ while cap.isOpened():
31
+ # Read the next frame
32
+ ret, frame = cap.read()
33
+
34
+ # If we've reached the end of the video, break out of the loop
35
+ if not ret:
36
+ break
37
+
38
+ # Convert the frame to a PIL image and apply the image transformation pipeline
39
+ image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
40
+ image = image_transforms(image)
41
+
42
+ # Add batch dimension to the image tensor
43
+ image = image.unsqueeze(0)
44
+
45
+ # Generate a caption for the image using the model and tokenizer
46
+ caption_ids = model.generate(image)
47
+ caption = tokenizer.decode(caption_ids[0], skip_special_tokens=True)
48
+
49
+ # Print the caption for the frame
50
+ print(caption)
51
+
52
+ # Release the video file
53
+ cap.release()