File size: 2,230 Bytes
51971b3
 
 
 
 
e504141
ffe333d
2becb9d
ffe333d
 
 
 
 
 
 
 
 
f08e044
ffe333d
 
e2f7104
ffe333d
 
 
 
 
 
 
 
 
 
 
 
e007b42
ffe333d
e007b42
 
ffe333d
 
 
 
 
 
 
 
 
e504141
 
 
ffe333d
2becb9d
b6a0d48
2becb9d
 
 
 
 
 
 
 
 
 
5132433
51971b3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import torch
import torchvision.transforms as transforms
import cv2
from PIL import Image
import pafy
import gradio as gra
from transformers import BlipProcessor, BlipForConditionalGeneration
import youtube_dl

model_id = "Salesforce/blip-image-captioning-base"



# Set up the Hugging Face model 
model = BlipForConditionalGeneration.from_pretrained(model_id)
processor = BlipProcessor.from_pretrained(model_id)

def caption_youtube(url):

                # Open the video file
            cap = cv2.VideoCapture('video.mp4')
            
            # Loop through the frames of the video
            while cap.isOpened():
                        # Read the next frame
                        ret, frame = cap.read()
                    
                        # If we've reached the end of the video, break out of the loop
                        if not ret:
                            break
                    
                        # Convert the frame to a PIL image and apply the image transformation pipeline
                        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                        # image = image_transforms(image)
                    
                        # # Add batch dimension to the image tensor
                        # image = image.unsqueeze(0)
                        inputs = processor(image, return_tensors="pt")
                        out = model.generate(**inputs)
                        caption = processor.decode(out[0], skip_special_tokens=True)
                        # Print the caption for the frame
                        print(caption)
                    
                    # Release the video file
                    
            cap.release()

def user_greeting(name):
    return "Hi! " + name + " Welcome to your first Gradio application!😎"    
    
# URL of the YouTube video
video_url = 'https://www.youtube.com/watch?v=orbkg5JH9C8'

# Download the video using youtube_dl and extract the frames using OpenCV
ydl_opts = {
    'format': 'best',
    'outtmpl': 'video.mp4'
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([video_url])
    
app = gra.Interface(fn=caption_youtube, inputs='text', outputs="text")
app.launch()