Spaces:

bidit
/

tubegpt

Runtime error

File size: 2,230 Bytes

51971b3
 
 
 
 
e504141
ffe333d
2becb9d
ffe333d
 
 
 
 
 
 
 
 
f08e044
ffe333d
 
e2f7104
ffe333d
 
 
 
 
 
 
 
 
 
 
 
e007b42
ffe333d
e007b42
 
ffe333d
 
 
 
 
 
 
 
 
e504141
 
 
ffe333d
2becb9d
b6a0d48
2becb9d
 
 
 
 
 
 
 
 
 
5132433
51971b3

import torch
import torchvision.transforms as transforms
import cv2
from PIL import Image
import pafy
import gradio as gra
from transformers import BlipProcessor, BlipForConditionalGeneration
import youtube_dl

model_id = "Salesforce/blip-image-captioning-base"



# Set up the Hugging Face model 
model = BlipForConditionalGeneration.from_pretrained(model_id)
processor = BlipProcessor.from_pretrained(model_id)

def caption_youtube(url):

                # Open the video file
            cap = cv2.VideoCapture('video.mp4')
            
            # Loop through the frames of the video
            while cap.isOpened():
                        # Read the next frame
                        ret, frame = cap.read()
                    
                        # If we've reached the end of the video, break out of the loop
                        if not ret:
                            break
                    
                        # Convert the frame to a PIL image and apply the image transformation pipeline
                        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                        # image = image_transforms(image)
                    
                        # # Add batch dimension to the image tensor
                        # image = image.unsqueeze(0)
                        inputs = processor(image, return_tensors="pt")
                        out = model.generate(**inputs)
                        caption = processor.decode(out[0], skip_special_tokens=True)
                        # Print the caption for the frame
                        print(caption)
                    
                    # Release the video file
                    
            cap.release()

def user_greeting(name):
    return "Hi! " + name + " Welcome to your first Gradio application!😎"    
    
# URL of the YouTube video
video_url = 'https://www.youtube.com/watch?v=orbkg5JH9C8'

# Download the video using youtube_dl and extract the frames using OpenCV
ydl_opts = {
    'format': 'best',
    'outtmpl': 'video.mp4'
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([video_url])
    
app = gra.Interface(fn=caption_youtube, inputs='text', outputs="text")
app.launch()