import torch import torchvision.transforms as transforms import cv2 from PIL import Image import pafy import gradio as gra from transformers import BlipProcessor, BlipForConditionalGeneration import youtube_dl model_id = "Salesforce/blip-image-captioning-base" # Set up the Hugging Face model model = BlipForConditionalGeneration.from_pretrained(model_id) processor = BlipProcessor.from_pretrained(model_id) def caption_youtube(url): # Open the video file cap = cv2.VideoCapture('video.mp4') # Loop through the frames of the video while cap.isOpened(): # Read the next frame ret, frame = cap.read() # If we've reached the end of the video, break out of the loop if not ret: break # Convert the frame to a PIL image and apply the image transformation pipeline image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # image = image_transforms(image) # # Add batch dimension to the image tensor # image = image.unsqueeze(0) inputs = processor(image, return_tensors="pt") out = model.generate(**inputs) caption = processor.decode(out[0], skip_special_tokens=True) # Print the caption for the frame print(caption) # Release the video file cap.release() def user_greeting(name): return "Hi! " + name + " Welcome to your first Gradio application!😎" # URL of the YouTube video video_url = 'https://www.youtube.com/watch?v=orbkg5JH9C8' # Download the video using youtube_dl and extract the frames using OpenCV ydl_opts = { 'format': 'best', 'outtmpl': 'video.mp4' } with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([video_url]) app = gra.Interface(fn=caption_youtube, inputs='text', outputs="text") app.launch()