File size: 2,230 Bytes
51971b3 e504141 ffe333d 2becb9d ffe333d f08e044 ffe333d e2f7104 ffe333d e007b42 ffe333d e007b42 ffe333d e504141 ffe333d 2becb9d b6a0d48 2becb9d 5132433 51971b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import torch
import torchvision.transforms as transforms
import cv2
from PIL import Image
import pafy
import gradio as gra
from transformers import BlipProcessor, BlipForConditionalGeneration
import youtube_dl
model_id = "Salesforce/blip-image-captioning-base"
# Set up the Hugging Face model
model = BlipForConditionalGeneration.from_pretrained(model_id)
processor = BlipProcessor.from_pretrained(model_id)
def caption_youtube(url):
# Open the video file
cap = cv2.VideoCapture('video.mp4')
# Loop through the frames of the video
while cap.isOpened():
# Read the next frame
ret, frame = cap.read()
# If we've reached the end of the video, break out of the loop
if not ret:
break
# Convert the frame to a PIL image and apply the image transformation pipeline
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# image = image_transforms(image)
# # Add batch dimension to the image tensor
# image = image.unsqueeze(0)
inputs = processor(image, return_tensors="pt")
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
# Print the caption for the frame
print(caption)
# Release the video file
cap.release()
def user_greeting(name):
return "Hi! " + name + " Welcome to your first Gradio application!😎"
# URL of the YouTube video
video_url = 'https://www.youtube.com/watch?v=orbkg5JH9C8'
# Download the video using youtube_dl and extract the frames using OpenCV
ydl_opts = {
'format': 'best',
'outtmpl': 'video.mp4'
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
app = gra.Interface(fn=caption_youtube, inputs='text', outputs="text")
app.launch()
|