tubegpt / app.py
bidit's picture
remove image pre process
e007b42
import torch
import torchvision.transforms as transforms
import cv2
from PIL import Image
import pafy
import gradio as gra
from transformers import BlipProcessor, BlipForConditionalGeneration
import youtube_dl
model_id = "Salesforce/blip-image-captioning-base"
# Set up the Hugging Face model
model = BlipForConditionalGeneration.from_pretrained(model_id)
processor = BlipProcessor.from_pretrained(model_id)
def caption_youtube(url):
# Open the video file
cap = cv2.VideoCapture('video.mp4')
# Loop through the frames of the video
while cap.isOpened():
# Read the next frame
ret, frame = cap.read()
# If we've reached the end of the video, break out of the loop
if not ret:
break
# Convert the frame to a PIL image and apply the image transformation pipeline
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# image = image_transforms(image)
# # Add batch dimension to the image tensor
# image = image.unsqueeze(0)
inputs = processor(image, return_tensors="pt")
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
# Print the caption for the frame
print(caption)
# Release the video file
cap.release()
def user_greeting(name):
return "Hi! " + name + " Welcome to your first Gradio application!😎"
# URL of the YouTube video
video_url = 'https://www.youtube.com/watch?v=orbkg5JH9C8'
# Download the video using youtube_dl and extract the frames using OpenCV
ydl_opts = {
'format': 'best',
'outtmpl': 'video.mp4'
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
app = gra.Interface(fn=caption_youtube, inputs='text', outputs="text")
app.launch()