import torch
import torchvision.transforms as transforms
import cv2
from PIL import Image
import pafy
import gradio as gra
from transformers import BlipProcessor, BlipForConditionalGeneration
import youtube_dl

model_id = "Salesforce/blip-image-captioning-base"


# Set up the Hugging Face model 
model = BlipForConditionalGeneration.from_pretrained(model_id)
processor = BlipProcessor.from_pretrained(model_id)

def caption_youtube(url):

                # Open the video file
            cap = cv2.VideoCapture('video.mp4')
            
            # Loop through the frames of the video
            while cap.isOpened():
                        # Read the next frame
                        ret, frame = cap.read()
                    
                        # If we've reached the end of the video, break out of the loop
                        if not ret:
                            break
                    
                        # Convert the frame to a PIL image and apply the image transformation pipeline
                        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                        # image = image_transforms(image)
                    
                        # # Add batch dimension to the image tensor
                        # image = image.unsqueeze(0)
                        inputs = processor(image, return_tensors="pt")
                        out = model.generate(**inputs)
                        caption = processor.decode(out[0], skip_special_tokens=True)
                        # Print the caption for the frame
                        print(caption)
                    
                    # Release the video file
                    
            cap.release()

def user_greeting(name):
    return "Hi! " + name + " Welcome to your first Gradio application!😎"    
    
# URL of the YouTube video
video_url = 'https://www.youtube.com/watch?v=orbkg5JH9C8'

# Download the video using youtube_dl and extract the frames using OpenCV
ydl_opts = {
    'format': 'best',
    'outtmpl': 'video.mp4'
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([video_url])
    
app = gra.Interface(fn=caption_youtube, inputs='text', outputs="text")
app.launch()