video-qa / app.py
Thao Pham
allow for creating index
527422b
raw
history blame
8.02 kB
import gradio as gr
import time
import re
import video_utils
import utils
import embed
import rag
import os
import uuid
import numpy as np
import pinecone
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from transformers import AutoImageProcessor, AutoModel
from transformers import BlipProcessor, BlipForConditionalGeneration
from dotenv import load_dotenv
load_dotenv() # Load from .env
UPLOAD_FOLDER = 'uploads'
video_name = None
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
# init models
TEXT_MODEL = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
VISION_MODEL_PROCESSOR = AutoImageProcessor.from_pretrained('facebook/dinov2-small')
VISION_MODEL = AutoModel.from_pretrained('facebook/dinov2-small')
VLM_PROCESSOR = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
VLM = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# init index
pc = Pinecone(
api_key=PINECONE_API_KEY
)
# Connect to an index
index_name = "multimodal-minilm"
if index_name not in pc.list_indexes().names():
pinecone.create_index(index_name, dimension=384, metric="cosine")
INDEX = pc.Index(index_name)
MODEL_STACK = [TEXT_MODEL, VISION_MODEL, VISION_MODEL_PROCESSOR, VLM, VLM_PROCESSOR]
def is_valid_youtube_url(url):
"""
Checks if the given URL is a valid YouTube video URL.
Returns True if valid, False otherwise.
"""
youtube_regex = re.compile(
r"^(https?://)?(www\.)?(youtube\.com|youtu\.be)/"
r"(watch\?v=|embed/|v/|shorts/)?([a-zA-Z0-9_-]{11})"
)
match = youtube_regex.match(url)
return bool(match)
def check_exist_before_upsert(index, video_path):
# threshold = len(frames) * 3
threshold = [elem for elem in os.listdir(video_path.split('/')[0]) if elem.endswith('.jpg')]
threshold = len(threshold)*3 # image embeds, caption embeds, transcript embeds
dimension = 384
res = index.query(
vector=[0]*dimension, # Dummy vector (not used for filtering)
top_k=10000, # Set a high value to retrieve as many matches as possible
filter={"video_path": video_path} # Filter by video_path
)
# Count the number of matching vectors
num_existing_vectors = len(res["matches"])
if num_existing_vectors >= threshold:
return True
return False
def chat(message, history):
image_input_path = None
if len(message['files']) > 0:
assert len(message['files']) == 1
image_input_path = message['files'][0]
message = message['text']
if history is None:
history = []
if message.startswith("https://"):
# Check valid URL
history.append((message, f"Checking if your provided URL at {message} is valid..."))
yield history
valid = is_valid_youtube_url(message)
if not valid:
history.append((None, "❌ Invalid YouTube URL. Please try again."))
yield history
return
# Check metadata
history.append((None, "βœ… URL is valid! Fetching video metadata..."))
yield history
video_metadata = video_utils.get_video_metdata(message)
history.append((None, f"The video you want to process is: \nTitle: {video_metadata['title']} published by {video_metadata['author']} on {video_metadata['publish_date']}."))
yield history
history.append((None, "⏳ Downloading video..."))
yield history
output_folder_path = os.path.join(UPLOAD_FOLDER, video_metadata['title'])
path_to_video = os.path.join(output_folder_path, f"video.mp4")
if not os.path.exists(path_to_video):
path_to_video = utils.download_video(message, path=output_folder_path)
history.append((None, "⏳ Transcribing video..."))
yield history
path_to_audio_file = os.path.join(output_folder_path, f"audio.mp3")
if not os.path.exists(path_to_audio_file):
path_to_audio_file = video_utils.extract_audio(path_to_video, output_folder_path)
path_to_generated_transcript = os.path.join(output_folder_path, f"transcript.vtt")
if not os.path.exists(path_to_generated_transcript):
path_to_generated_transcript = video_utils.transcribe_video(path_to_audio_file, output_folder_path)
# extract frames and metadata
metadatas_path = os.path.join(output_folder_path, 'metadatas.json')
if not os.path.exists(metadatas_path):
metadatas = video_utils.extract_and_save_frames_and_metadata(path_to_video=path_to_video,
path_to_transcript=path_to_generated_transcript,
path_to_save_extracted_frames=output_folder_path,
path_to_save_metadatas=output_folder_path)
history.append((None, "⏳ Captioning video..."))
yield history
caption_path = os.path.join(output_folder_path, 'captions.json')
if not os.path.exists(caption_path):
video_frames = [os.path.join(output_folder_path, elem) for elem in os.listdir(output_folder_path) if elem.endswith('.jpg')]
metadata_path = video_utils.get_video_caption(video_frames, metadatas, output_folder_path, vlm=VLM, vlm_processor=VLM_PROCESSOR)
history.append((None, "⏳ Indexing..."))
yield history
index_exist = check_exist_before_upsert(INDEX, path_to_video)
print(index_exist)
if not index_exist:
embed.indexing(INDEX, MODEL_STACK, metadatas_path)
# summarizing video
video_summary = rag.summarize_video(metadatas_path)
with open(os.path.join(output_folder_path, "summary.txt"), "w") as f:
f.write(video_summary)
history.append((None, f"Video processing complete! You can now ask me questions about the video {video_metadata['title']}!"))
yield history
global video_name
video_name = video_metadata['title']
else:
history.append((message, None))
yield history
if video_name is None:
history.append((None, "You need to insert video URL before asking questions."))
yield history
return
output_folder_path = f"{UPLOAD_FOLDER}/{video_name}"
metadatas_path = os.path.join(output_folder_path, 'metadatas.json')
video_summary = ''
with open(f'./{output_folder_path}/summary.txt') as f:
while True:
ln = f.readline()
if ln == '':
break
video_summary += ln.strip()
video_path = os.path.join(output_folder_path, 'video.mp4')
answer = rag.answer_question(INDEX, MODEL_STACK, metadatas_path, video_summary, video_path, message, image_input_path)
history.append((None, answer))
yield history
def clear_chat(history):
# return []
history = []
history.append((None, "Please input a Youtube URL to get started!"))
# yield history
return history
def main():
initial_messages = [(None, "Please input a Youtube URL to get started!")]
with gr.Blocks() as demo:
chatbot = gr.Chatbot(value=initial_messages)
msg = gr.MultimodalTextbox(file_types=['image'], sources=['upload'])
with gr.Row():
with gr.Column():
submit = gr.Button("Send")
submit.click(chat, [msg, chatbot], chatbot)
with gr.Column():
clear = gr.Button("Clear") # Clear button
# Clear chat history when clear button is clicked
clear.click(clear_chat, [], chatbot)
global video_name
video_name = None
demo.launch()
if __name__ == "__main__":
main()