Spaces:

tpha4308
/

video-qa

Sleeping

video-qa / app.py

Thao Pham

allow for creating index

527422b 3 months ago

8.02 kB

	import gradio as gr
	import time
	import re
	import video_utils
	import utils
	import embed
	import rag
	import os
	import uuid
	import numpy as np
	import pinecone
	from pinecone import Pinecone, ServerlessSpec
	from sentence_transformers import SentenceTransformer
	from transformers import AutoImageProcessor, AutoModel
	from transformers import BlipProcessor, BlipForConditionalGeneration
	from dotenv import load_dotenv

	load_dotenv() # Load from .env

	UPLOAD_FOLDER = 'uploads'
	video_name = None

	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

	# init models
	TEXT_MODEL = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	VISION_MODEL_PROCESSOR = AutoImageProcessor.from_pretrained('facebook/dinov2-small')
	VISION_MODEL = AutoModel.from_pretrained('facebook/dinov2-small')

	VLM_PROCESSOR = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	VLM = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

	# init index
	pc = Pinecone(
	api_key=PINECONE_API_KEY
	)
	# Connect to an index
	index_name = "multimodal-minilm"
	if index_name not in pc.list_indexes().names():
	pinecone.create_index(index_name, dimension=384, metric="cosine")
	INDEX = pc.Index(index_name)

	MODEL_STACK = [TEXT_MODEL, VISION_MODEL, VISION_MODEL_PROCESSOR, VLM, VLM_PROCESSOR]


	def is_valid_youtube_url(url):
	"""
	Checks if the given URL is a valid YouTube video URL.

	Returns True if valid, False otherwise.
	"""
	youtube_regex = re.compile(
	r"^(https?://)?(www\.)?(youtube\.com\|youtu\.be)/"
	r"(watch\?v=\|embed/\|v/\|shorts/)?([a-zA-Z0-9_-]{11})"
	)

	match = youtube_regex.match(url)
	return bool(match)


	def check_exist_before_upsert(index, video_path):
	# threshold = len(frames) * 3
	threshold = [elem for elem in os.listdir(video_path.split('/')[0]) if elem.endswith('.jpg')]
	threshold = len(threshold)*3 # image embeds, caption embeds, transcript embeds

	dimension = 384
	res = index.query(
	vector=[0]*dimension, # Dummy vector (not used for filtering)
	top_k=10000, # Set a high value to retrieve as many matches as possible
	filter={"video_path": video_path} # Filter by video_path
	)

	# Count the number of matching vectors
	num_existing_vectors = len(res["matches"])

	if num_existing_vectors >= threshold:
	return True
	return False


	def chat(message, history):
	image_input_path = None
	if len(message['files']) > 0:
	assert len(message['files']) == 1
	image_input_path = message['files'][0]

	message = message['text']

	if history is None:
	history = []

	if message.startswith("https://"):
	# Check valid URL
	history.append((message, f"Checking if your provided URL at {message} is valid..."))
	yield history

	valid = is_valid_youtube_url(message)
	if not valid:
	history.append((None, "❌ Invalid YouTube URL. Please try again."))
	yield history
	return

	# Check metadata
	history.append((None, "✅ URL is valid! Fetching video metadata..."))
	yield history

	video_metadata = video_utils.get_video_metdata(message)
	history.append((None, f"The video you want to process is: \nTitle: {video_metadata['title']} published by {video_metadata['author']} on {video_metadata['publish_date']}."))
	yield history

	history.append((None, "⏳ Downloading video..."))
	yield history

	output_folder_path = os.path.join(UPLOAD_FOLDER, video_metadata['title'])
	path_to_video = os.path.join(output_folder_path, f"video.mp4")
	if not os.path.exists(path_to_video):
	path_to_video = utils.download_video(message, path=output_folder_path)

	history.append((None, "⏳ Transcribing video..."))
	yield history
	path_to_audio_file = os.path.join(output_folder_path, f"audio.mp3")
	if not os.path.exists(path_to_audio_file):
	path_to_audio_file = video_utils.extract_audio(path_to_video, output_folder_path)

	path_to_generated_transcript = os.path.join(output_folder_path, f"transcript.vtt")
	if not os.path.exists(path_to_generated_transcript):
	path_to_generated_transcript = video_utils.transcribe_video(path_to_audio_file, output_folder_path)

	# extract frames and metadata
	metadatas_path = os.path.join(output_folder_path, 'metadatas.json')
	if not os.path.exists(metadatas_path):
	metadatas = video_utils.extract_and_save_frames_and_metadata(path_to_video=path_to_video,
	path_to_transcript=path_to_generated_transcript,
	path_to_save_extracted_frames=output_folder_path,
	path_to_save_metadatas=output_folder_path)

	history.append((None, "⏳ Captioning video..."))
	yield history

	caption_path = os.path.join(output_folder_path, 'captions.json')
	if not os.path.exists(caption_path):
	video_frames = [os.path.join(output_folder_path, elem) for elem in os.listdir(output_folder_path) if elem.endswith('.jpg')]
	metadata_path = video_utils.get_video_caption(video_frames, metadatas, output_folder_path, vlm=VLM, vlm_processor=VLM_PROCESSOR)

	history.append((None, "⏳ Indexing..."))
	yield history
	index_exist = check_exist_before_upsert(INDEX, path_to_video)
	print(index_exist)
	if not index_exist:
	embed.indexing(INDEX, MODEL_STACK, metadatas_path)

	# summarizing video
	video_summary = rag.summarize_video(metadatas_path)
	with open(os.path.join(output_folder_path, "summary.txt"), "w") as f:
	f.write(video_summary)

	history.append((None, f"Video processing complete! You can now ask me questions about the video {video_metadata['title']}!"))
	yield history

	global video_name
	video_name = video_metadata['title']
	else:
	history.append((message, None))
	yield history

	if video_name is None:
	history.append((None, "You need to insert video URL before asking questions."))
	yield history
	return

	output_folder_path = f"{UPLOAD_FOLDER}/{video_name}"
	metadatas_path = os.path.join(output_folder_path, 'metadatas.json')

	video_summary = ''
	with open(f'./{output_folder_path}/summary.txt') as f:
	while True:
	ln = f.readline()
	if ln == '':
	break
	video_summary += ln.strip()
	video_path = os.path.join(output_folder_path, 'video.mp4')
	answer = rag.answer_question(INDEX, MODEL_STACK, metadatas_path, video_summary, video_path, message, image_input_path)

	history.append((None, answer))
	yield history

	def clear_chat(history):
	# return []
	history = []
	history.append((None, "Please input a Youtube URL to get started!"))
	# yield history
	return history

	def main():
	initial_messages = [(None, "Please input a Youtube URL to get started!")]

	with gr.Blocks() as demo:
	chatbot = gr.Chatbot(value=initial_messages)
	msg = gr.MultimodalTextbox(file_types=['image'], sources=['upload'])

	with gr.Row():
	with gr.Column():
	submit = gr.Button("Send")
	submit.click(chat, [msg, chatbot], chatbot)

	with gr.Column():
	clear = gr.Button("Clear") # Clear button
	# Clear chat history when clear button is clicked
	clear.click(clear_chat, [], chatbot)
	global video_name
	video_name = None

	demo.launch()

	if __name__ == "__main__":
	main()