Spaces:

tpha4308
/

video-qa

Sleeping

App Files Files Community

Thao Pham commited on Mar 23

Commit

d50ce1c

1 Parent(s): 1d5379f

First commit

Browse files

Files changed (7) hide show

.gitignore +5 -0
app.py +213 -0
embed.py +62 -0
rag.py +270 -0
requirements.txt +17 -0
utils.py +91 -0
video_utils.py +156 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.vscode
+.env
+__pycache__
+tmp
+uploads

app.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import gradio as gr
+import time
+import re
+import video_utils
+import utils
+import embed
+import rag
+import os
+import uuid
+import numpy as np
+from pinecone import Pinecone, ServerlessSpec
+from sentence_transformers import SentenceTransformer
+from transformers import AutoImageProcessor, AutoModel
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from dotenv import load_dotenv
+load_dotenv()  # Load from .env
+UPLOAD_FOLDER = 'uploads'
+video_name = None
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+# init models
+TEXT_MODEL = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+VISION_MODEL_PROCESSOR = AutoImageProcessor.from_pretrained('facebook/dinov2-small')
+VISION_MODEL = AutoModel.from_pretrained('facebook/dinov2-small')
+VLM_PROCESSOR = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+VLM = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# init index
+pc = Pinecone(
+    api_key=PINECONE_API_KEY
+)
+# Connect to an index
+index_name = "mutlimodal-minilm"
+INDEX = pc.Index(index_name)
+MODEL_STACK = [TEXT_MODEL, VISION_MODEL, VISION_MODEL_PROCESSOR, VLM, VLM_PROCESSOR]
+def is_valid_youtube_url(url):
+    """
+    Checks if the given URL is a valid YouTube video URL.
+    Returns True if valid, False otherwise.
+    """
+    youtube_regex = re.compile(
+        r"^(https?://)?(www\.)?(youtube\.com|youtu\.be)/"
+        r"(watch\?v=|embed/|v/|shorts/)?([a-zA-Z0-9_-]{11})"
+    )
+    match = youtube_regex.match(url)
+    return bool(match)
+def check_exist_before_upsert(index, video_path):
+    # threshold = len(frames) * 3
+    threshold = [elem for elem in os.listdir(video_path.split('/')[0]) if elem.endswith('.jpg')]
+    threshold = len(threshold)*3 # image embeds, caption embeds, transcript embeds
+    dimension = 384
+    res = index.query(
+        vector=[0]*dimension,  # Dummy vector (not used for filtering)
+        top_k=10000,  # Set a high value to retrieve as many matches as possible
+        filter={"video_path": video_path}  # Filter by video_path
+    )
+    # Count the number of matching vectors
+    num_existing_vectors = len(res["matches"])
+    if num_existing_vectors >= threshold:
+        return True
+    return False
+def chat(message, history):
+    image_input_path = None
+    if len(message['files']) > 0:
+        assert len(message['files']) == 1
+        image_input_path = message['files'][0]
+    message = message['text']
+    if history is None:
+        history = []
+    if message.startswith("https://"):
+        # Check valid URL
+        history.append((message, f"Checking if your provided URL at {message} is valid..."))
+        yield history
+        valid = is_valid_youtube_url(message)
+        if not valid:
+            history.append((None, "❌ Invalid YouTube URL. Please try again."))
+            yield history
+            return
+        # Check metadata
+        history.append((None, "✅ URL is valid! Fetching video metadata..."))
+        yield history
+        video_metadata = video_utils.get_video_metdata(message)
+        history.append((None, f"The video you want to process is: \nTitle: {video_metadata['title']} published by {video_metadata['author']} on {video_metadata['publish_date']}."))
+        yield history
+        history.append((None, "⏳ Downloading video..."))
+        yield history
+        output_folder_path = os.path.join(UPLOAD_FOLDER, video_metadata['title'])
+        path_to_video = os.path.join(output_folder_path, f"video.mp4")
+        if not os.path.exists(path_to_video):
+            path_to_video = utils.download_video(message, path=output_folder_path)
+        history.append((None, "⏳ Transcribing video..."))
+        yield history
+        path_to_audio_file = os.path.join(output_folder_path, f"audio.mp3")
+        if not os.path.exists(path_to_audio_file):
+            path_to_audio_file = video_utils.extract_audio(path_to_video, output_folder_path)
+        path_to_generated_transcript = os.path.join(output_folder_path, f"transcript.vtt")
+        if not os.path.exists(path_to_generated_transcript):
+            path_to_generated_transcript = video_utils.transcribe_video(path_to_audio_file, output_folder_path)
+        # extract frames and metadata
+        metadatas_path = os.path.join(output_folder_path, 'metadatas.json')
+        if not os.path.exists(metadatas_path):
+            metadatas = video_utils.extract_and_save_frames_and_metadata(path_to_video=path_to_video,
+                                                                        path_to_transcript=path_to_generated_transcript,
+                                                                        path_to_save_extracted_frames=output_folder_path,
+                                                                        path_to_save_metadatas=output_folder_path)
+        history.append((None, "⏳ Captioning video..."))
+        yield history
+        caption_path = os.path.join(output_folder_path, 'captions.json')
+        if not os.path.exists(caption_path):
+            video_frames = [os.path.join(output_folder_path, elem) for elem in os.listdir(output_folder_path) if elem.endswith('.jpg')]
+            metadata_path = video_utils.get_video_caption(video_frames, metadatas, output_folder_path, vlm=VLM, vlm_processor=VLM_PROCESSOR)
+        history.append((None, "⏳ Indexing..."))
+        yield history
+        index_exist = check_exist_before_upsert(INDEX, path_to_video)
+        print(index_exist)
+        if not index_exist:
+            embed.indexing(INDEX, MODEL_STACK, metadatas_path)
+        # summarizing video
+        video_summary = rag.summarize_video(metadatas_path)
+        with open(os.path.join(output_folder_path, "summary.txt"), "w") as f:
+            f.write(video_summary)
+        history.append((None, f"Video processing complete! You can now ask me questions about the video {video_metadata['title']}!"))
+        yield history
+        global video_name
+        video_name = video_metadata['title']
+    else:
+        history.append((message, None))
+        yield history
+        if video_name is None:
+            history.append((None, "You need to insert video URL before asking questions."))
+            yield history
+            return
+        output_folder_path = f"{UPLOAD_FOLDER}/{video_name}"
+        metadatas_path = os.path.join(output_folder_path, 'metadatas.json')
+        video_summary = ''
+        with open(f'./{output_folder_path}/summary.txt') as f:
+            while True:
+                ln = f.readline()
+                if ln == '':
+                    break
+                video_summary += ln.strip()
+        video_path = os.path.join(output_folder_path, 'video.mp4')
+        answer = rag.answer_question(INDEX, MODEL_STACK, metadatas_path, video_summary, video_path, message, image_input_path)
+        history.append((None, answer))
+        yield history
+def clear_chat(history):
+    # return []
+    history = []
+    history.append((None, "Please input a Youtube URL to get started!"))
+    # yield history
+    return history
+def main():
+    initial_messages = [(None, "Please input a Youtube URL to get started!")]
+    with gr.Blocks() as demo:
+        chatbot = gr.Chatbot(value=initial_messages)
+        msg = gr.MultimodalTextbox(file_types=['image'], sources=['upload'])
+        with gr.Row():
+            with gr.Column():
+                submit = gr.Button("Send")
+                submit.click(chat, [msg, chatbot], chatbot)
+            with gr.Column():
+                clear = gr.Button("Clear")  # Clear button
+                # Clear chat history when clear button is clicked
+                clear.click(clear_chat, [], chatbot)
+                global video_name
+                video_name = None
+    demo.launch()
+if __name__ == "__main__":
+    main()

embed.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from sentence_transformers import SentenceTransformer
+from transformers import AutoImageProcessor, AutoModel
+from tqdm import tqdm
+from PIL import Image
+from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union
+import numpy as np
+import uuid
+from utils import load_json_file
+def embed_texts(text_ls:List[str], text_model=None):
+    if text_model is None:
+        text_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+    text_embeddings = []
+    for i, text in enumerate(tqdm(text_ls, desc="Embedding text")):
+        embeds = text_model.encode(text)
+        text_embeddings.append(embeds)
+    return np.array(text_embeddings)
+def embed_images(image_path_ls:List[str], vision_model=None, vision_model_processor=None):
+    if vision_model is None or vision_model_processor is None:
+        vision_model_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-small')
+        vision_model = AutoModel.from_pretrained('facebook/dinov2-small')
+    image_embeds_ls = []
+    for i, frame in enumerate(tqdm(image_path_ls, desc="Embedding image")):
+        frame = Image.open(frame)
+        # TODO: add device here
+        inputs = vision_model_processor(images=frame, return_tensors="pt")
+        outputs = vision_model(**inputs)
+        image_embeds_ls.append(outputs.pooler_output)
+    return np.array([elem.squeeze().detach().numpy() for elem in image_embeds_ls])
+def indexing(index, model_stack, vid_metadata_path):
+    text_model, vision_model, vision_model_processor, _, _ = model_stack
+    # read metadata file
+    vid_metadata = load_json_file(vid_metadata_path)
+    # embed transcripts
+    vid_trans = [frame['transcript'] for frame in vid_metadata]
+    transcript_embeddings = embed_texts(text_ls=vid_trans, text_model=text_model)
+    # embed caption
+    vid_captions = [frame['caption'] for frame in vid_metadata]
+    caption_embeddings = embed_texts(text_ls=vid_captions, text_model=text_model)
+    # embed frames
+    vid_img_paths = [vid['extracted_frame_path'] for vid in vid_metadata]
+    frame_embeddings = embed_images(vid_img_paths, vision_model, vision_model_processor)
+    for ls in [transcript_embeddings, caption_embeddings, frame_embeddings]:
+        # Prepare metadata
+        vectors = [
+            (str(uuid.uuid4()), emb.tolist(), meta)  # Generate unique IDs
+            for emb, meta in zip(ls, vid_metadata)
+        ]
+        # Upsert vectors into Pinecone
+        index.upsert(vectors)

rag.py ADDED Viewed

	@@ -0,0 +1,270 @@

+from langchain.prompts import PromptTemplate
+from langchain_community.chat_models import ChatOpenAI
+from langchain.chains import LLMChain
+from PIL import Image
+import os
+from utils import load_json_file, str2time
+from openai import OpenAI
+import base64
+def get_smallest_timestamp(timestamps):
+    assert len(timestamps) > 0
+    timestamps_in_ms = [str2time(elem) for elem in timestamps]
+    smallest_timestamp_in_ms = timestamps_in_ms[0]
+    smallest_timestamp = timestamps[0]
+    for i, elem in enumerate(timestamps_in_ms):
+        if elem < smallest_timestamp_in_ms:
+            smallest_timestamp_in_ms = elem
+            smallest_timestamp = timestamps[i]
+    return smallest_timestamp
+def generate(query, context, relevant_timestamps=None):
+    prompt = PromptTemplate(input_variables=["question", "context"], template="You're a helpful LLM assistant in answering questions regarding a video. Given contexts are segments relevant to the question, please answer the question. Do not refer to segments. Context: {context}, question: {question} \nA:")
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+    chain = LLMChain(llm=llm, prompt=prompt)
+    response = chain.run(question=query, context=context)
+    if relevant_timestamps is not None and len(relevant_timestamps)>0:
+        # get smallest timestamp = earliest mention
+        smallest_timestamp = get_smallest_timestamp(relevant_timestamps)
+        response += f' {smallest_timestamp}'
+    return response
+def check_relevance(query, relevant_metadatas):
+    transcripts = [frame['transcript'] for frame in relevant_metadatas]
+    captions = [frame['caption'] for frame in relevant_metadatas]
+    timestamps = [frame['start_time'] for frame in relevant_metadatas]
+    context = ""
+    for i in range(len(transcripts)):
+        context += f"Segment {i}: transcript={transcripts[i]} caption={captions[i]}\n"
+    # print(context)
+    prompt = PromptTemplate(input_variables=["question", "context"], template="""
+    You are a grader assessing relevance of a retrieved video segment to a user question. \n
+    If the video segment contains keyword(s) or semantic meaning related to the question, grade it as relevant. \n
+    Give a binary score 'yes' or 'no' score to indicate whether the video segment is relevant to the question. \n
+    Answer in a string, separated by commas. For example: if there are segments provided, answer: yes,no,no,yes. \n
+    Question: {question} Context: {context}\n A:""")
+    # query = "What are the books mentioned in the video?"
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+    chain = LLMChain(llm=llm, prompt=prompt)
+    response = chain.run(question=query, context=context)
+    # print(response)
+    relevance_response = response.split(',')
+    actual_relevant_context = ""
+    relevant_timestamps = []
+    for i, relevance_check in enumerate(relevance_response):
+        if relevance_check.strip() == 'yes':
+            actual_relevant_context += f"Segment {i}: transcript={transcripts[i]} caption={captions[i]}\n"
+            relevant_timestamps.append(timestamps[i])
+    return actual_relevant_context, relevant_timestamps
+def retrieve_segments_from_timestamp(metadatas, timestamps):
+    relevant_segments = []
+    for timestamp in timestamps:
+        time_to_find_ms = str2time(timestamp)
+        buffer = 5000 # 5 seconds before and after
+        for segment in metadatas:
+            start = str2time(segment['start_time'])
+            end = str2time(segment['end_time'])
+            if start <= time_to_find_ms + buffer and end >= time_to_find_ms - buffer:
+                relevant_segments.append(segment)
+    return relevant_segments
+def check_timestamps(query):
+    prompt = PromptTemplate(input_variables=["question"], template="You're a helpful LLM assistant. You're good at detecting any timestamps provided in a query. Please detect the question and timestamp in the the following question and separated them by commas such as question,timestamp1,timestamp2 if timestamps are provided else just question. Question: {question} \nA:")
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+    chain = LLMChain(llm=llm, prompt=prompt)
+    response = chain.run(question=query)
+    timestamps = []
+    if len(response.split(',')) > 1:
+        query = response.split(',')[0].strip()
+        timestamps = [f"00:{elem.strip()}.00" for elem in response.split(',')[1:]]
+    return query, timestamps
+def retrieve_by_embedding(index, video_path, query, text_model):
+    print(query)
+    query_embedding = text_model.encode(query)
+    res = index.query(vector=query_embedding.tolist(), top_k=5, filter={"video_path": {"$eq": video_path}} )
+    metadatas = []
+    for id, match_ in enumerate(res['matches']):
+        result = index.fetch(ids=[match_['id']])
+        # Extract the vector data
+        vector_data = result.vectors.get(match_['id'], {})
+        # Extract metadata
+        metadata = vector_data.metadata
+        metadatas.append(metadata)
+    return metadatas
+def self_reflection(query, answer, summary):
+    prompt = PromptTemplate(input_variables=["summary", "question", "answer"], template="You're a helpful LLM assistant. You're good at determining if the provided answer is satisfactory to a question relating to a video. You have access to the video summary as follows: {summary}. Given a pair of question and answer, give the answer's satisfactory score in either yes or no. Question: {question}, Answer: {answer} \nA:")
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+    chain = LLMChain(llm=llm, prompt=prompt)
+    response = chain.run(summary=summary, question=query, answer=answer)
+    return response
+def get_full_transcript(metadatas):
+    # metadatas = webvtt.read(path_to_transcript)
+    transcripts = [frame['transcript'] for frame in metadatas]
+    full_text = ''
+    for idx, transcript in enumerate(transcripts):
+        text = transcript.strip().replace("  ", " ")
+        full_text += f"{text} "
+    full_text = full_text.strip()
+    return full_text
+def summarize_video(metadatas_path:str):
+    metadatas = load_json_file(metadatas_path)
+    # get full transcript
+    transcript = get_full_transcript(metadatas)
+    prompt = PromptTemplate(input_variables=["transcript"], template="You're a helpful LLM assistant. Please provide a summary for the video given its full transcript: {transcript} \nA:")
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+    chain = LLMChain(llm=llm, prompt=prompt)
+    response = chain.run(transcript=transcript)
+    return response
+def answer_wrt_timestamp(query, context):
+    prompt = PromptTemplate(input_variables=["question", "context"], template="""
+    You're a helpful LLM assistant. Given a question and a timestamp, I have retrieved the relevant context as follows. Please answer the question using the information provided in the context. Question: {question}, context: {context} \n
+    For example: Question="What happens at 4:20?" Caption="a person is standing up" Transcript="I have to go" Appropriate Answer="At 4:20, a person is standing up and saying he has to go."
+    A:""")
+    llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
+    chain = LLMChain(llm=llm, prompt=prompt)
+    response = chain.run(question=query, context=context)
+    return response
+def answer_question(index, model_stack, metadatas_path, video_summary:str, video_path:str, query:str, image_input_path:str=None):
+    metadatas = load_json_file(metadatas_path)
+    if image_input_path is not None:
+        return answer_image_question(index, model_stack, metadatas, video_summary, video_path, query, image_input_path)
+    # check if timestamp provided
+    query, timestamps = check_timestamps(query)
+    if len(timestamps) > 0:
+        # retrieve by timestamps
+        relevant_segments_metadatas = retrieve_segments_from_timestamp(metadatas, timestamps)
+        transcripts = [frame['transcript'] for frame in relevant_segments_metadatas]
+        captions = [frame['caption'] for frame in relevant_segments_metadatas]
+        context = ""
+        for i in range(len(transcripts)):
+            context += f"Segment {i}: transcript={transcripts[i]} caption={captions[i]}\n"
+        # print(context)
+        return answer_wrt_timestamp(query, context)
+    else:
+        # retrieve by embedding
+        relevant_segments_metadatas = retrieve_by_embedding(index, video_path, query, model_stack[0])
+    # check relevance
+    actual_relevant_context, relevant_timestamps = check_relevance(query, relevant_segments_metadatas)
+    # relevant_timestamps = [frame['start_time'] for frame in relevant_segments_metadatas]
+    # print(actual_relevant_context)
+    # generate
+    answer = generate(query, actual_relevant_context, relevant_timestamps)
+    # print(answer)
+    # self-reflection
+    reflect = self_reflection(query, answer, video_summary)
+    # print("Reflect", reflect)
+    if reflect.lower() == 'no':
+        answer = generate(query, f"{actual_relevant_context}\nSummary={video_summary}")
+    return answer
+def retrieve_segments_by_image_embedding(index, video_path, model_stack, image_query_path):
+    image_query = Image.open(image_query_path)
+    _, vision_model, vision_model_processor, _, _ = model_stack
+    inputs = vision_model_processor(images=image_query, return_tensors="pt")
+    outputs = vision_model(**inputs)
+    image_query_embeds = outputs.pooler_output
+    res = index.query(vector=image_query_embeds.tolist(), top_k=5, filter={"video_path": {"$eq": video_path}} )
+    metadatas = []
+    for id_, match_ in enumerate(res['matches']):
+        result = index.fetch(ids=[match_['id']])
+        # Extract the vector data
+        vector_data = result.vectors.get(match_['id'], {})
+        # Extract metadata
+        metadata = vector_data.metadata
+        metadatas.append(metadata)
+    return metadatas
+def answer_image_question(index, model_stack, metadatas, video_summary:str, video_path:str, query:str, image_query_path:str=None):
+    # search segment by image
+    relevant_segments = retrieve_segments_by_image_embedding(index, video_path, model_stack, image_query_path)
+    # generate answer using those segments
+    return generate_w_image(query, image_query_path, relevant_segments)
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def generate_w_image(query:str, image_query_path:str, relevant_metadatas):
+    base64_image = encode_image(image_query_path)
+    transcripts = [frame['transcript'] for frame in relevant_metadatas]
+    captions = [frame['caption'] for frame in relevant_metadatas]
+    # timestamps = [frame['start_time'] for frame in relevant_metadatas]
+    context = ""
+    for i in range(len(transcripts)):
+        context += f"Segment {i}: transcript={transcripts[i]} caption={captions[i]}\n"
+    # print(context)
+    client = OpenAI()
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "user", "content": [
+                {"type": "text", "text": f"Here is some context about the image: {context}"},  # Add context here
+                {"type": "text", "text": "You are a helpful LLM assistant. You are good at answering questions about a video given an image. Given the context surrounding the frames most correlated with the image and image, please answer the question. Question: {query}"},
+                {"type": "image_url", "image_url": {
+                    "url": f"data:image/png;base64,{base64_image}"
+                    }
+                }
+            ]}
+        ],
+        temperature=0.0,
+        max_tokens=100,
+    )
+    response = response.choices[0].message.content
+    # print(response)
+    return response

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+openai-whisper
+webvtt-py
+pytubefix
+sentence-transformers
+pinecone
+gradio
+moviepy
+youtube-transcript-api
+pytube
+ffmpeg-python
+ffmpeg
+opencv-python
+langchain_yt_dlp
+langchain_community
+transformers==4.49.0
+numpy==1.26.4
+openai==1.68.2

utils.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+from io import StringIO, BytesIO
+from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union
+import base64
+import glob
+from tqdm import tqdm
+from pytubefix import YouTube, Stream
+import cv2
+import json
+# Taken from the course: https://www.deeplearning.ai/short-courses/multimodal-rag-chat-with-videos/
+def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int=-1) -> str:
+    segmentStream = StringIO()
+    if format == 'vtt':
+        write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
+    elif format == 'srt':
+        write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
+    else:
+        raise Exception("Unknown format " + format)
+    segmentStream.seek(0)
+    return segmentStream.read()
+def download_video(video_url, path='/tmp/'):
+    print(f'Getting video information for {video_url}')
+    if not video_url.startswith('http'):
+        return os.path.join(path, video_url)
+    filepath = glob.glob(os.path.join(path, '*.mp4'))
+    if len(filepath) > 0:
+        return filepath[0]
+    def progress_callback(stream: Stream, data_chunk: bytes, bytes_remaining: int) -> None:
+        pbar.update(len(data_chunk))
+    yt = YouTube(video_url, on_progress_callback=progress_callback)
+    stream = yt.streams.filter(progressive=True, file_extension='mp4', res='720p').desc().first()
+    if stream is None:
+        stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
+    if not os.path.exists(path):
+        os.makedirs(path)
+    filepath = os.path.join(path, stream.default_filename)
+    if not os.path.exists(filepath):
+        print('Downloading video from YouTube...')
+        pbar = tqdm(desc='Downloading video from YouTube', total=stream.filesize, unit="bytes")
+        stream.download(path)
+        pbar.close()
+    return filepath
+# a help function that helps to convert a specific time written as a string in format `webvtt` into a time in miliseconds
+def str2time(strtime):
+    # strip character " if exists
+    strtime = strtime.strip('"')
+    # get hour, minute, second from time string
+    hrs, mins, seconds = [float(c) for c in strtime.split(':')]
+    # get the corresponding time as total seconds
+    total_seconds = hrs * 60**2 + mins * 60 + seconds
+    total_miliseconds = total_seconds * 1000
+    return total_miliseconds
+# Resizes a image and maintains aspect ratio
+def maintain_aspect_ratio_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
+    # Grab the image size and initialize dimensions
+    dim = None
+    (h, w) = image.shape[:2]
+    # Return original image if no need to resize
+    if width is None and height is None:
+        return image
+    # We are resizing height if width is none
+    if width is None:
+        # Calculate the ratio of the height and construct the dimensions
+        r = height / float(h)
+        dim = (int(w * r), height)
+    # We are resizing width if height is none
+    else:
+        # Calculate the ratio of the width and construct the dimensions
+        r = width / float(w)
+        dim = (width, int(h * r))
+    # Return the resized image
+    return cv2.resize(image, dim, interpolation=inter)
+def load_json_file(file_path):
+    # Open the JSON file in read mode
+    with open(file_path, 'r') as file:
+        data = json.load(file)
+    return data

video_utils.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union
+from utils import getSubs, str2time, maintain_aspect_ratio_resize
+from moviepy import VideoFileClip
+import whisper
+import os
+import cv2
+import webvtt
+from PIL import Image
+from tqdm import tqdm
+import json
+from langchain_yt_dlp.youtube_loader import YoutubeLoaderDL
+from transformers import BlipProcessor, BlipForConditionalGeneration
+# get video_metdata
+def get_video_metdata(video_url:str):
+    docs = YoutubeLoaderDL.from_youtube_url(video_url, add_video_info=True).load()
+    return docs[0].metadata
+# extract audio
+def extract_audio(path_to_video:str, output_folder:str):
+    video_name = os.path.basename(path_to_video).replace('.mp4', '')
+    # declare where to save .mp3 audio
+    path_to_extracted_audio_file = os.path.join(output_folder, f'{video_name}.mp3')
+    # extract mp3 audio file from mp4 video video file
+    clip = VideoFileClip(path_to_video)
+    clip.audio.write_audiofile(path_to_extracted_audio_file)
+    return path_to_extracted_audio_file
+# Get video transcript
+def transcribe_video(path_to_extracted_audio_file, output_folder, whisper_model=None):
+    # load model
+    if whisper_model is None:
+        whisper_model = whisper.load_model("small")
+    options = dict(task="translate", best_of=1, language='en')
+    results = whisper_model.transcribe(path_to_extracted_audio_file, **options)
+    vtt = getSubs(results["segments"], "vtt")
+    # path to save generated transcript of video1
+    video_name = os.path.basename(path_to_video).replace('.mp4', '')
+    path_to_generated_transcript = os.path.join(output_folder, f'{video_name}.vtt')
+    # write transcription to file
+    with open(path_to_generated_transcript, 'w') as f:
+        f.write(vtt)
+    return path_to_generated_transcript
+# get video frames & metadata
+def extract_and_save_frames_and_metadata(
+        path_to_video,
+        path_to_transcript,
+        path_to_save_extracted_frames,
+        path_to_save_metadatas):
+    # metadatas will store the metadata of all extracted frames
+    metadatas = []
+    # load video using cv2
+    video = cv2.VideoCapture(path_to_video)
+    # load transcript using webvtt
+    trans = webvtt.read(path_to_transcript)
+    # iterate transcript file
+    # for each video segment specified in the transcript file
+    for idx, transcript in enumerate(trans):
+        # get the start time and end time in seconds
+        start_time_ms = str2time(transcript.start)
+        end_time_ms = str2time(transcript.end)
+        # get the time in ms exactly
+        # in the middle of start time and end time
+        mid_time_ms = (end_time_ms + start_time_ms) / 2
+        # get the transcript, remove the next-line symbol
+        text = transcript.text.replace("\n", ' ')
+        # get frame at the middle time
+        video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
+        success, frame = video.read()
+        if success:
+            # if the frame is extracted successfully, resize it
+            image = maintain_aspect_ratio_resize(frame, height=350)
+            # save frame as JPEG file
+            img_fname = f'frame_{idx}.jpg'
+            img_fpath = os.path.join(
+                path_to_save_extracted_frames, img_fname
+            )
+            cv2.imwrite(img_fpath, image)
+            # prepare the metadata
+            metadata = {
+                'extracted_frame_path': img_fpath,
+                'transcript': text,
+                'video_segment_id': idx,
+                'video_path': path_to_video,
+                'start_time': transcript.start,
+                'end_time': transcript.end
+            }
+            metadatas.append(metadata)
+        else:
+            print(f"ERROR! Cannot extract frame: idx = {idx}")
+    # add back and forth to eliminate the problem of disjointed transcript
+    metadatas = update_transcript(metadatas)
+    # save metadata of all extracted frames
+    fn = os.path.join(path_to_save_metadatas, 'metadatas.json')
+    with open(fn, 'w') as outfile:
+        json.dump(metadatas, outfile)
+    return metadatas
+def update_transcript(vid_metadata, n=7):
+    vid_trans = [frame['transcript'] for frame in vid_metadata]
+    updated_vid_trans = [
+    ' '.join(vid_trans[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else
+    ' '.join(vid_trans[0 : i + int(n/2)]) for i in range(len(vid_trans))
+    ]
+    # also need to update the updated transcripts in metadata
+    for i in range(len(updated_vid_trans)):
+        vid_metadata[i]['transcript'] = updated_vid_trans[i]
+    return vid_metadata
+# get video caption
+def get_video_caption(path_to_video_frames: List, metadatas, output_folder_path:str, vlm=None, vlm_processor=None):
+    if vlm is None or vlm_processor is None:
+        vlm_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        vlm = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+    frame_caption = {}
+    for i, frame_path in enumerate(tqdm(path_to_video_frames, desc="Captioning frames")):
+        frame = Image.open(frame_path)
+        inputs = vlm_processor(frame, return_tensors="pt")
+        out = vlm.generate(**inputs)
+        caption = vlm_processor.decode(out[0], skip_special_tokens=True)
+        frame_caption[frame_path] = caption
+    caption_out_path = os.path.join(output_folder_path, 'captions.json')
+    with open(caption_out_path, 'w') as outfile:
+        json.dump(frame_caption, outfile)
+    # save video caption to metadata
+    for frame_metadata in metadatas:
+      frame_metadata['caption'] = frame_caption[frame_metadata['extracted_frame_path']]
+    metadatas_out_path = os.path.join(output_folder_path, 'metadatas.json')
+    with open(metadatas_out_path, 'w') as outfile:
+        json.dump(metadatas, outfile)
+    return metadatas_out_path