File size: 7,814 Bytes
d50ce1c
 
 
 
 
 
 
5de0912
d50ce1c
 
 
527422b
d50ce1c
 
 
 
 
 
 
 
 
5de0912
d50ce1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527422b
 
5de0912
 
 
 
 
d50ce1c
527422b
d50ce1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5de0912
 
 
 
d50ce1c
 
5de0912
 
 
 
 
 
d50ce1c
 
 
 
 
 
5de0912
d50ce1c
5de0912
d50ce1c
 
5de0912
 
d50ce1c
5de0912
d50ce1c
 
5de0912
d50ce1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5de0912
d50ce1c
 
5de0912
 
d50ce1c
 
 
 
5de0912
 
d50ce1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5de0912
d50ce1c
 
 
5de0912
d50ce1c
 
 
5de0912
d50ce1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import gradio as gr
import time 
import re
import video_utils
import utils
import embed
import rag
import shutil
import os
import uuid
import numpy as np
import pinecone
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from transformers import AutoImageProcessor, AutoModel
from transformers import BlipProcessor, BlipForConditionalGeneration
from dotenv import load_dotenv

load_dotenv()  # Load from .env

UPLOAD_FOLDER = 'uploads'
global_video_name = None

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") 

# init models 
TEXT_MODEL = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
VISION_MODEL_PROCESSOR = AutoImageProcessor.from_pretrained('facebook/dinov2-small')
VISION_MODEL = AutoModel.from_pretrained('facebook/dinov2-small')

VLM_PROCESSOR = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
VLM = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# init index 
pc = Pinecone(
    api_key=PINECONE_API_KEY
)
# Connect to an index
index_name = "multimodal-minilm"
if index_name not in pc.list_indexes().names():
    pc.create_index(index_name, dimension=384, metric="cosine",
                    spec=ServerlessSpec(
                            cloud="aws",
                            region="us-east-1"
                        ))
INDEX = pc.Index(index_name)

MODEL_STACK = [TEXT_MODEL, VISION_MODEL, VISION_MODEL_PROCESSOR, VLM, VLM_PROCESSOR]


def is_valid_youtube_url(url):
    """
    Checks if the given URL is a valid YouTube video URL.
    
    Returns True if valid, False otherwise.
    """
    youtube_regex = re.compile(
        r"^(https?://)?(www\.)?(youtube\.com|youtu\.be)/"
        r"(watch\?v=|embed/|v/|shorts/)?([a-zA-Z0-9_-]{11})"
    )
    
    match = youtube_regex.match(url)
    return bool(match)


def check_exist_before_upsert(index, video_path):
    # threshold = len(frames) * 3
    threshold = [elem for elem in os.listdir(video_path.split('/')[0]) if elem.endswith('.jpg')]
    threshold = len(threshold)*3 # image embeds, caption embeds, transcript embeds

    dimension = 384
    res = index.query(
        vector=[0]*dimension,  # Dummy vector (not used for filtering)
        top_k=10000,  # Set a high value to retrieve as many matches as possible
        filter={"video_path": video_path}  # Filter by video_path
    )

    # Count the number of matching vectors
    num_existing_vectors = len(res["matches"])

    if num_existing_vectors >= threshold:
        return True
    return False


def chat(message, history):
    image_input_path = None

    # print(message['files'])

    video_name, video_input_path = None, None
    if len(message['files']) > 0:
        assert len(message['files']) == 1

        if message['files'][0].endswith('.jpg'):
            image_input_path = message['files'][0]        
        elif message['files'][0].endswith('.mp4'):
            video_input_path = message['files'][0]       
            video_name = os.path.basename(video_input_path).split('.mp4')[0]

    message = message['text']

    if history is None:
        history = []

    if video_name is not None:
        # Check metadata
        history.append((None, f"✅ Video uploaded succesfully! Your video's title is {video_name}..."))
        yield history

        output_folder_path = os.path.join(UPLOAD_FOLDER, video_name)
        os.makedirs(output_folder_path, exist_ok=True)

        path_to_video = os.path.join(output_folder_path, "video.mp4")

        if not os.path.exists(path_to_video):
            shutil.move(video_input_path, path_to_video)

        history.append((None, "⏳ Transcribing video..."))
        yield history
        path_to_audio_file = os.path.join(output_folder_path, f"audio.mp3")
        if not os.path.exists(path_to_audio_file):
            path_to_audio_file = video_utils.extract_audio(path_to_video, output_folder_path)

        path_to_generated_transcript = os.path.join(output_folder_path, f"transcript.vtt")
        if not os.path.exists(path_to_generated_transcript):
            path_to_generated_transcript = video_utils.transcribe_video(path_to_audio_file, output_folder_path)

        # extract frames and metadata
        metadatas_path = os.path.join(output_folder_path, 'metadatas.json')
        if not os.path.exists(metadatas_path):
            metadatas = video_utils.extract_and_save_frames_and_metadata(path_to_video=path_to_video, 
                                                                        path_to_transcript=path_to_generated_transcript, 
                                                                        path_to_save_extracted_frames=output_folder_path,
                                                                        path_to_save_metadatas=output_folder_path)

        history.append((None, "⏳ Captioning video..."))
        yield history

        caption_path = os.path.join(output_folder_path, 'captions.json')
        if not os.path.exists(caption_path):
            video_frames = [os.path.join(output_folder_path, elem) for elem in os.listdir(output_folder_path) if elem.endswith('.jpg')]
            metadata_path = video_utils.get_video_caption(video_frames, metadatas, output_folder_path, vlm=VLM, vlm_processor=VLM_PROCESSOR)

        history.append((None, "⏳ Indexing..."))
        yield history
        index_exist = check_exist_before_upsert(INDEX, path_to_video)
        print(index_exist)
        if not index_exist:
            embed.indexing(INDEX, MODEL_STACK, metadatas_path)

        # summarizing video 
        video_summary = rag.summarize_video(metadatas_path)
        with open(os.path.join(output_folder_path, "summary.txt"), "w") as f:
            f.write(video_summary)

        history.append((None, f"Video processing complete! You can now ask me questions about the video {video_name}!"))
        yield history        

        global global_video_name 
        global_video_name = video_name
    else:
        history.append((message, None))
        yield history

        if global_video_name is None:
            history.append((None, "You need to upload a video before asking questions."))
            yield history
            return

        output_folder_path = f"{UPLOAD_FOLDER}/{video_name}"
        metadatas_path = os.path.join(output_folder_path, 'metadatas.json') 

        video_summary = ''
        with open(f'./{output_folder_path}/summary.txt') as f:
            while True:
                ln = f.readline()
                if ln == '':
                    break
                video_summary += ln.strip()
        video_path = os.path.join(output_folder_path, 'video.mp4')
        answer = rag.answer_question(INDEX, MODEL_STACK, metadatas_path, video_summary, video_path, message, image_input_path)
        
        history.append((None, answer))
        yield history

def clear_chat(history):
    history = []
    history.append((None, "Please upload a video to get started!"))
    return history

def main():
    initial_messages = [(None, "Please upload a video to get started!")]

    with gr.Blocks() as demo:
        chatbot = gr.Chatbot(value=initial_messages)
        msg = gr.MultimodalTextbox(file_types=['image', '.mp4'], sources=['upload'])

        with gr.Row():
            with gr.Column():
                submit = gr.Button("Send")
                submit.click(chat, [msg, chatbot], chatbot)

            with gr.Column():
                clear = gr.Button("Clear")  # Clear button
                # Clear chat history when clear button is clicked
                clear.click(clear_chat, [], chatbot)
                global video_name 
                video_name = None

    demo.launch()

if __name__ == "__main__":
    main()