Update app.py
Browse files
app.py
CHANGED
@@ -40,7 +40,7 @@ def extract_frames(video_in, output_format='.jpg'):
|
|
40 |
# Adjust interval to video length
|
41 |
video_clip = VideoFileClip(video_in)
|
42 |
if video_clip.duration <= 5:
|
43 |
-
interval =
|
44 |
else :
|
45 |
interval = 24
|
46 |
|
@@ -165,9 +165,11 @@ def llm_process(user_prompt):
|
|
165 |
|
166 |
def infer(video_in):
|
167 |
# Extract frames from a video
|
|
|
168 |
frame_files = extract_frames(video_in)
|
169 |
|
170 |
# Process each extracted frame and collect results in a list
|
|
|
171 |
processed_texts = []
|
172 |
for frame_file in frame_files:
|
173 |
text = process_image(frame_file)
|
@@ -184,6 +186,7 @@ def infer(video_in):
|
|
184 |
print(extracted_audio)
|
185 |
|
186 |
# Get description of audio content
|
|
|
187 |
audio_content_described = get_salmonn(extracted_audio)
|
188 |
else :
|
189 |
audio_content_described = "Video has no sound."
|
@@ -195,6 +198,7 @@ def infer(video_in):
|
|
195 |
print(formatted_captions)
|
196 |
|
197 |
# Send formatted captions to LLM
|
|
|
198 |
video_description_from_llm = llm_process(formatted_captions)
|
199 |
|
200 |
return video_description_from_llm
|
@@ -213,7 +217,12 @@ div#video-text textarea {
|
|
213 |
with gr.Blocks(css=css) as demo :
|
214 |
with gr.Column(elem_id="col-container"):
|
215 |
gr.HTML("""
|
216 |
-
<h2 style="text-align: center;">Soft
|
|
|
|
|
|
|
|
|
|
|
217 |
""")
|
218 |
with gr.Row():
|
219 |
with gr.Column():
|
|
|
40 |
# Adjust interval to video length
|
41 |
video_clip = VideoFileClip(video_in)
|
42 |
if video_clip.duration <= 5:
|
43 |
+
interval = 6
|
44 |
else :
|
45 |
interval = 24
|
46 |
|
|
|
165 |
|
166 |
def infer(video_in):
|
167 |
# Extract frames from a video
|
168 |
+
gr.info("Extracting frames...")
|
169 |
frame_files = extract_frames(video_in)
|
170 |
|
171 |
# Process each extracted frame and collect results in a list
|
172 |
+
gr.Info("Captioning frames ...")
|
173 |
processed_texts = []
|
174 |
for frame_file in frame_files:
|
175 |
text = process_image(frame_file)
|
|
|
186 |
print(extracted_audio)
|
187 |
|
188 |
# Get description of audio content
|
189 |
+
gr.Info("Getting audio description from extracted sound ...")
|
190 |
audio_content_described = get_salmonn(extracted_audio)
|
191 |
else :
|
192 |
audio_content_described = "Video has no sound."
|
|
|
198 |
print(formatted_captions)
|
199 |
|
200 |
# Send formatted captions to LLM
|
201 |
+
gr.Info("Try to provide a video understanding with provided elements ...")
|
202 |
video_description_from_llm = llm_process(formatted_captions)
|
203 |
|
204 |
return video_description_from_llm
|
|
|
217 |
with gr.Blocks(css=css) as demo :
|
218 |
with gr.Column(elem_id="col-container"):
|
219 |
gr.HTML("""
|
220 |
+
<h2 style="text-align: center;">Soft Video Understanding</h2>
|
221 |
+
<p style="text-align: center;">
|
222 |
+
An experiment to try to achieve what i call "soft video understanding" with open-source available models. <br />
|
223 |
+
We use moondream1 to caption extracted frames, salmonn to analyze extracted audio, then send visual and audio details to Zephyr which is instructed to resume what it understood.
|
224 |
+
Instructions prompt is available for further discussion with the Community.
|
225 |
+
</p>
|
226 |
""")
|
227 |
with gr.Row():
|
228 |
with gr.Column():
|