Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -10,7 +10,7 @@ from io import BytesIO
|
|
10 |
from openai import OpenAI
|
11 |
import whisper
|
12 |
from google.cloud import vision
|
13 |
-
|
14 |
# st.set_page_config(layout="wide")
|
15 |
|
16 |
load_dotenv()
|
@@ -196,13 +196,13 @@ def search_keyword(keyword, frame_texts):
|
|
196 |
|
197 |
|
198 |
# Function to generate description for video frames
|
199 |
-
def generate_description(base64_frames):
|
200 |
try:
|
201 |
prompt_messages = [
|
202 |
{
|
203 |
"role": "user",
|
204 |
"content": [
|
205 |
-
|
206 |
*map(lambda x: {"image": x, "resize": 428}, base64_frames),
|
207 |
],
|
208 |
},
|
@@ -212,10 +212,24 @@ def generate_description(base64_frames):
|
|
212 |
messages=prompt_messages,
|
213 |
max_tokens=3000,
|
214 |
)
|
215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
except Exception as e:
|
217 |
print(f"Error in generate_description: {e}")
|
218 |
-
return None
|
219 |
|
220 |
def generate_overall_description(transcript_text, video_description):
|
221 |
try:
|
@@ -251,8 +265,21 @@ with col1:
|
|
251 |
keyword = st.text_input("Enter a keyword to filter the frames (optional):")
|
252 |
extract_frames_button = st.button("Extract Frames")
|
253 |
uploaded_video = st.file_uploader("Or upload a video file (MP4):", type=["mp4"])
|
254 |
-
|
255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
# Slider to select the number of seconds for extraction
|
257 |
seconds = st.slider("Select the number of seconds for extraction:", min_value=1, max_value=60, value=10)
|
258 |
|
@@ -375,7 +402,7 @@ with col1:
|
|
375 |
|
376 |
# Get consolidated description for all frames
|
377 |
if ffmpeg_output:
|
378 |
-
description = generate_description(base64_frames)
|
379 |
if description:
|
380 |
st.markdown("**Frame Description:**")
|
381 |
st.write(description)
|
@@ -391,7 +418,7 @@ with col1:
|
|
391 |
|
392 |
# Get the transcript from whisper
|
393 |
transcript_text = get_transcript_from_audio(audio_tempfile.name)
|
394 |
-
description = generate_description(base64_frames)
|
395 |
# Generate overall description using transcript and video description
|
396 |
overall_description = generate_overall_description(transcript_text, description)
|
397 |
if overall_description:
|
@@ -429,6 +456,7 @@ with col1:
|
|
429 |
n_frames = len(frame_bytes_list)
|
430 |
base64_frames = [base64.b64encode(b'\xff\xd8' + frame_bytes).decode('utf-8') for frame_bytes in frame_bytes_list]
|
431 |
|
|
|
432 |
categories_results = []
|
433 |
frame_texts = {}
|
434 |
|
@@ -439,6 +467,7 @@ with col1:
|
|
439 |
col1, col2 = st.columns([3, 2])
|
440 |
with col1:
|
441 |
frame_bytes = base64.b64decode(frame_base64)
|
|
|
442 |
st.image(Image.open(BytesIO(frame_bytes)), caption=f'Frame {idx + 1}', use_column_width=True)
|
443 |
with col2:
|
444 |
st.write(f"Extracted Text: {extracted_text}")
|
@@ -482,7 +511,7 @@ with col1:
|
|
482 |
|
483 |
# Get consolidated description for all frames
|
484 |
if ffmpeg_output:
|
485 |
-
description = generate_description(base64_frames)
|
486 |
if description:
|
487 |
st.markdown("**Frame Description:**")
|
488 |
st.write(description)
|
@@ -503,7 +532,59 @@ with col1:
|
|
503 |
|
504 |
# Get the transcript from whisper
|
505 |
transcript_text = get_transcript_from_audio(audio_tempfile.name)
|
506 |
-
description = generate_description(base64_frames)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
# Generate overall description using transcript and video description
|
508 |
overall_description = generate_overall_description(transcript_text, description)
|
509 |
if overall_description:
|
|
|
10 |
from openai import OpenAI
|
11 |
import whisper
|
12 |
from google.cloud import vision
|
13 |
+
import re
|
14 |
# st.set_page_config(layout="wide")
|
15 |
|
16 |
load_dotenv()
|
|
|
196 |
|
197 |
|
198 |
# Function to generate description for video frames
|
199 |
+
def generate_description(base64_frames,prompt):
|
200 |
try:
|
201 |
prompt_messages = [
|
202 |
{
|
203 |
"role": "user",
|
204 |
"content": [
|
205 |
+
prompt ,
|
206 |
*map(lambda x: {"image": x, "resize": 428}, base64_frames),
|
207 |
],
|
208 |
},
|
|
|
212 |
messages=prompt_messages,
|
213 |
max_tokens=3000,
|
214 |
)
|
215 |
+
description = response.choices[0].message.content
|
216 |
+
|
217 |
+
# Use regular expression to find frame numbers
|
218 |
+
frame_numbers = re.findall(r'Frames\s*:\s*(\d+(?:,\s*\d+)*)', response.choices[0].message.content)
|
219 |
+
|
220 |
+
# Convert the string of numbers into a list of integers
|
221 |
+
if frame_numbers:
|
222 |
+
frame_numbers = [int(num) for num in frame_numbers[0].split(',')]
|
223 |
+
else:
|
224 |
+
frame_numbers = []
|
225 |
+
|
226 |
+
print("Frame numbers to extract:", frame_numbers)
|
227 |
+
|
228 |
+
return description, frame_numbers
|
229 |
+
|
230 |
except Exception as e:
|
231 |
print(f"Error in generate_description: {e}")
|
232 |
+
return None, []
|
233 |
|
234 |
def generate_overall_description(transcript_text, video_description):
|
235 |
try:
|
|
|
265 |
keyword = st.text_input("Enter a keyword to filter the frames (optional):")
|
266 |
extract_frames_button = st.button("Extract Frames")
|
267 |
uploaded_video = st.file_uploader("Or upload a video file (MP4):", type=["mp4"])
|
268 |
+
prompt1 = "keyword is " + st.text_input("Enter a keyword for analysis:")
|
269 |
+
prompt2 = "1. Generate a description for this sequence of video frames in about 90 words. 2.Return the following:\
|
270 |
+
i. List of objects in the video \
|
271 |
+
ii. Any restrictive content or sensitive content and if so which frame. \
|
272 |
+
iii. The frames is supposed to contain news content and we want to detect non-news content such as an advertisement. \
|
273 |
+
So analyze specifically for any indications that the content might be promotional or an advertisement. \
|
274 |
+
Find the most portions of a video related to the keyword. \
|
275 |
+
The output will be targeted towards social media (like TikTok or Reels) or to news broadcasts. \
|
276 |
+
For the provided frames return the frames related to the keyword\
|
277 |
+
I am trying to fill these frames for a TikTok video. \
|
278 |
+
Hence while selecting the frames keep that in mind. \
|
279 |
+
You do not have to give me the script of the Tiktok video. \
|
280 |
+
Just return the most interesting frames in a sequence that will come for a tiktok video. \
|
281 |
+
List all frame numbers separated by commas at the end like this for eg, Frames : 1,2,4,7,9"
|
282 |
+
prompt = prompt2 + prompt1
|
283 |
# Slider to select the number of seconds for extraction
|
284 |
seconds = st.slider("Select the number of seconds for extraction:", min_value=1, max_value=60, value=10)
|
285 |
|
|
|
402 |
|
403 |
# Get consolidated description for all frames
|
404 |
if ffmpeg_output:
|
405 |
+
description = generate_description(base64_frames,prompt)
|
406 |
if description:
|
407 |
st.markdown("**Frame Description:**")
|
408 |
st.write(description)
|
|
|
418 |
|
419 |
# Get the transcript from whisper
|
420 |
transcript_text = get_transcript_from_audio(audio_tempfile.name)
|
421 |
+
description = generate_description(base64_frames,prompt)
|
422 |
# Generate overall description using transcript and video description
|
423 |
overall_description = generate_overall_description(transcript_text, description)
|
424 |
if overall_description:
|
|
|
456 |
n_frames = len(frame_bytes_list)
|
457 |
base64_frames = [base64.b64encode(b'\xff\xd8' + frame_bytes).decode('utf-8') for frame_bytes in frame_bytes_list]
|
458 |
|
459 |
+
frame_dict = {}
|
460 |
categories_results = []
|
461 |
frame_texts = {}
|
462 |
|
|
|
467 |
col1, col2 = st.columns([3, 2])
|
468 |
with col1:
|
469 |
frame_bytes = base64.b64decode(frame_base64)
|
470 |
+
frame_dict[idx + 1] = frame_bytes
|
471 |
st.image(Image.open(BytesIO(frame_bytes)), caption=f'Frame {idx + 1}', use_column_width=True)
|
472 |
with col2:
|
473 |
st.write(f"Extracted Text: {extracted_text}")
|
|
|
511 |
|
512 |
# Get consolidated description for all frames
|
513 |
if ffmpeg_output:
|
514 |
+
description,frame_numbers = generate_description(base64_frames,prompt)
|
515 |
if description:
|
516 |
st.markdown("**Frame Description:**")
|
517 |
st.write(description)
|
|
|
532 |
|
533 |
# Get the transcript from whisper
|
534 |
transcript_text = get_transcript_from_audio(audio_tempfile.name)
|
535 |
+
description = generate_description(base64_frames,prompt)
|
536 |
+
|
537 |
+
if frame_numbers:
|
538 |
+
print("Frame numbers to extract:", frame_numbers) # Check frame numbers
|
539 |
+
|
540 |
+
# Create a mapping from original frame numbers to sequential numbers
|
541 |
+
frame_mapping = {}
|
542 |
+
new_frame_numbers = []
|
543 |
+
for idx, frame_number in enumerate(sorted(frame_numbers)):
|
544 |
+
frame_mapping[frame_number] = idx + 1
|
545 |
+
new_frame_numbers.append(idx + 1)
|
546 |
+
|
547 |
+
print("New frame numbers:", new_frame_numbers)
|
548 |
+
print("Frame mapping:", frame_mapping)
|
549 |
+
|
550 |
+
# Create a temporary directory to store images
|
551 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
552 |
+
image_paths = []
|
553 |
+
for frame_number in frame_numbers:
|
554 |
+
if frame_number in frame_dict:
|
555 |
+
frame_path = os.path.join(temp_dir, f'frame_{frame_mapping[frame_number]:03}.jpg') # Updated file naming
|
556 |
+
image_paths.append(frame_path)
|
557 |
+
with open(frame_path, 'wb') as f:
|
558 |
+
f.write(frame_dict[frame_number])
|
559 |
+
|
560 |
+
#image = Image.open(BytesIO(frame_bytes))
|
561 |
+
#st.image(image, caption='Selected Frame', use_column_width=True)
|
562 |
+
#with open(frame_path, "rb") as file:
|
563 |
+
# btn = st.download_button(
|
564 |
+
# label="Download Frame",
|
565 |
+
# data=file,
|
566 |
+
# file_name=f'frame_{frame_number}.jpg',
|
567 |
+
# mime="image/jpeg"
|
568 |
+
# )
|
569 |
+
# Once all selected frames are saved as images, create a video from them using FFmpeg
|
570 |
+
video_output_path = os.path.join(temp_dir, 'output5.mp4')
|
571 |
+
framerate = 1 # Adjust framerate based on the number of frames
|
572 |
+
ffmpeg_command = [
|
573 |
+
'ffmpeg',
|
574 |
+
'-framerate', str(framerate), # Set framerate based on the number of frames
|
575 |
+
'-i', os.path.join(temp_dir, 'frame_%03d.jpg'), # Input pattern for all frame files
|
576 |
+
'-c:v', 'libx264',
|
577 |
+
'-pix_fmt', 'yuv420p',
|
578 |
+
video_output_path
|
579 |
+
]
|
580 |
+
|
581 |
+
print("FFmpeg command:", ' '.join(ffmpeg_command)) # Debug FFmpeg command
|
582 |
+
|
583 |
+
subprocess.run(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
584 |
+
|
585 |
+
# Display or provide a download link for the created video
|
586 |
+
st.header("Final Video")
|
587 |
+
st.video(video_output_path)
|
588 |
# Generate overall description using transcript and video description
|
589 |
overall_description = generate_overall_description(transcript_text, description)
|
590 |
if overall_description:
|