File size: 5,842 Bytes
d50ce1c 5de0912 d50ce1c 9c77f6f e9f073f d50ce1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union
from utils import getSubs, str2time, maintain_aspect_ratio_resize
from moviepy import VideoFileClip
import whisper
import os
import cv2
import webvtt
from PIL import Image
from tqdm import tqdm
import json
from langchain_yt_dlp.youtube_loader import YoutubeLoaderDL
from transformers import BlipProcessor, BlipForConditionalGeneration
# get video_metdata
def get_video_metdata(video_url:str):
docs = YoutubeLoaderDL.from_youtube_url(video_url, add_video_info=True).load()
return docs[0].metadata
# extract audio
def extract_audio(path_to_video:str, output_folder:str):
video_name = os.path.basename(path_to_video).replace('.mp4', '')
# declare where to save .mp3 audio
path_to_extracted_audio_file = os.path.join(output_folder, 'audio.mp3')
# extract mp3 audio file from mp4 video video file
clip = VideoFileClip(path_to_video)
clip.audio.write_audiofile(path_to_extracted_audio_file)
return path_to_extracted_audio_file
# Get video transcript
def transcribe_video(path_to_extracted_audio_file, output_folder, whisper_model=None):
# load model
if whisper_model is None:
whisper_model = whisper.load_model("tiny")
options = dict(task="translate", best_of=1, language='en', verbose=True)
results = whisper_model.transcribe(path_to_extracted_audio_file, **options)
vtt = getSubs(results["segments"], "vtt")
# path to save generated transcript of video1
video_name = os.path.basename(path_to_video).replace('.mp4', '')
path_to_generated_transcript = os.path.join(output_folder, f'{video_name}.vtt')
# write transcription to file
with open(path_to_generated_transcript, 'w') as f:
f.write(vtt)
return path_to_generated_transcript
# get video frames & metadata
def extract_and_save_frames_and_metadata(
path_to_video,
path_to_transcript,
path_to_save_extracted_frames,
path_to_save_metadatas):
# metadatas will store the metadata of all extracted frames
metadatas = []
# load video using cv2
video = cv2.VideoCapture(path_to_video)
# load transcript using webvtt
trans = webvtt.read(path_to_transcript)
# iterate transcript file
# for each video segment specified in the transcript file
for idx, transcript in enumerate(trans):
# get the start time and end time in seconds
start_time_ms = str2time(transcript.start)
end_time_ms = str2time(transcript.end)
# get the time in ms exactly
# in the middle of start time and end time
mid_time_ms = (end_time_ms + start_time_ms) / 2
# get the transcript, remove the next-line symbol
text = transcript.text.replace("\n", ' ')
# get frame at the middle time
video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
success, frame = video.read()
if success:
# if the frame is extracted successfully, resize it
image = maintain_aspect_ratio_resize(frame, height=350)
# save frame as JPEG file
img_fname = f'frame_{idx}.jpg'
img_fpath = os.path.join(
path_to_save_extracted_frames, img_fname
)
cv2.imwrite(img_fpath, image)
# prepare the metadata
metadata = {
'extracted_frame_path': img_fpath,
'transcript': text,
'video_segment_id': idx,
'video_path': path_to_video,
'start_time': transcript.start,
'end_time': transcript.end
}
metadatas.append(metadata)
else:
print(f"ERROR! Cannot extract frame: idx = {idx}")
# add back and forth to eliminate the problem of disjointed transcript
metadatas = update_transcript(metadatas)
# save metadata of all extracted frames
fn = os.path.join(path_to_save_metadatas, 'metadatas.json')
with open(fn, 'w') as outfile:
json.dump(metadatas, outfile)
return metadatas
def update_transcript(vid_metadata, n=7):
vid_trans = [frame['transcript'] for frame in vid_metadata]
updated_vid_trans = [
' '.join(vid_trans[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else
' '.join(vid_trans[0 : i + int(n/2)]) for i in range(len(vid_trans))
]
# also need to update the updated transcripts in metadata
for i in range(len(updated_vid_trans)):
vid_metadata[i]['transcript'] = updated_vid_trans[i]
return vid_metadata
# get video caption
def get_video_caption(path_to_video_frames: List, metadatas, output_folder_path:str, vlm=None, vlm_processor=None):
if vlm is None or vlm_processor is None:
vlm_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
vlm = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
frame_caption = {}
for i, frame_path in enumerate(tqdm(path_to_video_frames, desc="Captioning frames")):
frame = Image.open(frame_path)
inputs = vlm_processor(frame, return_tensors="pt")
out = vlm.generate(**inputs)
caption = vlm_processor.decode(out[0], skip_special_tokens=True)
frame_caption[frame_path] = caption
caption_out_path = os.path.join(output_folder_path, 'captions.json')
with open(caption_out_path, 'w') as outfile:
json.dump(frame_caption, outfile)
# save video caption to metadata
for frame_metadata in metadatas:
frame_metadata['caption'] = frame_caption[frame_metadata['extracted_frame_path']]
metadatas_out_path = os.path.join(output_folder_path, 'metadatas.json')
with open(metadatas_out_path, 'w') as outfile:
json.dump(metadatas, outfile)
return metadatas_out_path
|