video-qa / video_utils.py
Thao Pham
change whisper model size
9c77f6f
raw
history blame
5.84 kB
from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union
from utils import getSubs, str2time, maintain_aspect_ratio_resize
from moviepy import VideoFileClip
import whisper
import os
import cv2
import webvtt
from PIL import Image
from tqdm import tqdm
import json
from langchain_yt_dlp.youtube_loader import YoutubeLoaderDL
from transformers import BlipProcessor, BlipForConditionalGeneration
# get video_metdata
def get_video_metdata(video_url:str):
docs = YoutubeLoaderDL.from_youtube_url(video_url, add_video_info=True).load()
return docs[0].metadata
# extract audio
def extract_audio(path_to_video:str, output_folder:str):
video_name = os.path.basename(path_to_video).replace('.mp4', '')
# declare where to save .mp3 audio
path_to_extracted_audio_file = os.path.join(output_folder, 'audio.mp3')
# extract mp3 audio file from mp4 video video file
clip = VideoFileClip(path_to_video)
clip.audio.write_audiofile(path_to_extracted_audio_file)
return path_to_extracted_audio_file
# Get video transcript
def transcribe_video(path_to_extracted_audio_file, output_folder, whisper_model=None):
# load model
if whisper_model is None:
whisper_model = whisper.load_model("tiny")
options = dict(task="translate", best_of=1, language='en', verbose=True)
results = whisper_model.transcribe(path_to_extracted_audio_file, **options)
vtt = getSubs(results["segments"], "vtt")
# path to save generated transcript of video1
video_name = os.path.basename(path_to_video).replace('.mp4', '')
path_to_generated_transcript = os.path.join(output_folder, f'{video_name}.vtt')
# write transcription to file
with open(path_to_generated_transcript, 'w') as f:
f.write(vtt)
return path_to_generated_transcript
# get video frames & metadata
def extract_and_save_frames_and_metadata(
path_to_video,
path_to_transcript,
path_to_save_extracted_frames,
path_to_save_metadatas):
# metadatas will store the metadata of all extracted frames
metadatas = []
# load video using cv2
video = cv2.VideoCapture(path_to_video)
# load transcript using webvtt
trans = webvtt.read(path_to_transcript)
# iterate transcript file
# for each video segment specified in the transcript file
for idx, transcript in enumerate(trans):
# get the start time and end time in seconds
start_time_ms = str2time(transcript.start)
end_time_ms = str2time(transcript.end)
# get the time in ms exactly
# in the middle of start time and end time
mid_time_ms = (end_time_ms + start_time_ms) / 2
# get the transcript, remove the next-line symbol
text = transcript.text.replace("\n", ' ')
# get frame at the middle time
video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
success, frame = video.read()
if success:
# if the frame is extracted successfully, resize it
image = maintain_aspect_ratio_resize(frame, height=350)
# save frame as JPEG file
img_fname = f'frame_{idx}.jpg'
img_fpath = os.path.join(
path_to_save_extracted_frames, img_fname
)
cv2.imwrite(img_fpath, image)
# prepare the metadata
metadata = {
'extracted_frame_path': img_fpath,
'transcript': text,
'video_segment_id': idx,
'video_path': path_to_video,
'start_time': transcript.start,
'end_time': transcript.end
}
metadatas.append(metadata)
else:
print(f"ERROR! Cannot extract frame: idx = {idx}")
# add back and forth to eliminate the problem of disjointed transcript
metadatas = update_transcript(metadatas)
# save metadata of all extracted frames
fn = os.path.join(path_to_save_metadatas, 'metadatas.json')
with open(fn, 'w') as outfile:
json.dump(metadatas, outfile)
return metadatas
def update_transcript(vid_metadata, n=7):
vid_trans = [frame['transcript'] for frame in vid_metadata]
updated_vid_trans = [
' '.join(vid_trans[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else
' '.join(vid_trans[0 : i + int(n/2)]) for i in range(len(vid_trans))
]
# also need to update the updated transcripts in metadata
for i in range(len(updated_vid_trans)):
vid_metadata[i]['transcript'] = updated_vid_trans[i]
return vid_metadata
# get video caption
def get_video_caption(path_to_video_frames: List, metadatas, output_folder_path:str, vlm=None, vlm_processor=None):
if vlm is None or vlm_processor is None:
vlm_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
vlm = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
frame_caption = {}
for i, frame_path in enumerate(tqdm(path_to_video_frames, desc="Captioning frames")):
frame = Image.open(frame_path)
inputs = vlm_processor(frame, return_tensors="pt")
out = vlm.generate(**inputs)
caption = vlm_processor.decode(out[0], skip_special_tokens=True)
frame_caption[frame_path] = caption
caption_out_path = os.path.join(output_folder_path, 'captions.json')
with open(caption_out_path, 'w') as outfile:
json.dump(frame_caption, outfile)
# save video caption to metadata
for frame_metadata in metadatas:
frame_metadata['caption'] = frame_caption[frame_metadata['extracted_frame_path']]
metadatas_out_path = os.path.join(output_folder_path, 'metadatas.json')
with open(metadatas_out_path, 'w') as outfile:
json.dump(metadatas, outfile)
return metadatas_out_path