from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union from utils import getSubs, str2time, maintain_aspect_ratio_resize from moviepy import VideoFileClip import whisper import os import cv2 import webvtt from PIL import Image from tqdm import tqdm import json from langchain_yt_dlp.youtube_loader import YoutubeLoaderDL from transformers import BlipProcessor, BlipForConditionalGeneration # get video_metdata def get_video_metdata(video_url:str): docs = YoutubeLoaderDL.from_youtube_url(video_url, add_video_info=True).load() return docs[0].metadata # extract audio def extract_audio(path_to_video:str, output_folder:str): video_name = os.path.basename(path_to_video).replace('.mp4', '') # declare where to save .mp3 audio path_to_extracted_audio_file = os.path.join(output_folder, 'audio.mp3') # extract mp3 audio file from mp4 video video file clip = VideoFileClip(path_to_video) clip.audio.write_audiofile(path_to_extracted_audio_file) return path_to_extracted_audio_file # Get video transcript def transcribe_video(path_to_extracted_audio_file, output_folder, whisper_model=None): # load model if whisper_model is None: whisper_model = whisper.load_model("tiny") options = dict(task="translate", best_of=1, language='en', verbose=True) results = whisper_model.transcribe(path_to_extracted_audio_file, **options) vtt = getSubs(results["segments"], "vtt") # path to save generated transcript of video1 video_name = os.path.basename(path_to_video).replace('.mp4', '') path_to_generated_transcript = os.path.join(output_folder, f'{video_name}.vtt') # write transcription to file with open(path_to_generated_transcript, 'w') as f: f.write(vtt) return path_to_generated_transcript # get video frames & metadata def extract_and_save_frames_and_metadata( path_to_video, path_to_transcript, path_to_save_extracted_frames, path_to_save_metadatas): # metadatas will store the metadata of all extracted frames metadatas = [] # load video using cv2 video = cv2.VideoCapture(path_to_video) # load transcript using webvtt trans = webvtt.read(path_to_transcript) # iterate transcript file # for each video segment specified in the transcript file for idx, transcript in enumerate(trans): # get the start time and end time in seconds start_time_ms = str2time(transcript.start) end_time_ms = str2time(transcript.end) # get the time in ms exactly # in the middle of start time and end time mid_time_ms = (end_time_ms + start_time_ms) / 2 # get the transcript, remove the next-line symbol text = transcript.text.replace("\n", ' ') # get frame at the middle time video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms) success, frame = video.read() if success: # if the frame is extracted successfully, resize it image = maintain_aspect_ratio_resize(frame, height=350) # save frame as JPEG file img_fname = f'frame_{idx}.jpg' img_fpath = os.path.join( path_to_save_extracted_frames, img_fname ) cv2.imwrite(img_fpath, image) # prepare the metadata metadata = { 'extracted_frame_path': img_fpath, 'transcript': text, 'video_segment_id': idx, 'video_path': path_to_video, 'start_time': transcript.start, 'end_time': transcript.end } metadatas.append(metadata) else: print(f"ERROR! Cannot extract frame: idx = {idx}") # add back and forth to eliminate the problem of disjointed transcript metadatas = update_transcript(metadatas) # save metadata of all extracted frames fn = os.path.join(path_to_save_metadatas, 'metadatas.json') with open(fn, 'w') as outfile: json.dump(metadatas, outfile) return metadatas def update_transcript(vid_metadata, n=7): vid_trans = [frame['transcript'] for frame in vid_metadata] updated_vid_trans = [ ' '.join(vid_trans[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else ' '.join(vid_trans[0 : i + int(n/2)]) for i in range(len(vid_trans)) ] # also need to update the updated transcripts in metadata for i in range(len(updated_vid_trans)): vid_metadata[i]['transcript'] = updated_vid_trans[i] return vid_metadata # get video caption def get_video_caption(path_to_video_frames: List, metadatas, output_folder_path:str, vlm=None, vlm_processor=None): if vlm is None or vlm_processor is None: vlm_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") vlm = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") frame_caption = {} for i, frame_path in enumerate(tqdm(path_to_video_frames, desc="Captioning frames")): frame = Image.open(frame_path) inputs = vlm_processor(frame, return_tensors="pt") out = vlm.generate(**inputs) caption = vlm_processor.decode(out[0], skip_special_tokens=True) frame_caption[frame_path] = caption caption_out_path = os.path.join(output_folder_path, 'captions.json') with open(caption_out_path, 'w') as outfile: json.dump(frame_caption, outfile) # save video caption to metadata for frame_metadata in metadatas: frame_metadata['caption'] = frame_caption[frame_metadata['extracted_frame_path']] metadatas_out_path = os.path.join(output_folder_path, 'metadatas.json') with open(metadatas_out_path, 'w') as outfile: json.dump(metadatas, outfile) return metadatas_out_path