File size: 5,842 Bytes
d50ce1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5de0912
d50ce1c
 
 
 
 
 
 
 
 
 
 
9c77f6f
e9f073f
d50ce1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union
from utils import getSubs, str2time, maintain_aspect_ratio_resize
from moviepy import VideoFileClip
import whisper
import os
import cv2
import webvtt
from PIL import Image
from tqdm import tqdm
import json
from langchain_yt_dlp.youtube_loader import YoutubeLoaderDL
from transformers import BlipProcessor, BlipForConditionalGeneration


# get video_metdata
def get_video_metdata(video_url:str):
    docs = YoutubeLoaderDL.from_youtube_url(video_url, add_video_info=True).load()
    return docs[0].metadata

# extract audio 
def extract_audio(path_to_video:str, output_folder:str):
    video_name = os.path.basename(path_to_video).replace('.mp4', '')

    # declare where to save .mp3 audio
    path_to_extracted_audio_file = os.path.join(output_folder, 'audio.mp3')

    # extract mp3 audio file from mp4 video video file
    clip = VideoFileClip(path_to_video)
    clip.audio.write_audiofile(path_to_extracted_audio_file)
    return path_to_extracted_audio_file


# Get video transcript 
def transcribe_video(path_to_extracted_audio_file, output_folder, whisper_model=None):
    # load model 
    if whisper_model is None:
        whisper_model = whisper.load_model("tiny")
    options = dict(task="translate", best_of=1, language='en', verbose=True)
    results = whisper_model.transcribe(path_to_extracted_audio_file, **options)

    vtt = getSubs(results["segments"], "vtt")
    # path to save generated transcript of video1
    video_name = os.path.basename(path_to_video).replace('.mp4', '')
    path_to_generated_transcript = os.path.join(output_folder, f'{video_name}.vtt')
    
    # write transcription to file
    with open(path_to_generated_transcript, 'w') as f:
        f.write(vtt)
    return path_to_generated_transcript


# get video frames & metadata
def extract_and_save_frames_and_metadata(
        path_to_video,
        path_to_transcript,
        path_to_save_extracted_frames,
        path_to_save_metadatas):

    # metadatas will store the metadata of all extracted frames
    metadatas = []

    # load video using cv2
    video = cv2.VideoCapture(path_to_video)
    # load transcript using webvtt
    trans = webvtt.read(path_to_transcript)

    # iterate transcript file
    # for each video segment specified in the transcript file
    for idx, transcript in enumerate(trans):
        # get the start time and end time in seconds
        start_time_ms = str2time(transcript.start)
        end_time_ms = str2time(transcript.end)
        # get the time in ms exactly
        # in the middle of start time and end time
        mid_time_ms = (end_time_ms + start_time_ms) / 2
        # get the transcript, remove the next-line symbol
        text = transcript.text.replace("\n", ' ')
        # get frame at the middle time
        video.set(cv2.CAP_PROP_POS_MSEC, mid_time_ms)
        success, frame = video.read()
        if success:
            # if the frame is extracted successfully, resize it
            image = maintain_aspect_ratio_resize(frame, height=350)
            # save frame as JPEG file
            img_fname = f'frame_{idx}.jpg'
            img_fpath = os.path.join(
                path_to_save_extracted_frames, img_fname
            )
            cv2.imwrite(img_fpath, image)

            # prepare the metadata
            metadata = {
                'extracted_frame_path': img_fpath,
                'transcript': text,
                'video_segment_id': idx,
                'video_path': path_to_video,
                'start_time': transcript.start,
                'end_time': transcript.end
            }
            metadatas.append(metadata)
        else:
            print(f"ERROR! Cannot extract frame: idx = {idx}")

    # add back and forth to eliminate the problem of disjointed transcript
    metadatas = update_transcript(metadatas)

    # save metadata of all extracted frames
    fn = os.path.join(path_to_save_metadatas, 'metadatas.json')
    with open(fn, 'w') as outfile:
        json.dump(metadatas, outfile)
    return metadatas


def update_transcript(vid_metadata, n=7):
    vid_trans = [frame['transcript'] for frame in vid_metadata]
    updated_vid_trans = [
    ' '.join(vid_trans[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else
    ' '.join(vid_trans[0 : i + int(n/2)]) for i in range(len(vid_trans))
    ]

    # also need to update the updated transcripts in metadata
    for i in range(len(updated_vid_trans)):
        vid_metadata[i]['transcript'] = updated_vid_trans[i]
    return vid_metadata


# get video caption
def get_video_caption(path_to_video_frames: List, metadatas, output_folder_path:str, vlm=None, vlm_processor=None):
    if vlm is None or vlm_processor is None:
        vlm_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        vlm = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    frame_caption = {}
    for i, frame_path in enumerate(tqdm(path_to_video_frames, desc="Captioning frames")):
    
        frame = Image.open(frame_path)
        inputs = vlm_processor(frame, return_tensors="pt")

        out = vlm.generate(**inputs)
        caption = vlm_processor.decode(out[0], skip_special_tokens=True)
        frame_caption[frame_path] = caption

    caption_out_path = os.path.join(output_folder_path, 'captions.json')
    with open(caption_out_path, 'w') as outfile:
        json.dump(frame_caption, outfile)
    
    # save video caption to metadata
    for frame_metadata in metadatas:
      frame_metadata['caption'] = frame_caption[frame_metadata['extracted_frame_path']]

    metadatas_out_path = os.path.join(output_folder_path, 'metadatas.json')
    with open(metadatas_out_path, 'w') as outfile:
        json.dump(metadatas, outfile)
    return metadatas_out_path