File size: 6,730 Bytes
76934e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import sys
import copy
import librosa
import logging
import argparse
import numpy as np
import soundfile as sf
import moviepy.editor as mpy
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from subtitle_utils import generate_srt, generate_srt_clip
from trans_utils import pre_proc, proc, write_state, load_state

from moviepy.editor import *
from moviepy.video.tools.subtitles import SubtitlesClip


class VideoClipper():
    def __init__(self, asr_pipeline):
        logging.warning("Initializing VideoClipper.")
        self.asr_pipeline = asr_pipeline

    def recog(self, audio_input, state=None):
        if state is None:
            state = {}
        state['audio_input'] = audio_input
        _, data = audio_input
        data = data.astype(np.float64)
        rec_result = self.asr_pipeline(audio_in=data)
        state['recog_res_raw'] = rec_result['text_postprocessed']
        state['timestamp'] = rec_result['time_stamp']
        state['sentences'] = rec_result['sentences']
        res_text = rec_result['text']
        res_srt = generate_srt(rec_result['sentences'])
        return res_text, res_srt, state

    def clip(self, dest_text, start_ost, end_ost, state):
        # get from state
        audio_input = state['audio_input']
        recog_res_raw = state['recog_res_raw']
        timestamp = state['timestamp']
        sentences = state['sentences']
        sr, data = audio_input
        data = data.astype(np.float64)

        all_ts = []
        for _dest_text in dest_text.split('#'):
            _dest_text = pre_proc(_dest_text)
            ts = proc(recog_res_raw, timestamp, _dest_text)
            for _ts in ts: all_ts.append(_ts)
        ts = all_ts
        srt_index = 0
        clip_srt = ""
        if len(ts):
            start, end = ts[0]
            start = min(max(0, start+start_ost*16), len(data))
            end = min(max(0, end+end_ost*16), len(data))
            res_audio = data[start:end]
            start_end_info = "from {} to {}".format(start/16000, end/16000)
            srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index)
            clip_srt += srt_clip
            for _ts in ts[1:]:  # multiple sentence input or multiple output matched
                start, end = _ts
                start = min(max(0, start+start_ost*16), len(data))
                end = min(max(0, end+end_ost*16), len(data))
                start_end_info += ", from {} to {}".format(start, end)
                res_audio = np.concatenate([res_audio, data[start+start_ost*16:end+end_ost*16]], -1)
                srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
                clip_srt += srt_clip
        if len(ts):
            message = "{} periods found in the speech: ".format(len(ts)) + start_end_info
        else:
            message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
        return (sr, res_audio), message, clip_srt

    def video_recog(self, vedio_filename):
        vedio_filename = vedio_filename
        clip_video_file = vedio_filename[:-4] + '_clip.mp4'
        video = mpy.VideoFileClip(vedio_filename)
        audio_file = vedio_filename[:-3] + 'wav'
        video.audio.write_audiofile(audio_file)
        wav = librosa.load(audio_file, 16000)[0]
        state = {
            'vedio_filename': vedio_filename,
            'clip_video_file': clip_video_file,
            'video': video,
        }
        # res_text, res_srt = self.recog((16000, wav), state)
        return self.recog((16000, wav), state)

    def video_clip(self, dest_text, start_ost, end_ost, state, font_size=32, font_color='white', add_sub=False):
        # get from state
        recog_res_raw = state['recog_res_raw']
        timestamp = state['timestamp']
        sentences = state['sentences']
        video = state['video']
        clip_video_file = state['clip_video_file']
        vedio_filename = state['vedio_filename']
        
        all_ts = []
        srt_index = 0
        for _dest_text in dest_text.split('#'):
            _dest_text = pre_proc(_dest_text)
            ts = proc(recog_res_raw, timestamp, _dest_text)
            for _ts in ts: all_ts.append(_ts)
        ts = all_ts
        clip_srt = ""
        if len(ts):
            start, end = ts[0][0] / 16000, ts[0][1] / 16000
            start, end = start+start_ost/1000.0, end+end_ost/1000.0
            video_clip = video.subclip(start, end)
            clip_video_file = clip_video_file
            start_end_info = "from {} to {}".format(start, end)
            # message = "{} periods found in the audio: from {} to {}.".format(len(ts), start, end)
            srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index)
            clip_srt += srt_clip
            if add_sub:
                generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
                subtitles = SubtitlesClip(subs, generator)
                video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
            concate_clip = [video_clip]
            for _ts in ts[1:]:
                start, end = _ts[0] / 16000, _ts[1] / 16000
                start, end = start+start_ost/1000.0, end+end_ost/1000.0
                _video_clip = video.subclip(start, end)
                clip_video_file = clip_video_file
                start_end_info += ", from {} to {}".format(start, end)
                srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1)
                clip_srt += srt_clip
                if add_sub:
                    generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
                    subtitles = SubtitlesClip(subs, generator)
                    _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
                concate_clip.append(copy.copy(_video_clip))
            message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
            logging.warning("Concating...")
            if len(concate_clip) > 1:
                video_clip = concatenate_videoclips(concate_clip)
            video_clip.write_videofile(clip_video_file)
        else:
            clip_video_file = vedio_filename
            message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
            srt_clip = ''
        return clip_video_file, message, clip_srt