Raushan-123 commited on
Commit
0156cb1
1 Parent(s): 102f976

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +313 -0
  2. packages.txt +2 -0
  3. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ from difflib import Differ
4
+ import ffmpeg
5
+ import os
6
+ from pathlib import Path
7
+ import time
8
+ import aiohttp
9
+ import asyncio
10
+
11
+
12
+ # Set true if you're using huggingface inference API API https://huggingface.co/inference-api
13
+ API_BACKEND = True
14
+ # MODEL = 'facebook/wav2vec2-large-960h-lv60-self'
15
+ # MODEL = "facebook/wav2vec2-large-960h"
16
+ MODEL = "facebook/wav2vec2-base-960h"
17
+ # MODEL = "patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram"
18
+ if API_BACKEND:
19
+ from dotenv import load_dotenv
20
+ import base64
21
+ import asyncio
22
+ load_dotenv(Path(".env"))
23
+
24
+ HF_TOKEN = os.environ["HF_TOKEN"]
25
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
26
+ API_URL = f'https://api-inference.huggingface.co/models/{MODEL}'
27
+
28
+ else:
29
+ import torch
30
+ from transformers import pipeline
31
+
32
+ # is cuda available?
33
+ cuda = torch.device(
34
+ 'cuda:0') if torch.cuda.is_available() else torch.device('cpu')
35
+ device = 0 if torch.cuda.is_available() else -1
36
+ speech_recognizer = pipeline(
37
+ task="automatic-speech-recognition",
38
+ model=f'{MODEL}',
39
+ tokenizer=f'{MODEL}',
40
+ framework="pt",
41
+ device=device,
42
+ )
43
+
44
+ videos_out_path = Path("./videos_out")
45
+ videos_out_path.mkdir(parents=True, exist_ok=True)
46
+
47
+ samples_data = sorted(Path('examples').glob('*.json'))
48
+ SAMPLES = []
49
+ for file in samples_data:
50
+ with open(file) as f:
51
+ sample = json.load(f)
52
+ SAMPLES.append(sample)
53
+ VIDEOS = list(map(lambda x: [x['video']], SAMPLES))
54
+
55
+ total_inferences_since_reboot = 415
56
+ total_cuts_since_reboot = 1539
57
+
58
+
59
+ async def speech_to_text(video_file_path):
60
+ """
61
+ Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
62
+
63
+ Using https://huggingface.co/tasks/automatic-speech-recognition pipeline
64
+ """
65
+ global total_inferences_since_reboot
66
+ if (video_file_path == None):
67
+ raise ValueError("Error no video input")
68
+
69
+ video_path = Path(video_file_path)
70
+ try:
71
+ # convert video to audio 16k using PIPE to audio_memory
72
+ audio_memory, _ = ffmpeg.input(video_path).output(
73
+ '-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
74
+ except Exception as e:
75
+ raise RuntimeError("Error converting video to audio")
76
+
77
+ ping("speech_to_text")
78
+ last_time = time.time()
79
+ if API_BACKEND:
80
+ # Using Inference API https://huggingface.co/inference-api
81
+ # try twice, because the model must be loaded
82
+ for i in range(10):
83
+ for tries in range(4):
84
+ print(f'Transcribing from API attempt {tries}')
85
+ try:
86
+ inference_reponse = await query_api(audio_memory)
87
+ print(inference_reponse)
88
+ transcription = inference_reponse["text"].lower()
89
+ timestamps = [[chunk["text"].lower(), chunk["timestamp"][0], chunk["timestamp"][1]]
90
+ for chunk in inference_reponse['chunks']]
91
+
92
+ total_inferences_since_reboot += 1
93
+ print("\n\ntotal_inferences_since_reboot: ",
94
+ total_inferences_since_reboot, "\n\n")
95
+ return (transcription, transcription, timestamps)
96
+ except Exception as e:
97
+ print(e)
98
+ if 'error' in inference_reponse and 'estimated_time' in inference_reponse:
99
+ wait_time = inference_reponse['estimated_time']
100
+ print("Waiting for model to load....", wait_time)
101
+ # wait for loading model
102
+ # 5 seconds plus for certanty
103
+ await asyncio.sleep(wait_time + 5.0)
104
+ elif 'error' in inference_reponse:
105
+ raise RuntimeError("Error Fetching API",
106
+ inference_reponse['error'])
107
+ else:
108
+ break
109
+ else:
110
+ raise RuntimeError(inference_reponse, "Error Fetching API")
111
+ else:
112
+
113
+ try:
114
+ print(f'Transcribing via local model')
115
+ output = speech_recognizer(
116
+ audio_memory, return_timestamps="char", chunk_length_s=10, stride_length_s=(4, 2))
117
+
118
+ transcription = output["text"].lower()
119
+ timestamps = [[chunk["text"].lower(), chunk["timestamp"][0].tolist(), chunk["timestamp"][1].tolist()]
120
+ for chunk in output['chunks']]
121
+ total_inferences_since_reboot += 1
122
+
123
+ print("\n\ntotal_inferences_since_reboot: ",
124
+ total_inferences_since_reboot, "\n\n")
125
+ return (transcription, transcription, timestamps)
126
+ except Exception as e:
127
+ raise RuntimeError("Error Running inference with local model", e)
128
+
129
+
130
+ async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
131
+ """
132
+ Given original video input, text transcript + timestamps,
133
+ and edit ext cuts video segments into a single video
134
+ """
135
+ global total_cuts_since_reboot
136
+
137
+ video_path = Path(video_in)
138
+ video_file_name = video_path.stem
139
+ if (video_in == None or text_in == None or transcription == None):
140
+ raise ValueError("Inputs undefined")
141
+
142
+ d = Differ()
143
+ # compare original transcription with edit text
144
+ diff_chars = d.compare(transcription, text_in)
145
+ # remove all text aditions from diff
146
+ filtered = list(filter(lambda x: x[0] != '+', diff_chars))
147
+
148
+ # filter timestamps to be removed
149
+ # timestamps_to_cut = [b for (a,b) in zip(filtered, timestamps_var) if a[0]== '-' ]
150
+ # return diff tokes and cutted video!!
151
+
152
+ # groupping character timestamps so there are less cuts
153
+ idx = 0
154
+ grouped = {}
155
+ for (a, b) in zip(filtered, timestamps):
156
+ if a[0] != '-':
157
+ if idx in grouped:
158
+ grouped[idx].append(b)
159
+ else:
160
+ grouped[idx] = []
161
+ grouped[idx].append(b)
162
+ else:
163
+ idx += 1
164
+
165
+ # after grouping, gets the lower and upter start and time for each group
166
+ timestamps_to_cut = [[v[0][1], v[-1][2]] for v in grouped.values()]
167
+
168
+ between_str = '+'.join(
169
+ map(lambda t: f'between(t,{t[0]},{t[1]})', timestamps_to_cut))
170
+
171
+ if timestamps_to_cut:
172
+ video_file = ffmpeg.input(video_in)
173
+ video = video_file.video.filter(
174
+ "select", f'({between_str})').filter("setpts", "N/FRAME_RATE/TB")
175
+ audio = video_file.audio.filter(
176
+ "aselect", f'({between_str})').filter("asetpts", "N/SR/TB")
177
+
178
+ output_video = f'./videos_out/{video_file_name}.mp4'
179
+ ffmpeg.concat(video, audio, v=1, a=1).output(
180
+ output_video).overwrite_output().global_args('-loglevel', 'quiet').run()
181
+ else:
182
+ output_video = video_in
183
+
184
+ tokens = [(token[2:], token[0] if token[0] != " " else None)
185
+ for token in filtered]
186
+
187
+ total_cuts_since_reboot += 1
188
+ ping("video_cuts")
189
+ print("\n\ntotal_cuts_since_reboot: ", total_cuts_since_reboot, "\n\n")
190
+ return (tokens, output_video)
191
+
192
+
193
+ async def query_api(audio_bytes: bytes):
194
+ """
195
+ Query for Huggingface Inference API for Automatic Speech Recognition task
196
+ """
197
+ payload = json.dumps({
198
+ "inputs": base64.b64encode(audio_bytes).decode("utf-8"),
199
+ "parameters": {
200
+ "return_timestamps": "char",
201
+ "chunk_length_s": 10,
202
+ "stride_length_s": [4, 2]
203
+ },
204
+ "options": {"use_gpu": False}
205
+ }).encode("utf-8")
206
+ async with aiohttp.ClientSession() as session:
207
+ async with session.post(API_URL, headers=headers, data=payload) as response:
208
+ print("API Response: ", response.status)
209
+ if response.headers['Content-Type'] == 'application/json':
210
+ return await response.json()
211
+ elif response.headers['Content-Type'] == 'application/octet-stream':
212
+ return await response.read()
213
+ elif response.headers['Content-Type'] == 'text/plain':
214
+ return await response.text()
215
+ else:
216
+ raise RuntimeError("Error Fetching API")
217
+
218
+
219
+ def ping(name):
220
+ url = f'https://huggingface.co/api/telemetry/spaces/radames/edit-video-by-editing-text/{name}'
221
+ print("ping: ", url)
222
+
223
+ async def req():
224
+ async with aiohttp.ClientSession() as session:
225
+ async with session.get(url) as response:
226
+ print("pong: ", response.status)
227
+ asyncio.create_task(req())
228
+
229
+
230
+ # ---- Gradio Layout -----
231
+ video_in = gr.Video(label="Video file", elem_id="video-container")
232
+ text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
233
+ video_out = gr.Video(label="Video Out")
234
+ diff_out = gr.HighlightedText(label="Cuts Diffs", combine_adjacent=True)
235
+ examples = gr.Dataset(components=[video_in], samples=VIDEOS, type="index")
236
+
237
+ css = """
238
+ #cut_btn, #reset_btn { align-self:stretch; }
239
+ #\\31 3 { max-width: 540px; }
240
+ .output-markdown {max-width: 65ch !important;}
241
+ #video-container{
242
+ max-width: 40rem;
243
+ }
244
+ """
245
+ with gr.Blocks(css=css) as demo:
246
+ transcription_var = gr.State()
247
+ timestamps_var = gr.State()
248
+ with gr.Row():
249
+ with gr.Column():
250
+ gr.Markdown("""
251
+ # Edit Video By Editing Text
252
+ This project is a quick proof of concept of a simple video editor where the edits
253
+ are made by editing the audio transcription.
254
+ Using the [Huggingface Automatic Speech Recognition Pipeline](https://huggingface.co/tasks/automatic-speech-recognition)
255
+ with a fine tuned [Wav2Vec2 model using Connectionist Temporal Classification (CTC)](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self)
256
+ you can predict not only the text transcription but also the [character or word base timestamps](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__.return_timestamps)
257
+ """)
258
+
259
+ with gr.Row():
260
+
261
+ examples.render()
262
+
263
+ def load_example(id):
264
+ video = SAMPLES[id]['video']
265
+ transcription = SAMPLES[id]['transcription'].lower()
266
+ timestamps = SAMPLES[id]['timestamps']
267
+
268
+ return (video, transcription, transcription, timestamps)
269
+
270
+ examples.click(
271
+ load_example,
272
+ inputs=[examples],
273
+ outputs=[video_in, text_in, transcription_var, timestamps_var],
274
+ queue=False)
275
+ with gr.Row():
276
+ with gr.Column():
277
+ video_in.render()
278
+ transcribe_btn = gr.Button("Transcribe Audio")
279
+ transcribe_btn.click(speech_to_text, [video_in], [
280
+ text_in, transcription_var, timestamps_var])
281
+
282
+ with gr.Row():
283
+ gr.Markdown("""
284
+ ### Now edit as text
285
+ After running the video transcription, you can make cuts to the text below (only cuts, not additions!)""")
286
+
287
+ with gr.Row():
288
+ with gr.Column():
289
+ text_in.render()
290
+ with gr.Row():
291
+ cut_btn = gr.Button("Cut to video", elem_id="cut_btn")
292
+ # send audio path and hidden variables
293
+ cut_btn.click(cut_timestamps_to_video, [
294
+ video_in, transcription_var, text_in, timestamps_var], [diff_out, video_out])
295
+
296
+ reset_transcription = gr.Button(
297
+ "Reset to last trascription", elem_id="reset_btn")
298
+ reset_transcription.click(
299
+ lambda x: x, transcription_var, text_in)
300
+ with gr.Column():
301
+ video_out.render()
302
+ diff_out.render()
303
+ with gr.Row():
304
+ gr.Markdown("""
305
+ #### Video Credits
306
+
307
+ 1. [Cooking](https://vimeo.com/573792389)
308
+ 1. [Shia LaBeouf "Just Do It"](https://www.youtube.com/watch?v=n2lTxIk_Dr0)
309
+ 1. [Mark Zuckerberg & Yuval Noah Harari in Conversation](https://www.youtube.com/watch?v=Boj9eD0Wug8)
310
+ """)
311
+ demo.queue()
312
+ if __name__ == "__main__":
313
+ demo.launch(debug=True)
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ libsndfile1
2
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ gradio==3.35.2
4
+ datasets
5
+ librosa
6
+ ffmpeg-python
7
+ python-dotenv
8
+ aiohttp