Spaces:
Runtime error
Runtime error
Commit
·
ecbf7df
1
Parent(s):
5933369
Update app.py
Browse files
app.py
CHANGED
@@ -212,7 +212,7 @@ def get_youtube(video_url):
|
|
212 |
abs_video_path = ydl.prepare_filename(info)
|
213 |
ydl.process_info(info)
|
214 |
|
215 |
-
print("
|
216 |
print(abs_video_path)
|
217 |
return abs_video_path
|
218 |
|
@@ -241,7 +241,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
241 |
_,file_ending = os.path.splitext(f'{video_file_path}')
|
242 |
print(f'file enging is {file_ending}')
|
243 |
audio_file = video_file_path.replace(file_ending, ".wav")
|
244 |
-
print("
|
245 |
os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
|
246 |
|
247 |
# Get duration
|
@@ -249,7 +249,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
249 |
frames = f.getnframes()
|
250 |
rate = f.getframerate()
|
251 |
duration = frames / float(rate)
|
252 |
-
print(f"
|
253 |
|
254 |
# Transcribe audio
|
255 |
options = dict(language=selected_source_lang, beam_size=5, best_of=5)
|
@@ -266,9 +266,9 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
266 |
chunk["text"] = segment_chunk.text
|
267 |
segments.append(chunk)
|
268 |
i += 1
|
269 |
-
print("
|
270 |
except Exception as e:
|
271 |
-
raise RuntimeError("
|
272 |
|
273 |
try:
|
274 |
# Create embedding
|
@@ -296,7 +296,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
296 |
score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
|
297 |
score_num_speakers[num_speakers] = score
|
298 |
best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
|
299 |
-
print(f"
|
300 |
else:
|
301 |
best_num_speaker = num_speakers
|
302 |
|
@@ -304,7 +304,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
304 |
clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
|
305 |
labels = clustering.labels_
|
306 |
for i in range(len(segments)):
|
307 |
-
segments[i]["speaker"] = '
|
308 |
|
309 |
# Make output
|
310 |
objects = {
|
@@ -333,9 +333,9 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
333 |
gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
|
334 |
gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
|
335 |
system_info = f"""
|
336 |
-
*
|
337 |
-
*
|
338 |
-
*GPU
|
339 |
"""
|
340 |
save_path = "output/transcript_result.csv"
|
341 |
df_results = pd.DataFrame(objects)
|
@@ -343,7 +343,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
|
|
343 |
return df_results, system_info, save_path
|
344 |
|
345 |
except Exception as e:
|
346 |
-
raise RuntimeError("
|
347 |
|
348 |
|
349 |
# ---- Gradio Layout -----
|
@@ -367,23 +367,23 @@ with demo:
|
|
367 |
with gr.Tab("Whisper speaker diarization"):
|
368 |
gr.Markdown('''
|
369 |
<div>
|
370 |
-
<h1 style='text-align: center'>Whisper
|
371 |
-
|
372 |
-
|
373 |
</div>
|
374 |
''')
|
375 |
|
376 |
with gr.Row():
|
377 |
gr.Markdown('''
|
378 |
-
###
|
379 |
-
##### 1.
|
380 |
-
##### 2.
|
381 |
-
##### 3.
|
382 |
''')
|
383 |
|
384 |
with gr.Row():
|
385 |
gr.Markdown('''
|
386 |
-
###
|
387 |
''')
|
388 |
examples = gr.Examples(examples=
|
389 |
[ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
|
@@ -395,7 +395,7 @@ with demo:
|
|
395 |
with gr.Row():
|
396 |
with gr.Column():
|
397 |
youtube_url_in.render()
|
398 |
-
download_youtube_btn = gr.Button("
|
399 |
download_youtube_btn.click(get_youtube, [youtube_url_in], [
|
400 |
video_in])
|
401 |
print(video_in)
|
@@ -406,14 +406,14 @@ with demo:
|
|
406 |
video_in.render()
|
407 |
with gr.Column():
|
408 |
gr.Markdown('''
|
409 |
-
#####
|
410 |
-
#####
|
411 |
-
#####
|
412 |
''')
|
413 |
selected_source_lang.render()
|
414 |
selected_whisper_model.render()
|
415 |
number_speakers.render()
|
416 |
-
transcribe_btn = gr.Button("
|
417 |
transcribe_btn.click(speech_to_text,
|
418 |
[video_in, selected_source_lang, selected_whisper_model, number_speakers],
|
419 |
[transcription_df, system_info, download_transcript]
|
@@ -430,45 +430,8 @@ with demo:
|
|
430 |
download_transcript.render()
|
431 |
transcription_df.render()
|
432 |
system_info.render()
|
433 |
-
gr.Markdown('''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
|
434 |
|
435 |
|
436 |
|
437 |
-
with gr.Tab("Whisper Transcribe Japanese Audio"):
|
438 |
-
gr.Markdown(f'''
|
439 |
-
<div>
|
440 |
-
<h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
|
441 |
-
</div>
|
442 |
-
Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
|
443 |
-
checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
|
444 |
-
''')
|
445 |
-
microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
|
446 |
-
upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
|
447 |
-
transcribe_btn = gr.Button("Transcribe Audio")
|
448 |
-
text_output = gr.Textbox()
|
449 |
-
with gr.Row():
|
450 |
-
gr.Markdown('''
|
451 |
-
### You can test by following examples:
|
452 |
-
''')
|
453 |
-
examples = gr.Examples(examples=
|
454 |
-
[ "sample1.wav",
|
455 |
-
"sample2.wav",
|
456 |
-
],
|
457 |
-
label="Examples", inputs=[upload])
|
458 |
-
transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
|
459 |
-
|
460 |
-
with gr.Tab("Whisper Transcribe Japanese YouTube"):
|
461 |
-
gr.Markdown(f'''
|
462 |
-
<div>
|
463 |
-
<h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
|
464 |
-
</div>
|
465 |
-
Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
|
466 |
-
<a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
|
467 |
-
''')
|
468 |
-
youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
|
469 |
-
yt_transcribe_btn = gr.Button("Transcribe YouTube")
|
470 |
-
text_output2 = gr.Textbox()
|
471 |
-
html_output = gr.Markdown()
|
472 |
-
yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
|
473 |
|
474 |
demo.launch(debug=True)
|
|
|
212 |
abs_video_path = ydl.prepare_filename(info)
|
213 |
ydl.process_info(info)
|
214 |
|
215 |
+
print("Sucesso ao baixar o vídeo")
|
216 |
print(abs_video_path)
|
217 |
return abs_video_path
|
218 |
|
|
|
241 |
_,file_ending = os.path.splitext(f'{video_file_path}')
|
242 |
print(f'file enging is {file_ending}')
|
243 |
audio_file = video_file_path.replace(file_ending, ".wav")
|
244 |
+
print("A iniciar a conversão para WAV")
|
245 |
os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
|
246 |
|
247 |
# Get duration
|
|
|
249 |
frames = f.getnframes()
|
250 |
rate = f.getframerate()
|
251 |
duration = frames / float(rate)
|
252 |
+
print(f"Conversão para WAV concluída, duração do arquivo de áudio.: {duration}")
|
253 |
|
254 |
# Transcribe audio
|
255 |
options = dict(language=selected_source_lang, beam_size=5, best_of=5)
|
|
|
266 |
chunk["text"] = segment_chunk.text
|
267 |
segments.append(chunk)
|
268 |
i += 1
|
269 |
+
print("transcrição de audio com fast whisper terminada")
|
270 |
except Exception as e:
|
271 |
+
raise RuntimeError("Erro a converter o filme para audio")
|
272 |
|
273 |
try:
|
274 |
# Create embedding
|
|
|
296 |
score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
|
297 |
score_num_speakers[num_speakers] = score
|
298 |
best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
|
299 |
+
print(f"O número estimado de participantes: {best_num_speaker} com pontuação de {score_num_speakers[best_num_speaker]} ")
|
300 |
else:
|
301 |
best_num_speaker = num_speakers
|
302 |
|
|
|
304 |
clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
|
305 |
labels = clustering.labels_
|
306 |
for i in range(len(segments)):
|
307 |
+
segments[i]["speaker"] = 'Participante ' + str(labels[i] + 1)
|
308 |
|
309 |
# Make output
|
310 |
objects = {
|
|
|
333 |
gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
|
334 |
gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
|
335 |
system_info = f"""
|
336 |
+
*Memoria: {memory.total / (1024 * 1024 * 1024):.2f}GB, utilizado: {memory.percent}%, disponivel: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
|
337 |
+
*Tempo de processamento: {time_diff:.5} segundos.*
|
338 |
+
*Utilização de GPU: {gpu_utilization}%, Memoria de GPU: {gpu_memory}MiB.*
|
339 |
"""
|
340 |
save_path = "output/transcript_result.csv"
|
341 |
df_results = pd.DataFrame(objects)
|
|
|
343 |
return df_results, system_info, save_path
|
344 |
|
345 |
except Exception as e:
|
346 |
+
raise RuntimeError("Erro a correr a inferência com um modelo local", e)
|
347 |
|
348 |
|
349 |
# ---- Gradio Layout -----
|
|
|
367 |
with gr.Tab("Whisper speaker diarization"):
|
368 |
gr.Markdown('''
|
369 |
<div>
|
370 |
+
<h1 style='text-align: center'>Whisper diarização com participantes</h1>
|
371 |
+
Este espaço usa os modelos whisper <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
|
372 |
+
e o modelo ECAPA-TDNN de <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> para codificar e identificar participantes
|
373 |
</div>
|
374 |
''')
|
375 |
|
376 |
with gr.Row():
|
377 |
gr.Markdown('''
|
378 |
+
### Transcreva o link do youtube usando OpenAI Whisper
|
379 |
+
##### 1. Usando o modelo Whisper da Open AI para separar o áudio em segmentos e gerar transcrições.
|
380 |
+
##### 2. Gerando embeddings para cada segmento.
|
381 |
+
##### 3. Aplicando clustering aglomerativo nos embeddings para identificar o participante de cada segmento.
|
382 |
''')
|
383 |
|
384 |
with gr.Row():
|
385 |
gr.Markdown('''
|
386 |
+
### Pode testar com os seguintes exemplos:
|
387 |
''')
|
388 |
examples = gr.Examples(examples=
|
389 |
[ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
|
|
|
395 |
with gr.Row():
|
396 |
with gr.Column():
|
397 |
youtube_url_in.render()
|
398 |
+
download_youtube_btn = gr.Button("Descarregar video do Youtube")
|
399 |
download_youtube_btn.click(get_youtube, [youtube_url_in], [
|
400 |
video_in])
|
401 |
print(video_in)
|
|
|
406 |
video_in.render()
|
407 |
with gr.Column():
|
408 |
gr.Markdown('''
|
409 |
+
##### Aqui você pode iniciar o processo de transcrição.
|
410 |
+
##### Por favor, selecione o idioma de origem para a transcrição.
|
411 |
+
##### Você pode selecionar uma faixa de números estimados de participantes.
|
412 |
''')
|
413 |
selected_source_lang.render()
|
414 |
selected_whisper_model.render()
|
415 |
number_speakers.render()
|
416 |
+
transcribe_btn = gr.Button("Transcrever audio com diarização")
|
417 |
transcribe_btn.click(speech_to_text,
|
418 |
[video_in, selected_source_lang, selected_whisper_model, number_speakers],
|
419 |
[transcription_df, system_info, download_transcript]
|
|
|
430 |
download_transcript.render()
|
431 |
transcription_df.render()
|
432 |
system_info.render()
|
|
|
433 |
|
434 |
|
435 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
436 |
|
437 |
demo.launch(debug=True)
|