pedromsfaria commited on
Commit
ecbf7df
·
1 Parent(s): 5933369

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -61
app.py CHANGED
@@ -212,7 +212,7 @@ def get_youtube(video_url):
212
  abs_video_path = ydl.prepare_filename(info)
213
  ydl.process_info(info)
214
 
215
- print("Success download video")
216
  print(abs_video_path)
217
  return abs_video_path
218
 
@@ -241,7 +241,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
241
  _,file_ending = os.path.splitext(f'{video_file_path}')
242
  print(f'file enging is {file_ending}')
243
  audio_file = video_file_path.replace(file_ending, ".wav")
244
- print("starting conversion to wav")
245
  os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
246
 
247
  # Get duration
@@ -249,7 +249,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
249
  frames = f.getnframes()
250
  rate = f.getframerate()
251
  duration = frames / float(rate)
252
- print(f"conversion to wav ready, duration of audio file: {duration}")
253
 
254
  # Transcribe audio
255
  options = dict(language=selected_source_lang, beam_size=5, best_of=5)
@@ -266,9 +266,9 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
266
  chunk["text"] = segment_chunk.text
267
  segments.append(chunk)
268
  i += 1
269
- print("transcribe audio done with fast whisper")
270
  except Exception as e:
271
- raise RuntimeError("Error converting video to audio")
272
 
273
  try:
274
  # Create embedding
@@ -296,7 +296,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
296
  score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
297
  score_num_speakers[num_speakers] = score
298
  best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
299
- print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
300
  else:
301
  best_num_speaker = num_speakers
302
 
@@ -304,7 +304,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
304
  clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
305
  labels = clustering.labels_
306
  for i in range(len(segments)):
307
- segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
308
 
309
  # Make output
310
  objects = {
@@ -333,9 +333,9 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
333
  gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
334
  gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
335
  system_info = f"""
336
- *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
337
- *Processing time: {time_diff:.5} seconds.*
338
- *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
339
  """
340
  save_path = "output/transcript_result.csv"
341
  df_results = pd.DataFrame(objects)
@@ -343,7 +343,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
343
  return df_results, system_info, save_path
344
 
345
  except Exception as e:
346
- raise RuntimeError("Error Running inference with local model", e)
347
 
348
 
349
  # ---- Gradio Layout -----
@@ -367,23 +367,23 @@ with demo:
367
  with gr.Tab("Whisper speaker diarization"):
368
  gr.Markdown('''
369
  <div>
370
- <h1 style='text-align: center'>Whisper speaker diarization</h1>
371
- This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
372
- and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
373
  </div>
374
  ''')
375
 
376
  with gr.Row():
377
  gr.Markdown('''
378
- ### Transcribe youtube link using OpenAI Whisper
379
- ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
380
- ##### 2. Generating speaker embeddings for each segments.
381
- ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
382
  ''')
383
 
384
  with gr.Row():
385
  gr.Markdown('''
386
- ### You can test by following examples:
387
  ''')
388
  examples = gr.Examples(examples=
389
  [ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
@@ -395,7 +395,7 @@ with demo:
395
  with gr.Row():
396
  with gr.Column():
397
  youtube_url_in.render()
398
- download_youtube_btn = gr.Button("Download Youtube video")
399
  download_youtube_btn.click(get_youtube, [youtube_url_in], [
400
  video_in])
401
  print(video_in)
@@ -406,14 +406,14 @@ with demo:
406
  video_in.render()
407
  with gr.Column():
408
  gr.Markdown('''
409
- ##### Here you can start the transcription process.
410
- ##### Please select the source language for transcription.
411
- ##### You can select a range of assumed numbers of speakers.
412
  ''')
413
  selected_source_lang.render()
414
  selected_whisper_model.render()
415
  number_speakers.render()
416
- transcribe_btn = gr.Button("Transcribe audio and diarization")
417
  transcribe_btn.click(speech_to_text,
418
  [video_in, selected_source_lang, selected_whisper_model, number_speakers],
419
  [transcription_df, system_info, download_transcript]
@@ -430,45 +430,8 @@ with demo:
430
  download_transcript.render()
431
  transcription_df.render()
432
  system_info.render()
433
- gr.Markdown('''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
434
 
435
 
436
 
437
- with gr.Tab("Whisper Transcribe Japanese Audio"):
438
- gr.Markdown(f'''
439
- <div>
440
- <h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
441
- </div>
442
- Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
443
- checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
444
- ''')
445
- microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
446
- upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
447
- transcribe_btn = gr.Button("Transcribe Audio")
448
- text_output = gr.Textbox()
449
- with gr.Row():
450
- gr.Markdown('''
451
- ### You can test by following examples:
452
- ''')
453
- examples = gr.Examples(examples=
454
- [ "sample1.wav",
455
- "sample2.wav",
456
- ],
457
- label="Examples", inputs=[upload])
458
- transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
459
-
460
- with gr.Tab("Whisper Transcribe Japanese YouTube"):
461
- gr.Markdown(f'''
462
- <div>
463
- <h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
464
- </div>
465
- Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
466
- <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
467
- ''')
468
- youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
469
- yt_transcribe_btn = gr.Button("Transcribe YouTube")
470
- text_output2 = gr.Textbox()
471
- html_output = gr.Markdown()
472
- yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
473
 
474
  demo.launch(debug=True)
 
212
  abs_video_path = ydl.prepare_filename(info)
213
  ydl.process_info(info)
214
 
215
+ print("Sucesso ao baixar o vídeo")
216
  print(abs_video_path)
217
  return abs_video_path
218
 
 
241
  _,file_ending = os.path.splitext(f'{video_file_path}')
242
  print(f'file enging is {file_ending}')
243
  audio_file = video_file_path.replace(file_ending, ".wav")
244
+ print("A iniciar a conversão para WAV")
245
  os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
246
 
247
  # Get duration
 
249
  frames = f.getnframes()
250
  rate = f.getframerate()
251
  duration = frames / float(rate)
252
+ print(f"Conversão para WAV concluída, duração do arquivo de áudio.: {duration}")
253
 
254
  # Transcribe audio
255
  options = dict(language=selected_source_lang, beam_size=5, best_of=5)
 
266
  chunk["text"] = segment_chunk.text
267
  segments.append(chunk)
268
  i += 1
269
+ print("transcrição de audio com fast whisper terminada")
270
  except Exception as e:
271
+ raise RuntimeError("Erro a converter o filme para audio")
272
 
273
  try:
274
  # Create embedding
 
296
  score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
297
  score_num_speakers[num_speakers] = score
298
  best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
299
+ print(f"O número estimado de participantes: {best_num_speaker} com pontuação de {score_num_speakers[best_num_speaker]} ")
300
  else:
301
  best_num_speaker = num_speakers
302
 
 
304
  clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
305
  labels = clustering.labels_
306
  for i in range(len(segments)):
307
+ segments[i]["speaker"] = 'Participante ' + str(labels[i] + 1)
308
 
309
  # Make output
310
  objects = {
 
333
  gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
334
  gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
335
  system_info = f"""
336
+ *Memoria: {memory.total / (1024 * 1024 * 1024):.2f}GB, utilizado: {memory.percent}%, disponivel: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
337
+ *Tempo de processamento: {time_diff:.5} segundos.*
338
+ *Utilização de GPU: {gpu_utilization}%, Memoria de GPU: {gpu_memory}MiB.*
339
  """
340
  save_path = "output/transcript_result.csv"
341
  df_results = pd.DataFrame(objects)
 
343
  return df_results, system_info, save_path
344
 
345
  except Exception as e:
346
+ raise RuntimeError("Erro a correr a inferência com um modelo local", e)
347
 
348
 
349
  # ---- Gradio Layout -----
 
367
  with gr.Tab("Whisper speaker diarization"):
368
  gr.Markdown('''
369
  <div>
370
+ <h1 style='text-align: center'>Whisper diarização com participantes</h1>
371
+ Este espaço usa os modelos whisper <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
372
+ e o modelo ECAPA-TDNN de <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> para codificar e identificar participantes
373
  </div>
374
  ''')
375
 
376
  with gr.Row():
377
  gr.Markdown('''
378
+ ### Transcreva o link do youtube usando OpenAI Whisper
379
+ ##### 1. Usando o modelo Whisper da Open AI para separar o áudio em segmentos e gerar transcrições.
380
+ ##### 2. Gerando embeddings para cada segmento.
381
+ ##### 3. Aplicando clustering aglomerativo nos embeddings para identificar o participante de cada segmento.
382
  ''')
383
 
384
  with gr.Row():
385
  gr.Markdown('''
386
+ ### Pode testar com os seguintes exemplos:
387
  ''')
388
  examples = gr.Examples(examples=
389
  [ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
 
395
  with gr.Row():
396
  with gr.Column():
397
  youtube_url_in.render()
398
+ download_youtube_btn = gr.Button("Descarregar video do Youtube")
399
  download_youtube_btn.click(get_youtube, [youtube_url_in], [
400
  video_in])
401
  print(video_in)
 
406
  video_in.render()
407
  with gr.Column():
408
  gr.Markdown('''
409
+ ##### Aqui você pode iniciar o processo de transcrição.
410
+ ##### Por favor, selecione o idioma de origem para a transcrição.
411
+ ##### Você pode selecionar uma faixa de números estimados de participantes.
412
  ''')
413
  selected_source_lang.render()
414
  selected_whisper_model.render()
415
  number_speakers.render()
416
+ transcribe_btn = gr.Button("Transcrever audio com diarização")
417
  transcribe_btn.click(speech_to_text,
418
  [video_in, selected_source_lang, selected_whisper_model, number_speakers],
419
  [transcription_df, system_info, download_transcript]
 
430
  download_transcript.render()
431
  transcription_df.render()
432
  system_info.render()
 
433
 
434
 
435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
 
437
  demo.launch(debug=True)