Whisper_Diariazacao

Runtime error

App Files Files Community

pedromsfaria commited on Jun 25, 2023

Commit

ecbf7df

1 Parent(s): 5933369

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -61

app.py CHANGED Viewed

@@ -212,7 +212,7 @@ def get_youtube(video_url):
         abs_video_path = ydl.prepare_filename(info)
         ydl.process_info(info)
-    print("Success download video")
     print(abs_video_path)
     return abs_video_path
@@ -241,7 +241,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
         _,file_ending = os.path.splitext(f'{video_file_path}')
         print(f'file enging is {file_ending}')
         audio_file = video_file_path.replace(file_ending, ".wav")
-        print("starting conversion to wav")
         os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
         # Get duration
@@ -249,7 +249,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
             frames = f.getnframes()
             rate = f.getframerate()
             duration = frames / float(rate)
-        print(f"conversion to wav ready, duration of audio file: {duration}")
         # Transcribe audio
         options = dict(language=selected_source_lang, beam_size=5, best_of=5)
@@ -266,9 +266,9 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
             chunk["text"] = segment_chunk.text
             segments.append(chunk)
             i += 1
-        print("transcribe audio done with fast whisper")
     except Exception as e:
-        raise RuntimeError("Error converting video to audio")
     try:
         # Create embedding
@@ -296,7 +296,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
                 score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
                 score_num_speakers[num_speakers] = score
             best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
-            print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
         else:
             best_num_speaker = num_speakers
@@ -304,7 +304,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
         clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
         labels = clustering.labels_
         for i in range(len(segments)):
-            segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
         # Make output
         objects = {
@@ -333,9 +333,9 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
         gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
         gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
         system_info = f"""
-        *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
-        *Processing time: {time_diff:.5} seconds.*
-        *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
         """
         save_path = "output/transcript_result.csv"
         df_results = pd.DataFrame(objects)
@@ -343,7 +343,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
         return df_results, system_info, save_path
     except Exception as e:
-        raise RuntimeError("Error Running inference with local model", e)
 # ---- Gradio Layout -----
@@ -367,23 +367,23 @@ with demo:
     with gr.Tab("Whisper speaker diarization"):
         gr.Markdown('''
             <div>
-            <h1 style='text-align: center'>Whisper speaker diarization</h1>
-            This space uses Whisper models from <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
-            and ECAPA-TDNN model from <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> to encode and clasify speakers
             </div>
         ''')
         with gr.Row():
             gr.Markdown('''
-            ### Transcribe youtube link using OpenAI Whisper
-            ##### 1. Using Open AI's Whisper model to seperate audio into segments and generate transcripts.
-            ##### 2. Generating speaker embeddings for each segments.
-            ##### 3. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
             ''')
         with gr.Row():
             gr.Markdown('''
-                ### You can test by following examples:
                 ''')
         examples = gr.Examples(examples=
                 [ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
@@ -395,7 +395,7 @@ with demo:
         with gr.Row():
             with gr.Column():
                 youtube_url_in.render()
-                download_youtube_btn = gr.Button("Download Youtube video")
                 download_youtube_btn.click(get_youtube, [youtube_url_in], [
                     video_in])
                 print(video_in)
@@ -406,14 +406,14 @@ with demo:
                 video_in.render()
                 with gr.Column():
                     gr.Markdown('''
-                    ##### Here you can start the transcription process.
-                    ##### Please select the source language for transcription.
-                    ##### You can select a range of assumed numbers of speakers.
                     ''')
                 selected_source_lang.render()
                 selected_whisper_model.render()
                 number_speakers.render()
-                transcribe_btn = gr.Button("Transcribe audio and diarization")
                 transcribe_btn.click(speech_to_text,
                                      [video_in, selected_source_lang, selected_whisper_model, number_speakers],
                                      [transcription_df, system_info, download_transcript]
@@ -430,45 +430,8 @@ with demo:
                 download_transcript.render()
                 transcription_df.render()
                 system_info.render()
-                gr.Markdown('''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
-    with gr.Tab("Whisper Transcribe Japanese Audio"):
-        gr.Markdown(f'''
-              <div>
-              <h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
-              </div>
-              Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
-              checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
-          ''')
-        microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
-        upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
-        transcribe_btn = gr.Button("Transcribe Audio")
-        text_output = gr.Textbox()
-        with gr.Row():
-            gr.Markdown('''
-                ### You can test by following examples:
-                ''')
-        examples = gr.Examples(examples=
-              [ "sample1.wav",
-                "sample2.wav",
-                ],
-              label="Examples", inputs=[upload])
-        transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
-    with gr.Tab("Whisper Transcribe Japanese YouTube"):
-        gr.Markdown(f'''
-              <div>
-              <h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
-              </div>
-                Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
-                <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
-            ''')
-        youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
-        yt_transcribe_btn = gr.Button("Transcribe YouTube")
-        text_output2 = gr.Textbox()
-        html_output = gr.Markdown()
-        yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
 demo.launch(debug=True)

         abs_video_path = ydl.prepare_filename(info)
         ydl.process_info(info)
+    print("Sucesso ao baixar o vídeo")
     print(abs_video_path)
     return abs_video_path
         _,file_ending = os.path.splitext(f'{video_file_path}')
         print(f'file enging is {file_ending}')
         audio_file = video_file_path.replace(file_ending, ".wav")
+        print("A iniciar a conversão para WAV")
         os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
         # Get duration
             frames = f.getnframes()
             rate = f.getframerate()
             duration = frames / float(rate)
+        print(f"Conversão para WAV concluída, duração do arquivo de áudio.: {duration}")
         # Transcribe audio
         options = dict(language=selected_source_lang, beam_size=5, best_of=5)
             chunk["text"] = segment_chunk.text
             segments.append(chunk)
             i += 1
+        print("transcrição de audio com fast whisper terminada")
     except Exception as e:
+        raise RuntimeError("Erro a converter o filme para audio")
     try:
         # Create embedding
                 score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
                 score_num_speakers[num_speakers] = score
             best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
+            print(f"O número estimado de participantes: {best_num_speaker} com pontuação de {score_num_speakers[best_num_speaker]} ")
         else:
             best_num_speaker = num_speakers
         clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
         labels = clustering.labels_
         for i in range(len(segments)):
+            segments[i]["speaker"] = 'Participante ' + str(labels[i] + 1)
         # Make output
         objects = {
         gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
         gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
         system_info = f"""
+        *Memoria: {memory.total / (1024 * 1024 * 1024):.2f}GB, utilizado: {memory.percent}%, disponivel: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
+        *Tempo de processamento: {time_diff:.5} segundos.*
+        *Utilização de GPU: {gpu_utilization}%, Memoria de GPU: {gpu_memory}MiB.*
         """
         save_path = "output/transcript_result.csv"
         df_results = pd.DataFrame(objects)
         return df_results, system_info, save_path
     except Exception as e:
+        raise RuntimeError("Erro a correr a inferência com um modelo local", e)
 # ---- Gradio Layout -----
     with gr.Tab("Whisper speaker diarization"):
         gr.Markdown('''
             <div>
+            <h1 style='text-align: center'>Whisper diarização com participantes</h1>
+            Este espaço usa os modelos whisper <a href='https://github.com/openai/whisper' target='_blank'><b>OpenAI</b></a> with <a href='https://github.com/guillaumekln/faster-whisper' target='_blank'><b>CTranslate2</b></a> which is a fast inference engine for Transformer models to recognize the speech (4 times faster than original openai model with same accuracy)
+            e o modelo ECAPA-TDNN de <a href='https://github.com/speechbrain/speechbrain' target='_blank'><b>SpeechBrain</b></a> para codificar e identificar participantes
             </div>
         ''')
         with gr.Row():
             gr.Markdown('''
+            ### Transcreva o link do youtube usando OpenAI Whisper
+            ##### 1. Usando o modelo Whisper da Open AI para separar o áudio em segmentos e gerar transcrições.
+            ##### 2. Gerando embeddings para cada segmento.
+            ##### 3. Aplicando clustering aglomerativo nos embeddings para identificar o participante de cada segmento.
             ''')
         with gr.Row():
             gr.Markdown('''
+                ### Pode testar com os seguintes exemplos:
                 ''')
         examples = gr.Examples(examples=
                 [ "https://www.youtube.com/watch?v=j7BfEzAFuYc&t=32s",
         with gr.Row():
             with gr.Column():
                 youtube_url_in.render()
+                download_youtube_btn = gr.Button("Descarregar video do Youtube")
                 download_youtube_btn.click(get_youtube, [youtube_url_in], [
                     video_in])
                 print(video_in)
                 video_in.render()
                 with gr.Column():
                     gr.Markdown('''
+                    ##### Aqui você pode iniciar o processo de transcrição.
+                    ##### Por favor, selecione o idioma de origem para a transcrição.
+                    ##### Você pode selecionar uma faixa de números estimados de participantes.
                     ''')
                 selected_source_lang.render()
                 selected_whisper_model.render()
                 number_speakers.render()
+                transcribe_btn = gr.Button("Transcrever audio com diarização")
                 transcribe_btn.click(speech_to_text,
                                      [video_in, selected_source_lang, selected_whisper_model, number_speakers],
                                      [transcription_df, system_info, download_transcript]
                 download_transcript.render()
                 transcription_df.render()
                 system_info.render()
 demo.launch(debug=True)