pedromsfaria commited on
Commit
0b903bc
·
1 Parent(s): 06b46e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -94
app.py CHANGED
@@ -9,7 +9,6 @@ import re
9
  import time
10
  import os
11
  import numpy as np
12
- import openai
13
  from sklearn.cluster import AgglomerativeClustering
14
  from sklearn.metrics import silhouette_score
15
 
@@ -150,22 +149,6 @@ embedding_model = PretrainedSpeakerEmbedding(
150
  "speechbrain/spkrec-ecapa-voxceleb",
151
  device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
152
 
153
- def summarize_text(text):
154
- response = openai.Completion.create(
155
- engine="text-davinci-003",
156
- prompt=f"Please summarize the following text: {text}",
157
- max_tokens=100
158
- )
159
- return response.choices[0].text
160
-
161
- def emotion_analysis(text):
162
- response = openai.Completion.create(
163
- engine="text-davinci-003",
164
- prompt=f"Please interpret the emotions in the following text: {text}",
165
- max_tokens=100
166
- )
167
- return response.choices[0].text
168
-
169
  def transcribe(microphone, file_upload):
170
  warn_output = ""
171
  if (microphone is not None) and (file_upload is not None):
@@ -234,79 +217,133 @@ def get_youtube(video_url):
234
  return abs_video_path
235
 
236
  def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  model = WhisperModel(whisper_model, compute_type="int8")
238
  time_start = time.time()
239
- if video_file_path is None:
240
  raise ValueError("Error no video input")
241
-
242
- _, file_ending = os.path.splitext(f'{video_file_path}')
243
- audio_file = video_file_path.replace(file_ending, ".wav")
244
- os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
- with contextlib.closing(wave.open(audio_file, 'r')) as f:
247
- frames = f.getnframes()
248
- rate = f.getframerate()
249
- duration = frames / float(rate)
250
-
251
- options = dict(language=selected_source_lang, beam_size=5, best_of=5)
252
- transcribe_options = dict(task="transcribe", **options)
253
- segments_raw, info = model.transcribe(audio_file, **transcribe_options)
254
-
255
- segments = []
256
- for segment_chunk in segments_raw:
257
- chunk = {"start": segment_chunk.start, "end": segment_chunk.end, "text": segment_chunk.text}
258
- segments.append(chunk)
259
-
260
- embeddings = np.zeros(shape=(len(segments), 192))
261
- for i, segment in enumerate(segments):
262
- embeddings[i] = segment_embedding(segment)
263
- embeddings = np.nan_to_num(embeddings)
264
-
265
- best_num_speaker = num_speakers if num_speakers != 0 else max(range(2, 10 + 1), key=lambda n: silhouette_score(embeddings, AgglomerativeClustering(n).fit(embeddings).labels_, metric='euclidean'))
266
- clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
267
- labels = clustering.labels_
268
- for i in range(len(segments)):
269
- segments[i]["speaker"] = 'Participante ' + str(labels[i] + 1)
270
-
271
- objects = {
272
- 'Start': [],
273
- 'End': [],
274
- 'Speaker': [],
275
- 'Text': []
276
- }
277
- text = ''
278
- for (i, segment) in enumerate(segments):
279
- if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
280
- objects['Start'].append(str(convert_time(segment["start"])))
281
- objects['Speaker'].append(segment["speaker"])
282
- if i != 0:
283
- objects['End'].append(str(convert_time(segments[i - 1]["end"])))
284
- objects['Text'].append(text)
285
- text = ''
286
  text += segment["text"] + ' '
287
- objects['End'].append(str(convert_time(segments[i - 1]["end"])))
288
- objects['Text'].append(text)
289
-
290
- transcription = " ".join(objects['Text'])
291
- summary = summarize_text(transcription)
292
- emotions = emotion_analysis(transcription)
293
-
294
- time_end = time.time()
295
- time_diff = time_end - time_start
296
- memory = psutil.virtual_memory()
297
- gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
298
- gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
299
- gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
300
- system_info = f"""
301
- *Memoria: {memory.total / (1024 * 1024 * 1024):.2f}GB, utilizado: {memory.percent}%, disponivel: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
302
- *Tempo de processamento: {time_diff:.5} segundos.*
303
- *Utilização de GPU: {gpu_utilization}%, Memoria de GPU: {gpu_memory}MiB.*
304
- """
305
- save_path = "output/transcript_result.csv"
306
- df_results = pd.DataFrame(objects)
307
- df_results.to_csv(save_path, index=False, encoding="utf-8")
308
- return df_results, system_info, save_path, summary, emotions
309
-
310
 
311
  except Exception as e:
312
  raise RuntimeError("Erro a correr a inferência com um modelo local", e)
@@ -321,8 +358,6 @@ memory = psutil.virtual_memory()
321
  selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="pt", label="Linguagem detectada no vídeo", interactive=True)
322
  selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="large-v2", label="Modelo Whisper selecionado", interactive=True)
323
  number_speakers = gr.Number(precision=0, value=2, label="Insira o número de participantes para obter melhores resultados. Se o valor for 0, o modelo encontrará automaticamente a melhor quantidade.", interactive=True)
324
- summary_text = gr.Textbox(label="Resumo da Transcrição", readonly=True)
325
- emotion_analysis_text = gr.Textbox(label="Análise de Emoções", readonly=True)
326
  system_info = gr.Markdown(f"*Memoria: {memory.total / (1024 * 1024 * 1024):.2f}GB, utilizado: {memory.percent}%, disponível: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
327
  download_transcript = gr.File(label="Download transcript")
328
  transcription_df = gr.DataFrame(value=df_init,label="Dataframe da transcrição", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
@@ -383,9 +418,9 @@ with demo:
383
  number_speakers.render()
384
  transcribe_btn = gr.Button("Transcrever audio com diarização")
385
  transcribe_btn.click(speech_to_text,
386
- [video_in, selected_source_lang, selected_whisper_model, number_speakers],
387
- [transcription_df, system_info, download_transcript, summary_text, emotion_analysis_text]
388
- )
389
 
390
  with gr.Row():
391
  gr.Markdown('''
@@ -399,9 +434,8 @@ with demo:
399
  transcription_df.render()
400
  system_info.render()
401
 
402
- with gr.Row():
403
- summary_text.render()
404
- emotion_analysis_text.render()
405
 
406
 
407
- demo.launch(debug=True)
 
 
9
  import time
10
  import os
11
  import numpy as np
 
12
  from sklearn.cluster import AgglomerativeClustering
13
  from sklearn.metrics import silhouette_score
14
 
 
149
  "speechbrain/spkrec-ecapa-voxceleb",
150
  device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def transcribe(microphone, file_upload):
153
  warn_output = ""
154
  if (microphone is not None) and (file_upload is not None):
 
217
  return abs_video_path
218
 
219
  def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_speakers):
220
+ """
221
+ # Transcreva o link do youtube usando OpenAI Whisper
222
+
223
+ NOTA: Este modelo foi adaptado por Pedro Faria, para exemplo para a Biometrid, não deve ser usado para outros fins.
224
+
225
+ 1. Usando o modelo Whisper da Open AI para separar áudio em segmentos e gerar transcrições.
226
+ 2. Gerando embeddings de alto-falante para cada segmento.
227
+ 3. Aplicando clustering aglomerativo nos embeddings para identificar o falante de cada segmento.
228
+
229
+ O reconhecimento de fala é baseado em modelos do OpenAI Whisper https://github.com/openai/whisper
230
+ Speaker diarization model and pipeline from by https://github.com/pyannote/pyannote-audio
231
+ Modelo de diarização de alto-falante e pipeline desenvolvido por https://github.com/pyannote/pyannote-audio
232
+ """
233
+
234
+ # model = whisper.load_model(whisper_model)
235
+ # model = WhisperModel(whisper_model, device="cuda", compute_type="int8_float16")
236
  model = WhisperModel(whisper_model, compute_type="int8")
237
  time_start = time.time()
238
+ if(video_file_path == None):
239
  raise ValueError("Error no video input")
240
+ print(video_file_path)
241
+
242
+ try:
243
+ # Read and convert youtube video
244
+ _,file_ending = os.path.splitext(f'{video_file_path}')
245
+ print(f'file enging is {file_ending}')
246
+ audio_file = video_file_path.replace(file_ending, ".wav")
247
+ print("A iniciar a conversão para WAV")
248
+ os.system(f'ffmpeg -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{audio_file}"')
249
+
250
+ # Get duration
251
+ with contextlib.closing(wave.open(audio_file,'r')) as f:
252
+ frames = f.getnframes()
253
+ rate = f.getframerate()
254
+ duration = frames / float(rate)
255
+ print(f"Conversão para WAV concluída, duração do arquivo de áudio.: {duration}")
256
+
257
+ # Transcribe audio
258
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5)
259
+ transcribe_options = dict(task="transcribe", **options)
260
+ segments_raw, info = model.transcribe(audio_file, **transcribe_options)
261
+
262
+ # Convert back to original openai format
263
+ segments = []
264
+ i = 0
265
+ for segment_chunk in segments_raw:
266
+ chunk = {}
267
+ chunk["start"] = segment_chunk.start
268
+ chunk["end"] = segment_chunk.end
269
+ chunk["text"] = segment_chunk.text
270
+ segments.append(chunk)
271
+ i += 1
272
+ print("transcrição de audio com fast whisper terminada")
273
+ except Exception as e:
274
+ raise RuntimeError("Erro a converter o filme para audio")
275
+
276
+ try:
277
+ # Create embedding
278
+ def segment_embedding(segment):
279
+ audio = Audio()
280
+ start = segment["start"]
281
+ # Whisper overshoots the end timestamp in the last segment
282
+ end = min(duration, segment["end"])
283
+ clip = Segment(start, end)
284
+ waveform, sample_rate = audio.crop(audio_file, clip)
285
+ return embedding_model(waveform[None])
286
+
287
+ embeddings = np.zeros(shape=(len(segments), 192))
288
+ for i, segment in enumerate(segments):
289
+ embeddings[i] = segment_embedding(segment)
290
+ embeddings = np.nan_to_num(embeddings)
291
+ print(f'Embedding shape: {embeddings.shape}')
292
+
293
+ if num_speakers == 0:
294
+ # Find the best number of speakers
295
+ score_num_speakers = {}
296
 
297
+ for num_speakers in range(2, 10+1):
298
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
299
+ score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
300
+ score_num_speakers[num_speakers] = score
301
+ best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
302
+ print(f"O número estimado de participantes: {best_num_speaker} com pontuação de {score_num_speakers[best_num_speaker]} ")
303
+ else:
304
+ best_num_speaker = num_speakers
305
+
306
+ # Assign speaker label
307
+ clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
308
+ labels = clustering.labels_
309
+ for i in range(len(segments)):
310
+ segments[i]["speaker"] = 'Participante ' + str(labels[i] + 1)
311
+
312
+ # Make output
313
+ objects = {
314
+ 'Start' : [],
315
+ 'End': [],
316
+ 'Speaker': [],
317
+ 'Text': []
318
+ }
319
+ text = ''
320
+ for (i, segment) in enumerate(segments):
321
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
322
+ objects['Start'].append(str(convert_time(segment["start"])))
323
+ objects['Speaker'].append(segment["speaker"])
324
+ if i != 0:
325
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
326
+ objects['Text'].append(text)
327
+ text = ''
 
 
 
 
 
 
 
 
 
328
  text += segment["text"] + ' '
329
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
330
+ objects['Text'].append(text)
331
+
332
+ time_end = time.time()
333
+ time_diff = time_end - time_start
334
+ memory = psutil.virtual_memory()
335
+ gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
336
+ gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
337
+ gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
338
+ system_info = f"""
339
+ *Memoria: {memory.total / (1024 * 1024 * 1024):.2f}GB, utilizado: {memory.percent}%, disponivel: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
340
+ *Tempo de processamento: {time_diff:.5} segundos.*
341
+ *Utilização de GPU: {gpu_utilization}%, Memoria de GPU: {gpu_memory}MiB.*
342
+ """
343
+ save_path = "output/transcript_result.csv"
344
+ df_results = pd.DataFrame(objects)
345
+ df_results.to_csv(save_path, index=False, encoding="utf-8")
346
+ return df_results, system_info, save_path
 
 
 
 
 
347
 
348
  except Exception as e:
349
  raise RuntimeError("Erro a correr a inferência com um modelo local", e)
 
358
  selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="pt", label="Linguagem detectada no vídeo", interactive=True)
359
  selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="large-v2", label="Modelo Whisper selecionado", interactive=True)
360
  number_speakers = gr.Number(precision=0, value=2, label="Insira o número de participantes para obter melhores resultados. Se o valor for 0, o modelo encontrará automaticamente a melhor quantidade.", interactive=True)
 
 
361
  system_info = gr.Markdown(f"*Memoria: {memory.total / (1024 * 1024 * 1024):.2f}GB, utilizado: {memory.percent}%, disponível: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
362
  download_transcript = gr.File(label="Download transcript")
363
  transcription_df = gr.DataFrame(value=df_init,label="Dataframe da transcrição", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate')
 
418
  number_speakers.render()
419
  transcribe_btn = gr.Button("Transcrever audio com diarização")
420
  transcribe_btn.click(speech_to_text,
421
+ [video_in, selected_source_lang, selected_whisper_model, number_speakers],
422
+ [transcription_df, system_info, download_transcript]
423
+ )
424
 
425
  with gr.Row():
426
  gr.Markdown('''
 
434
  transcription_df.render()
435
  system_info.render()
436
 
437
+
 
 
438
 
439
 
440
+ demo.launch(debug=True)
441
+