sepal commited on
Commit
60bd8ec
·
1 Parent(s): 2b1b04f

Change architecture and implement basic demo

Browse files
Files changed (1) hide show
  1. app.py +41 -30
app.py CHANGED
@@ -2,11 +2,13 @@ import os
2
  from dotenv import load_dotenv
3
  import gradio as gr
4
  import numpy as np
 
5
  import torch
6
  from pyannote.audio import Pipeline
7
  from pydub import AudioSegment
8
  from mimetypes import MimeTypes
9
  import whisper
 
10
 
11
  load_dotenv()
12
 
@@ -21,37 +23,20 @@ You need to set it via an .env or environment variable HG_ACCESS_TOKEN''')
21
  exit(1)
22
 
23
 
24
- def diarization(audio_file: tuple[int, np.array]) -> np.array:
25
  """
26
- Receives a tuple with the sample rate and audio data and returns the
27
- a numpy array containing the audio segments, track names and speakers for
28
- each segment.
29
  """
30
- waveform = torch.tensor(audio_file[1].astype(np.float32, order='C')).reshape(1,-1)
31
- audio_data = {
32
- "waveform": waveform,
33
- "sample_rate": audio_file[0]
34
- }
35
 
36
- diarization = pipeline(audio_data)
37
-
38
- return np.array(list(diarization.itertracks(yield_label=True)))
39
-
40
- def combine_segments(segments: np.array) -> np.array:
41
- new_arr = []
42
- prev_label = None
43
- for row in segments:
44
- if prev_label is None or row[2] != prev_label:
45
- new_arr.append(row)
46
- prev_label = row[2]
47
- else:
48
- new_arr[-1][0] = new_arr[-1][0] | row[0]
49
- new_arr[-1][1] = new_arr[-1][1]
50
- new_arr[-1][2] = prev_label
51
- return np.array(new_arr)
52
 
53
- def split_audio(audio_file: tuple[int, np.array], segments):
54
- pass
 
 
 
55
 
56
 
57
  def prep_audio(audio_segment):
@@ -63,11 +48,37 @@ def prep_audio(audio_segment):
63
  audio_data = audio_segment.set_channels(1).set_frame_rate(16000)
64
  return np.array(audio_data.get_array_of_samples()).flatten().astype(np.float32) / 32768.0
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  def transcribe(audio_file: str) -> str:
67
  audio = AudioSegment.from_file(audio_file)
68
-
69
- audio_data = prep_audio(audio)
70
- return whisper_ml.transcribe(audio_data)['text']
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
  demo = gr.Interface(
 
2
  from dotenv import load_dotenv
3
  import gradio as gr
4
  import numpy as np
5
+ import pandas as pd
6
  import torch
7
  from pyannote.audio import Pipeline
8
  from pydub import AudioSegment
9
  from mimetypes import MimeTypes
10
  import whisper
11
+ import tempfile
12
 
13
  load_dotenv()
14
 
 
23
  exit(1)
24
 
25
 
26
+ def diarization(audio) -> np.array:
27
  """
28
+ Receives a pydub AudioSegment and returns an numpy array with all segments.
 
 
29
  """
30
+ audio.export("/tmp/dz.wav", format="wav")
31
+ diarization = pipeline("/tmp/dz.wav")
32
+ return pd.DataFrame(list(diarization.itertracks(yield_label=True)),columns=["Segment","Trackname", "Speaker"])
 
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ def combine_segments(df):
36
+ grouped_df = df.groupby((df['Speaker'] != df['Speaker'].shift()).cumsum())
37
+ return grouped_df.agg({'Segment': lambda x: x.min() | x.max(),
38
+ 'Trackname': 'first',
39
+ 'Speaker': 'first'})
40
 
41
 
42
  def prep_audio(audio_segment):
 
48
  audio_data = audio_segment.set_channels(1).set_frame_rate(16000)
49
  return np.array(audio_data.get_array_of_samples()).flatten().astype(np.float32) / 32768.0
50
 
51
+ def transcribe_row(row, audio):
52
+ segment = audio[row.start_ms:row.end_ms]
53
+ data = prep_audio(segment)
54
+ return whisper_ml.transcribe(data)['text']
55
+
56
+
57
+ def combine_transcription(segments):
58
+ text = ""
59
+ for _,row in segments.iterrows():
60
+ text += f"[{row.Speaker}]: {row.text}\n"
61
+
62
+ return text
63
+
64
  def transcribe(audio_file: str) -> str:
65
  audio = AudioSegment.from_file(audio_file)
66
+ print("diarization")
67
+ df = diarization(audio)
68
+
69
+ print("combining segments")
70
+ df = combine_segments(df)
71
+
72
+ df['start'] = df.Segment.apply(lambda x: x.start)
73
+ df['end'] = df.Segment.apply(lambda x: x.end)
74
+
75
+ df['start_ms'] = df.Segment.apply(lambda x: int(x.start*1000))
76
+ df['end_ms'] = df.Segment.apply(lambda x: int(x.end*1000))
77
+
78
+ print("transcribing segments")
79
+ df['text'] = df.apply(lambda x: transcribe_row(x, audio), axis=1)
80
+
81
+ return combine_transcription(df)
82
 
83
 
84
  demo = gr.Interface(