sepal commited on
Commit
2b1b04f
·
1 Parent(s): f8052a8

Switch to pydub audio conversion and implement basic transcription

Browse files
Files changed (1) hide show
  1. app.py +17 -5
app.py CHANGED
@@ -6,6 +6,7 @@ import torch
6
  from pyannote.audio import Pipeline
7
  from pydub import AudioSegment
8
  from mimetypes import MimeTypes
 
9
 
10
  load_dotenv()
11
 
@@ -13,6 +14,7 @@ hg_token = os.getenv("HG_ACCESS_TOKEN")
13
 
14
  if hg_token != None:
15
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hg_token)
 
16
  else:
17
  print('''No hugging face access token set.
18
  You need to set it via an .env or environment variable HG_ACCESS_TOKEN''')
@@ -52,15 +54,25 @@ def split_audio(audio_file: tuple[int, np.array], segments):
52
  pass
53
 
54
 
55
- def transcribe(audio_file: tuple[int, np.array]) -> str:
56
- segments = diarization(audio_file)
57
- segments = combine_segments(segments)
58
- return segments
 
 
 
 
 
 
 
 
 
 
59
 
60
 
61
  demo = gr.Interface(
62
  fn=transcribe,
63
- inputs=gr.Audio(type="numpy"),
64
  outputs="text",
65
  )
66
 
 
6
  from pyannote.audio import Pipeline
7
  from pydub import AudioSegment
8
  from mimetypes import MimeTypes
9
+ import whisper
10
 
11
  load_dotenv()
12
 
 
14
 
15
  if hg_token != None:
16
  pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hg_token)
17
+ whisper_ml = whisper.load_model("base")
18
  else:
19
  print('''No hugging face access token set.
20
  You need to set it via an .env or environment variable HG_ACCESS_TOKEN''')
 
54
  pass
55
 
56
 
57
+ def prep_audio(audio_segment):
58
+ """
59
+ This function preps a pydub AudioSegment for a ml model.
60
+
61
+ Both pyannote audio and whisper require mono audio with a 16khz rate as float32.
62
+ """
63
+ audio_data = audio_segment.set_channels(1).set_frame_rate(16000)
64
+ return np.array(audio_data.get_array_of_samples()).flatten().astype(np.float32) / 32768.0
65
+
66
+ def transcribe(audio_file: str) -> str:
67
+ audio = AudioSegment.from_file(audio_file)
68
+
69
+ audio_data = prep_audio(audio)
70
+ return whisper_ml.transcribe(audio_data)['text']
71
 
72
 
73
  demo = gr.Interface(
74
  fn=transcribe,
75
+ inputs=gr.Audio(type="filepath"),
76
  outputs="text",
77
  )
78