akadriu commited on
Commit
864b253
·
verified ·
1 Parent(s): 78ed731

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -4
app.py CHANGED
@@ -14,9 +14,13 @@ def transcribe(audio):
14
  # If audio is a tuple, extract the NumPy array and the sampling rate
15
  if isinstance(audio, tuple):
16
  audio_input, sr = audio
 
 
 
17
  else:
18
  # Otherwise, load the file from the path
19
- audio_input, sr = librosa.load(audio, sr=16000)
 
20
 
21
  # Ensure the sample rate is what the processor expects
22
  if sr != 16000:
@@ -27,12 +31,13 @@ def transcribe(audio):
27
  predicted_ids = model.generate(input_features)
28
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
29
  text = transcription[0] # Decode returns a list
30
- return text
 
31
 
32
  iface = gr.Interface(
33
  fn=transcribe,
34
- inputs=gr.Audio(), # No need for 'source' or 'type'
35
- outputs="text",
36
  title="Whisper Medium Shqip",
37
  description="Realtime demo for Sq speech recognition using a fine-tuned Whisper medium model.",
38
  )
 
14
  # If audio is a tuple, extract the NumPy array and the sampling rate
15
  if isinstance(audio, tuple):
16
  audio_input, sr = audio
17
+ # Save the audio back to a file for playback
18
+ audio_file = "temp.wav"
19
+ librosa.output.write_wav(audio_file, audio_input, sr)
20
  else:
21
  # Otherwise, load the file from the path
22
+ audio_file = audio
23
+ audio_input, sr = librosa.load(audio_file, sr=16000)
24
 
25
  # Ensure the sample rate is what the processor expects
26
  if sr != 16000:
 
31
  predicted_ids = model.generate(input_features)
32
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
33
  text = transcription[0] # Decode returns a list
34
+
35
+ return audio_file, text
36
 
37
  iface = gr.Interface(
38
  fn=transcribe,
39
+ inputs=gr.Audio(), # Audio input
40
+ outputs=[gr.Audio(), "text"], # Return the audio file and transcription text
41
  title="Whisper Medium Shqip",
42
  description="Realtime demo for Sq speech recognition using a fine-tuned Whisper medium model.",
43
  )