akadriu commited on
Commit
eda98d9
·
verified ·
1 Parent(s): 642de66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -9
app.py CHANGED
@@ -1,30 +1,43 @@
1
- from transformers import pipeline
 
2
  import gradio as gr
3
  import librosa
4
  import numpy as np
5
- import os
6
-
7
- from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
8
 
 
9
  hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
 
 
10
  processor = WhisperProcessor.from_pretrained("akadriu/whisper-medium-sq", token=hf_token)
11
  model = WhisperForConditionalGeneration.from_pretrained("akadriu/whisper-medium-sq", token=hf_token)
12
 
13
  def transcribe(audio):
14
- audio_input, _ = librosa.load(audio, sr=16000)
 
 
 
 
 
 
 
 
 
 
 
15
  input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
16
  predicted_ids = model.generate(input_features)
17
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
18
- text = transcription
19
- return text
20
 
 
21
  iface = gr.Interface(
22
  fn=transcribe,
23
- inputs=gr.Audio(type="numpy"),
24
  outputs="text",
25
  title="Whisper Medium Shqip",
26
  description="Realtime demo for Sq speech recognition using a fine-tuned Whisper medium model.",
27
  )
28
 
29
- iface.launch(share=True)
30
 
 
1
+ import os
2
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
3
  import gradio as gr
4
  import librosa
5
  import numpy as np
 
 
 
6
 
7
+ # Fetch the token from the environment
8
  hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
9
+
10
+ # Load the processor and model using the token for authentication
11
  processor = WhisperProcessor.from_pretrained("akadriu/whisper-medium-sq", token=hf_token)
12
  model = WhisperForConditionalGeneration.from_pretrained("akadriu/whisper-medium-sq", token=hf_token)
13
 
14
  def transcribe(audio):
15
+ if isinstance(audio, tuple):
16
+ # Gradio provides audio as (sample_rate, data) when using the microphone
17
+ sr, audio_input = audio
18
+ else:
19
+ # Load the file if it's a filepath
20
+ audio_input, sr = librosa.load(audio, sr=16000)
21
+
22
+ # Resample if the sample rate is not 16000
23
+ if sr != 16000:
24
+ audio_input = librosa.resample(audio_input, orig_sr=sr, target_sr=16000)
25
+
26
+ # Process and transcribe the audio
27
  input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
28
  predicted_ids = model.generate(input_features)
29
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
30
+
31
+ return transcription[0] # Decode returns a list
32
 
33
+ # Create the Gradio interface
34
  iface = gr.Interface(
35
  fn=transcribe,
36
+ inputs=gr.Audio(),
37
  outputs="text",
38
  title="Whisper Medium Shqip",
39
  description="Realtime demo for Sq speech recognition using a fine-tuned Whisper medium model.",
40
  )
41
 
42
+ iface.launch(share=True)
43