akadriu commited on
Commit
1b6f227
·
verified ·
1 Parent(s): fd885a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -41
app.py CHANGED
@@ -1,49 +1,46 @@
1
  import os
2
- from transformers import WhisperProcessor, WhisperForConditionalGeneration
3
  import gradio as gr
4
- import librosa
5
- import numpy as np
6
 
7
  # Fetch the token from the environment
8
  hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # Load the processor and model using the token for authentication
11
- processor = WhisperProcessor.from_pretrained("akadriu/whisper-medium-sq", token=hf_token)
12
- model = WhisperForConditionalGeneration.from_pretrained("akadriu/whisper-medium-sq", token=hf_token)
13
-
14
- def transcribe(audio):
15
- if isinstance(audio, tuple):
16
- # Gradio provides audio as (sample_rate, data) when using the microphone
17
- sr, audio_input = audio
18
- else:
19
- # Load the file if it's a filepath
20
- audio_input, sr = librosa.load(audio, sr=16000)
21
-
22
- # Convert audio to floating-point if necessary
23
- if audio_input.dtype != np.float32:
24
- audio_input = audio_input.astype(np.float32)
25
-
26
- # Resample if the sample rate is not 16000
27
- if sr != 16000:
28
- audio_input = librosa.resample(audio_input, orig_sr=sr, target_sr=16000)
29
-
30
- # Process and transcribe the audio
31
- input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
32
-
33
- # Generate predictions
34
- predicted_ids = model.generate(input_features)
35
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
36
-
37
- text = transcription
38
- return text
39
-
40
- # Create the Gradio interface
41
- iface = gr.Interface(
42
- fn=transcribe,
43
- inputs=gr.Audio(),
44
- outputs="text",
45
- title="Whisper Medium Shqip",
46
- description="Realtime demo for Sq speech recognition using a fine-tuned Whisper medium model.",
47
  )
48
 
49
- iface.launch(share=True)
 
 
 
 
 
 
 
1
  import os
2
+ from transformers import pipeline
3
  import gradio as gr
4
+
 
5
 
6
  # Fetch the token from the environment
7
  hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
8
+ model_id = "akadriu/whisper-medium-sq" # update with your model id
9
+ pipe = pipeline("automatic-speech-recognition", model=model_id, token=hf_token)
10
+
11
+ def transcribe_speech(filepath):
12
+ output = pipe(
13
+ filepath,
14
+ max_new_tokens=256,
15
+ generate_kwargs={
16
+ "task": "transcribe",
17
+ "language": "albanian",
18
+ }, # update with the language you've fine-tuned on
19
+ chunk_length_s=30,
20
+ batch_size=8,
21
+ )
22
+ return output["text"]
23
+
24
+ import gradio as gr
25
 
26
+ demo = gr.Blocks()
27
+
28
+ mic_transcribe = gr.Interface(
29
+ fn=transcribe_speech,
30
+ inputs=gr.Audio(sources="microphone", type="filepath"),
31
+ outputs=gr.outputs.Textbox(),
32
+ )
33
+
34
+ file_transcribe = gr.Interface(
35
+ fn=transcribe_speech,
36
+ inputs=gr.Audio(sources="upload", type="filepath"),
37
+ outputs=gr.outputs.Textbox(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  )
39
 
40
+ with demo:
41
+ gr.TabbedInterface(
42
+ [mic_transcribe, file_transcribe],
43
+ ["Transcribe Microphone", "Transcribe Audio File"],
44
+ )
45
+
46
+ demo.launch(debug=True)