Spaces:

akadriu
/

shqip_whisper

Sleeping

App Files Files Community

akadriu commited on Aug 19, 2024

Commit

1b6f227

verified ·

1 Parent(s): fd885a8

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -41

app.py CHANGED Viewed

@@ -1,49 +1,46 @@
 import os
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import gradio as gr
-import librosa
-import numpy as np
 # Fetch the token from the environment
 hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
-# Load the processor and model using the token for authentication
-processor = WhisperProcessor.from_pretrained("akadriu/whisper-medium-sq", token=hf_token)
-model = WhisperForConditionalGeneration.from_pretrained("akadriu/whisper-medium-sq", token=hf_token)
-def transcribe(audio):
-    if isinstance(audio, tuple):
-        # Gradio provides audio as (sample_rate, data) when using the microphone
-        sr, audio_input = audio
-    else:
-        # Load the file if it's a filepath
-        audio_input, sr = librosa.load(audio, sr=16000)
-    # Convert audio to floating-point if necessary
-    if audio_input.dtype != np.float32:
-        audio_input = audio_input.astype(np.float32)
-    # Resample if the sample rate is not 16000
-    if sr != 16000:
-        audio_input = librosa.resample(audio_input, orig_sr=sr, target_sr=16000)
-    # Process and transcribe the audio
-    input_features = processor(audio_input, sampling_rate=16000, return_tensors="pt").input_features
-    # Generate predictions
-    predicted_ids = model.generate(input_features)
-    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
-    text = transcription
-    return text
-# Create the Gradio interface
-iface = gr.Interface(
-    fn=transcribe,
-    inputs=gr.Audio(),
-    outputs="text",
-    title="Whisper Medium Shqip",
-    description="Realtime demo for Sq speech recognition using a fine-tuned Whisper medium model.",
 )
-iface.launch(share=True)

 import os
+from transformers import pipeline
 import gradio as gr
 # Fetch the token from the environment
 hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
+model_id = "akadriu/whisper-medium-sq"  # update with your model id
+pipe = pipeline("automatic-speech-recognition", model=model_id, token=hf_token)
+def transcribe_speech(filepath):
+    output = pipe(
+        filepath,
+        max_new_tokens=256,
+        generate_kwargs={
+            "task": "transcribe",
+            "language": "albanian",
+        },  # update with the language you've fine-tuned on
+        chunk_length_s=30,
+        batch_size=8,
+    )
+    return output["text"]
+import gradio as gr
+demo = gr.Blocks()
+mic_transcribe = gr.Interface(
+    fn=transcribe_speech,
+    inputs=gr.Audio(sources="microphone", type="filepath"),
+    outputs=gr.outputs.Textbox(),
+)
+file_transcribe = gr.Interface(
+    fn=transcribe_speech,
+    inputs=gr.Audio(sources="upload", type="filepath"),
+    outputs=gr.outputs.Textbox(),
 )
+with demo:
+    gr.TabbedInterface(
+        [mic_transcribe, file_transcribe],
+        ["Transcribe Microphone", "Transcribe Audio File"],
+    )
+demo.launch(debug=True)