Spaces:

alakxender
/

dhivehi-mms-demo

Running on Zero

App Files Files Community

alakxender commited on Feb 7

Commit

6430b7c

1 Parent(s): 0c6a355

i

Browse files

Files changed (2) hide show

app.py +86 -4
requirements.txt +3 -0

app.py CHANGED Viewed

@@ -1,7 +1,89 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
+import torch
+import torchaudio
+import numpy as np
+# Device and dtype configuration
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+# Load model and processor with LM
+processor = Wav2Vec2ProcessorWithLM.from_pretrained("alakxender/wav2vec2-large-mms-1b-dv-syn-md")
+model = Wav2Vec2ForCTC.from_pretrained(
+    "alakxender/wav2vec2-large-mms-1b-dv-syn-md",
+    torch_dtype=torch_dtype
+).to(device)
+MAX_LENGTH = 120 # 2 minutes
+MIN_LENGTH = 1 # 1 second
+def transcribe(audio_file):
+    try:
+        # Load audio file
+        waveform, sample_rate = torchaudio.load(audio_file)
+        # Move waveform to the correct device
+        waveform = waveform.to(device)
+        # Get the duration of the audio
+        duration = waveform.shape[1] / sample_rate
+        # Check if the audio is too short or too long
+        if duration < MIN_LENGTH or duration > MAX_LENGTH:
+            return f"Audio duration is too short or too long. Duration: {duration} seconds"
+        # Resample if necessary
+        if sample_rate != 16000:
+            resampler = torchaudio.transforms.Resample(sample_rate, 16000).to(device)
+            waveform = resampler(waveform)
+        # Convert to mono if stereo
+        if waveform.shape[0] > 1:
+            waveform = waveform.mean(dim=0, keepdim=True)
+        # Move to CPU for numpy conversion
+        waveform = waveform.cpu()
+        audio_input = waveform.squeeze().numpy()
+        # Ensure audio input is float32
+        if audio_input.dtype != np.float32:
+            audio_input = audio_input.astype(np.float32)
+        # Process audio input
+        input_values = processor(
+            audio_input,
+            sampling_rate=16_000,
+            return_tensors="pt"
+        ).input_values.to(device)
+        # Convert to float16 if using CUDA
+        if torch_dtype == torch.float16:
+            input_values = input_values.half()
+        # Generate transcription
+        with torch.no_grad():
+            logits = model(input_values).logits
+        # Use language model for decoding
+        transcription = processor.decode(logits[0].cpu().numpy())
+        # Return the transcription in lowercase
+        print(transcription)
+        return transcription[0].lower()
+    except Exception as e:
+        return f"Error during transcription: {str(e)}"
+# Create Gradio interface
+iface = gr.Interface(
+    fn=transcribe,
+    inputs=gr.Audio(type="filepath"),
+    outputs="text",
+    title="Dhivehi Speech Recognition with Language Model",
+    description="Upload an audio file to transcribe Dhivehi speech to text using language model enhanced decoding."
+)
+# Launch the interface
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers
+torchaudio
+pyctcdecode