Spaces:

vakyansh
/

vakyansh-hindi-conformer-2022

Runtime error

App Files Files Community

vakyansh commited on Mar 4, 2024

Commit

647230d

1 Parent(s): 27cd801

Adding hindi conformer model

Browse files

Files changed (4) hide show

app.py +81 -0
packages.txt +2 -0
pre-requirements.txt +2 -0
requirements.txt +2 -0

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import gradio as gr
+import torch
+import librosa
+import soundfile
+import nemo.collections.asr as nemo_asr
+import tempfile
+import os
+import uuid
+import wget
+model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo'
+wget.download(model_url)
+SAMPLE_RATE = 16000
+# Load pre-trained model
+model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("Conformer-CTC-BPE-Large.nemo")
+model.change_decoding_strategy(None)
+model.eval()
+def process_audio_file(file_path):
+    # Load audio file
+    data, sr = librosa.load(file_path)
+    # Resample if necessary
+    if sr != SAMPLE_RATE:
+        data = librosa.resample(data, sr, SAMPLE_RATE)
+    # Convert to mono channel
+    data = librosa.to_mono(data)
+    return data
+def transcribe(microphone_audio, uploaded_audio):
+    # Handle warning message
+    warn_output = ""
+    if microphone_audio and uploaded_audio:
+        warn_output = ("WARNING: You've uploaded an audio file and used the microphone. "
+                       "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n")
+        audio_file = microphone_audio
+    elif not microphone_audio and not uploaded_audio:
+        return "ERROR: You have to either use the microphone or upload an audio file"
+    elif microphone_audio:
+        audio_file = microphone_audio
+    else:
+        audio_file = uploaded_audio
+    # Process audio file
+    audio_data = process_audio_file(audio_file)
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # Save audio data to a temporary file
+        audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
+        soundfile.write(audio_path, audio_data, SAMPLE_RATE)
+        # Transcribe audio
+        transcriptions = model.transcribe([audio_path])
+        # Extract best hypothesis if transcriptions form a tuple (from RNNT)
+        if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
+            transcriptions = transcriptions[0]
+    return warn_output + transcriptions[0]
+iface = gr.Interface(
+    fn=transcribe,
+    inputs=[
+        gr.inputs.Audio(source="microphone", type='filepath', optional=True),
+        gr.inputs.Audio(source="upload", type='filepath', optional=True),
+    ],
+    outputs="text",
+    layout="horizontal",
+    theme="huggingface",
+    title="NeMo Conformer Transducer Large - English",
+    description="Demo for English speech recognition using Conformer Transducers",
+    allow_flagging='never',
+)
+iface.launch(enable_queue=True)

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ libsndfile1

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Cython
2	+ torch

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ nemo_toolkit[asr]
2	+ wget