Spaces:

Norphel
/

s2s_translation_dz_en

Runtime error

App Files Files Community

Norphel commited on Feb 12

Commit

44c8041

verified ·

1 Parent(s): bb5cc1d

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -52

app.py CHANGED Viewed

@@ -1,59 +1,93 @@
 import numpy as np
 import torch
-import gradio as gr
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import librosa
-# Load ASR model & processor
-asr_model_id = "Norphel/wav2vec2-large-mms-1b-dzo-colab"
-asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_id, target_lang="dzo")
-asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_id)
-asr_processor.tokenizer.set_target_lang("dzo")
-# Use CPU if no GPU is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
-asr_model.to(device)
-# Function to process audio & generate text
-def generate_text(audio):
-    if audio is None:
-        return "No audio received"
-    sr, data = audio  # Unpack the tuple (sample rate, numpy array)
-    print(f"Original sample rate: {sr}, dtype: {data.dtype}")
-    # Convert to float32
-    data = data.astype(np.float32)
-    # Resample to 16kHz if necessary
-    target_sr = 16000
-    if sr != target_sr:
-        data = librosa.resample(data, orig_sr=sr, target_sr=target_sr)
-        sr = target_sr
-    print(f"Processed sample rate: {sr}, dtype: {data.dtype}")
-    # Tokenize and run inference
-    inputs = asr_processor(data, sampling_rate=sr, return_tensors="pt", padding=True)
     with torch.no_grad():
-        outputs = asr_model(**inputs).logits
-    pred_ids = torch.argmax(outputs, dim=-1)[0]
-    # Decode the prediction
-    return asr_processor.decode(pred_ids)
-# Ensure we get a NumPy array from Gradio
-input_audio = gr.Audio(
-    sources=["microphone"],
-    type="numpy",  # Ensures function gets (sr, np.ndarray)
-)
-demo = gr.Interface(
-    fn=generate_text,
-    inputs=input_audio,
-    outputs="text"
-)
-if __name__ == "__main__":
-    demo.launch()

+import streamlit as st
+import soundfile as sf
 import numpy as np
 import torch
 import librosa
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+from transformers import VitsModel, AutoTokenizer
+import tempfile
+st.title("Dzongkha Speech-to-Text")
+# Check if a GPU is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
+st.write(f"Using device: {device.upper()}")
+# Load the model only once (for performance)
+@st.cache_resource
+def load_asr_model():
+    model_id = "Norphel/wav2vec2-large-mms-1b-dzo-colab"
+    model = Wav2Vec2ForCTC.from_pretrained(model_id).to(device)  # Use CPU or GPU
+    processor = Wav2Vec2Processor.from_pretrained(model_id)
+    return model, processor
+@st.cache_resource
+def load_translation_model():
+    model = AutoModelForSeq2SeqLM.from_pretrained("Norphel/Dz_en", token="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")
+    tokenizer = AutoTokenizer.from_pretrained("Norphel/Dz_en", token="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")
+    return model, tokenizer
+@st.cache_resource
+def load_tts_model():
+    model = VitsModel.from_pretrained("Norphel/MMS-TTS-Dzo-N3")
+    tokenizer = AutoTokenizer.from_pretrained("Norphel/MMS-TTS-Dzo-N3")
+    return model, tokenizer
+def generate_voice(text):
+    inputs = tts_tokenizer(text, return_tensors="pt")
     with torch.no_grad():
+        output = tts_model(**inputs).waveform
+    return output
+def translate(text):
+    inputs = translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)  # Move inputs to GPU
+    translation_model.to(device)  # Move model to GPU
+    outputs = translation_model.generate(inputs, max_new_tokens=512)
+    decoded_output = translation_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return decoded_output
+# Corrected function to load the ASR model
+asr_model, processor = load_asr_model()
+translation_model, translation_tokenizer = load_translation_model()
+tts_model, tts_tokenizer = load_tts_model()
+# Audio Recording Widget
+audio_value = st.audio_input("Record a voice message")
+if audio_value:
+    st.audio(audio_value, format="audio/wav")
+    # Save the uploaded audio to a temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
+        temp_file.write(audio_value.getvalue())
+        temp_filename = temp_file.name
+    # Read audio file using soundfile
+    with sf.SoundFile(temp_filename) as audio_file:
+        sample_rate = audio_file.samplerate
+        dtype = audio_file.subtype  # Example: PCM_16
+    st.write(f"Original Sample Rate: {sample_rate} Hz")
+    st.write(f"Data Type: {dtype}")
+    # Convert to 16kHz Float32
+    with sf.SoundFile(temp_filename) as audio_file:
+        audio_data = audio_file.read(dtype="float32")
+    if sample_rate != 16000:
+        audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
+    # Run Speech-to-Text
+    def generate_text(audio):
+        input_dict = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
+        logits = asr_model(input_dict.input_values.to(device)).logits
+        pred_ids = torch.argmax(logits, dim=-1)[0]
+        return processor.decode(pred_ids)
+    # Get Transcription
+    transcription = generate_text(audio_data)
+    translation = translate(transcription)
+    audio = generate_voice(transcription)
+    st.write(translation)
+    st.audio(audio)