Spaces:

nvidia
/

multilingual-voice-4B-demo

Sleeping

App Files Files Community

huckiyang commited on Mar 11

Commit

d7c422e

1 Parent(s): c4dfa25

Revert "last try to fix it"

Browse files

This reverts commit c4dfa25a3ab77abdad19c048916ca0eb11509801.

Files changed (1) hide show

app.py +61 -31

app.py CHANGED Viewed

@@ -131,21 +131,45 @@ def transcribe_audio(audio_file, language):
     # Perform ASR
     text, *_ = model(speech)[0]
-    # Also get translation to English if not already in English
-    translation = ""
-    if language != "<eng>":
-        # Set task to speech translation to English
-        model.task_sym = "<st_en>"
-        # Keep the source language the same
-        # Perform speech translation
-        translation, *_ = model(speech)[0]
     # Clean up temporary file if created
     if isinstance(audio_file, str) and audio_file.startswith(tempfile.gettempdir()):
         os.unlink(audio_file)
-    return text, translation
 # Function to handle English transcription
 def transcribe_english(audio_file):
@@ -160,22 +184,22 @@ def transcribe_chinese(audio_file, chinese_variant="Traditional"):
         audio_file: Path to the audio file
         chinese_variant: Either "Simplified" or "Traditional"
     """
-    # First get the base transcription and translation
-    text, translation = transcribe_audio(audio_file, "<zho>")
     # Convert between simplified and traditional Chinese if needed
     if chinese_variant == "Traditional":
         # Convert simplified to traditional
         # Use s2t for more complete conversion from Simplified to Traditional
         cc = OpenCC('s2t')  # s2t
-        text = cc.convert(text)
-    elif chinese_variant == "Simplified" and not text.isascii():
         # If the text contains non-ASCII characters, it might be traditional
         # Convert traditional to simplified just to be safe
         cc = OpenCC('t2s')  # t2s: Traditional to Simplified
-        text = cc.convert(text)
-    return text, translation
 # Function to handle Japanese transcription
 def transcribe_japanese(audio_file):
@@ -285,14 +309,14 @@ with demo:
                 # Special handling for Chinese with variant selection
                 if lang == "Mandarin" and chinese_variant:
-                    transcription, translation = transcribe_chinese(audio, chinese_variant)
                 else:
-                    result = transcribe_audio(audio, lang_map.get(lang, "<eng>"))
-                    if lang == "English":
-                        transcription = result
-                        translation = ""
-                    else:
-                        transcription, translation = result
                 return transcription, translation, gr.update(visible=(lang != "English"))
@@ -389,7 +413,9 @@ with demo:
             # Update the click function to include the Chinese variant and translation
             def transcribe_chinese_with_variant(audio_file, variant):
-                return transcribe_chinese(audio_file, variant)
             zh_button.click(
                 fn=transcribe_chinese_with_variant,
@@ -431,7 +457,8 @@ with demo:
                 )
             def transcribe_and_translate_japanese(audio_file):
-                transcription, translation = transcribe_japanese(audio_file)
                 return transcription, translation
             jp_button.click(
@@ -474,8 +501,8 @@ with demo:
                 )
             def transcribe_and_translate_korean(audio_file):
-                transcription, translation = transcribe_korean(audio_file)
                 return transcription, translation
             kr_button.click(
@@ -518,7 +545,8 @@ with demo:
                 )
             def transcribe_and_translate_thai(audio_file):
-                transcription, translation = transcribe_thai(audio_file)
                 return transcription, translation
             th_button.click(
@@ -561,7 +589,8 @@ with demo:
                 )
             def transcribe_and_translate_italian(audio_file):
-                transcription,translation = transcribe_italian(audio_file)
                 return transcription, translation
             it_button.click(
@@ -604,7 +633,8 @@ with demo:
                 )
             def transcribe_and_translate_german(audio_file):
-                transcription, translation = transcribe_german(audio_file)
                 return transcription, translation
             de_button.click(

     # Perform ASR
     text, *_ = model(speech)[0]
     # Clean up temporary file if created
     if isinstance(audio_file, str) and audio_file.startswith(tempfile.gettempdir()):
         os.unlink(audio_file)
+    return text
+# New function for speech translation to English
+def translate_to_english(audio_file, source_language):
+    """Process the audio file and return the English translation"""
+    if audio_file is None:
+        return "Please upload an audio file or record audio."
+    # If audio is a tuple (from microphone recording)
+    if isinstance(audio_file, tuple):
+        sr, audio_data = audio_file
+        # Create a temporary file to save the audio
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
+            temp_path = temp_audio.name
+            sf.write(temp_path, audio_data, sr)
+            audio_file = temp_path
+    # Load and resample the audio file to 16kHz
+    speech, _ = librosa.load(audio_file, sr=16000)
+    # Set task to speech translation to English
+    model.task_sym = "<st_en>"
+    # Set source language
+    if source_language != None:
+        model.lang_sym = source_language
+    # Perform speech translation
+    translation, *_ = model(speech)[0]
+    # Clean up temporary file if created
+    if isinstance(audio_file, str) and audio_file.startswith(tempfile.gettempdir()):
+        os.unlink(audio_file)
+    return translation
 # Function to handle English transcription
 def transcribe_english(audio_file):
         audio_file: Path to the audio file
         chinese_variant: Either "Simplified" or "Traditional"
     """
+    # First get the base transcription
+    asr_text = transcribe_audio(audio_file, "<zho>")
     # Convert between simplified and traditional Chinese if needed
     if chinese_variant == "Traditional":
         # Convert simplified to traditional
         # Use s2t for more complete conversion from Simplified to Traditional
         cc = OpenCC('s2t')  # s2t
+        asr_text = cc.convert(asr_text)
+    elif chinese_variant == "Simplified" and not asr_text.isascii():
         # If the text contains non-ASCII characters, it might be traditional
         # Convert traditional to simplified just to be safe
         cc = OpenCC('t2s')  # t2s: Traditional to Simplified
+        asr_text = cc.convert(asr_text)
+    return asr_text
 # Function to handle Japanese transcription
 def transcribe_japanese(audio_file):
                 # Special handling for Chinese with variant selection
                 if lang == "Mandarin" and chinese_variant:
+                    transcription = transcribe_chinese(audio, chinese_variant)
                 else:
+                    transcription = transcribe_audio(audio, lang_map.get(lang, "<eng>"))
+                # Get translation if not English
+                translation = ""
+                if lang != "English":
+                    translation = translate_to_english(audio, lang_map.get(lang, "<eng>"))
                 return transcription, translation, gr.update(visible=(lang != "English"))
             # Update the click function to include the Chinese variant and translation
             def transcribe_chinese_with_variant(audio_file, variant):
+                transcription = transcribe_chinese(audio_file, variant)
+                translation = translate_to_english(audio_file, "<zho>")
+                return transcription, translation
             zh_button.click(
                 fn=transcribe_chinese_with_variant,
                 )
             def transcribe_and_translate_japanese(audio_file):
+                transcription = transcribe_japanese(audio_file)
+                translation = translate_to_english(audio_file, "<jpn>")
                 return transcription, translation
             jp_button.click(
                 )
             def transcribe_and_translate_korean(audio_file):
+                transcription = transcribe_korean(audio_file)
+                translation = translate_to_english(audio_file, "<kor>")
                 return transcription, translation
             kr_button.click(
                 )
             def transcribe_and_translate_thai(audio_file):
+                transcription = transcribe_thai(audio_file)
+                translation = translate_to_english(audio_file, "<tha>")
                 return transcription, translation
             th_button.click(
                 )
             def transcribe_and_translate_italian(audio_file):
+                transcription = transcribe_italian(audio_file)
+                translation = translate_to_english(audio_file, "<ita>")
                 return transcription, translation
             it_button.click(
                 )
             def transcribe_and_translate_german(audio_file):
+                transcription = transcribe_german(audio_file)
+                translation = translate_to_english(audio_file, "<deu>")
                 return transcription, translation
             de_button.click(