Spaces:

nvidia
/

multilingual-voice-4B-demo

Runtime error

App Files Files Community

huckiyang commited on Mar 11

Commit

c4dfa25

1 Parent(s): afe4a7c

last try to fix it

Browse files

Files changed (1) hide show

app.py +31 -61

app.py CHANGED Viewed

@@ -131,45 +131,21 @@ def transcribe_audio(audio_file, language):
     # Perform ASR
     text, *_ = model(speech)[0]
-    # Clean up temporary file if created
-    if isinstance(audio_file, str) and audio_file.startswith(tempfile.gettempdir()):
-        os.unlink(audio_file)
-    return text
-# New function for speech translation to English
-def translate_to_english(audio_file, source_language):
-    """Process the audio file and return the English translation"""
-    if audio_file is None:
-        return "Please upload an audio file or record audio."
-    # If audio is a tuple (from microphone recording)
-    if isinstance(audio_file, tuple):
-        sr, audio_data = audio_file
-        # Create a temporary file to save the audio
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
-            temp_path = temp_audio.name
-            sf.write(temp_path, audio_data, sr)
-            audio_file = temp_path
-    # Load and resample the audio file to 16kHz
-    speech, _ = librosa.load(audio_file, sr=16000)
-    # Set task to speech translation to English
-    model.task_sym = "<st_en>"
-    # Set source language
-    if source_language != None:
-        model.lang_sym = source_language
-    # Perform speech translation
-    translation, *_ = model(speech)[0]
     # Clean up temporary file if created
     if isinstance(audio_file, str) and audio_file.startswith(tempfile.gettempdir()):
         os.unlink(audio_file)
-    return translation
 # Function to handle English transcription
 def transcribe_english(audio_file):
@@ -184,22 +160,22 @@ def transcribe_chinese(audio_file, chinese_variant="Traditional"):
         audio_file: Path to the audio file
         chinese_variant: Either "Simplified" or "Traditional"
     """
-    # First get the base transcription
-    asr_text = transcribe_audio(audio_file, "<zho>")
     # Convert between simplified and traditional Chinese if needed
     if chinese_variant == "Traditional":
         # Convert simplified to traditional
         # Use s2t for more complete conversion from Simplified to Traditional
         cc = OpenCC('s2t')  # s2t
-        asr_text = cc.convert(asr_text)
-    elif chinese_variant == "Simplified" and not asr_text.isascii():
         # If the text contains non-ASCII characters, it might be traditional
         # Convert traditional to simplified just to be safe
         cc = OpenCC('t2s')  # t2s: Traditional to Simplified
-        asr_text = cc.convert(asr_text)
-    return asr_text
 # Function to handle Japanese transcription
 def transcribe_japanese(audio_file):
@@ -309,14 +285,14 @@ with demo:
                 # Special handling for Chinese with variant selection
                 if lang == "Mandarin" and chinese_variant:
-                    transcription = transcribe_chinese(audio, chinese_variant)
                 else:
-                    transcription = transcribe_audio(audio, lang_map.get(lang, "<eng>"))
-                # Get translation if not English
-                translation = ""
-                if lang != "English":
-                    translation = translate_to_english(audio, lang_map.get(lang, "<eng>"))
                 return transcription, translation, gr.update(visible=(lang != "English"))
@@ -413,9 +389,7 @@ with demo:
             # Update the click function to include the Chinese variant and translation
             def transcribe_chinese_with_variant(audio_file, variant):
-                transcription = transcribe_chinese(audio_file, variant)
-                translation = translate_to_english(audio_file, "<zho>")
-                return transcription, translation
             zh_button.click(
                 fn=transcribe_chinese_with_variant,
@@ -457,8 +431,7 @@ with demo:
                 )
             def transcribe_and_translate_japanese(audio_file):
-                transcription = transcribe_japanese(audio_file)
-                translation = translate_to_english(audio_file, "<jpn>")
                 return transcription, translation
             jp_button.click(
@@ -501,8 +474,8 @@ with demo:
                 )
             def transcribe_and_translate_korean(audio_file):
-                transcription = transcribe_korean(audio_file)
-                translation = translate_to_english(audio_file, "<kor>")
                 return transcription, translation
             kr_button.click(
@@ -545,8 +518,7 @@ with demo:
                 )
             def transcribe_and_translate_thai(audio_file):
-                transcription = transcribe_thai(audio_file)
-                translation = translate_to_english(audio_file, "<tha>")
                 return transcription, translation
             th_button.click(
@@ -589,8 +561,7 @@ with demo:
                 )
             def transcribe_and_translate_italian(audio_file):
-                transcription = transcribe_italian(audio_file)
-                translation = translate_to_english(audio_file, "<ita>")
                 return transcription, translation
             it_button.click(
@@ -633,8 +604,7 @@ with demo:
                 )
             def transcribe_and_translate_german(audio_file):
-                transcription = transcribe_german(audio_file)
-                translation = translate_to_english(audio_file, "<deu>")
                 return transcription, translation
             de_button.click(

     # Perform ASR
     text, *_ = model(speech)[0]
+    # Also get translation to English if not already in English
+    translation = ""
+    if language != "<eng>":
+        # Set task to speech translation to English
+        model.task_sym = "<st_en>"
+        # Keep the source language the same
+        # Perform speech translation
+        translation, *_ = model(speech)[0]
     # Clean up temporary file if created
     if isinstance(audio_file, str) and audio_file.startswith(tempfile.gettempdir()):
         os.unlink(audio_file)
+    return text, translation
 # Function to handle English transcription
 def transcribe_english(audio_file):
         audio_file: Path to the audio file
         chinese_variant: Either "Simplified" or "Traditional"
     """
+    # First get the base transcription and translation
+    text, translation = transcribe_audio(audio_file, "<zho>")
     # Convert between simplified and traditional Chinese if needed
     if chinese_variant == "Traditional":
         # Convert simplified to traditional
         # Use s2t for more complete conversion from Simplified to Traditional
         cc = OpenCC('s2t')  # s2t
+        text = cc.convert(text)
+    elif chinese_variant == "Simplified" and not text.isascii():
         # If the text contains non-ASCII characters, it might be traditional
         # Convert traditional to simplified just to be safe
         cc = OpenCC('t2s')  # t2s: Traditional to Simplified
+        text = cc.convert(text)
+    return text, translation
 # Function to handle Japanese transcription
 def transcribe_japanese(audio_file):
                 # Special handling for Chinese with variant selection
                 if lang == "Mandarin" and chinese_variant:
+                    transcription, translation = transcribe_chinese(audio, chinese_variant)
                 else:
+                    result = transcribe_audio(audio, lang_map.get(lang, "<eng>"))
+                    if lang == "English":
+                        transcription = result
+                        translation = ""
+                    else:
+                        transcription, translation = result
                 return transcription, translation, gr.update(visible=(lang != "English"))
             # Update the click function to include the Chinese variant and translation
             def transcribe_chinese_with_variant(audio_file, variant):
+                return transcribe_chinese(audio_file, variant)
             zh_button.click(
                 fn=transcribe_chinese_with_variant,
                 )
             def transcribe_and_translate_japanese(audio_file):
+                transcription, translation = transcribe_japanese(audio_file)
                 return transcription, translation
             jp_button.click(
                 )
             def transcribe_and_translate_korean(audio_file):
+                transcription, translation = transcribe_korean(audio_file)
                 return transcription, translation
             kr_button.click(
                 )
             def transcribe_and_translate_thai(audio_file):
+                transcription, translation = transcribe_thai(audio_file)
                 return transcription, translation
             th_button.click(
                 )
             def transcribe_and_translate_italian(audio_file):
+                transcription,translation = transcribe_italian(audio_file)
                 return transcription, translation
             it_button.click(
                 )
             def transcribe_and_translate_german(audio_file):
+                transcription, translation = transcribe_german(audio_file)
                 return transcription, translation
             de_button.click(