Spaces:

owaski-demo
/

Demo

Sleeping

App Files Files Community

owaski commited on Feb 28, 2023

Commit

4d5c730

1 Parent(s): 63a63e1

add azure tts

Browse files

Files changed (3) hide show

app.py +36 -25
packages.txt +5 -1
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -41,29 +41,28 @@ os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2
 #           sed -i 's/package_data={/# package_data={/g' setup.py; \
 #           pip install ./; cd ..")
-os.system("git clone https://github.com/DigitalPhonetics/IMS-Toucan.git")
-sys.path.append('./IMS-Toucan')
-os.system("cd IMS-Toucan; pip install --no-cache-dir -r requirements.txt")
-os.system("python run_model_downloader.py; cd ..")
-from InferenceInterfaces.PortaSpeechInterface import PortaSpeechInterface
-cwd = os.getcwd()
-os.chdir('./IMS-Toucan')
-tts = PortaSpeechInterface(device='cpu', tts_model_path='Meta')
-os.chdir(cwd)
 # azure tts
-os.system("apt-get update")
-os.system("apt-get install build-essential libssl-dev libasound2 wget -y")
-os.system("add-apt-repository universe; \
-          apt-get install apt-transport-https -y; \
-          apt-get update -y; \
-          apt-get install dotnet-sdk-6.0 -y")
-os.system("dotnet tool install --global Microsoft.CognitiveServices.Speech.CLI")
-os.system("export PATH=\"$PATH:/home/user/.dotnet/tools\"")
-os.system("spx config @key --set 2d1847f4151f4f94ae06d0b620533936")
-os.system("spx config @region --set eastus")
 # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
@@ -137,11 +136,23 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
         translation = (' '.join(r.readline().split(' ')[3:])).strip()
     # tts
-    tts.set_language(tgt_language)
-    tts.read_to_file(text_list=[translation], file_location='output.wav')
     # Returns the text
-    return transcript, translation, 'output.wav'
@@ -165,8 +176,8 @@ gr.Interface(
                             'Russian',
                             'French',
                             'Detect Language'], type="value", default='English', label="Select the language of input"),
-        gr.inputs.Dropdown([# 'Arabic',
-                            # 'Chinese',
                             'English',
                             'Spanish',
                             'Russian',

 #           sed -i 's/package_data={/# package_data={/g' setup.py; \
 #           pip install ./; cd ..")
+# os.system("git clone https://github.com/DigitalPhonetics/IMS-Toucan.git")
+# sys.path.append('./IMS-Toucan')
+# os.system("cd IMS-Toucan; pip install --no-cache-dir -r requirements.txt")
+# os.system("python run_model_downloader.py; cd ..")
+# from InferenceInterfaces.PortaSpeechInterface import PortaSpeechInterface
+# cwd = os.getcwd()
+# os.chdir('./IMS-Toucan')
+# tts = PortaSpeechInterface(device='cpu', tts_model_path='Meta')
+# os.chdir(cwd)
 # azure tts
+# os.system("2d1847f4151f4f94ae06d0b620533936")
+# os.system("eastus")
+lang2voice = {
+    "zh": "zh-CN-XiaoxiaoNeural",
+    "ar": "ar-EG-SalmaNeural",
+    "en": "en-US-JennyNeural",
+    "es": "es-ES-AbrilNeural",
+    "ru": "ru-RU-DariyaNeural",
+    "fr": "fr-FR-AlainNeural",
+}
 # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
         translation = (' '.join(r.readline().split(' ')[3:])).strip()
     # tts
+    # tts.set_language(tgt_language)
+    # tts.read_to_file(text_list=[translation], file_location='output.wav')
+    # azure tts
+    import azure.cognitiveservices.speech as speechsdk
+    speech_key = "2d1847f4151f4f94ae06d0b620533936"
+    service_region = "eastus"
+    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
+    # Note: the voice setting will not overwrite the voice element in input SSML.
+    speech_config.speech_synthesis_voice_name = lang2voice[tgt_language]
+    audio_config = speechsdk.audio.AudioOutputConfig(filename="output.wav")
+    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
+    speech_synthesizer.speak_text(translation)
     # Returns the text
+    return transcript, translation, "output.wav"
                             'Russian',
                             'French',
                             'Detect Language'], type="value", default='English', label="Select the language of input"),
+        gr.inputs.Dropdown(['Arabic',
+                            'Chinese',
                             'English',
                             'Spanish',
                             'Russian',

packages.txt CHANGED Viewed

@@ -1,3 +1,7 @@
 espeak
 libasound-dev
-libportaudio2

 espeak
 libasound-dev
+libportaudio2
+build-essential
+libssl-dev
+libasound2
+wget

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ kytea
 six
 phonemizer
 sentencepiece
-hanziconv

 six
 phonemizer
 sentencepiece
+hanziconv
+azure-cognitiveservices-speech