owaski commited on
Commit
4d5c730
·
1 Parent(s): 63a63e1

add azure tts

Browse files
Files changed (3) hide show
  1. app.py +36 -25
  2. packages.txt +5 -1
  3. requirements.txt +2 -1
app.py CHANGED
@@ -41,29 +41,28 @@ os.system("wget https://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/emnlp2
41
  # sed -i 's/package_data={/# package_data={/g' setup.py; \
42
  # pip install ./; cd ..")
43
 
44
- os.system("git clone https://github.com/DigitalPhonetics/IMS-Toucan.git")
45
- sys.path.append('./IMS-Toucan')
46
- os.system("cd IMS-Toucan; pip install --no-cache-dir -r requirements.txt")
47
- os.system("python run_model_downloader.py; cd ..")
48
 
49
- from InferenceInterfaces.PortaSpeechInterface import PortaSpeechInterface
50
- cwd = os.getcwd()
51
- os.chdir('./IMS-Toucan')
52
- tts = PortaSpeechInterface(device='cpu', tts_model_path='Meta')
53
- os.chdir(cwd)
54
 
55
  # azure tts
56
- os.system("apt-get update")
57
- os.system("apt-get install build-essential libssl-dev libasound2 wget -y")
58
- os.system("add-apt-repository universe; \
59
- apt-get install apt-transport-https -y; \
60
- apt-get update -y; \
61
- apt-get install dotnet-sdk-6.0 -y")
62
- os.system("dotnet tool install --global Microsoft.CognitiveServices.Speech.CLI")
63
- os.system("export PATH=\"$PATH:/home/user/.dotnet/tools\"")
64
-
65
- os.system("spx config @key --set 2d1847f4151f4f94ae06d0b620533936")
66
- os.system("spx config @region --set eastus")
67
 
68
 
69
  # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
@@ -137,11 +136,23 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
137
  translation = (' '.join(r.readline().split(' ')[3:])).strip()
138
 
139
  # tts
140
- tts.set_language(tgt_language)
141
- tts.read_to_file(text_list=[translation], file_location='output.wav')
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  # Returns the text
144
- return transcript, translation, 'output.wav'
145
 
146
 
147
 
@@ -165,8 +176,8 @@ gr.Interface(
165
  'Russian',
166
  'French',
167
  'Detect Language'], type="value", default='English', label="Select the language of input"),
168
- gr.inputs.Dropdown([# 'Arabic',
169
- # 'Chinese',
170
  'English',
171
  'Spanish',
172
  'Russian',
 
41
  # sed -i 's/package_data={/# package_data={/g' setup.py; \
42
  # pip install ./; cd ..")
43
 
44
+ # os.system("git clone https://github.com/DigitalPhonetics/IMS-Toucan.git")
45
+ # sys.path.append('./IMS-Toucan')
46
+ # os.system("cd IMS-Toucan; pip install --no-cache-dir -r requirements.txt")
47
+ # os.system("python run_model_downloader.py; cd ..")
48
 
49
+ # from InferenceInterfaces.PortaSpeechInterface import PortaSpeechInterface
50
+ # cwd = os.getcwd()
51
+ # os.chdir('./IMS-Toucan')
52
+ # tts = PortaSpeechInterface(device='cpu', tts_model_path='Meta')
53
+ # os.chdir(cwd)
54
 
55
  # azure tts
56
+ # os.system("2d1847f4151f4f94ae06d0b620533936")
57
+ # os.system("eastus")
58
+ lang2voice = {
59
+ "zh": "zh-CN-XiaoxiaoNeural",
60
+ "ar": "ar-EG-SalmaNeural",
61
+ "en": "en-US-JennyNeural",
62
+ "es": "es-ES-AbrilNeural",
63
+ "ru": "ru-RU-DariyaNeural",
64
+ "fr": "fr-FR-AlainNeural",
65
+ }
 
66
 
67
 
68
  # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
 
136
  translation = (' '.join(r.readline().split(' ')[3:])).strip()
137
 
138
  # tts
139
+ # tts.set_language(tgt_language)
140
+ # tts.read_to_file(text_list=[translation], file_location='output.wav')
141
+
142
+ # azure tts
143
+ import azure.cognitiveservices.speech as speechsdk
144
+ speech_key = "2d1847f4151f4f94ae06d0b620533936"
145
+ service_region = "eastus"
146
+
147
+ speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
148
+ # Note: the voice setting will not overwrite the voice element in input SSML.
149
+ speech_config.speech_synthesis_voice_name = lang2voice[tgt_language]
150
+ audio_config = speechsdk.audio.AudioOutputConfig(filename="output.wav")
151
+ speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
152
+ speech_synthesizer.speak_text(translation)
153
 
154
  # Returns the text
155
+ return transcript, translation, "output.wav"
156
 
157
 
158
 
 
176
  'Russian',
177
  'French',
178
  'Detect Language'], type="value", default='English', label="Select the language of input"),
179
+ gr.inputs.Dropdown(['Arabic',
180
+ 'Chinese',
181
  'English',
182
  'Spanish',
183
  'Russian',
packages.txt CHANGED
@@ -1,3 +1,7 @@
1
  espeak
2
  libasound-dev
3
- libportaudio2
 
 
 
 
 
1
  espeak
2
  libasound-dev
3
+ libportaudio2
4
+ build-essential
5
+ libssl-dev
6
+ libasound2
7
+ wget
requirements.txt CHANGED
@@ -8,4 +8,5 @@ kytea
8
  six
9
  phonemizer
10
  sentencepiece
11
- hanziconv
 
 
8
  six
9
  phonemizer
10
  sentencepiece
11
+ hanziconv
12
+ azure-cognitiveservices-speech