owaski commited on
Commit
87d6e1d
·
1 Parent(s): 566d6f4
Files changed (2) hide show
  1. app.py +28 -5
  2. requirements.txt +2 -1
app.py CHANGED
@@ -7,6 +7,7 @@ import whisper
7
  # the model we are using for ASR, options are small, medium, large and largev2 (large and largev2 don't fit on huggingface cpu)
8
  model = whisper.load_model("small")
9
 
 
10
 
11
  # A table to look up all the languages
12
  language_id_lookup = {
@@ -43,6 +44,13 @@ tts_model_name = {
43
  "fr": "facebook/tts_transformer-fr-cv7_css10"
44
  }
45
 
 
 
 
 
 
 
 
46
 
47
  # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
48
  # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
@@ -119,16 +127,31 @@ def predict(audio, src_language, tgt_language, mic_audio=None):
119
  from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
120
  tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
121
  tts_model_name[tgt_language],
122
- arg_overrides={"vocoder": "hifigan", "fp16": True}
123
  )
124
- tts_model = tts_models[0]
 
125
  TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
126
- tts_generator = tts_task.build_generator(tts_model, tts_cfg)
127
  tts_sample = TTSHubInterface.get_model_input(tts_task, translation)
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, tts_sample)
 
129
 
130
  # Returns the text
131
- return transcript, translation, wav
132
 
133
 
134
 
@@ -166,7 +189,7 @@ gr.Interface(
166
  outputs=[
167
  gr.Text(label="Transcript"),
168
  gr.Text(label="Translation"),
169
- gr.outputs.Audio(type="numpy", label="Translation Speech")
170
  ],
171
  title=title,
172
  description=description,
 
7
  # the model we are using for ASR, options are small, medium, large and largev2 (large and largev2 don't fit on huggingface cpu)
8
  model = whisper.load_model("small")
9
 
10
+ import torch
11
 
12
  # A table to look up all the languages
13
  language_id_lookup = {
 
44
  "fr": "facebook/tts_transformer-fr-cv7_css10"
45
  }
46
 
47
+ os.system("git clone https://github.com/Kyubyong/g2pC.git")
48
+ os.system("cd g2pC; sed -i 's/pkuseg/spacy_pkuseg/g' setup.py; \
49
+ sed -i 's/import pkuseg/import spacy_pkuseg as pkuseg/g' g2pc/g2pc.py; \
50
+ sed -i 's/package_data={/# package_data={/g' setup.py; \
51
+ pip install ./; cd ..")
52
+
53
+
54
 
55
  # The predict function. audio, language and mic_audio are all parameters directly passed by gradio
56
  # which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The
 
127
  from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
128
  tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
129
  tts_model_name[tgt_language],
130
+ arg_overrides={"vocoder": "hifigan", "fp16": False}
131
  )
132
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
133
+ tts_model = tts_models[0].to(device)
134
  TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
135
+ tts_generator = tts_task.build_generator(tts_models, tts_cfg)
136
  tts_sample = TTSHubInterface.get_model_input(tts_task, translation)
137
+
138
+
139
+
140
+ tts_sample = {
141
+ 'net_input': {
142
+ 'src_tokens': tts_sample['net_input']['src_tokens'].to(device),
143
+ 'src_lengths': tts_sample['net_input']['src_lengths'].to(device),
144
+ 'prev_output_tokens': None,
145
+ },
146
+ 'target_lengths': None,
147
+ 'speaker': tts_sample['speaker'].to(device)
148
+ }
149
+
150
  wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, tts_sample)
151
+ wav = wav.cpu().numpy()
152
 
153
  # Returns the text
154
+ return transcript, translation, (rate, wav)
155
 
156
 
157
 
 
189
  outputs=[
190
  gr.Text(label="Transcript"),
191
  gr.Text(label="Translation"),
192
+ gr.Audio(label="Translation Speech")
193
  ],
194
  title=title,
195
  description=description,
requirements.txt CHANGED
@@ -7,4 +7,5 @@ sacremoses
7
  kytea
8
  six
9
  phonemizer
10
- sentencepiece
 
 
7
  kytea
8
  six
9
  phonemizer
10
+ sentencepiece
11
+ hanziconv