Spaces:
Running
on
A10G
Running
on
A10G
Replaced Encodec with Vocos
Browse files
app.py
CHANGED
@@ -44,8 +44,8 @@ text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
|
|
44 |
text_collater = get_text_token_collater()
|
45 |
|
46 |
device = torch.device("cpu")
|
47 |
-
if torch.cuda.is_available():
|
48 |
-
|
49 |
|
50 |
# VALL-E-X model
|
51 |
model = VALLE(
|
@@ -141,17 +141,18 @@ def make_npz_prompt(name, uploaded_audio, recorded_audio, transcript_content):
|
|
141 |
|
142 |
if transcript_content == "":
|
143 |
lang_pr, text_pr = transcribe_one(wav_pr, sr)
|
|
|
|
|
144 |
else:
|
145 |
lang_pr = langid.classify(str(transcript_content))[0]
|
146 |
lang_token = lang2token[lang_pr]
|
|
|
147 |
text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
|
148 |
# tokenize audio
|
149 |
encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
|
150 |
audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
|
151 |
|
152 |
# tokenize text
|
153 |
-
lang_token = lang2token[lang_pr]
|
154 |
-
text_pr = lang_token + text_pr + lang_token
|
155 |
phonemes, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
|
156 |
text_tokens, enroll_x_lens = text_collater(
|
157 |
[
|
@@ -193,16 +194,20 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt,
|
|
193 |
|
194 |
if transcript_content == "":
|
195 |
lang_pr, text_pr = transcribe_one(wav_pr, sr)
|
|
|
|
|
196 |
else:
|
197 |
lang_pr = langid.classify(str(transcript_content))[0]
|
|
|
198 |
lang_token = lang2token[lang_pr]
|
199 |
-
text_pr =
|
200 |
|
201 |
if language == 'auto-detect':
|
202 |
lang_token = lang2token[langid.classify(text)[0]]
|
203 |
else:
|
204 |
lang_token = langdropdown2token[language]
|
205 |
lang = token2lang[lang_token]
|
|
|
206 |
text = lang_token + text + lang_token
|
207 |
|
208 |
if lang_pr not in ['ja', 'zh', 'en']:
|
@@ -223,8 +228,6 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt,
|
|
223 |
|
224 |
enroll_x_lens = None
|
225 |
if text_pr:
|
226 |
-
lang_token = lang2token[lang_pr]
|
227 |
-
text_pr = lang_token + text_pr + lang_token
|
228 |
text_prompts, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
|
229 |
text_prompts, enroll_x_lens = text_collater(
|
230 |
[
|
@@ -266,6 +269,7 @@ def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
|
|
266 |
else:
|
267 |
lang_token = langdropdown2token[language]
|
268 |
lang = token2lang[lang_token]
|
|
|
269 |
text = lang_token + text + lang_token
|
270 |
|
271 |
# load prompt
|
|
|
44 |
text_collater = get_text_token_collater()
|
45 |
|
46 |
device = torch.device("cpu")
|
47 |
+
# if torch.cuda.is_available():
|
48 |
+
# device = torch.device("cuda", 0)
|
49 |
|
50 |
# VALL-E-X model
|
51 |
model = VALLE(
|
|
|
141 |
|
142 |
if transcript_content == "":
|
143 |
lang_pr, text_pr = transcribe_one(wav_pr, sr)
|
144 |
+
lang_token = lang2token[lang_pr]
|
145 |
+
text_pr = lang_token + text_pr + lang_token
|
146 |
else:
|
147 |
lang_pr = langid.classify(str(transcript_content))[0]
|
148 |
lang_token = lang2token[lang_pr]
|
149 |
+
transcript_content = transcript_content.replace("\n", "")
|
150 |
text_pr = f"{lang_token}{str(transcript_content)}{lang_token}"
|
151 |
# tokenize audio
|
152 |
encoded_frames = tokenize_audio(audio_tokenizer, (wav_pr, sr))
|
153 |
audio_tokens = encoded_frames[0][0].transpose(2, 1).cpu().numpy()
|
154 |
|
155 |
# tokenize text
|
|
|
|
|
156 |
phonemes, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
|
157 |
text_tokens, enroll_x_lens = text_collater(
|
158 |
[
|
|
|
194 |
|
195 |
if transcript_content == "":
|
196 |
lang_pr, text_pr = transcribe_one(wav_pr, sr)
|
197 |
+
lang_token = lang2token[lang_pr]
|
198 |
+
text_pr = lang_token + text_pr + lang_token
|
199 |
else:
|
200 |
lang_pr = langid.classify(str(transcript_content))[0]
|
201 |
+
text_pr = transcript_content.replace("\n", "")
|
202 |
lang_token = lang2token[lang_pr]
|
203 |
+
text_pr = lang_token + text_pr + lang_token
|
204 |
|
205 |
if language == 'auto-detect':
|
206 |
lang_token = lang2token[langid.classify(text)[0]]
|
207 |
else:
|
208 |
lang_token = langdropdown2token[language]
|
209 |
lang = token2lang[lang_token]
|
210 |
+
text = text.replace("\n", "")
|
211 |
text = lang_token + text + lang_token
|
212 |
|
213 |
if lang_pr not in ['ja', 'zh', 'en']:
|
|
|
228 |
|
229 |
enroll_x_lens = None
|
230 |
if text_pr:
|
|
|
|
|
231 |
text_prompts, _ = text_tokenizer.tokenize(text=f"{text_pr}".strip())
|
232 |
text_prompts, enroll_x_lens = text_collater(
|
233 |
[
|
|
|
269 |
else:
|
270 |
lang_token = langdropdown2token[language]
|
271 |
lang = token2lang[lang_token]
|
272 |
+
text = text.replace("\n", "")
|
273 |
text = lang_token + text + lang_token
|
274 |
|
275 |
# load prompt
|