Spaces:

ShoukanLabs
/

Vokan

Running on Zero

App Files Files Community

ButterCream commited on Oct 3, 2024

Commit

417a076

1 Parent(s): a835dc1

readd lain fix

Browse files

Files changed (2) hide show

app.py +12 -5
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -5,13 +5,14 @@ import re
 import numpy as np
 from scipy.io.wavfile import write
 import nltk
-from VoPho.engine import Phonemizer
 nltk.download('punkt')
 from nltk.tokenize import word_tokenize
 import torch
 INTRO = """
 <style>
@@ -93,6 +94,13 @@ theme = gr.themes.Soft(
     block_background_fill='*neutral_50'
 )
 def split_and_recombine_text(text, desired_length=200, max_length=300):
     """Split text it into chunks of a desired length trying to keep sentences intact."""
@@ -165,12 +173,11 @@ def split_and_recombine_text(text, desired_length=200, max_length=300):
     return rv
-engine = Phonemizer()
 def text_to_phonemes(text):
     text = text.strip()
     print("Text before phonemization: ", text)
-    ps = engine.phonemize(text)
     print("Text after phonemization: ", ps)
     ps = word_tokenize(ps[0])
     ps = ' '.join(ps)
@@ -211,7 +218,7 @@ def generate(audio_path, ins, speed, alpha, beta, embedding, steps=200):
         thresh = np.percentile(np.abs(synthaud), 95)
         CUT_SAMPLES = 20000  # max samples to cut, in practice only 4-6k are actually cut
         lead_percent = 0.008
-        trail_percent = 0.0085
         # Leading artefact removal
@@ -252,7 +259,7 @@ def generate(audio_path, ins, speed, alpha, beta, embedding, steps=200):
 other_tts = tts.StyleTTS2(model_checkpoint_path='./epoch_2nd_00012.pth', config_path="models/config_ft.yml")
 if torch.cuda.is_available():
-    other_tts.devuce = "cuda"
 else:
     other_tts.device = "cpu"

 import numpy as np
 from scipy.io.wavfile import write
 import nltk
 nltk.download('punkt')
 from nltk.tokenize import word_tokenize
 import torch
+import phonemizer  # en-us
 INTRO = """
 <style>
     block_background_fill='*neutral_50'
 )
+# eventually swap to something else
+global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us',
+                                                     preserve_punctuation=True,
+                                                     with_stress=True,
+                                                     language_switch="remove-flags",
+                                                     tie=False)
 def split_and_recombine_text(text, desired_length=200, max_length=300):
     """Split text it into chunks of a desired length trying to keep sentences intact."""
     return rv
 def text_to_phonemes(text):
     text = text.strip()
     print("Text before phonemization: ", text)
+    ps = global_phonemizer.phonemize([text])
     print("Text after phonemization: ", ps)
     ps = word_tokenize(ps[0])
     ps = ' '.join(ps)
         thresh = np.percentile(np.abs(synthaud), 95)
         CUT_SAMPLES = 20000  # max samples to cut, in practice only 4-6k are actually cut
         lead_percent = 0.008
+        trail_percent = 0.009
         # Leading artefact removal
 other_tts = tts.StyleTTS2(model_checkpoint_path='./epoch_2nd_00012.pth', config_path="models/config_ft.yml")
 if torch.cuda.is_available():
+    other_tts.device = "cuda"
 else:
     other_tts.device = "cpu"

requirements.txt CHANGED Viewed

@@ -23,5 +23,4 @@ gradio
 spaces
 gruut
 txtsplit
-scipy
-VoPho

 spaces
 gruut
 txtsplit
+scipy