Spaces:

ShoukanLabs
/

Vokan

Running on Zero

App Files Files Community

ButterCream commited on Oct 9, 2024

Commit

b07d516

1 Parent(s): 417a076

upgrade to VoPho - resolved all issues

Browse files

Files changed (2) hide show

app.py +5 -18
requirements.txt +4 -3

app.py CHANGED Viewed

@@ -5,14 +5,10 @@ import re
 import numpy as np
 from scipy.io.wavfile import write
 import nltk
-nltk.download('punkt')
-from nltk.tokenize import word_tokenize
 import torch
-import phonemizer  # en-us
 INTRO = """
 <style>
@@ -94,13 +90,6 @@ theme = gr.themes.Soft(
     block_background_fill='*neutral_50'
 )
-# eventually swap to something else
-global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us',
-                                                     preserve_punctuation=True,
-                                                     with_stress=True,
-                                                     language_switch="remove-flags",
-                                                     tie=False)
 def split_and_recombine_text(text, desired_length=200, max_length=300):
     """Split text it into chunks of a desired length trying to keep sentences intact."""
@@ -173,15 +162,13 @@ def split_and_recombine_text(text, desired_length=200, max_length=300):
     return rv
 def text_to_phonemes(text):
     text = text.strip()
     print("Text before phonemization: ", text)
-    ps = global_phonemizer.phonemize([text])
     print("Text after phonemization: ", ps)
-    ps = word_tokenize(ps[0])
-    ps = ' '.join(ps)
-    print("Final text after tokenization: ", ps)
     return ps
@@ -218,7 +205,7 @@ def generate(audio_path, ins, speed, alpha, beta, embedding, steps=200):
         thresh = np.percentile(np.abs(synthaud), 95)
         CUT_SAMPLES = 20000  # max samples to cut, in practice only 4-6k are actually cut
         lead_percent = 0.008
-        trail_percent = 0.009
         # Leading artefact removal
@@ -295,4 +282,4 @@ with gr.Blocks(theme=theme, js=js_func) as clone:
 if __name__ == "__main__":
     # demo.queue(api_open=False, max_size=15).launch(show_api=False)
-    clone.queue(api_open=False, max_size=15).launch(show_api=False)

 import numpy as np
 from scipy.io.wavfile import write
 import nltk
+from VoPho.engine import Phonemizer
 import torch
 INTRO = """
 <style>
     block_background_fill='*neutral_50'
 )
 def split_and_recombine_text(text, desired_length=200, max_length=300):
     """Split text it into chunks of a desired length trying to keep sentences intact."""
     return rv
+engine = Phonemizer()
 def text_to_phonemes(text):
     text = text.strip()
     print("Text before phonemization: ", text)
+    ps = engine.phonemize(text)
     print("Text after phonemization: ", ps)
     return ps
         thresh = np.percentile(np.abs(synthaud), 95)
         CUT_SAMPLES = 20000  # max samples to cut, in practice only 4-6k are actually cut
         lead_percent = 0.008
+        trail_percent = 0.0085
         # Leading artefact removal
 if __name__ == "__main__":
     # demo.queue(api_open=False, max_size=15).launch(show_api=False)
+    clone.queue(api_open=False, max_size=15).launch(show_api=False)

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
 SoundFile
-torchaudio==2.1.2
 munch
-torch==2.1.2
 pydub
 pyyaml
 librosa
@@ -23,4 +23,5 @@ gradio
 spaces
 gruut
 txtsplit
-scipy

 SoundFile
+torchaudio==2.2.0
 munch
+torch==2.2.0
 pydub
 pyyaml
 librosa
 spaces
 gruut
 txtsplit
+scipy
+VoPho==0.0.8