ButterCream commited on
Commit
b07d516
·
1 Parent(s): 417a076

upgrade to VoPho - resolved all issues

Browse files
Files changed (2) hide show
  1. app.py +5 -18
  2. requirements.txt +4 -3
app.py CHANGED
@@ -5,14 +5,10 @@ import re
5
  import numpy as np
6
  from scipy.io.wavfile import write
7
  import nltk
8
-
9
- nltk.download('punkt')
10
- from nltk.tokenize import word_tokenize
11
 
12
  import torch
13
 
14
- import phonemizer # en-us
15
-
16
  INTRO = """
17
  <style>
18
 
@@ -94,13 +90,6 @@ theme = gr.themes.Soft(
94
  block_background_fill='*neutral_50'
95
  )
96
 
97
- # eventually swap to something else
98
- global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us',
99
- preserve_punctuation=True,
100
- with_stress=True,
101
- language_switch="remove-flags",
102
- tie=False)
103
-
104
 
105
  def split_and_recombine_text(text, desired_length=200, max_length=300):
106
  """Split text it into chunks of a desired length trying to keep sentences intact."""
@@ -173,15 +162,13 @@ def split_and_recombine_text(text, desired_length=200, max_length=300):
173
 
174
  return rv
175
 
 
176
 
177
  def text_to_phonemes(text):
178
  text = text.strip()
179
  print("Text before phonemization: ", text)
180
- ps = global_phonemizer.phonemize([text])
181
  print("Text after phonemization: ", ps)
182
- ps = word_tokenize(ps[0])
183
- ps = ' '.join(ps)
184
- print("Final text after tokenization: ", ps)
185
  return ps
186
 
187
 
@@ -218,7 +205,7 @@ def generate(audio_path, ins, speed, alpha, beta, embedding, steps=200):
218
  thresh = np.percentile(np.abs(synthaud), 95)
219
  CUT_SAMPLES = 20000 # max samples to cut, in practice only 4-6k are actually cut
220
  lead_percent = 0.008
221
- trail_percent = 0.009
222
 
223
 
224
  # Leading artefact removal
@@ -295,4 +282,4 @@ with gr.Blocks(theme=theme, js=js_func) as clone:
295
 
296
  if __name__ == "__main__":
297
  # demo.queue(api_open=False, max_size=15).launch(show_api=False)
298
- clone.queue(api_open=False, max_size=15).launch(show_api=False)
 
5
  import numpy as np
6
  from scipy.io.wavfile import write
7
  import nltk
8
+ from VoPho.engine import Phonemizer
 
 
9
 
10
  import torch
11
 
 
 
12
  INTRO = """
13
  <style>
14
 
 
90
  block_background_fill='*neutral_50'
91
  )
92
 
 
 
 
 
 
 
 
93
 
94
  def split_and_recombine_text(text, desired_length=200, max_length=300):
95
  """Split text it into chunks of a desired length trying to keep sentences intact."""
 
162
 
163
  return rv
164
 
165
+ engine = Phonemizer()
166
 
167
  def text_to_phonemes(text):
168
  text = text.strip()
169
  print("Text before phonemization: ", text)
170
+ ps = engine.phonemize(text)
171
  print("Text after phonemization: ", ps)
 
 
 
172
  return ps
173
 
174
 
 
205
  thresh = np.percentile(np.abs(synthaud), 95)
206
  CUT_SAMPLES = 20000 # max samples to cut, in practice only 4-6k are actually cut
207
  lead_percent = 0.008
208
+ trail_percent = 0.0085
209
 
210
 
211
  # Leading artefact removal
 
282
 
283
  if __name__ == "__main__":
284
  # demo.queue(api_open=False, max_size=15).launch(show_api=False)
285
+ clone.queue(api_open=False, max_size=15).launch(show_api=False)
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
  SoundFile
2
- torchaudio==2.1.2
3
  munch
4
- torch==2.1.2
5
  pydub
6
  pyyaml
7
  librosa
@@ -23,4 +23,5 @@ gradio
23
  spaces
24
  gruut
25
  txtsplit
26
- scipy
 
 
1
  SoundFile
2
+ torchaudio==2.2.0
3
  munch
4
+ torch==2.2.0
5
  pydub
6
  pyyaml
7
  librosa
 
23
  spaces
24
  gruut
25
  txtsplit
26
+ scipy
27
+ VoPho==0.0.8