Dionyssos commited on
Commit
1c1f53d
·
1 Parent(s): fd9e569

wav of voices

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +5 -5
  2. app.py +221 -62
  3. espeak_util.py +206 -0
  4. requirements.txt +11 -3
  5. wav/af_ZA_google-nwu_0184.wav +0 -0
  6. wav/af_ZA_google-nwu_1919.wav +0 -0
  7. wav/af_ZA_google-nwu_2418.wav +0 -0
  8. wav/af_ZA_google-nwu_6590.wav +0 -0
  9. wav/af_ZA_google-nwu_7130.wav +0 -0
  10. wav/af_ZA_google-nwu_7214.wav +0 -0
  11. wav/af_ZA_google-nwu_8148.wav +0 -0
  12. wav/af_ZA_google-nwu_8924.wav +0 -0
  13. wav/af_ZA_google-nwu_8963.wav +0 -0
  14. wav/bn_multi_00737.wav +0 -0
  15. wav/bn_multi_00779.wav +0 -0
  16. wav/bn_multi_01232.wav +0 -0
  17. wav/bn_multi_01701.wav +0 -0
  18. wav/bn_multi_03042.wav +0 -0
  19. wav/bn_multi_0834.wav +0 -0
  20. wav/bn_multi_1010.wav +0 -0
  21. wav/bn_multi_3108.wav +0 -0
  22. wav/bn_multi_3713.wav +0 -0
  23. wav/bn_multi_3958.wav +0 -0
  24. wav/bn_multi_4046.wav +0 -0
  25. wav/bn_multi_4811.wav +0 -0
  26. wav/bn_multi_5958.wav +0 -0
  27. wav/bn_multi_9169.wav +0 -0
  28. wav/bn_multi_rm.wav +0 -0
  29. wav/de_DE_m-ailabs_angela_merkel.wav +0 -0
  30. wav/de_DE_m-ailabs_eva_k.wav +0 -0
  31. wav/de_DE_m-ailabs_karlsson.wav +0 -0
  32. wav/de_DE_m-ailabs_ramona_deininger.wav +0 -0
  33. wav/de_DE_m-ailabs_rebecca_braunert_plunkett.wav +0 -0
  34. wav/de_DE_thorsten-emotion_amused.wav +0 -0
  35. wav/el_GR_rapunzelina.wav +0 -0
  36. wav/en_UK_apope.wav +0 -0
  37. wav/en_US_cmu_arctic_aew.wav +0 -0
  38. wav/en_US_cmu_arctic_aup.wav +0 -0
  39. wav/en_US_cmu_arctic_awb.wav +0 -0
  40. wav/en_US_cmu_arctic_awbrms.wav +0 -0
  41. wav/en_US_cmu_arctic_axb.wav +0 -0
  42. wav/en_US_cmu_arctic_bdl.wav +0 -0
  43. wav/en_US_cmu_arctic_clb.wav +0 -0
  44. wav/en_US_cmu_arctic_eey.wav +0 -0
  45. wav/en_US_cmu_arctic_fem.wav +0 -0
  46. wav/en_US_cmu_arctic_gka.wav +0 -0
  47. wav/en_US_cmu_arctic_jmk.wav +0 -0
  48. wav/en_US_cmu_arctic_ksp.wav +0 -0
  49. wav/en_US_cmu_arctic_ljm.wav +0 -0
  50. wav/en_US_cmu_arctic_lnh.wav +0 -0
README.md CHANGED
@@ -6,13 +6,13 @@ colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.41.1
8
  app_file: app.py
9
- pinned: false
10
  license: cc-by-nc-4.0
11
  tags:
12
- - age
13
- - gender
14
- - expression
15
- - audio
16
  ---
17
 
18
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
6
  sdk: gradio
7
  sdk_version: 5.41.1
8
  app_file: app.py
9
+ short_description: TTS for CPU
10
  license: cc-by-nc-4.0
11
  tags:
12
+ - non-AR
13
+ - affective
14
+ - shift
15
+ - tts
16
  ---
17
 
18
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,22 +1,25 @@
 
1
  import typing
2
  import types # fusion of forward() of Wav2Vec2
3
  import gradio as gr
4
  import matplotlib.pyplot as plt
5
  import numpy as np
6
- import spaces
7
  import torch
8
  import torch.nn as nn
9
  from transformers import Wav2Vec2Processor
10
  from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
11
  from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
12
-
13
  import audiofile
 
 
 
14
  import audresample
15
 
16
 
17
  device = 0 if torch.cuda.is_available() else "cpu"
18
  duration = 2 # limit processing of audio
19
- age_gender_model_name = "audeering/wav2vec2-large-robust-24-ft-age-gender"
20
  expression_model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
21
 
22
 
@@ -167,7 +170,7 @@ class ExpressionModel(Wav2Vec2PreTrainedModel):
167
 
168
 
169
  # Load models from hub
170
- age_gender_processor = Wav2Vec2Processor.from_pretrained(age_gender_model_name)
171
  age_gender_model = AgeGenderModel.from_pretrained(age_gender_model_name)
172
  expression_processor = Wav2Vec2Processor.from_pretrained(expression_model_name)
173
  expression_model = ExpressionModel.from_pretrained(expression_model_name)
@@ -206,12 +209,9 @@ def process_func(x: np.ndarray, sampling_rate: int) -> typing.Tuple[str, dict, s
206
  },
207
  expression_file,
208
  )
 
209
 
210
-
211
- @spaces.GPU
212
- def recognize(input_file: str) -> typing.Tuple[str, dict, str]:
213
- # sampling_rate, signal = input_microphone
214
- # signal = signal.astype(np.float32, order="C") / 32768.0
215
  if input_file is None:
216
  raise gr.Error(
217
  "No audio file submitted! "
@@ -227,50 +227,6 @@ def recognize(input_file: str) -> typing.Tuple[str, dict, str]:
227
  return process_func(signal, target_rate)
228
 
229
 
230
- def plot_expression_RIGID(arousal, dominance, valence):
231
- r"""3D pixel plot of arousal, dominance, valence."""
232
- # Voxels per dimension
233
- voxels = 7
234
- # Create voxel grid
235
- x, y, z = np.indices((voxels + 1, voxels + 1, voxels + 1))
236
- voxel = (
237
- (x == round(arousal * voxels))
238
- & (y == round(dominance * voxels))
239
- & (z == round(valence * voxels))
240
- )
241
- projection = (
242
- (x == round(arousal * voxels))
243
- & (y == round(dominance * voxels))
244
- & (z < round(valence * voxels))
245
- )
246
- colors = np.empty((voxel | projection).shape, dtype=object)
247
- colors[voxel] = "#fcb06c"
248
- colors[projection] = "#fed7a9"
249
- ax = plt.figure().add_subplot(projection='3d')
250
- ax.voxels(voxel | projection, facecolors=colors, edgecolor='k')
251
- ax.set_xlim([0, voxels])
252
- ax.set_ylim([0, voxels])
253
- ax.set_zlim([0, voxels])
254
- ax.set_aspect("equal")
255
- ax.set_xlabel("arousal", fontsize="large", labelpad=0)
256
- ax.set_ylabel("dominance", fontsize="large", labelpad=0)
257
- ax.set_zlabel("valence", fontsize="large", labelpad=0)
258
- ax.set_xticks(
259
- list(range(voxels + 1)),
260
- labels=[0, None, None, None, None, None, None, 1],
261
- verticalalignment="bottom",
262
- )
263
- ax.set_yticks(
264
- list(range(voxels + 1)),
265
- labels=[0, None, None, None, None, None, None, 1],
266
- verticalalignment="bottom",
267
- )
268
- ax.set_zticks(
269
- list(range(voxels + 1)),
270
- labels=[0, None, None, None, None, None, None, 1],
271
- verticalalignment="top",
272
- )
273
-
274
  def explode(data):
275
  """
276
  Expands a 3D array by creating gaps between voxels.
@@ -282,6 +238,18 @@ def explode(data):
282
  retval[::2, ::2, ::2] = data
283
  return retval
284
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  def plot_expression(arousal, dominance, valence):
286
  '''_h = cuda tensor (N_PIX, N_PIX, N_PIX)'''
287
 
@@ -302,7 +270,8 @@ def plot_expression(arousal, dominance, valence):
302
  y[:, 1::2, :] += 1
303
  z[:, :, 1::2] += 1
304
 
305
- ax = plt.figure().add_subplot(projection='3d')
 
306
 
307
  f_2 = np.ones([2 * N_PIX - 1,
308
  2 * N_PIX - 1,
@@ -313,7 +282,6 @@ def plot_expression(arousal, dominance, valence):
313
 
314
  f_2[:, :, :, 3] = f_2[:, :, :, 3].clip(.01, .74)
315
 
316
- print(f_2.shape, 'f_2 AAAA')
317
  ecolors_2 = f_2
318
 
319
  ax.voxels(x, y, z, filled_2, facecolors=f_2, edgecolors=.006 * ecolors_2)
@@ -348,10 +316,156 @@ def plot_expression(arousal, dominance, valence):
348
  ax.set_xlim(0, N_PIX)
349
  ax.set_ylim(0, N_PIX)
350
  ax.set_zlim(0, N_PIX)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
- #plt.show()
 
 
 
 
 
 
 
 
353
 
354
- # ------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
 
357
  description = (
@@ -366,7 +480,52 @@ description = (
366
  )
367
 
368
  with gr.Blocks() as demo:
369
- with gr.Tab(label="Speech analysis"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  with gr.Row():
371
  with gr.Column():
372
  gr.Markdown(description)
@@ -378,10 +537,10 @@ with gr.Blocks() as demo:
378
  )
379
  gr.Examples(
380
  [
381
- "female-46-neutral.wav",
382
- "female-20-happy.wav",
383
- "male-60-angry.wav",
384
- "male-27-sad.wav",
385
  ],
386
  [input],
387
  label="Examples from CREMA-D, ODbL v1.0 license",
 
1
+ # -*- coding: utf-8 -*-
2
  import typing
3
  import types # fusion of forward() of Wav2Vec2
4
  import gradio as gr
5
  import matplotlib.pyplot as plt
6
  import numpy as np
7
+ import os
8
  import torch
9
  import torch.nn as nn
10
  from transformers import Wav2Vec2Processor
11
  from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
12
  from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
 
13
  import audiofile
14
+ import unicodedata
15
+ import textwrap
16
+ from tts import StyleTTS2
17
  import audresample
18
 
19
 
20
  device = 0 if torch.cuda.is_available() else "cpu"
21
  duration = 2 # limit processing of audio
22
+ age_gender_model_name = "audeering/wav2vec2-large-robust-6-ft-age-gender"
23
  expression_model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
24
 
25
 
 
170
 
171
 
172
  # Load models from hub
173
+
174
  age_gender_model = AgeGenderModel.from_pretrained(age_gender_model_name)
175
  expression_processor = Wav2Vec2Processor.from_pretrained(expression_model_name)
176
  expression_model = ExpressionModel.from_pretrained(expression_model_name)
 
209
  },
210
  expression_file,
211
  )
212
+
213
 
214
+ def recognize(input_file):
 
 
 
 
215
  if input_file is None:
216
  raise gr.Error(
217
  "No audio file submitted! "
 
227
  return process_func(signal, target_rate)
228
 
229
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  def explode(data):
231
  """
232
  Expands a 3D array by creating gaps between voxels.
 
238
  retval[::2, ::2, ::2] = data
239
  return retval
240
 
241
+
242
+ def explode(data):
243
+ """
244
+ Expands a 3D array by adding new voxels between existing ones.
245
+ This is used to create the gaps in the 3D plot.
246
+ """
247
+ shape = data.shape
248
+ new_shape = (2 * shape[0] - 1, 2 * shape[1] - 1, 2 * shape[2] - 1)
249
+ new_data = np.zeros(new_shape, dtype=data.dtype)
250
+ new_data[::2, ::2, ::2] = data
251
+ return new_data
252
+
253
  def plot_expression(arousal, dominance, valence):
254
  '''_h = cuda tensor (N_PIX, N_PIX, N_PIX)'''
255
 
 
270
  y[:, 1::2, :] += 1
271
  z[:, :, 1::2] += 1
272
 
273
+ fig = plt.figure()
274
+ ax = fig.add_subplot(projection='3d')
275
 
276
  f_2 = np.ones([2 * N_PIX - 1,
277
  2 * N_PIX - 1,
 
282
 
283
  f_2[:, :, :, 3] = f_2[:, :, :, 3].clip(.01, .74)
284
 
 
285
  ecolors_2 = f_2
286
 
287
  ax.voxels(x, y, z, filled_2, facecolors=f_2, edgecolors=.006 * ecolors_2)
 
316
  ax.set_xlim(0, N_PIX)
317
  ax.set_ylim(0, N_PIX)
318
  ax.set_zlim(0, N_PIX)
319
+ # plt.show()
320
+
321
+ # TTS
322
+ VOICES = [f'wav/{vox}' for vox in os.listdir('wav')]
323
+ _tts = StyleTTS2().to('cpu')
324
+
325
+ def only_greek_or_only_latin(text, lang='grc'):
326
+ '''
327
+ str: The converted string in the specified target script.
328
+ Characters not found in any mapping are preserved as is.
329
+ Latin accented characters in the input (e.g., 'É', 'ü') will
330
+ be preserved in their lowercase form (e.g., 'é', 'ü') if
331
+ converting to Latin.
332
+ '''
333
+
334
+ # --- Mapping Dictionaries ---
335
+ # Keys are in lowercase as input text is case-folded.
336
+ # If the output needs to maintain original casing, additional logic is required.
337
+
338
+ latin_to_greek_map = {
339
+ 'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
340
+ 'ch': 'τσο', # Example of a multi-character Latin sequence
341
+ 'z': 'ζ', 'h': 'χ', 'i': 'ι', 'k': 'κ', 'l': 'λ',
342
+ 'm': 'μ', 'n': 'ν', 'x': 'ξ', 'o': 'ο', 'p': 'π',
343
+ 'v': 'β', 'sc': 'σκ', 'r': 'ρ', 's': 'σ', 't': 'τ',
344
+ 'u': 'ου', 'f': 'φ', 'c': 'σ', 'w': 'β', 'y': 'γ',
345
+ }
346
+
347
+ greek_to_latin_map = {
348
+ 'ου': 'ou', # Prioritize common diphthongs/digraphs
349
+ 'α': 'a', 'β': 'v', 'γ': 'g', 'δ': 'd', 'ε': 'e',
350
+ 'ζ': 'z', 'η': 'i', 'θ': 'th', 'ι': 'i', 'κ': 'k',
351
+ 'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o',
352
+ 'π': 'p', 'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'y', # 'y' is a common transliteration for upsilon
353
+ 'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o',
354
+ 'ς': 's', # Final sigma
355
+ }
356
+
357
+ cyrillic_to_latin_map = {
358
+ 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo', 'ж': 'zh',
359
+ 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
360
+ 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
361
+ 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
362
+ 'я': 'ya',
363
+ }
364
+
365
+ # Direct Cyrillic to Greek mapping based on phonetic similarity.
366
+ # These are approximations and may not be universally accepted transliterations.
367
+ cyrillic_to_greek_map = {
368
+ 'а': 'α', 'б': 'β', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', 'ё': 'ιο', 'ж': 'ζ',
369
+ 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'κ': 'κ', 'λ': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο',
370
+ 'π': 'π', 'ρ': 'ρ', 'σ': 'σ', 'τ': 'τ', 'у': 'ου', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
371
+ 'ч': 'τσ', # or τζ depending on desired sound
372
+ 'ш': 'σ', 'щ': 'σ', # approximations
373
+ 'ъ': '', 'ы': 'ι', 'ь': '', 'э': 'ε', 'ю': 'ιου',
374
+ 'я': 'ια',
375
+ }
376
+
377
+ # Convert the input text to lowercase, preserving accents for Latin characters.
378
+ # casefold() is used for more robust caseless matching across Unicode characters.
379
+ lowercased_text = text.lower() #casefold()
380
+ output_chars = []
381
+ current_index = 0
382
+
383
+ if lang == 'grc':
384
+ # Combine all relevant maps for direct lookup to Greek
385
+ conversion_map = {**latin_to_greek_map, **cyrillic_to_greek_map}
386
+
387
+ # Sort keys by length in reverse order to handle multi-character sequences first
388
+ sorted_source_keys = sorted(
389
+ list(latin_to_greek_map.keys()) + list(cyrillic_to_greek_map.keys()),
390
+ key=len,
391
+ reverse=True
392
+ )
393
+
394
+ while current_index < len(lowercased_text):
395
+ found_conversion = False
396
+ for key in sorted_source_keys:
397
+ if lowercased_text.startswith(key, current_index):
398
+ output_chars.append(conversion_map[key])
399
+ current_index += len(key)
400
+ found_conversion = True
401
+ break
402
+ if not found_conversion:
403
+ # If no specific mapping found, append the character as is.
404
+ # This handles unmapped characters and already Greek characters.
405
+ output_chars.append(lowercased_text[current_index])
406
+ current_index += 1
407
+ return ''.join(output_chars)
408
+
409
+ else: # Default to 'lat' conversion
410
+ # Combine Greek to Latin and Cyrillic to Latin maps.
411
+ # Cyrillic map keys will take precedence in case of overlap if defined after Greek.
412
+ combined_to_latin_map = {**greek_to_latin_map, **cyrillic_to_latin_map}
413
+
414
+ # Sort all relevant source keys by length in reverse for replacement
415
+ sorted_source_keys = sorted(
416
+ list(greek_to_latin_map.keys()) + list(cyrillic_to_latin_map.keys()),
417
+ key=len,
418
+ reverse=True
419
+ )
420
+
421
+ while current_index < len(lowercased_text):
422
+ found_conversion = False
423
+ for key in sorted_source_keys:
424
+ if lowercased_text.startswith(key, current_index):
425
+ latin_equivalent = combined_to_latin_map[key]
426
 
427
+ # Strip accents ONLY if the source character was from the Greek map.
428
+ # This preserves accents on original Latin characters (like 'é')
429
+ # and allows for intentional accent stripping from Greek transliterations.
430
+ if key in greek_to_latin_map:
431
+ normalized_latin = unicodedata.normalize('NFD', latin_equivalent)
432
+ stripped_latin = ''.join(c for c in normalized_latin if not unicodedata.combining(c))
433
+ output_chars.append(stripped_latin)
434
+ else:
435
+ output_chars.append(latin_equivalent)
436
 
437
+ current_index += len(key)
438
+ found_conversion = True
439
+ break
440
+
441
+ if not found_conversion:
442
+ # If no conversion happened from Greek or Cyrillic, append the character as is.
443
+ # This preserves existing Latin characters (including accented ones from input),
444
+ # numbers, punctuation, and other symbols.
445
+ output_chars.append(lowercased_text[current_index])
446
+ current_index += 1
447
+
448
+ return ''.join(output_chars)
449
+
450
+
451
+ def other_tts(text='Hallov worlds Far over the',
452
+ ref_s='wav/af_ZA_google-nwu_0184.wav'):
453
+
454
+ text = only_greek_or_only_latin(text, lang='eng')
455
+
456
+ x = _tts.inference(text, ref_s=ref_s)[0, 0, :].cpu().numpy()
457
+
458
+ # x /= np.abs(x).max() + 1e-7 ~ Volume normalisation @api.py:tts_multi_sentence() OR demo.py
459
+
460
+ tmp_file = f'_speech.wav' # N x clients (cleanup vs tmp file / client)
461
+
462
+ audiofile.write(tmp_file, x, 24000)
463
+
464
+ return tmp_file
465
+
466
+
467
+ def update_selected_voice(voice_filename):
468
+ return 'wav/' + voice_filename + '.wav'
469
 
470
 
471
  description = (
 
480
  )
481
 
482
  with gr.Blocks() as demo:
483
+ with gr.Tab(label="other TTS"):
484
+
485
+ selected_voice = gr.State(value='wav/en_US_m-ailabs_mary_ann.wav')
486
+
487
+ with gr.Row():
488
+ voice_info = gr.Markdown(f'TTS vox : `{selected_voice.value}`')
489
+
490
+ # Main input and output components
491
+ with gr.Row():
492
+ text_input = gr.Textbox(
493
+ label="Enter text for TTS:",
494
+ placeholder="Type your message here...",
495
+ lines=4,
496
+ value="Farover the misty mountains cold too dungeons deep and caverns old.",
497
+ )
498
+ generate_button = gr.Button("Generate Audio", variant="primary")
499
+
500
+ output_audio = gr.Audio(label="TTS Output")
501
+
502
+ with gr.Column():
503
+ voice_buttons = []
504
+ for i in range(0, len(VOICES), 7):
505
+ with gr.Row():
506
+ for voice_filename in VOICES[i:i+7]:
507
+ voice_filename = voice_filename[4:-4] # drop wav/ for visibility
508
+ button = gr.Button(voice_filename)
509
+
510
+ button.click(
511
+ fn=update_selected_voice,
512
+ inputs=[gr.Textbox(value=voice_filename, visible=False)],
513
+ outputs=[selected_voice]
514
+ )
515
+ button.click(
516
+ fn=lambda v=voice_filename: f"TTS Vox = `{v}`",
517
+ inputs=None,
518
+ outputs=voice_info
519
+ )
520
+ voice_buttons.append(button)
521
+
522
+ generate_button.click(
523
+ fn=other_tts,
524
+ inputs=[text_input, selected_voice],
525
+ outputs=output_audio
526
+ )
527
+
528
+ with gr.Tab(label="Speech Analysis"):
529
  with gr.Row():
530
  with gr.Column():
531
  gr.Markdown(description)
 
537
  )
538
  gr.Examples(
539
  [
540
+ "wav/female-46-neutral.wav",
541
+ "wav/female-20-happy.wav",
542
+ "wav/male-60-angry.wav",
543
+ "wav/male-27-sad.wav",
544
  ],
545
  [input],
546
  label="Examples from CREMA-D, ODbL v1.0 license",
espeak_util.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import platform
2
+ import subprocess
3
+ import shutil
4
+ from pathlib import Path
5
+ import os
6
+ from typing import Optional, Tuple
7
+ from phonemizer.backend.espeak.wrapper import EspeakWrapper
8
+
9
+
10
+ class EspeakConfig:
11
+ """Utility class for configuring espeak-ng library and binary."""
12
+
13
+ @staticmethod
14
+ def find_espeak_binary() -> tuple[bool, Optional[str]]:
15
+ """
16
+ Find espeak-ng binary using multiple methods.
17
+
18
+ Returns:
19
+ tuple: (bool indicating if espeak is available, path to espeak binary if found)
20
+ """
21
+ # Common binary names
22
+ binary_names = ["espeak-ng", "espeak"]
23
+ if platform.system() == "Windows":
24
+ binary_names = ["espeak-ng.exe", "espeak.exe"]
25
+
26
+ # Common installation directories for Linux
27
+ linux_paths = [
28
+ "/usr/bin",
29
+ "/usr/local/bin",
30
+ "/usr/lib/espeak-ng",
31
+ "/usr/local/lib/espeak-ng",
32
+ "/opt/espeak-ng/bin",
33
+ ]
34
+
35
+ # First check if it's in PATH
36
+ for name in binary_names:
37
+ espeak_path = shutil.which(name)
38
+ if espeak_path:
39
+ return True, espeak_path
40
+
41
+ # For Linux, check common installation directories
42
+ if platform.system() == "Linux":
43
+ for directory in linux_paths:
44
+ for name in binary_names:
45
+ path = Path(directory) / name
46
+ if path.exists():
47
+ return True, str(path)
48
+
49
+ # Try running the command directly as a last resort
50
+ try:
51
+ subprocess.run(
52
+ ["espeak-ng", "--version"],
53
+ stdout=subprocess.PIPE,
54
+ stderr=subprocess.PIPE,
55
+ check=True,
56
+ )
57
+ return True, "espeak-ng"
58
+ except (subprocess.SubprocessError, FileNotFoundError):
59
+ pass
60
+
61
+ return False, None
62
+
63
+ @staticmethod
64
+ def find_library_path() -> Optional[str]:
65
+ """
66
+ Find the espeak-ng library using multiple search methods.
67
+
68
+ Returns:
69
+ Optional[str]: Path to the library if found, None otherwise
70
+ """
71
+ system = platform.system()
72
+
73
+ if system == "Linux":
74
+ lib_names = ["libespeak-ng.so", "libespeak-ng.so.1"]
75
+ common_paths = [
76
+ # Debian/Ubuntu paths
77
+ "/usr/lib/x86_64-linux-gnu",
78
+ "/usr/lib/aarch64-linux-gnu", # For ARM64
79
+ "/usr/lib/arm-linux-gnueabihf", # For ARM32
80
+ "/usr/lib",
81
+ "/usr/local/lib",
82
+ # Fedora/RHEL paths
83
+ "/usr/lib64",
84
+ "/usr/lib32",
85
+ # Common additional paths
86
+ "/usr/lib/espeak-ng",
87
+ "/usr/local/lib/espeak-ng",
88
+ "/opt/espeak-ng/lib",
89
+ ]
90
+
91
+ # Check common locations first
92
+ for path in common_paths:
93
+ for lib_name in lib_names:
94
+ lib_path = Path(path) / lib_name
95
+ if lib_path.exists():
96
+ return str(lib_path)
97
+
98
+ # Search system library paths
99
+ try:
100
+ # Use ldconfig to find the library
101
+ result = subprocess.run(
102
+ ["ldconfig", "-p"], capture_output=True, text=True, check=True
103
+ )
104
+ for line in result.stdout.splitlines():
105
+ if "libespeak-ng.so" in line:
106
+ # Extract path from ldconfig output
107
+ return line.split("=>")[-1].strip()
108
+ except (subprocess.SubprocessError, FileNotFoundError):
109
+ pass
110
+
111
+ elif system == "Darwin": # macOS
112
+ common_paths = [
113
+ Path("/opt/homebrew/lib/libespeak-ng.dylib"),
114
+ Path("/usr/local/lib/libespeak-ng.dylib"),
115
+ *list(
116
+ Path("/opt/homebrew/Cellar/espeak-ng").glob(
117
+ "*/lib/libespeak-ng.dylib"
118
+ )
119
+ ),
120
+ *list(
121
+ Path("/usr/local/Cellar/espeak-ng").glob("*/lib/libespeak-ng.dylib")
122
+ ),
123
+ ]
124
+
125
+ for path in common_paths:
126
+ if path.exists():
127
+ return str(path)
128
+
129
+ elif system == "Windows":
130
+ common_paths = [
131
+ Path(os.environ.get("PROGRAMFILES", "C:\\Program Files"))
132
+ / "eSpeak NG"
133
+ / "libespeak-ng.dll",
134
+ Path(os.environ.get("PROGRAMFILES(X86)", "C:\\Program Files (x86)"))
135
+ / "eSpeak NG"
136
+ / "libespeak-ng.dll",
137
+ *[
138
+ Path(p) / "libespeak-ng.dll"
139
+ for p in os.environ.get("PATH", "").split(os.pathsep)
140
+ ],
141
+ ]
142
+
143
+ for path in common_paths:
144
+ if path.exists():
145
+ return str(path)
146
+
147
+ return None
148
+
149
+ @classmethod
150
+ def configure_espeak(cls) -> Tuple[bool, str]:
151
+ """
152
+ Configure espeak-ng for use with the phonemizer.
153
+
154
+ Returns:
155
+ Tuple[bool, str]: (Success status, Status message)
156
+ """
157
+ # First check if espeak binary is available
158
+ espeak_available, espeak_path = cls.find_espeak_binary()
159
+ if not espeak_available:
160
+ raise FileNotFoundError(
161
+ "Could not find espeak-ng binary. Please install espeak-ng:\n"
162
+ "Ubuntu/Debian: sudo apt-get install espeak-ng espeak-ng-data\n"
163
+ "Fedora: sudo dnf install espeak-ng\n"
164
+ "Arch: sudo pacman -S espeak-ng\n"
165
+ "MacOS: brew install espeak-ng\n"
166
+ "Windows: Download from https://github.com/espeak-ng/espeak-ng/releases"
167
+ )
168
+
169
+ # Find the library
170
+ library_path = cls.find_library_path()
171
+ if not library_path:
172
+ # On Linux, we might not need to explicitly set the library path
173
+ if platform.system() == "Linux":
174
+ return True, f"Using system espeak-ng installation at: {espeak_path}"
175
+ else:
176
+ raise FileNotFoundError(
177
+ "Could not find espeak-ng library. Please ensure espeak-ng is properly installed."
178
+ )
179
+
180
+ # Try to set the library path
181
+ try:
182
+ EspeakWrapper.set_library(library_path)
183
+ return True, f"Successfully configured espeak-ng library at: {library_path}"
184
+ except Exception as e:
185
+ if platform.system() == "Linux":
186
+ # On Linux, try to continue without explicit library path
187
+ return True, f"Using system espeak-ng installation at: {espeak_path}"
188
+ else:
189
+ raise RuntimeError(f"Failed to configure espeak-ng library: {str(e)}")
190
+
191
+
192
+ def setup_espeak():
193
+ """
194
+ Set up espeak-ng for use with the phonemizer.
195
+ Raises appropriate exceptions if setup fails.
196
+ """
197
+ try:
198
+ success, message = EspeakConfig.configure_espeak()
199
+ print(message)
200
+ except Exception as e:
201
+ print(f"Error configuring espeak-ng: {str(e)}")
202
+ raise
203
+
204
+
205
+ # Replace the original set_espeak_library function with this
206
+ set_espeak_library = setup_espeak
requirements.txt CHANGED
@@ -1,5 +1,13 @@
1
- audiofile
2
- audresample
3
- matplotlib
4
  torch
 
 
 
5
  transformers
 
 
 
 
 
 
 
 
 
 
 
 
1
  torch
2
+ nltk
3
+ pydantic==2.10.6
4
+ librosa
5
  transformers
6
+ phonemizer
7
+ audiofile
8
+ matplotlib
9
+ numpy<2.0.0
10
+ gradio==5.27.0
11
+ audresample
12
+
13
+
wav/af_ZA_google-nwu_0184.wav ADDED
Binary file (92 kB). View file
 
wav/af_ZA_google-nwu_1919.wav ADDED
Binary file (92 kB). View file
 
wav/af_ZA_google-nwu_2418.wav ADDED
Binary file (92 kB). View file
 
wav/af_ZA_google-nwu_6590.wav ADDED
Binary file (92 kB). View file
 
wav/af_ZA_google-nwu_7130.wav ADDED
Binary file (92 kB). View file
 
wav/af_ZA_google-nwu_7214.wav ADDED
Binary file (92 kB). View file
 
wav/af_ZA_google-nwu_8148.wav ADDED
Binary file (92 kB). View file
 
wav/af_ZA_google-nwu_8924.wav ADDED
Binary file (92 kB). View file
 
wav/af_ZA_google-nwu_8963.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_00737.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_00779.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_01232.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_01701.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_03042.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_0834.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_1010.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_3108.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_3713.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_3958.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_4046.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_4811.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_5958.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_9169.wav ADDED
Binary file (92 kB). View file
 
wav/bn_multi_rm.wav ADDED
Binary file (92 kB). View file
 
wav/de_DE_m-ailabs_angela_merkel.wav ADDED
Binary file (90.7 kB). View file
 
wav/de_DE_m-ailabs_eva_k.wav ADDED
Binary file (92.7 kB). View file
 
wav/de_DE_m-ailabs_karlsson.wav ADDED
Binary file (92.7 kB). View file
 
wav/de_DE_m-ailabs_ramona_deininger.wav ADDED
Binary file (91.2 kB). View file
 
wav/de_DE_m-ailabs_rebecca_braunert_plunkett.wav ADDED
Binary file (91.2 kB). View file
 
wav/de_DE_thorsten-emotion_amused.wav ADDED
Binary file (92 kB). View file
 
wav/el_GR_rapunzelina.wav ADDED
Binary file (92 kB). View file
 
wav/en_UK_apope.wav ADDED
Binary file (92 kB). View file
 
wav/en_US_cmu_arctic_aew.wav ADDED
Binary file (92 kB). View file
 
wav/en_US_cmu_arctic_aup.wav ADDED
Binary file (94.3 kB). View file
 
wav/en_US_cmu_arctic_awb.wav ADDED
Binary file (92 kB). View file
 
wav/en_US_cmu_arctic_awbrms.wav ADDED
Binary file (92.7 kB). View file
 
wav/en_US_cmu_arctic_axb.wav ADDED
Binary file (92 kB). View file
 
wav/en_US_cmu_arctic_bdl.wav ADDED
Binary file (94.8 kB). View file
 
wav/en_US_cmu_arctic_clb.wav ADDED
Binary file (92 kB). View file
 
wav/en_US_cmu_arctic_eey.wav ADDED
Binary file (95.3 kB). View file
 
wav/en_US_cmu_arctic_fem.wav ADDED
Binary file (94.8 kB). View file
 
wav/en_US_cmu_arctic_gka.wav ADDED
Binary file (95.3 kB). View file
 
wav/en_US_cmu_arctic_jmk.wav ADDED
Binary file (93.2 kB). View file
 
wav/en_US_cmu_arctic_ksp.wav ADDED
Binary file (92 kB). View file
 
wav/en_US_cmu_arctic_ljm.wav ADDED
Binary file (94.3 kB). View file
 
wav/en_US_cmu_arctic_lnh.wav ADDED
Binary file (94.8 kB). View file