Spaces:
Sleeping
Sleeping
wav of voices
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +5 -5
- app.py +221 -62
- espeak_util.py +206 -0
- requirements.txt +11 -3
- wav/af_ZA_google-nwu_0184.wav +0 -0
- wav/af_ZA_google-nwu_1919.wav +0 -0
- wav/af_ZA_google-nwu_2418.wav +0 -0
- wav/af_ZA_google-nwu_6590.wav +0 -0
- wav/af_ZA_google-nwu_7130.wav +0 -0
- wav/af_ZA_google-nwu_7214.wav +0 -0
- wav/af_ZA_google-nwu_8148.wav +0 -0
- wav/af_ZA_google-nwu_8924.wav +0 -0
- wav/af_ZA_google-nwu_8963.wav +0 -0
- wav/bn_multi_00737.wav +0 -0
- wav/bn_multi_00779.wav +0 -0
- wav/bn_multi_01232.wav +0 -0
- wav/bn_multi_01701.wav +0 -0
- wav/bn_multi_03042.wav +0 -0
- wav/bn_multi_0834.wav +0 -0
- wav/bn_multi_1010.wav +0 -0
- wav/bn_multi_3108.wav +0 -0
- wav/bn_multi_3713.wav +0 -0
- wav/bn_multi_3958.wav +0 -0
- wav/bn_multi_4046.wav +0 -0
- wav/bn_multi_4811.wav +0 -0
- wav/bn_multi_5958.wav +0 -0
- wav/bn_multi_9169.wav +0 -0
- wav/bn_multi_rm.wav +0 -0
- wav/de_DE_m-ailabs_angela_merkel.wav +0 -0
- wav/de_DE_m-ailabs_eva_k.wav +0 -0
- wav/de_DE_m-ailabs_karlsson.wav +0 -0
- wav/de_DE_m-ailabs_ramona_deininger.wav +0 -0
- wav/de_DE_m-ailabs_rebecca_braunert_plunkett.wav +0 -0
- wav/de_DE_thorsten-emotion_amused.wav +0 -0
- wav/el_GR_rapunzelina.wav +0 -0
- wav/en_UK_apope.wav +0 -0
- wav/en_US_cmu_arctic_aew.wav +0 -0
- wav/en_US_cmu_arctic_aup.wav +0 -0
- wav/en_US_cmu_arctic_awb.wav +0 -0
- wav/en_US_cmu_arctic_awbrms.wav +0 -0
- wav/en_US_cmu_arctic_axb.wav +0 -0
- wav/en_US_cmu_arctic_bdl.wav +0 -0
- wav/en_US_cmu_arctic_clb.wav +0 -0
- wav/en_US_cmu_arctic_eey.wav +0 -0
- wav/en_US_cmu_arctic_fem.wav +0 -0
- wav/en_US_cmu_arctic_gka.wav +0 -0
- wav/en_US_cmu_arctic_jmk.wav +0 -0
- wav/en_US_cmu_arctic_ksp.wav +0 -0
- wav/en_US_cmu_arctic_ljm.wav +0 -0
- wav/en_US_cmu_arctic_lnh.wav +0 -0
README.md
CHANGED
@@ -6,13 +6,13 @@ colorTo: gray
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.41.1
|
8 |
app_file: app.py
|
9 |
-
|
10 |
license: cc-by-nc-4.0
|
11 |
tags:
|
12 |
-
-
|
13 |
-
-
|
14 |
-
-
|
15 |
-
-
|
16 |
---
|
17 |
|
18 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.41.1
|
8 |
app_file: app.py
|
9 |
+
short_description: TTS for CPU
|
10 |
license: cc-by-nc-4.0
|
11 |
tags:
|
12 |
+
- non-AR
|
13 |
+
- affective
|
14 |
+
- shift
|
15 |
+
- tts
|
16 |
---
|
17 |
|
18 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,22 +1,25 @@
|
|
|
|
1 |
import typing
|
2 |
import types # fusion of forward() of Wav2Vec2
|
3 |
import gradio as gr
|
4 |
import matplotlib.pyplot as plt
|
5 |
import numpy as np
|
6 |
-
import
|
7 |
import torch
|
8 |
import torch.nn as nn
|
9 |
from transformers import Wav2Vec2Processor
|
10 |
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
|
11 |
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
|
12 |
-
|
13 |
import audiofile
|
|
|
|
|
|
|
14 |
import audresample
|
15 |
|
16 |
|
17 |
device = 0 if torch.cuda.is_available() else "cpu"
|
18 |
duration = 2 # limit processing of audio
|
19 |
-
age_gender_model_name = "audeering/wav2vec2-large-robust-
|
20 |
expression_model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
|
21 |
|
22 |
|
@@ -167,7 +170,7 @@ class ExpressionModel(Wav2Vec2PreTrainedModel):
|
|
167 |
|
168 |
|
169 |
# Load models from hub
|
170 |
-
|
171 |
age_gender_model = AgeGenderModel.from_pretrained(age_gender_model_name)
|
172 |
expression_processor = Wav2Vec2Processor.from_pretrained(expression_model_name)
|
173 |
expression_model = ExpressionModel.from_pretrained(expression_model_name)
|
@@ -206,12 +209,9 @@ def process_func(x: np.ndarray, sampling_rate: int) -> typing.Tuple[str, dict, s
|
|
206 |
},
|
207 |
expression_file,
|
208 |
)
|
|
|
209 |
|
210 |
-
|
211 |
-
@spaces.GPU
|
212 |
-
def recognize(input_file: str) -> typing.Tuple[str, dict, str]:
|
213 |
-
# sampling_rate, signal = input_microphone
|
214 |
-
# signal = signal.astype(np.float32, order="C") / 32768.0
|
215 |
if input_file is None:
|
216 |
raise gr.Error(
|
217 |
"No audio file submitted! "
|
@@ -227,50 +227,6 @@ def recognize(input_file: str) -> typing.Tuple[str, dict, str]:
|
|
227 |
return process_func(signal, target_rate)
|
228 |
|
229 |
|
230 |
-
def plot_expression_RIGID(arousal, dominance, valence):
|
231 |
-
r"""3D pixel plot of arousal, dominance, valence."""
|
232 |
-
# Voxels per dimension
|
233 |
-
voxels = 7
|
234 |
-
# Create voxel grid
|
235 |
-
x, y, z = np.indices((voxels + 1, voxels + 1, voxels + 1))
|
236 |
-
voxel = (
|
237 |
-
(x == round(arousal * voxels))
|
238 |
-
& (y == round(dominance * voxels))
|
239 |
-
& (z == round(valence * voxels))
|
240 |
-
)
|
241 |
-
projection = (
|
242 |
-
(x == round(arousal * voxels))
|
243 |
-
& (y == round(dominance * voxels))
|
244 |
-
& (z < round(valence * voxels))
|
245 |
-
)
|
246 |
-
colors = np.empty((voxel | projection).shape, dtype=object)
|
247 |
-
colors[voxel] = "#fcb06c"
|
248 |
-
colors[projection] = "#fed7a9"
|
249 |
-
ax = plt.figure().add_subplot(projection='3d')
|
250 |
-
ax.voxels(voxel | projection, facecolors=colors, edgecolor='k')
|
251 |
-
ax.set_xlim([0, voxels])
|
252 |
-
ax.set_ylim([0, voxels])
|
253 |
-
ax.set_zlim([0, voxels])
|
254 |
-
ax.set_aspect("equal")
|
255 |
-
ax.set_xlabel("arousal", fontsize="large", labelpad=0)
|
256 |
-
ax.set_ylabel("dominance", fontsize="large", labelpad=0)
|
257 |
-
ax.set_zlabel("valence", fontsize="large", labelpad=0)
|
258 |
-
ax.set_xticks(
|
259 |
-
list(range(voxels + 1)),
|
260 |
-
labels=[0, None, None, None, None, None, None, 1],
|
261 |
-
verticalalignment="bottom",
|
262 |
-
)
|
263 |
-
ax.set_yticks(
|
264 |
-
list(range(voxels + 1)),
|
265 |
-
labels=[0, None, None, None, None, None, None, 1],
|
266 |
-
verticalalignment="bottom",
|
267 |
-
)
|
268 |
-
ax.set_zticks(
|
269 |
-
list(range(voxels + 1)),
|
270 |
-
labels=[0, None, None, None, None, None, None, 1],
|
271 |
-
verticalalignment="top",
|
272 |
-
)
|
273 |
-
|
274 |
def explode(data):
|
275 |
"""
|
276 |
Expands a 3D array by creating gaps between voxels.
|
@@ -282,6 +238,18 @@ def explode(data):
|
|
282 |
retval[::2, ::2, ::2] = data
|
283 |
return retval
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
def plot_expression(arousal, dominance, valence):
|
286 |
'''_h = cuda tensor (N_PIX, N_PIX, N_PIX)'''
|
287 |
|
@@ -302,7 +270,8 @@ def plot_expression(arousal, dominance, valence):
|
|
302 |
y[:, 1::2, :] += 1
|
303 |
z[:, :, 1::2] += 1
|
304 |
|
305 |
-
|
|
|
306 |
|
307 |
f_2 = np.ones([2 * N_PIX - 1,
|
308 |
2 * N_PIX - 1,
|
@@ -313,7 +282,6 @@ def plot_expression(arousal, dominance, valence):
|
|
313 |
|
314 |
f_2[:, :, :, 3] = f_2[:, :, :, 3].clip(.01, .74)
|
315 |
|
316 |
-
print(f_2.shape, 'f_2 AAAA')
|
317 |
ecolors_2 = f_2
|
318 |
|
319 |
ax.voxels(x, y, z, filled_2, facecolors=f_2, edgecolors=.006 * ecolors_2)
|
@@ -348,10 +316,156 @@ def plot_expression(arousal, dominance, valence):
|
|
348 |
ax.set_xlim(0, N_PIX)
|
349 |
ax.set_ylim(0, N_PIX)
|
350 |
ax.set_zlim(0, N_PIX)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
351 |
|
352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
|
354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
|
356 |
|
357 |
description = (
|
@@ -366,7 +480,52 @@ description = (
|
|
366 |
)
|
367 |
|
368 |
with gr.Blocks() as demo:
|
369 |
-
with gr.Tab(label="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
with gr.Row():
|
371 |
with gr.Column():
|
372 |
gr.Markdown(description)
|
@@ -378,10 +537,10 @@ with gr.Blocks() as demo:
|
|
378 |
)
|
379 |
gr.Examples(
|
380 |
[
|
381 |
-
"female-46-neutral.wav",
|
382 |
-
"female-20-happy.wav",
|
383 |
-
"male-60-angry.wav",
|
384 |
-
"male-27-sad.wav",
|
385 |
],
|
386 |
[input],
|
387 |
label="Examples from CREMA-D, ODbL v1.0 license",
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
import typing
|
3 |
import types # fusion of forward() of Wav2Vec2
|
4 |
import gradio as gr
|
5 |
import matplotlib.pyplot as plt
|
6 |
import numpy as np
|
7 |
+
import os
|
8 |
import torch
|
9 |
import torch.nn as nn
|
10 |
from transformers import Wav2Vec2Processor
|
11 |
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
|
12 |
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
|
|
|
13 |
import audiofile
|
14 |
+
import unicodedata
|
15 |
+
import textwrap
|
16 |
+
from tts import StyleTTS2
|
17 |
import audresample
|
18 |
|
19 |
|
20 |
device = 0 if torch.cuda.is_available() else "cpu"
|
21 |
duration = 2 # limit processing of audio
|
22 |
+
age_gender_model_name = "audeering/wav2vec2-large-robust-6-ft-age-gender"
|
23 |
expression_model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
|
24 |
|
25 |
|
|
|
170 |
|
171 |
|
172 |
# Load models from hub
|
173 |
+
|
174 |
age_gender_model = AgeGenderModel.from_pretrained(age_gender_model_name)
|
175 |
expression_processor = Wav2Vec2Processor.from_pretrained(expression_model_name)
|
176 |
expression_model = ExpressionModel.from_pretrained(expression_model_name)
|
|
|
209 |
},
|
210 |
expression_file,
|
211 |
)
|
212 |
+
|
213 |
|
214 |
+
def recognize(input_file):
|
|
|
|
|
|
|
|
|
215 |
if input_file is None:
|
216 |
raise gr.Error(
|
217 |
"No audio file submitted! "
|
|
|
227 |
return process_func(signal, target_rate)
|
228 |
|
229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
def explode(data):
|
231 |
"""
|
232 |
Expands a 3D array by creating gaps between voxels.
|
|
|
238 |
retval[::2, ::2, ::2] = data
|
239 |
return retval
|
240 |
|
241 |
+
|
242 |
+
def explode(data):
|
243 |
+
"""
|
244 |
+
Expands a 3D array by adding new voxels between existing ones.
|
245 |
+
This is used to create the gaps in the 3D plot.
|
246 |
+
"""
|
247 |
+
shape = data.shape
|
248 |
+
new_shape = (2 * shape[0] - 1, 2 * shape[1] - 1, 2 * shape[2] - 1)
|
249 |
+
new_data = np.zeros(new_shape, dtype=data.dtype)
|
250 |
+
new_data[::2, ::2, ::2] = data
|
251 |
+
return new_data
|
252 |
+
|
253 |
def plot_expression(arousal, dominance, valence):
|
254 |
'''_h = cuda tensor (N_PIX, N_PIX, N_PIX)'''
|
255 |
|
|
|
270 |
y[:, 1::2, :] += 1
|
271 |
z[:, :, 1::2] += 1
|
272 |
|
273 |
+
fig = plt.figure()
|
274 |
+
ax = fig.add_subplot(projection='3d')
|
275 |
|
276 |
f_2 = np.ones([2 * N_PIX - 1,
|
277 |
2 * N_PIX - 1,
|
|
|
282 |
|
283 |
f_2[:, :, :, 3] = f_2[:, :, :, 3].clip(.01, .74)
|
284 |
|
|
|
285 |
ecolors_2 = f_2
|
286 |
|
287 |
ax.voxels(x, y, z, filled_2, facecolors=f_2, edgecolors=.006 * ecolors_2)
|
|
|
316 |
ax.set_xlim(0, N_PIX)
|
317 |
ax.set_ylim(0, N_PIX)
|
318 |
ax.set_zlim(0, N_PIX)
|
319 |
+
# plt.show()
|
320 |
+
|
321 |
+
# TTS
|
322 |
+
VOICES = [f'wav/{vox}' for vox in os.listdir('wav')]
|
323 |
+
_tts = StyleTTS2().to('cpu')
|
324 |
+
|
325 |
+
def only_greek_or_only_latin(text, lang='grc'):
|
326 |
+
'''
|
327 |
+
str: The converted string in the specified target script.
|
328 |
+
Characters not found in any mapping are preserved as is.
|
329 |
+
Latin accented characters in the input (e.g., 'É', 'ü') will
|
330 |
+
be preserved in their lowercase form (e.g., 'é', 'ü') if
|
331 |
+
converting to Latin.
|
332 |
+
'''
|
333 |
+
|
334 |
+
# --- Mapping Dictionaries ---
|
335 |
+
# Keys are in lowercase as input text is case-folded.
|
336 |
+
# If the output needs to maintain original casing, additional logic is required.
|
337 |
+
|
338 |
+
latin_to_greek_map = {
|
339 |
+
'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
|
340 |
+
'ch': 'τσο', # Example of a multi-character Latin sequence
|
341 |
+
'z': 'ζ', 'h': 'χ', 'i': 'ι', 'k': 'κ', 'l': 'λ',
|
342 |
+
'm': 'μ', 'n': 'ν', 'x': 'ξ', 'o': 'ο', 'p': 'π',
|
343 |
+
'v': 'β', 'sc': 'σκ', 'r': 'ρ', 's': 'σ', 't': 'τ',
|
344 |
+
'u': 'ου', 'f': 'φ', 'c': 'σ', 'w': 'β', 'y': 'γ',
|
345 |
+
}
|
346 |
+
|
347 |
+
greek_to_latin_map = {
|
348 |
+
'ου': 'ou', # Prioritize common diphthongs/digraphs
|
349 |
+
'α': 'a', 'β': 'v', 'γ': 'g', 'δ': 'd', 'ε': 'e',
|
350 |
+
'ζ': 'z', 'η': 'i', 'θ': 'th', 'ι': 'i', 'κ': 'k',
|
351 |
+
'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o',
|
352 |
+
'π': 'p', 'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'y', # 'y' is a common transliteration for upsilon
|
353 |
+
'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o',
|
354 |
+
'ς': 's', # Final sigma
|
355 |
+
}
|
356 |
+
|
357 |
+
cyrillic_to_latin_map = {
|
358 |
+
'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo', 'ж': 'zh',
|
359 |
+
'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
|
360 |
+
'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
|
361 |
+
'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
|
362 |
+
'я': 'ya',
|
363 |
+
}
|
364 |
+
|
365 |
+
# Direct Cyrillic to Greek mapping based on phonetic similarity.
|
366 |
+
# These are approximations and may not be universally accepted transliterations.
|
367 |
+
cyrillic_to_greek_map = {
|
368 |
+
'а': 'α', 'б': 'β', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', 'ё': 'ιο', 'ж': 'ζ',
|
369 |
+
'з': 'ζ', 'и': 'ι', 'й': 'ι', 'κ': 'κ', 'λ': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο',
|
370 |
+
'π': 'π', 'ρ': 'ρ', 'σ': 'σ', 'τ': 'τ', 'у': 'ου', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
|
371 |
+
'ч': 'τσ', # or τζ depending on desired sound
|
372 |
+
'ш': 'σ', 'щ': 'σ', # approximations
|
373 |
+
'ъ': '', 'ы': 'ι', 'ь': '', 'э': 'ε', 'ю': 'ιου',
|
374 |
+
'я': 'ια',
|
375 |
+
}
|
376 |
+
|
377 |
+
# Convert the input text to lowercase, preserving accents for Latin characters.
|
378 |
+
# casefold() is used for more robust caseless matching across Unicode characters.
|
379 |
+
lowercased_text = text.lower() #casefold()
|
380 |
+
output_chars = []
|
381 |
+
current_index = 0
|
382 |
+
|
383 |
+
if lang == 'grc':
|
384 |
+
# Combine all relevant maps for direct lookup to Greek
|
385 |
+
conversion_map = {**latin_to_greek_map, **cyrillic_to_greek_map}
|
386 |
+
|
387 |
+
# Sort keys by length in reverse order to handle multi-character sequences first
|
388 |
+
sorted_source_keys = sorted(
|
389 |
+
list(latin_to_greek_map.keys()) + list(cyrillic_to_greek_map.keys()),
|
390 |
+
key=len,
|
391 |
+
reverse=True
|
392 |
+
)
|
393 |
+
|
394 |
+
while current_index < len(lowercased_text):
|
395 |
+
found_conversion = False
|
396 |
+
for key in sorted_source_keys:
|
397 |
+
if lowercased_text.startswith(key, current_index):
|
398 |
+
output_chars.append(conversion_map[key])
|
399 |
+
current_index += len(key)
|
400 |
+
found_conversion = True
|
401 |
+
break
|
402 |
+
if not found_conversion:
|
403 |
+
# If no specific mapping found, append the character as is.
|
404 |
+
# This handles unmapped characters and already Greek characters.
|
405 |
+
output_chars.append(lowercased_text[current_index])
|
406 |
+
current_index += 1
|
407 |
+
return ''.join(output_chars)
|
408 |
+
|
409 |
+
else: # Default to 'lat' conversion
|
410 |
+
# Combine Greek to Latin and Cyrillic to Latin maps.
|
411 |
+
# Cyrillic map keys will take precedence in case of overlap if defined after Greek.
|
412 |
+
combined_to_latin_map = {**greek_to_latin_map, **cyrillic_to_latin_map}
|
413 |
+
|
414 |
+
# Sort all relevant source keys by length in reverse for replacement
|
415 |
+
sorted_source_keys = sorted(
|
416 |
+
list(greek_to_latin_map.keys()) + list(cyrillic_to_latin_map.keys()),
|
417 |
+
key=len,
|
418 |
+
reverse=True
|
419 |
+
)
|
420 |
+
|
421 |
+
while current_index < len(lowercased_text):
|
422 |
+
found_conversion = False
|
423 |
+
for key in sorted_source_keys:
|
424 |
+
if lowercased_text.startswith(key, current_index):
|
425 |
+
latin_equivalent = combined_to_latin_map[key]
|
426 |
|
427 |
+
# Strip accents ONLY if the source character was from the Greek map.
|
428 |
+
# This preserves accents on original Latin characters (like 'é')
|
429 |
+
# and allows for intentional accent stripping from Greek transliterations.
|
430 |
+
if key in greek_to_latin_map:
|
431 |
+
normalized_latin = unicodedata.normalize('NFD', latin_equivalent)
|
432 |
+
stripped_latin = ''.join(c for c in normalized_latin if not unicodedata.combining(c))
|
433 |
+
output_chars.append(stripped_latin)
|
434 |
+
else:
|
435 |
+
output_chars.append(latin_equivalent)
|
436 |
|
437 |
+
current_index += len(key)
|
438 |
+
found_conversion = True
|
439 |
+
break
|
440 |
+
|
441 |
+
if not found_conversion:
|
442 |
+
# If no conversion happened from Greek or Cyrillic, append the character as is.
|
443 |
+
# This preserves existing Latin characters (including accented ones from input),
|
444 |
+
# numbers, punctuation, and other symbols.
|
445 |
+
output_chars.append(lowercased_text[current_index])
|
446 |
+
current_index += 1
|
447 |
+
|
448 |
+
return ''.join(output_chars)
|
449 |
+
|
450 |
+
|
451 |
+
def other_tts(text='Hallov worlds Far over the',
|
452 |
+
ref_s='wav/af_ZA_google-nwu_0184.wav'):
|
453 |
+
|
454 |
+
text = only_greek_or_only_latin(text, lang='eng')
|
455 |
+
|
456 |
+
x = _tts.inference(text, ref_s=ref_s)[0, 0, :].cpu().numpy()
|
457 |
+
|
458 |
+
# x /= np.abs(x).max() + 1e-7 ~ Volume normalisation @api.py:tts_multi_sentence() OR demo.py
|
459 |
+
|
460 |
+
tmp_file = f'_speech.wav' # N x clients (cleanup vs tmp file / client)
|
461 |
+
|
462 |
+
audiofile.write(tmp_file, x, 24000)
|
463 |
+
|
464 |
+
return tmp_file
|
465 |
+
|
466 |
+
|
467 |
+
def update_selected_voice(voice_filename):
|
468 |
+
return 'wav/' + voice_filename + '.wav'
|
469 |
|
470 |
|
471 |
description = (
|
|
|
480 |
)
|
481 |
|
482 |
with gr.Blocks() as demo:
|
483 |
+
with gr.Tab(label="other TTS"):
|
484 |
+
|
485 |
+
selected_voice = gr.State(value='wav/en_US_m-ailabs_mary_ann.wav')
|
486 |
+
|
487 |
+
with gr.Row():
|
488 |
+
voice_info = gr.Markdown(f'TTS vox : `{selected_voice.value}`')
|
489 |
+
|
490 |
+
# Main input and output components
|
491 |
+
with gr.Row():
|
492 |
+
text_input = gr.Textbox(
|
493 |
+
label="Enter text for TTS:",
|
494 |
+
placeholder="Type your message here...",
|
495 |
+
lines=4,
|
496 |
+
value="Farover the misty mountains cold too dungeons deep and caverns old.",
|
497 |
+
)
|
498 |
+
generate_button = gr.Button("Generate Audio", variant="primary")
|
499 |
+
|
500 |
+
output_audio = gr.Audio(label="TTS Output")
|
501 |
+
|
502 |
+
with gr.Column():
|
503 |
+
voice_buttons = []
|
504 |
+
for i in range(0, len(VOICES), 7):
|
505 |
+
with gr.Row():
|
506 |
+
for voice_filename in VOICES[i:i+7]:
|
507 |
+
voice_filename = voice_filename[4:-4] # drop wav/ for visibility
|
508 |
+
button = gr.Button(voice_filename)
|
509 |
+
|
510 |
+
button.click(
|
511 |
+
fn=update_selected_voice,
|
512 |
+
inputs=[gr.Textbox(value=voice_filename, visible=False)],
|
513 |
+
outputs=[selected_voice]
|
514 |
+
)
|
515 |
+
button.click(
|
516 |
+
fn=lambda v=voice_filename: f"TTS Vox = `{v}`",
|
517 |
+
inputs=None,
|
518 |
+
outputs=voice_info
|
519 |
+
)
|
520 |
+
voice_buttons.append(button)
|
521 |
+
|
522 |
+
generate_button.click(
|
523 |
+
fn=other_tts,
|
524 |
+
inputs=[text_input, selected_voice],
|
525 |
+
outputs=output_audio
|
526 |
+
)
|
527 |
+
|
528 |
+
with gr.Tab(label="Speech Analysis"):
|
529 |
with gr.Row():
|
530 |
with gr.Column():
|
531 |
gr.Markdown(description)
|
|
|
537 |
)
|
538 |
gr.Examples(
|
539 |
[
|
540 |
+
"wav/female-46-neutral.wav",
|
541 |
+
"wav/female-20-happy.wav",
|
542 |
+
"wav/male-60-angry.wav",
|
543 |
+
"wav/male-27-sad.wav",
|
544 |
],
|
545 |
[input],
|
546 |
label="Examples from CREMA-D, ODbL v1.0 license",
|
espeak_util.py
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import platform
|
2 |
+
import subprocess
|
3 |
+
import shutil
|
4 |
+
from pathlib import Path
|
5 |
+
import os
|
6 |
+
from typing import Optional, Tuple
|
7 |
+
from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
8 |
+
|
9 |
+
|
10 |
+
class EspeakConfig:
|
11 |
+
"""Utility class for configuring espeak-ng library and binary."""
|
12 |
+
|
13 |
+
@staticmethod
|
14 |
+
def find_espeak_binary() -> tuple[bool, Optional[str]]:
|
15 |
+
"""
|
16 |
+
Find espeak-ng binary using multiple methods.
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
tuple: (bool indicating if espeak is available, path to espeak binary if found)
|
20 |
+
"""
|
21 |
+
# Common binary names
|
22 |
+
binary_names = ["espeak-ng", "espeak"]
|
23 |
+
if platform.system() == "Windows":
|
24 |
+
binary_names = ["espeak-ng.exe", "espeak.exe"]
|
25 |
+
|
26 |
+
# Common installation directories for Linux
|
27 |
+
linux_paths = [
|
28 |
+
"/usr/bin",
|
29 |
+
"/usr/local/bin",
|
30 |
+
"/usr/lib/espeak-ng",
|
31 |
+
"/usr/local/lib/espeak-ng",
|
32 |
+
"/opt/espeak-ng/bin",
|
33 |
+
]
|
34 |
+
|
35 |
+
# First check if it's in PATH
|
36 |
+
for name in binary_names:
|
37 |
+
espeak_path = shutil.which(name)
|
38 |
+
if espeak_path:
|
39 |
+
return True, espeak_path
|
40 |
+
|
41 |
+
# For Linux, check common installation directories
|
42 |
+
if platform.system() == "Linux":
|
43 |
+
for directory in linux_paths:
|
44 |
+
for name in binary_names:
|
45 |
+
path = Path(directory) / name
|
46 |
+
if path.exists():
|
47 |
+
return True, str(path)
|
48 |
+
|
49 |
+
# Try running the command directly as a last resort
|
50 |
+
try:
|
51 |
+
subprocess.run(
|
52 |
+
["espeak-ng", "--version"],
|
53 |
+
stdout=subprocess.PIPE,
|
54 |
+
stderr=subprocess.PIPE,
|
55 |
+
check=True,
|
56 |
+
)
|
57 |
+
return True, "espeak-ng"
|
58 |
+
except (subprocess.SubprocessError, FileNotFoundError):
|
59 |
+
pass
|
60 |
+
|
61 |
+
return False, None
|
62 |
+
|
63 |
+
@staticmethod
|
64 |
+
def find_library_path() -> Optional[str]:
|
65 |
+
"""
|
66 |
+
Find the espeak-ng library using multiple search methods.
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
Optional[str]: Path to the library if found, None otherwise
|
70 |
+
"""
|
71 |
+
system = platform.system()
|
72 |
+
|
73 |
+
if system == "Linux":
|
74 |
+
lib_names = ["libespeak-ng.so", "libespeak-ng.so.1"]
|
75 |
+
common_paths = [
|
76 |
+
# Debian/Ubuntu paths
|
77 |
+
"/usr/lib/x86_64-linux-gnu",
|
78 |
+
"/usr/lib/aarch64-linux-gnu", # For ARM64
|
79 |
+
"/usr/lib/arm-linux-gnueabihf", # For ARM32
|
80 |
+
"/usr/lib",
|
81 |
+
"/usr/local/lib",
|
82 |
+
# Fedora/RHEL paths
|
83 |
+
"/usr/lib64",
|
84 |
+
"/usr/lib32",
|
85 |
+
# Common additional paths
|
86 |
+
"/usr/lib/espeak-ng",
|
87 |
+
"/usr/local/lib/espeak-ng",
|
88 |
+
"/opt/espeak-ng/lib",
|
89 |
+
]
|
90 |
+
|
91 |
+
# Check common locations first
|
92 |
+
for path in common_paths:
|
93 |
+
for lib_name in lib_names:
|
94 |
+
lib_path = Path(path) / lib_name
|
95 |
+
if lib_path.exists():
|
96 |
+
return str(lib_path)
|
97 |
+
|
98 |
+
# Search system library paths
|
99 |
+
try:
|
100 |
+
# Use ldconfig to find the library
|
101 |
+
result = subprocess.run(
|
102 |
+
["ldconfig", "-p"], capture_output=True, text=True, check=True
|
103 |
+
)
|
104 |
+
for line in result.stdout.splitlines():
|
105 |
+
if "libespeak-ng.so" in line:
|
106 |
+
# Extract path from ldconfig output
|
107 |
+
return line.split("=>")[-1].strip()
|
108 |
+
except (subprocess.SubprocessError, FileNotFoundError):
|
109 |
+
pass
|
110 |
+
|
111 |
+
elif system == "Darwin": # macOS
|
112 |
+
common_paths = [
|
113 |
+
Path("/opt/homebrew/lib/libespeak-ng.dylib"),
|
114 |
+
Path("/usr/local/lib/libespeak-ng.dylib"),
|
115 |
+
*list(
|
116 |
+
Path("/opt/homebrew/Cellar/espeak-ng").glob(
|
117 |
+
"*/lib/libespeak-ng.dylib"
|
118 |
+
)
|
119 |
+
),
|
120 |
+
*list(
|
121 |
+
Path("/usr/local/Cellar/espeak-ng").glob("*/lib/libespeak-ng.dylib")
|
122 |
+
),
|
123 |
+
]
|
124 |
+
|
125 |
+
for path in common_paths:
|
126 |
+
if path.exists():
|
127 |
+
return str(path)
|
128 |
+
|
129 |
+
elif system == "Windows":
|
130 |
+
common_paths = [
|
131 |
+
Path(os.environ.get("PROGRAMFILES", "C:\\Program Files"))
|
132 |
+
/ "eSpeak NG"
|
133 |
+
/ "libespeak-ng.dll",
|
134 |
+
Path(os.environ.get("PROGRAMFILES(X86)", "C:\\Program Files (x86)"))
|
135 |
+
/ "eSpeak NG"
|
136 |
+
/ "libespeak-ng.dll",
|
137 |
+
*[
|
138 |
+
Path(p) / "libespeak-ng.dll"
|
139 |
+
for p in os.environ.get("PATH", "").split(os.pathsep)
|
140 |
+
],
|
141 |
+
]
|
142 |
+
|
143 |
+
for path in common_paths:
|
144 |
+
if path.exists():
|
145 |
+
return str(path)
|
146 |
+
|
147 |
+
return None
|
148 |
+
|
149 |
+
@classmethod
|
150 |
+
def configure_espeak(cls) -> Tuple[bool, str]:
|
151 |
+
"""
|
152 |
+
Configure espeak-ng for use with the phonemizer.
|
153 |
+
|
154 |
+
Returns:
|
155 |
+
Tuple[bool, str]: (Success status, Status message)
|
156 |
+
"""
|
157 |
+
# First check if espeak binary is available
|
158 |
+
espeak_available, espeak_path = cls.find_espeak_binary()
|
159 |
+
if not espeak_available:
|
160 |
+
raise FileNotFoundError(
|
161 |
+
"Could not find espeak-ng binary. Please install espeak-ng:\n"
|
162 |
+
"Ubuntu/Debian: sudo apt-get install espeak-ng espeak-ng-data\n"
|
163 |
+
"Fedora: sudo dnf install espeak-ng\n"
|
164 |
+
"Arch: sudo pacman -S espeak-ng\n"
|
165 |
+
"MacOS: brew install espeak-ng\n"
|
166 |
+
"Windows: Download from https://github.com/espeak-ng/espeak-ng/releases"
|
167 |
+
)
|
168 |
+
|
169 |
+
# Find the library
|
170 |
+
library_path = cls.find_library_path()
|
171 |
+
if not library_path:
|
172 |
+
# On Linux, we might not need to explicitly set the library path
|
173 |
+
if platform.system() == "Linux":
|
174 |
+
return True, f"Using system espeak-ng installation at: {espeak_path}"
|
175 |
+
else:
|
176 |
+
raise FileNotFoundError(
|
177 |
+
"Could not find espeak-ng library. Please ensure espeak-ng is properly installed."
|
178 |
+
)
|
179 |
+
|
180 |
+
# Try to set the library path
|
181 |
+
try:
|
182 |
+
EspeakWrapper.set_library(library_path)
|
183 |
+
return True, f"Successfully configured espeak-ng library at: {library_path}"
|
184 |
+
except Exception as e:
|
185 |
+
if platform.system() == "Linux":
|
186 |
+
# On Linux, try to continue without explicit library path
|
187 |
+
return True, f"Using system espeak-ng installation at: {espeak_path}"
|
188 |
+
else:
|
189 |
+
raise RuntimeError(f"Failed to configure espeak-ng library: {str(e)}")
|
190 |
+
|
191 |
+
|
192 |
+
def setup_espeak():
|
193 |
+
"""
|
194 |
+
Set up espeak-ng for use with the phonemizer.
|
195 |
+
Raises appropriate exceptions if setup fails.
|
196 |
+
"""
|
197 |
+
try:
|
198 |
+
success, message = EspeakConfig.configure_espeak()
|
199 |
+
print(message)
|
200 |
+
except Exception as e:
|
201 |
+
print(f"Error configuring espeak-ng: {str(e)}")
|
202 |
+
raise
|
203 |
+
|
204 |
+
|
205 |
+
# Replace the original set_espeak_library function with this
|
206 |
+
set_espeak_library = setup_espeak
|
requirements.txt
CHANGED
@@ -1,5 +1,13 @@
|
|
1 |
-
audiofile
|
2 |
-
audresample
|
3 |
-
matplotlib
|
4 |
torch
|
|
|
|
|
|
|
5 |
transformers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
torch
|
2 |
+
nltk
|
3 |
+
pydantic==2.10.6
|
4 |
+
librosa
|
5 |
transformers
|
6 |
+
phonemizer
|
7 |
+
audiofile
|
8 |
+
matplotlib
|
9 |
+
numpy<2.0.0
|
10 |
+
gradio==5.27.0
|
11 |
+
audresample
|
12 |
+
|
13 |
+
|
wav/af_ZA_google-nwu_0184.wav
ADDED
Binary file (92 kB). View file
|
|
wav/af_ZA_google-nwu_1919.wav
ADDED
Binary file (92 kB). View file
|
|
wav/af_ZA_google-nwu_2418.wav
ADDED
Binary file (92 kB). View file
|
|
wav/af_ZA_google-nwu_6590.wav
ADDED
Binary file (92 kB). View file
|
|
wav/af_ZA_google-nwu_7130.wav
ADDED
Binary file (92 kB). View file
|
|
wav/af_ZA_google-nwu_7214.wav
ADDED
Binary file (92 kB). View file
|
|
wav/af_ZA_google-nwu_8148.wav
ADDED
Binary file (92 kB). View file
|
|
wav/af_ZA_google-nwu_8924.wav
ADDED
Binary file (92 kB). View file
|
|
wav/af_ZA_google-nwu_8963.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_00737.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_00779.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_01232.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_01701.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_03042.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_0834.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_1010.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_3108.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_3713.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_3958.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_4046.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_4811.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_5958.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_9169.wav
ADDED
Binary file (92 kB). View file
|
|
wav/bn_multi_rm.wav
ADDED
Binary file (92 kB). View file
|
|
wav/de_DE_m-ailabs_angela_merkel.wav
ADDED
Binary file (90.7 kB). View file
|
|
wav/de_DE_m-ailabs_eva_k.wav
ADDED
Binary file (92.7 kB). View file
|
|
wav/de_DE_m-ailabs_karlsson.wav
ADDED
Binary file (92.7 kB). View file
|
|
wav/de_DE_m-ailabs_ramona_deininger.wav
ADDED
Binary file (91.2 kB). View file
|
|
wav/de_DE_m-ailabs_rebecca_braunert_plunkett.wav
ADDED
Binary file (91.2 kB). View file
|
|
wav/de_DE_thorsten-emotion_amused.wav
ADDED
Binary file (92 kB). View file
|
|
wav/el_GR_rapunzelina.wav
ADDED
Binary file (92 kB). View file
|
|
wav/en_UK_apope.wav
ADDED
Binary file (92 kB). View file
|
|
wav/en_US_cmu_arctic_aew.wav
ADDED
Binary file (92 kB). View file
|
|
wav/en_US_cmu_arctic_aup.wav
ADDED
Binary file (94.3 kB). View file
|
|
wav/en_US_cmu_arctic_awb.wav
ADDED
Binary file (92 kB). View file
|
|
wav/en_US_cmu_arctic_awbrms.wav
ADDED
Binary file (92.7 kB). View file
|
|
wav/en_US_cmu_arctic_axb.wav
ADDED
Binary file (92 kB). View file
|
|
wav/en_US_cmu_arctic_bdl.wav
ADDED
Binary file (94.8 kB). View file
|
|
wav/en_US_cmu_arctic_clb.wav
ADDED
Binary file (92 kB). View file
|
|
wav/en_US_cmu_arctic_eey.wav
ADDED
Binary file (95.3 kB). View file
|
|
wav/en_US_cmu_arctic_fem.wav
ADDED
Binary file (94.8 kB). View file
|
|
wav/en_US_cmu_arctic_gka.wav
ADDED
Binary file (95.3 kB). View file
|
|
wav/en_US_cmu_arctic_jmk.wav
ADDED
Binary file (93.2 kB). View file
|
|
wav/en_US_cmu_arctic_ksp.wav
ADDED
Binary file (92 kB). View file
|
|
wav/en_US_cmu_arctic_ljm.wav
ADDED
Binary file (94.3 kB). View file
|
|
wav/en_US_cmu_arctic_lnh.wav
ADDED
Binary file (94.8 kB). View file
|
|