Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -15,46 +15,56 @@ def text_to_speech_with_emotion(text, emotion, language='en'):
|
|
15 |
fp.seek(0)
|
16 |
audio = AudioSegment.from_mp3(fp)
|
17 |
|
18 |
-
# Adjust audio based on emotion
|
19 |
if emotion == "Happy":
|
20 |
-
audio = audio.speedup(playback_speed=1.
|
21 |
elif emotion == "Sad":
|
22 |
-
audio = audio.speedup(playback_speed=0.
|
23 |
elif emotion == "Angry":
|
24 |
-
audio = audio +
|
|
|
|
|
|
|
25 |
|
26 |
return audio
|
27 |
|
28 |
-
def
|
29 |
-
sr =
|
30 |
t = np.linspace(0, duration, int(sr * duration), False)
|
31 |
|
32 |
if emotion == "Happy":
|
33 |
freq = 440 # A4 note
|
34 |
-
audio = np.sin(2 * np.pi * freq * t) * np.exp(-
|
35 |
elif emotion == "Sad":
|
36 |
freq = 294 # D4 note
|
37 |
-
audio = np.sin(2 * np.pi * freq * t) * np.exp(-
|
38 |
elif emotion == "Angry":
|
39 |
freq = 392 # G4 note
|
40 |
-
audio = np.sign(np.sin(2 * np.pi * freq * t)) * np.exp(-
|
41 |
else: # Neutral
|
42 |
freq = 329 # E4 note
|
43 |
-
audio = np.sin(2 * np.pi * freq * t) * np.exp(-
|
44 |
|
45 |
-
audio = np.concatenate([audio, np.zeros(int(sr * 0.5))]) # Add 0.5s silence
|
46 |
audio = audio / np.max(np.abs(audio)) # Normalize
|
47 |
-
return AudioSegment(audio.tobytes(),
|
|
|
48 |
|
49 |
def generate_dialogue_and_sound(text, emotion, language):
|
50 |
speech = text_to_speech_with_emotion(text, emotion, language)
|
51 |
-
sound =
|
|
|
|
|
|
|
52 |
|
53 |
# Combine speech and sound
|
54 |
-
combined = speech.overlay(sound, position=0)
|
|
|
|
|
|
|
|
|
55 |
|
56 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
|
57 |
-
combined.export(fp.name, format="mp3")
|
58 |
return fp.name
|
59 |
|
60 |
# Gradio interface
|
@@ -66,8 +76,8 @@ iface = gr.Interface(
|
|
66 |
gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
|
67 |
],
|
68 |
outputs=gr.Audio(label="Generated Dialogue with Emotion"),
|
69 |
-
title="Emotional Dialogue
|
70 |
-
description="Generate dialogues with
|
71 |
)
|
72 |
|
73 |
iface.launch()
|
|
|
15 |
fp.seek(0)
|
16 |
audio = AudioSegment.from_mp3(fp)
|
17 |
|
18 |
+
# Adjust audio based on emotion (more subtle adjustments)
|
19 |
if emotion == "Happy":
|
20 |
+
audio = audio.speedup(playback_speed=1.1)
|
21 |
elif emotion == "Sad":
|
22 |
+
audio = audio.speedup(playback_speed=0.9)
|
23 |
elif emotion == "Angry":
|
24 |
+
audio = audio + 2 # Increase volume slightly
|
25 |
+
|
26 |
+
# Apply a high-pass filter to reduce low-frequency noise
|
27 |
+
audio = audio.high_pass_filter(80)
|
28 |
|
29 |
return audio
|
30 |
|
31 |
+
def generate_subtle_emotion_sound(emotion, duration):
|
32 |
+
sr = 44100 # Higher sample rate for better quality
|
33 |
t = np.linspace(0, duration, int(sr * duration), False)
|
34 |
|
35 |
if emotion == "Happy":
|
36 |
freq = 440 # A4 note
|
37 |
+
audio = np.sin(2 * np.pi * freq * t) * np.exp(-3 * t)
|
38 |
elif emotion == "Sad":
|
39 |
freq = 294 # D4 note
|
40 |
+
audio = np.sin(2 * np.pi * freq * t) * np.exp(-2 * t)
|
41 |
elif emotion == "Angry":
|
42 |
freq = 392 # G4 note
|
43 |
+
audio = np.sign(np.sin(2 * np.pi * freq * t)) * np.exp(-4 * t)
|
44 |
else: # Neutral
|
45 |
freq = 329 # E4 note
|
46 |
+
audio = np.sin(2 * np.pi * freq * t) * np.exp(-3 * t)
|
47 |
|
|
|
48 |
audio = audio / np.max(np.abs(audio)) # Normalize
|
49 |
+
return AudioSegment(audio.astype(np.float32).tobytes(),
|
50 |
+
frame_rate=sr, sample_width=4, channels=1)
|
51 |
|
52 |
def generate_dialogue_and_sound(text, emotion, language):
|
53 |
speech = text_to_speech_with_emotion(text, emotion, language)
|
54 |
+
sound = generate_subtle_emotion_sound(emotion, 1) # 1 second of subtle emotion sound
|
55 |
+
|
56 |
+
# Reduce the volume of the sound effect
|
57 |
+
sound = sound - 12 # Reduce volume by 12 dB
|
58 |
|
59 |
# Combine speech and sound
|
60 |
+
combined = speech.overlay(sound, position=0, gain_during_overlay=-12)
|
61 |
+
|
62 |
+
# Apply some final processing
|
63 |
+
combined = combined.compress_dynamic_range() # Smooth out volume differences
|
64 |
+
combined = combined.normalize() # Normalize the final audio
|
65 |
|
66 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
|
67 |
+
combined.export(fp.name, format="mp3", bitrate="192k") # Higher bitrate for better quality
|
68 |
return fp.name
|
69 |
|
70 |
# Gradio interface
|
|
|
76 |
gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
|
77 |
],
|
78 |
outputs=gr.Audio(label="Generated Dialogue with Emotion"),
|
79 |
+
title="Improved Emotional Dialogue Generator",
|
80 |
+
description="Generate clear dialogues with subtle emotional cues."
|
81 |
)
|
82 |
|
83 |
iface.launch()
|