Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,12 @@
|
|
1 |
import gradio as gr
|
2 |
from gtts import gTTS
|
3 |
-
import numpy as np
|
4 |
-
import soundfile as sf
|
5 |
-
from pydub import AudioSegment
|
6 |
import io
|
7 |
import tempfile
|
8 |
-
import
|
|
|
9 |
|
10 |
def text_to_speech_with_emotion(text, emotion, language='en'):
|
|
|
11 |
tts = gTTS(text=text, lang=language, slow=False)
|
12 |
|
13 |
with io.BytesIO() as fp:
|
@@ -15,69 +14,45 @@ def text_to_speech_with_emotion(text, emotion, language='en'):
|
|
15 |
fp.seek(0)
|
16 |
audio = AudioSegment.from_mp3(fp)
|
17 |
|
18 |
-
# Adjust audio based on emotion
|
19 |
if emotion == "Happy":
|
20 |
-
audio = audio.speedup(playback_speed=1.
|
|
|
21 |
elif emotion == "Sad":
|
22 |
-
audio = audio.speedup(playback_speed=0.
|
|
|
23 |
elif emotion == "Angry":
|
24 |
-
audio = audio
|
|
|
|
|
25 |
|
26 |
-
# Apply
|
27 |
-
audio = audio.
|
|
|
28 |
|
29 |
return audio
|
30 |
|
31 |
-
def
|
32 |
-
|
33 |
-
t = np.linspace(0, duration, int(sr * duration), False)
|
34 |
-
|
35 |
-
if emotion == "Happy":
|
36 |
-
freq = 440 # A4 note
|
37 |
-
audio = np.sin(2 * np.pi * freq * t) * np.exp(-3 * t)
|
38 |
-
elif emotion == "Sad":
|
39 |
-
freq = 294 # D4 note
|
40 |
-
audio = np.sin(2 * np.pi * freq * t) * np.exp(-2 * t)
|
41 |
-
elif emotion == "Angry":
|
42 |
-
freq = 392 # G4 note
|
43 |
-
audio = np.sign(np.sin(2 * np.pi * freq * t)) * np.exp(-4 * t)
|
44 |
-
else: # Neutral
|
45 |
-
freq = 329 # E4 note
|
46 |
-
audio = np.sin(2 * np.pi * freq * t) * np.exp(-3 * t)
|
47 |
-
|
48 |
-
audio = audio / np.max(np.abs(audio)) # Normalize
|
49 |
-
return AudioSegment(audio.astype(np.float32).tobytes(),
|
50 |
-
frame_rate=sr, sample_width=4, channels=1)
|
51 |
-
|
52 |
-
def generate_dialogue_and_sound(text, emotion, language):
|
53 |
-
speech = text_to_speech_with_emotion(text, emotion, language)
|
54 |
-
sound = generate_subtle_emotion_sound(emotion, 1) # 1 second of subtle emotion sound
|
55 |
-
|
56 |
-
# Reduce the volume of the sound effect
|
57 |
-
sound = sound - 12 # Reduce volume by 12 dB
|
58 |
-
|
59 |
-
# Combine speech and sound
|
60 |
-
combined = speech.overlay(sound, position=0, gain_during_overlay=-12)
|
61 |
|
62 |
-
#
|
63 |
-
|
64 |
-
combined = combined.normalize() # Normalize the final audio
|
65 |
|
66 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
|
67 |
-
|
68 |
return fp.name
|
69 |
|
70 |
# Gradio interface
|
71 |
iface = gr.Interface(
|
72 |
-
fn=
|
73 |
inputs=[
|
74 |
-
gr.Textbox(label="Enter
|
75 |
-
gr.Radio(["
|
76 |
gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
|
77 |
],
|
78 |
-
outputs=gr.Audio(label="Generated
|
79 |
-
title="
|
80 |
-
description="Generate
|
81 |
)
|
82 |
|
83 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
from gtts import gTTS
|
|
|
|
|
|
|
3 |
import io
|
4 |
import tempfile
|
5 |
+
from pydub import AudioSegment
|
6 |
+
import numpy as np
|
7 |
|
8 |
def text_to_speech_with_emotion(text, emotion, language='en'):
|
9 |
+
# Generate base speech
|
10 |
tts = gTTS(text=text, lang=language, slow=False)
|
11 |
|
12 |
with io.BytesIO() as fp:
|
|
|
14 |
fp.seek(0)
|
15 |
audio = AudioSegment.from_mp3(fp)
|
16 |
|
17 |
+
# Adjust audio based on emotion
|
18 |
if emotion == "Happy":
|
19 |
+
audio = audio.speedup(playback_speed=1.15)
|
20 |
+
audio = audio.pitch_shift(semitones=1)
|
21 |
elif emotion == "Sad":
|
22 |
+
audio = audio.speedup(playback_speed=0.85)
|
23 |
+
audio = audio.pitch_shift(semitones=-1)
|
24 |
elif emotion == "Angry":
|
25 |
+
audio = audio.speedup(playback_speed=1.1)
|
26 |
+
audio = audio + 3 # Increase volume slightly
|
27 |
+
# Neutral emotion remains unchanged
|
28 |
|
29 |
+
# Apply some subtle enhancements
|
30 |
+
audio = audio.compress_dynamic_range(threshold=-15, ratio=2.0, attack=5, release=50)
|
31 |
+
audio = audio.high_pass_filter(80) # Remove very low frequencies
|
32 |
|
33 |
return audio
|
34 |
|
35 |
+
def generate_emotional_speech(text, emotion, language):
|
36 |
+
audio = text_to_speech_with_emotion(text, emotion, language)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
# Normalize the final audio
|
39 |
+
audio = audio.normalize()
|
|
|
40 |
|
41 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
|
42 |
+
audio.export(fp.name, format="mp3", bitrate="192k") # Higher bitrate for better quality
|
43 |
return fp.name
|
44 |
|
45 |
# Gradio interface
|
46 |
iface = gr.Interface(
|
47 |
+
fn=generate_emotional_speech,
|
48 |
inputs=[
|
49 |
+
gr.Textbox(label="Enter text for speech"),
|
50 |
+
gr.Radio(["Neutral", "Happy", "Sad", "Angry"], label="Emotion", value="Neutral"),
|
51 |
gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
|
52 |
],
|
53 |
+
outputs=gr.Audio(label="Generated Emotional Speech"),
|
54 |
+
title="Clean Emotional Text-to-Speech Generator",
|
55 |
+
description="Generate clean speech with emotional variations without background sounds."
|
56 |
)
|
57 |
|
58 |
iface.launch()
|