Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,17 @@ import os
|
|
5 |
from gtts import gTTS
|
6 |
from pydub import AudioSegment
|
7 |
from pydub.generators import WhiteNoise, Sine
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def text_to_speech_with_emotion(text, lang, emotion):
|
10 |
try:
|
@@ -13,51 +24,63 @@ def text_to_speech_with_emotion(text, lang, emotion):
|
|
13 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
|
14 |
tts.save(fp.name)
|
15 |
|
16 |
-
# Load the audio file and apply simple emotion effects
|
17 |
audio = AudioSegment.from_mp3(fp.name)
|
|
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
audio = audio.speedup(playback_speed=0.9)
|
23 |
-
elif emotion == "Angry":
|
24 |
-
audio = audio + 5 # Increase volume
|
25 |
-
audio = audio.compress_dynamic_range(threshold=-15.0, ratio=3.0, attack=5.0, release=50.0)
|
26 |
|
27 |
-
|
28 |
-
|
|
|
29 |
except Exception as e:
|
30 |
return None, f"Error in speech generation: {str(e)}"
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def generate_sound_effect(description, duration):
|
33 |
try:
|
34 |
-
sample_rate = 44100
|
35 |
-
channels = 2
|
36 |
duration_ms = int(duration * 1000)
|
37 |
|
38 |
if "rain" in description.lower():
|
39 |
-
sound = WhiteNoise().to_audio_segment(duration=duration_ms)
|
40 |
-
sound = sound.apply_gain(-10) # Make it softer
|
41 |
elif "car horn" in description.lower():
|
42 |
-
sound =
|
43 |
-
sound = sound.append(AudioSegment.silent(duration=
|
44 |
-
sound = sound * 3
|
45 |
elif "wind" in description.lower():
|
46 |
-
sound = WhiteNoise().to_audio_segment(duration=duration_ms)
|
47 |
-
sound = sound.apply_gain(-15) # Make it softer
|
48 |
-
sound = sound.low_pass_filter(1000) # Remove high frequencies
|
49 |
elif "bird" in description.lower():
|
50 |
-
|
51 |
-
sound =
|
52 |
-
sound = sound
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
54 |
else:
|
55 |
-
#
|
56 |
-
|
|
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
except Exception as e:
|
62 |
return None, f"Error in sound effect generation: {str(e)}"
|
63 |
|
@@ -107,7 +130,7 @@ def evaluate_emotion(ref_audio, gen_audio, uttwise_score=False):
|
|
107 |
|
108 |
# Gradio interface
|
109 |
with gr.Blocks() as iface:
|
110 |
-
gr.Markdown("#
|
111 |
|
112 |
with gr.Tab("Text-to-Speech"):
|
113 |
text_input = gr.Textbox(label="Enter text for speech generation")
|
@@ -118,7 +141,7 @@ with gr.Blocks() as iface:
|
|
118 |
speech_message = gr.Textbox(label="Message")
|
119 |
|
120 |
with gr.Tab("Sound Effect Generation"):
|
121 |
-
sfx_input = gr.Textbox(label="Enter description for sound effect (e.g., 'rain', 'car horn', 'wind', 'bird')")
|
122 |
sfx_duration = gr.Slider(minimum=1, maximum=10, value=3, label="Duration (seconds)")
|
123 |
sfx_button = gr.Button("Generate Sound Effect")
|
124 |
sfx_output = gr.Audio(label="Generated Sound Effect")
|
|
|
5 |
from gtts import gTTS
|
6 |
from pydub import AudioSegment
|
7 |
from pydub.generators import WhiteNoise, Sine
|
8 |
+
import random
|
9 |
+
|
10 |
+
def apply_emotion(audio, emotion):
|
11 |
+
if emotion == "Happy":
|
12 |
+
return audio.speedup(playback_speed=1.1).pitch_shift(semitones=1)
|
13 |
+
elif emotion == "Sad":
|
14 |
+
return audio.speedup(playback_speed=0.9).pitch_shift(semitones=-1)
|
15 |
+
elif emotion == "Angry":
|
16 |
+
return audio.speedup(playback_speed=1.05).compress_dynamic_range(threshold=-15.0, ratio=4.0).apply_gain(5)
|
17 |
+
else: # Neutral
|
18 |
+
return audio
|
19 |
|
20 |
def text_to_speech_with_emotion(text, lang, emotion):
|
21 |
try:
|
|
|
24 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
|
25 |
tts.save(fp.name)
|
26 |
|
|
|
27 |
audio = AudioSegment.from_mp3(fp.name)
|
28 |
+
audio = apply_emotion(audio, emotion)
|
29 |
|
30 |
+
# Improve audio quality
|
31 |
+
audio = audio.compress_dynamic_range(threshold=-20.0, ratio=2.0)
|
32 |
+
audio = audio.normalize()
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
output_path = tempfile.mktemp(suffix=".mp3")
|
35 |
+
audio.export(output_path, format="mp3", bitrate="192k")
|
36 |
+
return output_path, f"Speech generated successfully with {emotion} emotion"
|
37 |
except Exception as e:
|
38 |
return None, f"Error in speech generation: {str(e)}"
|
39 |
|
40 |
+
def generate_complex_sound(base_freq, duration_ms, harmonics=3, noise_level=0.1):
|
41 |
+
sample_rate = 44100
|
42 |
+
t = np.linspace(0, duration_ms / 1000, int(sample_rate * duration_ms / 1000), False)
|
43 |
+
audio = np.sin(2 * np.pi * base_freq * t)
|
44 |
+
|
45 |
+
for i in range(2, harmonics + 1):
|
46 |
+
audio += (1 / i) * np.sin(2 * np.pi * (base_freq * i) * t)
|
47 |
+
|
48 |
+
noise = np.random.normal(0, noise_level, audio.shape)
|
49 |
+
audio += noise
|
50 |
+
|
51 |
+
audio = np.int16(audio / np.max(np.abs(audio)) * 32767)
|
52 |
+
return AudioSegment(audio.tobytes(), frame_rate=sample_rate, sample_width=2, channels=1)
|
53 |
+
|
54 |
def generate_sound_effect(description, duration):
|
55 |
try:
|
|
|
|
|
56 |
duration_ms = int(duration * 1000)
|
57 |
|
58 |
if "rain" in description.lower():
|
59 |
+
sound = WhiteNoise().to_audio_segment(duration=duration_ms).apply_gain(-10)
|
|
|
60 |
elif "car horn" in description.lower():
|
61 |
+
sound = generate_complex_sound(440, 200, harmonics=5, noise_level=0.05)
|
62 |
+
sound = sound.append(AudioSegment.silent(duration=100))
|
63 |
+
sound = sound * 3
|
64 |
elif "wind" in description.lower():
|
65 |
+
sound = WhiteNoise().to_audio_segment(duration=duration_ms).apply_gain(-15).low_pass_filter(1000)
|
|
|
|
|
66 |
elif "bird" in description.lower():
|
67 |
+
chirp = generate_complex_sound(random.uniform(2000, 4000), 100, harmonics=2, noise_level=0.02)
|
68 |
+
sound = chirp + AudioSegment.silent(duration=200)
|
69 |
+
sound = sound * int(duration * 2)
|
70 |
+
elif "ocean" in description.lower():
|
71 |
+
sound = WhiteNoise().to_audio_segment(duration=duration_ms).apply_gain(-20).low_pass_filter(500)
|
72 |
+
elif "thunder" in description.lower():
|
73 |
+
sound = WhiteNoise().to_audio_segment(duration=500).apply_gain(10)
|
74 |
+
sound = sound.fade_in(100).fade_out(300)
|
75 |
+
sound = sound + AudioSegment.silent(duration=duration_ms - 500)
|
76 |
else:
|
77 |
+
# Generate a more complex default sound
|
78 |
+
base_freq = random.uniform(200, 800)
|
79 |
+
sound = generate_complex_sound(base_freq, duration_ms, harmonics=4, noise_level=0.1)
|
80 |
|
81 |
+
output_path = tempfile.mktemp(suffix=".wav")
|
82 |
+
sound.export(output_path, format="wav")
|
83 |
+
return output_path, f"Sound effect generated for '{description}'"
|
84 |
except Exception as e:
|
85 |
return None, f"Error in sound effect generation: {str(e)}"
|
86 |
|
|
|
130 |
|
131 |
# Gradio interface
|
132 |
with gr.Blocks() as iface:
|
133 |
+
gr.Markdown("# Enhanced TTS and Sound Generation Tool")
|
134 |
|
135 |
with gr.Tab("Text-to-Speech"):
|
136 |
text_input = gr.Textbox(label="Enter text for speech generation")
|
|
|
141 |
speech_message = gr.Textbox(label="Message")
|
142 |
|
143 |
with gr.Tab("Sound Effect Generation"):
|
144 |
+
sfx_input = gr.Textbox(label="Enter description for sound effect (e.g., 'rain', 'car horn', 'wind', 'bird', 'ocean', 'thunder')")
|
145 |
sfx_duration = gr.Slider(minimum=1, maximum=10, value=3, label="Duration (seconds)")
|
146 |
sfx_button = gr.Button("Generate Sound Effect")
|
147 |
sfx_output = gr.Audio(label="Generated Sound Effect")
|