Spaces:

capradeepgujaran
/

VoiceOversV3

Running

App Files Files Community

capradeepgujaran commited on Oct 7, 2024

Commit

82519df

verified ·

1 Parent(s): 2ae3aa9

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -50

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
 import gradio as gr
 from gtts import gTTS
-import numpy as np
-import soundfile as sf
-from pydub import AudioSegment
 import io
 import tempfile
-import os
 def text_to_speech_with_emotion(text, emotion, language='en'):
     tts = gTTS(text=text, lang=language, slow=False)
     with io.BytesIO() as fp:
@@ -15,69 +14,45 @@ def text_to_speech_with_emotion(text, emotion, language='en'):
         fp.seek(0)
         audio = AudioSegment.from_mp3(fp)
-    # Adjust audio based on emotion (more subtle adjustments)
     if emotion == "Happy":
-        audio = audio.speedup(playback_speed=1.1)
     elif emotion == "Sad":
-        audio = audio.speedup(playback_speed=0.9)
     elif emotion == "Angry":
-        audio = audio + 2  # Increase volume slightly
-    # Apply a high-pass filter to reduce low-frequency noise
-    audio = audio.high_pass_filter(80)
     return audio
-def generate_subtle_emotion_sound(emotion, duration):
-    sr = 44100  # Higher sample rate for better quality
-    t = np.linspace(0, duration, int(sr * duration), False)
-    if emotion == "Happy":
-        freq = 440  # A4 note
-        audio = np.sin(2 * np.pi * freq * t) * np.exp(-3 * t)
-    elif emotion == "Sad":
-        freq = 294  # D4 note
-        audio = np.sin(2 * np.pi * freq * t) * np.exp(-2 * t)
-    elif emotion == "Angry":
-        freq = 392  # G4 note
-        audio = np.sign(np.sin(2 * np.pi * freq * t)) * np.exp(-4 * t)
-    else:  # Neutral
-        freq = 329  # E4 note
-        audio = np.sin(2 * np.pi * freq * t) * np.exp(-3 * t)
-    audio = audio / np.max(np.abs(audio))  # Normalize
-    return AudioSegment(audio.astype(np.float32).tobytes(),
-                        frame_rate=sr, sample_width=4, channels=1)
-def generate_dialogue_and_sound(text, emotion, language):
-    speech = text_to_speech_with_emotion(text, emotion, language)
-    sound = generate_subtle_emotion_sound(emotion, 1)  # 1 second of subtle emotion sound
-    # Reduce the volume of the sound effect
-    sound = sound - 12  # Reduce volume by 12 dB
-    # Combine speech and sound
-    combined = speech.overlay(sound, position=0, gain_during_overlay=-12)
-    # Apply some final processing
-    combined = combined.compress_dynamic_range()  # Smooth out volume differences
-    combined = combined.normalize()  # Normalize the final audio
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
-        combined.export(fp.name, format="mp3", bitrate="192k")  # Higher bitrate for better quality
         return fp.name
 # Gradio interface
 iface = gr.Interface(
-    fn=generate_dialogue_and_sound,
     inputs=[
-        gr.Textbox(label="Enter dialogue text"),
-        gr.Radio(["Happy", "Sad", "Angry", "Neutral"], label="Emotion", value="Neutral"),
         gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
     ],
-    outputs=gr.Audio(label="Generated Dialogue with Emotion"),
-    title="Improved Emotional Dialogue Generator",
-    description="Generate clear dialogues with subtle emotional cues."
 )
 iface.launch()

 import gradio as gr
 from gtts import gTTS
 import io
 import tempfile
+from pydub import AudioSegment
+import numpy as np
 def text_to_speech_with_emotion(text, emotion, language='en'):
+    # Generate base speech
     tts = gTTS(text=text, lang=language, slow=False)
     with io.BytesIO() as fp:
         fp.seek(0)
         audio = AudioSegment.from_mp3(fp)
+    # Adjust audio based on emotion
     if emotion == "Happy":
+        audio = audio.speedup(playback_speed=1.15)
+        audio = audio.pitch_shift(semitones=1)
     elif emotion == "Sad":
+        audio = audio.speedup(playback_speed=0.85)
+        audio = audio.pitch_shift(semitones=-1)
     elif emotion == "Angry":
+        audio = audio.speedup(playback_speed=1.1)
+        audio = audio + 3  # Increase volume slightly
+    # Neutral emotion remains unchanged
+    # Apply some subtle enhancements
+    audio = audio.compress_dynamic_range(threshold=-15, ratio=2.0, attack=5, release=50)
+    audio = audio.high_pass_filter(80)  # Remove very low frequencies
     return audio
+def generate_emotional_speech(text, emotion, language):
+    audio = text_to_speech_with_emotion(text, emotion, language)
+    # Normalize the final audio
+    audio = audio.normalize()
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
+        audio.export(fp.name, format="mp3", bitrate="192k")  # Higher bitrate for better quality
         return fp.name
 # Gradio interface
 iface = gr.Interface(
+    fn=generate_emotional_speech,
     inputs=[
+        gr.Textbox(label="Enter text for speech"),
+        gr.Radio(["Neutral", "Happy", "Sad", "Angry"], label="Emotion", value="Neutral"),
         gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
     ],
+    outputs=gr.Audio(label="Generated Emotional Speech"),
+    title="Clean Emotional Text-to-Speech Generator",
+    description="Generate clean speech with emotional variations without background sounds."
 )
 iface.launch()