Spaces:

capradeepgujaran
/

VoiceOversV3

Sleeping

App Files Files Community

capradeepgujaran commited on Oct 7, 2024

Commit

46acd9e

verified ·

1 Parent(s): 82519df

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -46

app.py CHANGED Viewed

@@ -1,58 +1,107 @@
 import gradio as gr
-from gtts import gTTS
 import io
 import tempfile
-from pydub import AudioSegment
-import numpy as np
-def text_to_speech_with_emotion(text, emotion, language='en'):
-    # Generate base speech
-    tts = gTTS(text=text, lang=language, slow=False)
-    with io.BytesIO() as fp:
-        tts.write_to_fp(fp)
-        fp.seek(0)
-        audio = AudioSegment.from_mp3(fp)
-    # Adjust audio based on emotion
     if emotion == "Happy":
-        audio = audio.speedup(playback_speed=1.15)
-        audio = audio.pitch_shift(semitones=1)
     elif emotion == "Sad":
-        audio = audio.speedup(playback_speed=0.85)
-        audio = audio.pitch_shift(semitones=-1)
     elif emotion == "Angry":
-        audio = audio.speedup(playback_speed=1.1)
-        audio = audio + 3  # Increase volume slightly
-    # Neutral emotion remains unchanged
-    # Apply some subtle enhancements
-    audio = audio.compress_dynamic_range(threshold=-15, ratio=2.0, attack=5, release=50)
-    audio = audio.high_pass_filter(80)  # Remove very low frequencies
-    return audio
-def generate_emotional_speech(text, emotion, language):
-    audio = text_to_speech_with_emotion(text, emotion, language)
-    # Normalize the final audio
-    audio = audio.normalize()
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
-        audio.export(fp.name, format="mp3", bitrate="192k")  # Higher bitrate for better quality
-        return fp.name
 # Gradio interface
-iface = gr.Interface(
-    fn=generate_emotional_speech,
-    inputs=[
-        gr.Textbox(label="Enter text for speech"),
-        gr.Radio(["Neutral", "Happy", "Sad", "Angry"], label="Emotion", value="Neutral"),
-        gr.Dropdown(["en", "es", "fr", "de", "it"], label="Language", value="en")
-    ],
-    outputs=gr.Audio(label="Generated Emotional Speech"),
-    title="Clean Emotional Text-to-Speech Generator",
-    description="Generate clean speech with emotional variations without background sounds."
-)
 iface.launch()

 import gradio as gr
+from transformers import AutoProcessor, MusicgenForConditionalGeneration
+import torch
+from TTS.api import TTS
+import scipy
+import numpy as np
+from pydub import AudioSegment
 import io
 import tempfile
+# Initialize TTS model
+try:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device)
+except Exception as e:
+    print(f"Error initializing TTS model: {e}")
+    tts = None
+# Initialize Musicgen model for sound generation
+try:
+    processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+    model.to(device)
+except Exception as e:
+    print(f"Error initializing Musicgen model: {e}")
+    processor = None
+    model = None
+def apply_emotion(audio, emotion):
+    audio_segment = AudioSegment(audio.tobytes(), frame_rate=22050, sample_width=2, channels=1)
     if emotion == "Happy":
+        audio_segment = audio_segment.pitch_shift(1).speedup(playback_speed=1.1)
     elif emotion == "Sad":
+        audio_segment = audio_segment.pitch_shift(-1).speedup(playback_speed=0.9)
     elif emotion == "Angry":
+        audio_segment = audio_segment.pitch_shift(0.5).speedup(playback_speed=1.05)
+    return np.array(audio_segment.get_array_of_samples())
+def generate_speech(text, emotion):
+    try:
+        if tts is not None:
+            speech = tts.tts(text=text)
+            speech_with_emotion = apply_emotion(speech, emotion)
+            # Improve audio quality
+            audio_segment = AudioSegment(speech_with_emotion.tobytes(), frame_rate=22050, sample_width=2, channels=1)
+            audio_segment = audio_segment.compress_dynamic_range()
+            audio_segment = audio_segment.normalize()
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
+                audio_segment.export(fp.name, format="wav")
+                return (gr.Audio(value=fp.name), "Speech generated successfully")
+        else:
+            return (None, "TTS model not available. Check logs for initialization error.")
+    except Exception as e:
+        return (None, f"Error in speech generation: {str(e)}")
+def generate_sound(text):
+    try:
+        if processor is not None and model is not None:
+            inputs = processor(
+                text=[text],
+                padding=True,
+                return_tensors="pt",
+            ).to(device)
+            audio_values = model.generate(**inputs, max_new_tokens=512)  # Increased tokens for longer audio
+            audio_data = audio_values[0, 0].cpu().numpy()
+            # Improve audio quality
+            audio_segment = AudioSegment(
+                audio_data.tobytes(),
+                frame_rate=model.config.audio_encoder.sampling_rate,
+                sample_width=2,
+                channels=1
+            )
+            audio_segment = audio_segment.compress_dynamic_range()
+            audio_segment = audio_segment.normalize()
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
+                audio_segment.export(fp.name, format="wav")
+                return (gr.Audio(value=fp.name), "Sound generated successfully")
+        else:
+            return (None, "Musicgen model not available. Check logs for initialization error.")
+    except Exception as e:
+        return (None, f"Error in sound generation: {str(e)}")
 # Gradio interface
+with gr.Blocks() as iface:
+    gr.Markdown("# Enhanced Text-to-Speech and Text-to-Sound Generation Tool")
+    with gr.Tab("Text-to-Speech"):
+        text_input = gr.Textbox(label="Enter text for speech generation")
+        emotion_input = gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Select Emotion", value="Neutral")
+        speech_button = gr.Button("Generate Speech")
+        speech_output = gr.Audio(label="Generated Speech")
+        speech_message = gr.Textbox(label="Message")
+    with gr.Tab("Text-to-Sound"):
+        sound_input = gr.Textbox(label="Enter text description for sound generation")
+        sound_button = gr.Button("Generate Sound")
+        sound_output = gr.Audio(label="Generated Sound")
+        sound_message = gr.Textbox(label="Message")
+    speech_button.click(generate_speech, inputs=[text_input, emotion_input], outputs=[speech_output, speech_message])
+    sound_button.click(generate_sound, inputs=[sound_input], outputs=[sound_output, sound_message])
 iface.launch()