Spaces:

capradeepgujaran
/

VoiceOversV3

Sleeping

App Files Files Community

capradeepgujaran commited on Oct 7, 2024

Commit

427214b

verified ·

1 Parent(s): ab2d014

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -77

app.py CHANGED Viewed

@@ -1,107 +1,94 @@
 import gradio as gr
-from transformers import AutoProcessor, MusicgenForConditionalGeneration
 import torch
-from TTS.api import TTS
-import scipy
 import numpy as np
-from pydub import AudioSegment
-import io
 import tempfile
-# Initialize TTS model
 try:
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device)
 except Exception as e:
-    print(f"Error initializing TTS model: {e}")
-    tts = None
-# Initialize Musicgen model for sound generation
-try:
-    processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
-    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
-    model.to(device)
-except Exception as e:
-    print(f"Error initializing Musicgen model: {e}")
-    processor = None
-    model = None
-def apply_emotion(audio, emotion):
-    audio_segment = AudioSegment(audio.tobytes(), frame_rate=22050, sample_width=2, channels=1)
     if emotion == "Happy":
-        audio_segment = audio_segment.pitch_shift(1).speedup(playback_speed=1.1)
     elif emotion == "Sad":
-        audio_segment = audio_segment.pitch_shift(-1).speedup(playback_speed=0.9)
     elif emotion == "Angry":
-        audio_segment = audio_segment.pitch_shift(0.5).speedup(playback_speed=1.05)
-    return np.array(audio_segment.get_array_of_samples())
-def generate_speech(text, emotion):
-    try:
-        if tts is not None:
-            speech = tts.tts(text=text)
-            speech_with_emotion = apply_emotion(speech, emotion)
-            # Improve audio quality
-            audio_segment = AudioSegment(speech_with_emotion.tobytes(), frame_rate=22050, sample_width=2, channels=1)
-            audio_segment = audio_segment.compress_dynamic_range()
-            audio_segment = audio_segment.normalize()
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
-                audio_segment.export(fp.name, format="wav")
-                return (gr.Audio(value=fp.name), "Speech generated successfully")
-        else:
-            return (None, "TTS model not available. Check logs for initialization error.")
-    except Exception as e:
-        return (None, f"Error in speech generation: {str(e)}")
-def generate_sound(text):
-    try:
-        if processor is not None and model is not None:
-            inputs = processor(
-                text=[text],
-                padding=True,
-                return_tensors="pt",
-            ).to(device)
-            audio_values = model.generate(**inputs, max_new_tokens=512)  # Increased tokens for longer audio
-            audio_data = audio_values[0, 0].cpu().numpy()
-            # Improve audio quality
-            audio_segment = AudioSegment(
-                audio_data.tobytes(),
-                frame_rate=model.config.audio_encoder.sampling_rate,
-                sample_width=2,
-                channels=1
-            )
-            audio_segment = audio_segment.compress_dynamic_range()
-            audio_segment = audio_segment.normalize()
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
-                audio_segment.export(fp.name, format="wav")
-                return (gr.Audio(value=fp.name), "Sound generated successfully")
-        else:
-            return (None, "Musicgen model not available. Check logs for initialization error.")
-    except Exception as e:
-        return (None, f"Error in sound generation: {str(e)}")
 # Gradio interface
 with gr.Blocks() as iface:
-    gr.Markdown("# Enhanced Text-to-Speech and Text-to-Sound Generation Tool")
     with gr.Tab("Text-to-Speech"):
         text_input = gr.Textbox(label="Enter text for speech generation")
         emotion_input = gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Select Emotion", value="Neutral")
         speech_button = gr.Button("Generate Speech")
         speech_output = gr.Audio(label="Generated Speech")
         speech_message = gr.Textbox(label="Message")
-    with gr.Tab("Text-to-Sound"):
-        sound_input = gr.Textbox(label="Enter text description for sound generation")
         sound_button = gr.Button("Generate Sound")
         sound_output = gr.Audio(label="Generated Sound")
         sound_message = gr.Textbox(label="Message")
-    speech_button.click(generate_speech, inputs=[text_input, emotion_input], outputs=[speech_output, speech_message])
-    sound_button.click(generate_sound, inputs=[sound_input], outputs=[sound_output, sound_message])
 iface.launch()

 import gradio as gr
+import pyttsx3
 import torch
+import torchaudio
+from torch import nn
 import numpy as np
 import tempfile
+import os
+# Initialize TTS engine
 try:
+    engine = pyttsx3.init()
 except Exception as e:
+    print(f"Error initializing TTS engine: {e}")
+    engine = None
+class SimpleWaveformGenerator(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.frequency = nn.Parameter(torch.tensor(440.0))
+    def forward(self, t):
+        return torch.sin(2 * np.pi * self.frequency * t)
+def text_to_speech_with_emotion(text, emotion, lang='en'):
+    if engine is None:
+        return None, "TTS engine not initialized correctly."
+    # Set voice properties based on emotion
     if emotion == "Happy":
+        engine.setProperty('rate', 175)
+        engine.setProperty('pitch', 75)
     elif emotion == "Sad":
+        engine.setProperty('rate', 125)
+        engine.setProperty('pitch', 25)
     elif emotion == "Angry":
+        engine.setProperty('rate', 150)
+        engine.setProperty('pitch', 100)
+    else:  # Neutral
+        engine.setProperty('rate', 150)
+        engine.setProperty('pitch', 50)
+    # Generate speech
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
+        engine.save_to_file(text, fp.name)
+        engine.runAndWait()
+        return fp.name, "Speech generated successfully"
+def generate_sound(description):
+    duration = 3  # seconds
+    sample_rate = 44100
+    t = torch.linspace(0, duration, int(sample_rate * duration))
+    generator = SimpleWaveformGenerator()
+    if "high" in description.lower():
+        generator.frequency.data = torch.tensor(880.0)
+    elif "low" in description.lower():
+        generator.frequency.data = torch.tensor(220.0)
+    with torch.no_grad():
+        audio = generator(t)
+    audio = audio / audio.abs().max()
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
+        torchaudio.save(fp.name, audio.unsqueeze(0), sample_rate)
+        return fp.name, "Sound generated successfully"
 # Gradio interface
 with gr.Blocks() as iface:
+    gr.Markdown("# Reliable Text-to-Speech and Sound Generation Tool")
     with gr.Tab("Text-to-Speech"):
         text_input = gr.Textbox(label="Enter text for speech generation")
         emotion_input = gr.Dropdown(["Neutral", "Happy", "Sad", "Angry"], label="Select Emotion", value="Neutral")
+        lang_input = gr.Dropdown(["en"], label="Select Language", value="en")
         speech_button = gr.Button("Generate Speech")
         speech_output = gr.Audio(label="Generated Speech")
         speech_message = gr.Textbox(label="Message")
+    with gr.Tab("Sound Generation"):
+        sound_input = gr.Textbox(label="Enter sound description (e.g., 'high', 'low', or leave blank for middle)")
         sound_button = gr.Button("Generate Sound")
         sound_output = gr.Audio(label="Generated Sound")
         sound_message = gr.Textbox(label="Message")
+    speech_button.click(text_to_speech_with_emotion,
+                        inputs=[text_input, emotion_input, lang_input],
+                        outputs=[speech_output, speech_message])
+    sound_button.click(generate_sound,
+                       inputs=[sound_input],
+                       outputs=[sound_output, sound_message])
 iface.launch()