Spaces:

capradeepgujaran
/

VoiceOversV3

Running

App Files Files Community

capradeepgujaran commited on Oct 7, 2024

Commit

b7effce

verified ·

1 Parent(s): bb3964e

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -34

app.py CHANGED Viewed

@@ -1,52 +1,73 @@
 import gradio as gr
-from transformers import pipeline
 import torch
 from TTS.api import TTS
-# Initialize Whisper for speech recognition
-asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
 # Initialize TTS model
-# Note: We're using a try-except block to handle potential issues with GPU availability
 try:
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 except Exception as e:
     print(f"Error initializing TTS model: {e}")
     tts = None
-def generate_voiceover(audio_file, emotion):
     try:
-        # Transcribe audio using Whisper
-        result = asr(audio_file)
-        transcription = result["text"]
-        # Generate voice over with selected emotion
         if tts is not None:
-            tts_audio = tts.tts(text=transcription, speaker_wav="path/to/speaker/reference.wav", language="en")
-            return (gr.Audio(value=tts_audio, type="numpy"), transcription)
         else:
             return (None, "TTS model not available. Check logs for initialization error.")
     except Exception as e:
-        return (None, f"Error: {str(e)}")
 # Gradio interface
-iface = gr.Interface(
-    fn=generate_voiceover,
-    inputs=[
-        gr.Audio(type="filepath", label="Upload Audio"),
-        gr.Dropdown(["Happy", "Sad", "Angry", "Neutral"], label="Select Emotion")
-    ],
-    outputs=[
-        gr.Audio(label="Generated Voiceover"),
-        gr.Textbox(label="Transcription/Error Message")
-    ],
-    title="Voice Over Generator with Emotion Control",
-    description="Upload an audio file, select an emotion, and generate a voice over."
-)
-# This line is crucial for Hugging Face Spaces
-iface.launch()
-if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
+from transformers import pipeline, AutoProcessor, MusicgenForConditionalGeneration
 import torch
 from TTS.api import TTS
+import scipy
 # Initialize TTS model
 try:
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device)
 except Exception as e:
     print(f"Error initializing TTS model: {e}")
     tts = None
+# Initialize Musicgen model for sound generation
+try:
+    processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+except Exception as e:
+    print(f"Error initializing Musicgen model: {e}")
+    processor = None
+    model = None
+def generate_speech(text, emotion):
     try:
         if tts is not None:
+            # Note: emotion parameter is not used in this basic example
+            # You may need a different TTS model or post-processing to incorporate emotion
+            speech = tts.tts(text=text)
+            return (gr.Audio(value=(22050, speech), type="numpy"), "Speech generated successfully")
         else:
             return (None, "TTS model not available. Check logs for initialization error.")
     except Exception as e:
+        return (None, f"Error in speech generation: {str(e)}")
+def generate_sound(text):
+    try:
+        if processor is not None and model is not None:
+            inputs = processor(
+                text=[text],
+                padding=True,
+                return_tensors="pt",
+            )
+            audio_values = model.generate(**inputs, max_new_tokens=256)
+            sampling_rate = model.config.audio_encoder.sampling_rate
+            scipy.io.wavfile.write("output.wav", rate=sampling_rate, data=audio_values[0, 0].numpy())
+            return (gr.Audio(value="output.wav", type="filepath"), "Sound generated successfully")
+        else:
+            return (None, "Musicgen model not available. Check logs for initialization error.")
+    except Exception as e:
+        return (None, f"Error in sound generation: {str(e)}")
 # Gradio interface
+with gr.Blocks() as iface:
+    gr.Markdown("# Text-to-Speech and Text-to-Sound Generation Tool")
+    with gr.Tab("Text-to-Speech"):
+        text_input = gr.Textbox(label="Enter text for speech generation")
+        emotion_input = gr.Dropdown(["Happy", "Sad", "Angry", "Neutral"], label="Select Emotion")
+        speech_button = gr.Button("Generate Speech")
+        speech_output = gr.Audio(label="Generated Speech")
+        speech_message = gr.Textbox(label="Message")
+    with gr.Tab("Text-to-Sound"):
+        sound_input = gr.Textbox(label="Enter text description for sound generation")
+        sound_button = gr.Button("Generate Sound")
+        sound_output = gr.Audio(label="Generated Sound")
+        sound_message = gr.Textbox(label="Message")
+    speech_button.click(generate_speech, inputs=[text_input, emotion_input], outputs=[speech_output, speech_message])
+    sound_button.click(generate_sound, inputs=[sound_input], outputs=[sound_output, sound_message])
+iface.launch()