Spaces:

capradeepgujaran
/

VoiceOversV3

Running

App Files Files Community

capradeepgujaran commited on Oct 8, 2024

Commit

df63b30

verified ·

1 Parent(s): d0ff432

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -95

app.py CHANGED Viewed

@@ -2,24 +2,12 @@ import gradio as gr
 import numpy as np
 import tempfile
 import os
-import torch
-import soundfile as sf
-from diffusers import StableAudioPipeline
 from openai import OpenAI
-import base64
 # Initialize OpenAI client
 client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
-# We'll initialize StableAudio only when it's needed to save memory
-pipe = None
-def initialize_stable_audio():
-    global pipe
-    if pipe is None:
-        pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0", torch_dtype=torch.float16)
-        pipe = pipe.to("cuda" if torch.cuda.is_available() else "cpu")
 def text_to_speech_with_emotion(text, voice, model):
     try:
         response = client.audio.speech.create(
@@ -37,79 +25,33 @@ def text_to_speech_with_emotion(text, voice, model):
     except Exception as e:
         return None, f"Error in speech generation: {str(e)}"
-def generate_sound(prompt, negative_prompt, seed, inference_steps, duration, waveforms):
     try:
-        initialize_stable_audio()
-        # Set the seed for reproducibility
-        generator = torch.Generator("cuda" if torch.cuda.is_available() else "cpu").manual_seed(seed)
-        # Run the audio generation
-        audio = pipe(
-            prompt,
-            negative_prompt=negative_prompt,
-            num_inference_steps=inference_steps,
-            audio_length_in_s=duration,
-            num_waveforms_per_prompt=waveforms,
-            generator=generator,
-        ).audios
-        # Get the output and save to a file
-        output = audio[0].T.float().cpu().numpy()
         output_path = tempfile.mktemp(suffix=".wav")
-        sf.write(output_path, output, pipe.vae.sampling_rate)
-        return output_path, f"Sound generated for prompt: '{prompt}'"
     except Exception as e:
         return None, f"Error in sound generation: {str(e)}"
-# Placeholder functions for emotion evaluation
-def emo2vec_sim(ref_paths, gen_paths):
-    return [(ref, gen, np.random.random(), np.random.random()) for ref, gen in zip(ref_paths, gen_paths)]
-def arousal_valence_sim(ref_paths, gen_paths):
-    return [(ref, gen, np.random.random(), np.random.random()) for ref, gen in zip(ref_paths, gen_paths)]
-def evaluate_emotion(ref_audio, gen_audio, uttwise_score=False):
-    try:
-        ref_paths = [ref_audio]
-        gen_paths = [gen_audio]
-        arousal_valance_results = arousal_valence_sim(ref_paths, gen_paths)
-        emo2vec_results = emo2vec_sim(ref_paths, gen_paths)
-        scores = [0] * 4
-        for arousal_valance_result, emo2vec_result in zip(arousal_valance_results, emo2vec_results):
-            emo2vec_sim_utt = emo2vec_result[2]
-            emo2vec_sim_frame = emo2vec_result[3]
-            arousal_valance_sim_utt = arousal_valance_result[2]
-            arousal_valance_sim_frame = arousal_valance_result[3]
-            scores[0] += emo2vec_sim_utt
-            scores[1] += emo2vec_sim_frame
-            scores[2] += arousal_valance_sim_utt
-            scores[3] += arousal_valance_sim_frame
-        scores = [score / len(ref_paths) for score in scores]
-        if uttwise_score:
-            result = {
-                "emo2vec_sim_utt": f"{float(scores[0]):.3f}",
-                "arousal_valance_sim_utt": f"{float(scores[2]):.3f}",
-            }
-        else:
-            result = {
-                "emo2vec_sim_frame": f"{float(scores[1]):.3f}",
-                "arousal_valance_sim_frame": f"{float(scores[3]):.3f}",
-            }
-        return result
-    except Exception as e:
-        return {"error": str(e)}
 # Gradio interface
 with gr.Blocks() as iface:
-    gr.Markdown("# OpenAI TTS and StableAudio Sound Generation Tool")
     with gr.Tab("Text-to-Speech"):
         text_input = gr.Textbox(label="Enter text for speech generation")
@@ -119,32 +61,19 @@ with gr.Blocks() as iface:
         speech_output = gr.Audio(label="Generated Speech")
         speech_message = gr.Textbox(label="Message")
-    with gr.Tab("StableAudio Sound Generation"):
-        prompt_input = gr.Textbox(label="Text Prompt", placeholder="Describe the sound you'd like to generate...")
-        negative_prompt_input = gr.Textbox(label="Negative Prompt", placeholder="Describe what you don't want in the sound...")
-        seed_input = gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=0)
-        inference_steps_input = gr.Slider(label="Inference Steps", minimum=50, maximum=500, step=10, value=200)
-        duration_input = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=30.0, step=0.5, value=10.0)
-        waveforms_input = gr.Slider(label="Number of Waveforms", minimum=1, maximum=5, step=1, value=1)
         sound_button = gr.Button("Generate Sound")
         sound_output = gr.Audio(label="Generated Sound")
         sound_message = gr.Textbox(label="Message")
-    with gr.Tab("Emotion Evaluation"):
-        ref_audio_input = gr.Audio(label="Reference Audio")
-        gen_audio_input = gr.Audio(label="Generated Audio")
-        uttwise_score_input = gr.Checkbox(label="Use utterance-wise score")
-        evaluate_button = gr.Button("Evaluate Emotion")
-        evaluation_output = gr.JSON(label="Evaluation Results")
     speech_button.click(text_to_speech_with_emotion,
                         inputs=[text_input, voice_input, model_input],
                         outputs=[speech_output, speech_message])
-    sound_button.click(generate_sound,
-                       inputs=[prompt_input, negative_prompt_input, seed_input, inference_steps_input, duration_input, waveforms_input],
                        outputs=[sound_output, sound_message])
-    evaluate_button.click(evaluate_emotion,
-                          inputs=[ref_audio_input, gen_audio_input, uttwise_score_input],
-                          outputs=[evaluation_output])
 iface.launch()

 import numpy as np
 import tempfile
 import os
 from openai import OpenAI
+import soundfile as sf
 # Initialize OpenAI client
 client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 def text_to_speech_with_emotion(text, voice, model):
     try:
         response = client.audio.speech.create(
     except Exception as e:
         return None, f"Error in speech generation: {str(e)}"
+def generate_simple_sound(description, duration, frequency):
     try:
+        sample_rate = 44100
+        t = np.linspace(0, duration, int(sample_rate * duration), False)
+        if "rain" in description.lower():
+            audio = np.random.normal(0, 0.1, len(t))
+        elif "wind" in description.lower():
+            audio = np.sin(2 * np.pi * frequency * t) * np.random.normal(1, 0.1, len(t))
+        elif "bird" in description.lower():
+            audio = np.sin(2 * np.pi * frequency * t) * np.exp(-0.5 * t)
+            audio = np.tile(audio, int(duration / 0.5))[:len(t)]
+        else:
+            audio = np.sin(2 * np.pi * frequency * t)
+        audio = audio / np.max(np.abs(audio))
         output_path = tempfile.mktemp(suffix=".wav")
+        sf.write(output_path, audio, sample_rate)
+        return output_path, f"Simple sound generated for '{description}'"
     except Exception as e:
         return None, f"Error in sound generation: {str(e)}"
 # Gradio interface
 with gr.Blocks() as iface:
+    gr.Markdown("# OpenAI TTS and Simple Sound Generation Tool")
     with gr.Tab("Text-to-Speech"):
         text_input = gr.Textbox(label="Enter text for speech generation")
         speech_output = gr.Audio(label="Generated Speech")
         speech_message = gr.Textbox(label="Message")
+    with gr.Tab("Simple Sound Generation"):
+        prompt_input = gr.Textbox(label="Sound Description", placeholder="Describe the sound (e.g., rain, wind, bird)...")
+        duration_input = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=30.0, step=0.5, value=5.0)
+        frequency_input = gr.Slider(label="Base Frequency (Hz)", minimum=20, maximum=2000, step=10, value=440)
         sound_button = gr.Button("Generate Sound")
         sound_output = gr.Audio(label="Generated Sound")
         sound_message = gr.Textbox(label="Message")
     speech_button.click(text_to_speech_with_emotion,
                         inputs=[text_input, voice_input, model_input],
                         outputs=[speech_output, speech_message])
+    sound_button.click(generate_simple_sound,
+                       inputs=[prompt_input, duration_input, frequency_input],
                        outputs=[sound_output, sound_message])
 iface.launch()