capradeepgujaran commited on
Commit
df63b30
·
verified ·
1 Parent(s): d0ff432

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -95
app.py CHANGED
@@ -2,24 +2,12 @@ import gradio as gr
2
  import numpy as np
3
  import tempfile
4
  import os
5
- import torch
6
- import soundfile as sf
7
- from diffusers import StableAudioPipeline
8
  from openai import OpenAI
9
- import base64
10
 
11
  # Initialize OpenAI client
12
  client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
13
 
14
- # We'll initialize StableAudio only when it's needed to save memory
15
- pipe = None
16
-
17
- def initialize_stable_audio():
18
- global pipe
19
- if pipe is None:
20
- pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0", torch_dtype=torch.float16)
21
- pipe = pipe.to("cuda" if torch.cuda.is_available() else "cpu")
22
-
23
  def text_to_speech_with_emotion(text, voice, model):
24
  try:
25
  response = client.audio.speech.create(
@@ -37,79 +25,33 @@ def text_to_speech_with_emotion(text, voice, model):
37
  except Exception as e:
38
  return None, f"Error in speech generation: {str(e)}"
39
 
40
- def generate_sound(prompt, negative_prompt, seed, inference_steps, duration, waveforms):
41
  try:
42
- initialize_stable_audio()
 
43
 
44
- # Set the seed for reproducibility
45
- generator = torch.Generator("cuda" if torch.cuda.is_available() else "cpu").manual_seed(seed)
46
-
47
- # Run the audio generation
48
- audio = pipe(
49
- prompt,
50
- negative_prompt=negative_prompt,
51
- num_inference_steps=inference_steps,
52
- audio_length_in_s=duration,
53
- num_waveforms_per_prompt=waveforms,
54
- generator=generator,
55
- ).audios
56
 
57
- # Get the output and save to a file
58
- output = audio[0].T.float().cpu().numpy()
59
  output_path = tempfile.mktemp(suffix=".wav")
60
- sf.write(output_path, output, pipe.vae.sampling_rate)
61
 
62
- return output_path, f"Sound generated for prompt: '{prompt}'"
63
  except Exception as e:
64
  return None, f"Error in sound generation: {str(e)}"
65
 
66
- # Placeholder functions for emotion evaluation
67
- def emo2vec_sim(ref_paths, gen_paths):
68
- return [(ref, gen, np.random.random(), np.random.random()) for ref, gen in zip(ref_paths, gen_paths)]
69
-
70
- def arousal_valence_sim(ref_paths, gen_paths):
71
- return [(ref, gen, np.random.random(), np.random.random()) for ref, gen in zip(ref_paths, gen_paths)]
72
-
73
- def evaluate_emotion(ref_audio, gen_audio, uttwise_score=False):
74
- try:
75
- ref_paths = [ref_audio]
76
- gen_paths = [gen_audio]
77
- arousal_valance_results = arousal_valence_sim(ref_paths, gen_paths)
78
- emo2vec_results = emo2vec_sim(ref_paths, gen_paths)
79
-
80
- scores = [0] * 4
81
-
82
- for arousal_valance_result, emo2vec_result in zip(arousal_valance_results, emo2vec_results):
83
- emo2vec_sim_utt = emo2vec_result[2]
84
- emo2vec_sim_frame = emo2vec_result[3]
85
- arousal_valance_sim_utt = arousal_valance_result[2]
86
- arousal_valance_sim_frame = arousal_valance_result[3]
87
-
88
- scores[0] += emo2vec_sim_utt
89
- scores[1] += emo2vec_sim_frame
90
- scores[2] += arousal_valance_sim_utt
91
- scores[3] += arousal_valance_sim_frame
92
-
93
- scores = [score / len(ref_paths) for score in scores]
94
-
95
- if uttwise_score:
96
- result = {
97
- "emo2vec_sim_utt": f"{float(scores[0]):.3f}",
98
- "arousal_valance_sim_utt": f"{float(scores[2]):.3f}",
99
- }
100
- else:
101
- result = {
102
- "emo2vec_sim_frame": f"{float(scores[1]):.3f}",
103
- "arousal_valance_sim_frame": f"{float(scores[3]):.3f}",
104
- }
105
-
106
- return result
107
- except Exception as e:
108
- return {"error": str(e)}
109
-
110
  # Gradio interface
111
  with gr.Blocks() as iface:
112
- gr.Markdown("# OpenAI TTS and StableAudio Sound Generation Tool")
113
 
114
  with gr.Tab("Text-to-Speech"):
115
  text_input = gr.Textbox(label="Enter text for speech generation")
@@ -119,32 +61,19 @@ with gr.Blocks() as iface:
119
  speech_output = gr.Audio(label="Generated Speech")
120
  speech_message = gr.Textbox(label="Message")
121
 
122
- with gr.Tab("StableAudio Sound Generation"):
123
- prompt_input = gr.Textbox(label="Text Prompt", placeholder="Describe the sound you'd like to generate...")
124
- negative_prompt_input = gr.Textbox(label="Negative Prompt", placeholder="Describe what you don't want in the sound...")
125
- seed_input = gr.Slider(label="Seed", minimum=0, maximum=10000, step=1, value=0)
126
- inference_steps_input = gr.Slider(label="Inference Steps", minimum=50, maximum=500, step=10, value=200)
127
- duration_input = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=30.0, step=0.5, value=10.0)
128
- waveforms_input = gr.Slider(label="Number of Waveforms", minimum=1, maximum=5, step=1, value=1)
129
  sound_button = gr.Button("Generate Sound")
130
  sound_output = gr.Audio(label="Generated Sound")
131
  sound_message = gr.Textbox(label="Message")
132
 
133
- with gr.Tab("Emotion Evaluation"):
134
- ref_audio_input = gr.Audio(label="Reference Audio")
135
- gen_audio_input = gr.Audio(label="Generated Audio")
136
- uttwise_score_input = gr.Checkbox(label="Use utterance-wise score")
137
- evaluate_button = gr.Button("Evaluate Emotion")
138
- evaluation_output = gr.JSON(label="Evaluation Results")
139
-
140
  speech_button.click(text_to_speech_with_emotion,
141
  inputs=[text_input, voice_input, model_input],
142
  outputs=[speech_output, speech_message])
143
- sound_button.click(generate_sound,
144
- inputs=[prompt_input, negative_prompt_input, seed_input, inference_steps_input, duration_input, waveforms_input],
145
  outputs=[sound_output, sound_message])
146
- evaluate_button.click(evaluate_emotion,
147
- inputs=[ref_audio_input, gen_audio_input, uttwise_score_input],
148
- outputs=[evaluation_output])
149
 
150
  iface.launch()
 
2
  import numpy as np
3
  import tempfile
4
  import os
 
 
 
5
  from openai import OpenAI
6
+ import soundfile as sf
7
 
8
  # Initialize OpenAI client
9
  client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
10
 
 
 
 
 
 
 
 
 
 
11
  def text_to_speech_with_emotion(text, voice, model):
12
  try:
13
  response = client.audio.speech.create(
 
25
  except Exception as e:
26
  return None, f"Error in speech generation: {str(e)}"
27
 
28
+ def generate_simple_sound(description, duration, frequency):
29
  try:
30
+ sample_rate = 44100
31
+ t = np.linspace(0, duration, int(sample_rate * duration), False)
32
 
33
+ if "rain" in description.lower():
34
+ audio = np.random.normal(0, 0.1, len(t))
35
+ elif "wind" in description.lower():
36
+ audio = np.sin(2 * np.pi * frequency * t) * np.random.normal(1, 0.1, len(t))
37
+ elif "bird" in description.lower():
38
+ audio = np.sin(2 * np.pi * frequency * t) * np.exp(-0.5 * t)
39
+ audio = np.tile(audio, int(duration / 0.5))[:len(t)]
40
+ else:
41
+ audio = np.sin(2 * np.pi * frequency * t)
 
 
 
42
 
43
+ audio = audio / np.max(np.abs(audio))
44
+
45
  output_path = tempfile.mktemp(suffix=".wav")
46
+ sf.write(output_path, audio, sample_rate)
47
 
48
+ return output_path, f"Simple sound generated for '{description}'"
49
  except Exception as e:
50
  return None, f"Error in sound generation: {str(e)}"
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Gradio interface
53
  with gr.Blocks() as iface:
54
+ gr.Markdown("# OpenAI TTS and Simple Sound Generation Tool")
55
 
56
  with gr.Tab("Text-to-Speech"):
57
  text_input = gr.Textbox(label="Enter text for speech generation")
 
61
  speech_output = gr.Audio(label="Generated Speech")
62
  speech_message = gr.Textbox(label="Message")
63
 
64
+ with gr.Tab("Simple Sound Generation"):
65
+ prompt_input = gr.Textbox(label="Sound Description", placeholder="Describe the sound (e.g., rain, wind, bird)...")
66
+ duration_input = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=30.0, step=0.5, value=5.0)
67
+ frequency_input = gr.Slider(label="Base Frequency (Hz)", minimum=20, maximum=2000, step=10, value=440)
 
 
 
68
  sound_button = gr.Button("Generate Sound")
69
  sound_output = gr.Audio(label="Generated Sound")
70
  sound_message = gr.Textbox(label="Message")
71
 
 
 
 
 
 
 
 
72
  speech_button.click(text_to_speech_with_emotion,
73
  inputs=[text_input, voice_input, model_input],
74
  outputs=[speech_output, speech_message])
75
+ sound_button.click(generate_simple_sound,
76
+ inputs=[prompt_input, duration_input, frequency_input],
77
  outputs=[sound_output, sound_message])
 
 
 
78
 
79
  iface.launch()