Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,24 +2,12 @@ import gradio as gr
|
|
2 |
import numpy as np
|
3 |
import tempfile
|
4 |
import os
|
5 |
-
import torch
|
6 |
-
import soundfile as sf
|
7 |
-
from diffusers import StableAudioPipeline
|
8 |
from openai import OpenAI
|
9 |
-
import
|
10 |
|
11 |
# Initialize OpenAI client
|
12 |
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
13 |
|
14 |
-
# We'll initialize StableAudio only when it's needed to save memory
|
15 |
-
pipe = None
|
16 |
-
|
17 |
-
def initialize_stable_audio():
|
18 |
-
global pipe
|
19 |
-
if pipe is None:
|
20 |
-
pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0", torch_dtype=torch.float16)
|
21 |
-
pipe = pipe.to("cuda" if torch.cuda.is_available() else "cpu")
|
22 |
-
|
23 |
def text_to_speech_with_emotion(text, voice, model):
|
24 |
try:
|
25 |
response = client.audio.speech.create(
|
@@ -37,79 +25,33 @@ def text_to_speech_with_emotion(text, voice, model):
|
|
37 |
except Exception as e:
|
38 |
return None, f"Error in speech generation: {str(e)}"
|
39 |
|
40 |
-
def
|
41 |
try:
|
42 |
-
|
|
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
num_waveforms_per_prompt=waveforms,
|
54 |
-
generator=generator,
|
55 |
-
).audios
|
56 |
|
57 |
-
|
58 |
-
|
59 |
output_path = tempfile.mktemp(suffix=".wav")
|
60 |
-
sf.write(output_path,
|
61 |
|
62 |
-
return output_path, f"
|
63 |
except Exception as e:
|
64 |
return None, f"Error in sound generation: {str(e)}"
|
65 |
|
66 |
-
# Placeholder functions for emotion evaluation
|
67 |
-
def emo2vec_sim(ref_paths, gen_paths):
|
68 |
-
return [(ref, gen, np.random.random(), np.random.random()) for ref, gen in zip(ref_paths, gen_paths)]
|
69 |
-
|
70 |
-
def arousal_valence_sim(ref_paths, gen_paths):
|
71 |
-
return [(ref, gen, np.random.random(), np.random.random()) for ref, gen in zip(ref_paths, gen_paths)]
|
72 |
-
|
73 |
-
def evaluate_emotion(ref_audio, gen_audio, uttwise_score=False):
|
74 |
-
try:
|
75 |
-
ref_paths = [ref_audio]
|
76 |
-
gen_paths = [gen_audio]
|
77 |
-
arousal_valance_results = arousal_valence_sim(ref_paths, gen_paths)
|
78 |
-
emo2vec_results = emo2vec_sim(ref_paths, gen_paths)
|
79 |
-
|
80 |
-
scores = [0] * 4
|
81 |
-
|
82 |
-
for arousal_valance_result, emo2vec_result in zip(arousal_valance_results, emo2vec_results):
|
83 |
-
emo2vec_sim_utt = emo2vec_result[2]
|
84 |
-
emo2vec_sim_frame = emo2vec_result[3]
|
85 |
-
arousal_valance_sim_utt = arousal_valance_result[2]
|
86 |
-
arousal_valance_sim_frame = arousal_valance_result[3]
|
87 |
-
|
88 |
-
scores[0] += emo2vec_sim_utt
|
89 |
-
scores[1] += emo2vec_sim_frame
|
90 |
-
scores[2] += arousal_valance_sim_utt
|
91 |
-
scores[3] += arousal_valance_sim_frame
|
92 |
-
|
93 |
-
scores = [score / len(ref_paths) for score in scores]
|
94 |
-
|
95 |
-
if uttwise_score:
|
96 |
-
result = {
|
97 |
-
"emo2vec_sim_utt": f"{float(scores[0]):.3f}",
|
98 |
-
"arousal_valance_sim_utt": f"{float(scores[2]):.3f}",
|
99 |
-
}
|
100 |
-
else:
|
101 |
-
result = {
|
102 |
-
"emo2vec_sim_frame": f"{float(scores[1]):.3f}",
|
103 |
-
"arousal_valance_sim_frame": f"{float(scores[3]):.3f}",
|
104 |
-
}
|
105 |
-
|
106 |
-
return result
|
107 |
-
except Exception as e:
|
108 |
-
return {"error": str(e)}
|
109 |
-
|
110 |
# Gradio interface
|
111 |
with gr.Blocks() as iface:
|
112 |
-
gr.Markdown("# OpenAI TTS and
|
113 |
|
114 |
with gr.Tab("Text-to-Speech"):
|
115 |
text_input = gr.Textbox(label="Enter text for speech generation")
|
@@ -119,32 +61,19 @@ with gr.Blocks() as iface:
|
|
119 |
speech_output = gr.Audio(label="Generated Speech")
|
120 |
speech_message = gr.Textbox(label="Message")
|
121 |
|
122 |
-
with gr.Tab("
|
123 |
-
prompt_input = gr.Textbox(label="
|
124 |
-
|
125 |
-
|
126 |
-
inference_steps_input = gr.Slider(label="Inference Steps", minimum=50, maximum=500, step=10, value=200)
|
127 |
-
duration_input = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=30.0, step=0.5, value=10.0)
|
128 |
-
waveforms_input = gr.Slider(label="Number of Waveforms", minimum=1, maximum=5, step=1, value=1)
|
129 |
sound_button = gr.Button("Generate Sound")
|
130 |
sound_output = gr.Audio(label="Generated Sound")
|
131 |
sound_message = gr.Textbox(label="Message")
|
132 |
|
133 |
-
with gr.Tab("Emotion Evaluation"):
|
134 |
-
ref_audio_input = gr.Audio(label="Reference Audio")
|
135 |
-
gen_audio_input = gr.Audio(label="Generated Audio")
|
136 |
-
uttwise_score_input = gr.Checkbox(label="Use utterance-wise score")
|
137 |
-
evaluate_button = gr.Button("Evaluate Emotion")
|
138 |
-
evaluation_output = gr.JSON(label="Evaluation Results")
|
139 |
-
|
140 |
speech_button.click(text_to_speech_with_emotion,
|
141 |
inputs=[text_input, voice_input, model_input],
|
142 |
outputs=[speech_output, speech_message])
|
143 |
-
sound_button.click(
|
144 |
-
inputs=[prompt_input,
|
145 |
outputs=[sound_output, sound_message])
|
146 |
-
evaluate_button.click(evaluate_emotion,
|
147 |
-
inputs=[ref_audio_input, gen_audio_input, uttwise_score_input],
|
148 |
-
outputs=[evaluation_output])
|
149 |
|
150 |
iface.launch()
|
|
|
2 |
import numpy as np
|
3 |
import tempfile
|
4 |
import os
|
|
|
|
|
|
|
5 |
from openai import OpenAI
|
6 |
+
import soundfile as sf
|
7 |
|
8 |
# Initialize OpenAI client
|
9 |
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def text_to_speech_with_emotion(text, voice, model):
|
12 |
try:
|
13 |
response = client.audio.speech.create(
|
|
|
25 |
except Exception as e:
|
26 |
return None, f"Error in speech generation: {str(e)}"
|
27 |
|
28 |
+
def generate_simple_sound(description, duration, frequency):
|
29 |
try:
|
30 |
+
sample_rate = 44100
|
31 |
+
t = np.linspace(0, duration, int(sample_rate * duration), False)
|
32 |
|
33 |
+
if "rain" in description.lower():
|
34 |
+
audio = np.random.normal(0, 0.1, len(t))
|
35 |
+
elif "wind" in description.lower():
|
36 |
+
audio = np.sin(2 * np.pi * frequency * t) * np.random.normal(1, 0.1, len(t))
|
37 |
+
elif "bird" in description.lower():
|
38 |
+
audio = np.sin(2 * np.pi * frequency * t) * np.exp(-0.5 * t)
|
39 |
+
audio = np.tile(audio, int(duration / 0.5))[:len(t)]
|
40 |
+
else:
|
41 |
+
audio = np.sin(2 * np.pi * frequency * t)
|
|
|
|
|
|
|
42 |
|
43 |
+
audio = audio / np.max(np.abs(audio))
|
44 |
+
|
45 |
output_path = tempfile.mktemp(suffix=".wav")
|
46 |
+
sf.write(output_path, audio, sample_rate)
|
47 |
|
48 |
+
return output_path, f"Simple sound generated for '{description}'"
|
49 |
except Exception as e:
|
50 |
return None, f"Error in sound generation: {str(e)}"
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
# Gradio interface
|
53 |
with gr.Blocks() as iface:
|
54 |
+
gr.Markdown("# OpenAI TTS and Simple Sound Generation Tool")
|
55 |
|
56 |
with gr.Tab("Text-to-Speech"):
|
57 |
text_input = gr.Textbox(label="Enter text for speech generation")
|
|
|
61 |
speech_output = gr.Audio(label="Generated Speech")
|
62 |
speech_message = gr.Textbox(label="Message")
|
63 |
|
64 |
+
with gr.Tab("Simple Sound Generation"):
|
65 |
+
prompt_input = gr.Textbox(label="Sound Description", placeholder="Describe the sound (e.g., rain, wind, bird)...")
|
66 |
+
duration_input = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=30.0, step=0.5, value=5.0)
|
67 |
+
frequency_input = gr.Slider(label="Base Frequency (Hz)", minimum=20, maximum=2000, step=10, value=440)
|
|
|
|
|
|
|
68 |
sound_button = gr.Button("Generate Sound")
|
69 |
sound_output = gr.Audio(label="Generated Sound")
|
70 |
sound_message = gr.Textbox(label="Message")
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
speech_button.click(text_to_speech_with_emotion,
|
73 |
inputs=[text_input, voice_input, model_input],
|
74 |
outputs=[speech_output, speech_message])
|
75 |
+
sound_button.click(generate_simple_sound,
|
76 |
+
inputs=[prompt_input, duration_input, frequency_input],
|
77 |
outputs=[sound_output, sound_message])
|
|
|
|
|
|
|
78 |
|
79 |
iface.launch()
|