Spaces:
Runtime error
Runtime error
Dhruv Diddi
commited on
Commit
Β·
e1d4069
1
Parent(s):
a8c30fe
any text to stable diffusion
Browse files
app.py
CHANGED
@@ -1,117 +1,19 @@
|
|
1 |
import gradio as gr
|
2 |
-
#import torch
|
3 |
-
import whisper
|
4 |
from datetime import datetime
|
5 |
from PIL import Image
|
6 |
import flag
|
7 |
import os
|
8 |
-
#MY_SECRET_TOKEN=os.environ.get('HF_TOKEN_SD')
|
9 |
-
|
10 |
-
#from diffusers import StableDiffusionPipeline
|
11 |
|
12 |
stable_diffusion = gr.Blocks.load(name="spaces/stabilityai/stable-diffusion")
|
13 |
### ββββββββββββββββββββββββββββββββββββββββ
|
14 |
|
15 |
-
title="
|
16 |
-
|
17 |
-
### ββββββββββββββββββββββββββββββββββββββββ
|
18 |
-
|
19 |
-
whisper_model = whisper.load_model("small")
|
20 |
-
|
21 |
-
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
22 |
-
|
23 |
-
#pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", use_auth_token=MY_SECRET_TOKEN)
|
24 |
-
#pipe.to(device)
|
25 |
-
|
26 |
-
### ββββββββββββββββββββββββββββββββββββββββ
|
27 |
|
28 |
def get_images(prompt):
|
29 |
gallery_dir = stable_diffusion(prompt, fn_index=2)
|
30 |
return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)]
|
31 |
|
32 |
|
33 |
-
def magic_whisper_to_sd(audio, guidance_scale, nb_iterations, seed):
|
34 |
-
|
35 |
-
whisper_results = translate(audio)
|
36 |
-
prompt = whisper_results[2]
|
37 |
-
images = get_images(prompt)
|
38 |
-
|
39 |
-
return whisper_results[0], whisper_results[1], whisper_results[2], images
|
40 |
-
|
41 |
-
#def diffuse(prompt, guidance_scale, nb_iterations, seed):
|
42 |
-
#
|
43 |
-
# generator = torch.Generator(device=device).manual_seed(int(seed))
|
44 |
-
#
|
45 |
-
# print("""
|
46 |
-
# β
|
47 |
-
# Sending prompt to Stable Diffusion ...
|
48 |
-
# β
|
49 |
-
# """)
|
50 |
-
# print("prompt: " + prompt)
|
51 |
-
# print("guidance scale: " + str(guidance_scale))
|
52 |
-
# print("inference steps: " + str(nb_iterations))
|
53 |
-
# print("seed: " + str(seed))
|
54 |
-
#
|
55 |
-
# images_list = pipe(
|
56 |
-
# [prompt] * 2,
|
57 |
-
# guidance_scale=guidance_scale,
|
58 |
-
# num_inference_steps=nb_iterations,
|
59 |
-
# generator=generator
|
60 |
-
# )
|
61 |
-
#
|
62 |
-
# images = []
|
63 |
-
#
|
64 |
-
# safe_image = Image.open(r"unsafe.png")
|
65 |
-
#
|
66 |
-
# for i, image in enumerate(images_list["sample"]):
|
67 |
-
# if(images_list["nsfw_content_detected"][i]):
|
68 |
-
# images.append(safe_image)
|
69 |
-
# else:
|
70 |
-
# images.append(image)
|
71 |
-
#
|
72 |
-
#
|
73 |
-
# print("Stable Diffusion has finished")
|
74 |
-
# print("βββββββββββββββββββββββββββββββββββββββββββ")
|
75 |
-
#
|
76 |
-
# return images
|
77 |
-
|
78 |
-
def translate(audio):
|
79 |
-
print("""
|
80 |
-
β
|
81 |
-
Sending audio to Whisper ...
|
82 |
-
β
|
83 |
-
""")
|
84 |
-
# current dateTime
|
85 |
-
now = datetime.now()
|
86 |
-
# convert to string
|
87 |
-
date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
|
88 |
-
print('DateTime String:', date_time_str)
|
89 |
-
|
90 |
-
audio = whisper.load_audio(audio)
|
91 |
-
audio = whisper.pad_or_trim(audio)
|
92 |
-
|
93 |
-
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
|
94 |
-
|
95 |
-
_, probs = whisper_model.detect_language(mel)
|
96 |
-
|
97 |
-
transcript_options = whisper.DecodingOptions(task="transcribe", fp16 = False)
|
98 |
-
translate_options = whisper.DecodingOptions(task="translate", fp16 = False)
|
99 |
-
|
100 |
-
transcription = whisper.decode(whisper_model, mel, transcript_options)
|
101 |
-
translation = whisper.decode(whisper_model, mel, translate_options)
|
102 |
-
|
103 |
-
print("language spoken: " + transcription.language)
|
104 |
-
print("transcript: " + transcription.text)
|
105 |
-
print("βββββββββββββββββββββββββββββββββββββββββββ")
|
106 |
-
print("translated: " + translation.text)
|
107 |
-
if transcription.language == "en":
|
108 |
-
tr_flag = flag.flag('GB')
|
109 |
-
else:
|
110 |
-
tr_flag = flag.flag(transcription.language)
|
111 |
-
return tr_flag, transcription.text, translation.text
|
112 |
-
|
113 |
-
### ββββββββββββββββββββββββββββββββββββββββ
|
114 |
-
|
115 |
css = """
|
116 |
.container {
|
117 |
max-width: 880px;
|
@@ -274,15 +176,14 @@ with gr.Blocks(css=css) as demo:
|
|
274 |
with gr.Column():
|
275 |
gr.HTML('''
|
276 |
<h1>
|
277 |
-
|
278 |
</h1>
|
279 |
<p style='text-align: center;'>
|
280 |
-
Ask stable diffusion
|
281 |
</p>
|
282 |
|
283 |
<p style='text-align: center;'>
|
284 |
-
This demo is
|
285 |
-
β
|
286 |
</p>
|
287 |
|
288 |
''')
|
@@ -291,35 +192,12 @@ with gr.Blocks(css=css) as demo:
|
|
291 |
|
292 |
gr.Markdown(
|
293 |
"""
|
294 |
-
|
295 |
-
## 1. Record audio or Upload an audio file:
|
296 |
"""
|
297 |
)
|
298 |
|
299 |
-
with gr.Tab(label="Record audio input", elem_id="record_tab"):
|
300 |
-
with gr.Column():
|
301 |
-
record_input = gr.Audio(
|
302 |
-
source="microphone",
|
303 |
-
type="filepath",
|
304 |
-
show_label=False,
|
305 |
-
elem_id="record_btn"
|
306 |
-
)
|
307 |
-
with gr.Row():
|
308 |
-
audio_r_translate = gr.Button("Check Whisper first ? π", elem_id="check_btn_1")
|
309 |
-
audio_r_direct_sd = gr.Button("Magic Whisper βΊ SD right now!", elem_id="magic_btn_1")
|
310 |
-
|
311 |
-
with gr.Tab(label="Upload audio input", elem_id="upload_tab"):
|
312 |
-
with gr.Column():
|
313 |
-
upload_input = gr.Audio(
|
314 |
-
source="upload",
|
315 |
-
type="filepath",
|
316 |
-
show_label=False,
|
317 |
-
elem_id="upload_area"
|
318 |
-
)
|
319 |
-
with gr.Row():
|
320 |
-
audio_u_translate = gr.Button("Check Whisper first ? π", elem_id="check_btn_2")
|
321 |
-
audio_u_direct_sd = gr.Button("Magic Whisper βΊ SD right now!", elem_id="magic_btn_2")
|
322 |
|
|
|
323 |
with gr.Accordion(label="Stable Diffusion Settings", elem_id="sd_settings", visible=False):
|
324 |
with gr.Row():
|
325 |
guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
|
@@ -328,28 +206,27 @@ with gr.Blocks(css=css) as demo:
|
|
328 |
|
329 |
gr.Markdown(
|
330 |
"""
|
331 |
-
## 2.
|
332 |
"""
|
333 |
)
|
334 |
|
335 |
with gr.Row():
|
336 |
|
337 |
transcripted_output = gr.Textbox(
|
338 |
-
label="
|
339 |
lines=3,
|
340 |
-
elem_id="
|
341 |
)
|
342 |
-
|
343 |
-
|
344 |
with gr.Column():
|
345 |
translated_output = gr.Textbox(
|
346 |
-
label="
|
347 |
lines=4,
|
348 |
elem_id="translated"
|
349 |
)
|
350 |
with gr.Row():
|
351 |
clear_btn = gr.Button(value="Clear")
|
352 |
-
diffuse_btn = gr.Button(value="
|
353 |
|
354 |
clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
|
355 |
|
@@ -362,8 +239,8 @@ with gr.Blocks(css=css) as demo:
|
|
362 |
|
363 |
|
364 |
gr.Markdown("""
|
365 |
-
## 3.
|
366 |
-
Inference time is about ~
|
367 |
"""
|
368 |
)
|
369 |
|
@@ -371,11 +248,8 @@ with gr.Blocks(css=css) as demo:
|
|
371 |
|
372 |
|
373 |
gr.Markdown("""
|
374 |
-
### π
|
375 |
-
|
376 |
-
<strong>Whisper</strong> is a general-purpose speech recognition model.<br /><br />
|
377 |
-
It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. <br />
|
378 |
-
β
|
379 |
</p>
|
380 |
<p style='font-size: 1em;line-height: 1.5em;'>
|
381 |
<strong>Stable Diffusion</strong> is a state of the art text-to-image model that generates images from text.
|
@@ -406,49 +280,6 @@ with gr.Blocks(css=css) as demo:
|
|
406 |
|
407 |
""", elem_id="about")
|
408 |
|
409 |
-
audio_r_translate.click(translate,
|
410 |
-
inputs = record_input,
|
411 |
-
outputs = [
|
412 |
-
language_detected_output,
|
413 |
-
transcripted_output,
|
414 |
-
translated_output
|
415 |
-
])
|
416 |
-
|
417 |
-
audio_u_translate.click(translate,
|
418 |
-
inputs = upload_input,
|
419 |
-
outputs = [
|
420 |
-
language_detected_output,
|
421 |
-
transcripted_output,
|
422 |
-
translated_output
|
423 |
-
])
|
424 |
-
|
425 |
-
audio_r_direct_sd.click(magic_whisper_to_sd,
|
426 |
-
inputs = [
|
427 |
-
record_input,
|
428 |
-
guidance_scale,
|
429 |
-
nb_iterations,
|
430 |
-
seed
|
431 |
-
],
|
432 |
-
outputs = [
|
433 |
-
language_detected_output,
|
434 |
-
transcripted_output,
|
435 |
-
translated_output,
|
436 |
-
sd_output
|
437 |
-
])
|
438 |
-
|
439 |
-
audio_u_direct_sd.click(magic_whisper_to_sd,
|
440 |
-
inputs = [
|
441 |
-
upload_input,
|
442 |
-
guidance_scale,
|
443 |
-
nb_iterations,
|
444 |
-
seed
|
445 |
-
],
|
446 |
-
outputs = [
|
447 |
-
language_detected_output,
|
448 |
-
transcripted_output,
|
449 |
-
translated_output,
|
450 |
-
sd_output
|
451 |
-
])
|
452 |
|
453 |
diffuse_btn.click(get_images,
|
454 |
inputs = [
|
@@ -456,12 +287,6 @@ with gr.Blocks(css=css) as demo:
|
|
456 |
],
|
457 |
outputs = sd_output
|
458 |
)
|
459 |
-
gr.HTML('''
|
460 |
-
<div class="footer">
|
461 |
-
<p>Whisper by <a href="https://github.com/openai/whisper" target="_blank">OpenAI</a> - Stable Diffusion by <a href="https://huggingface.co/CompVis" target="_blank">CompVis</a> and <a href="https://huggingface.co/stabilityai" target="_blank">Stability AI</a>
|
462 |
-
</p>
|
463 |
-
</div>
|
464 |
-
''')
|
465 |
|
466 |
|
467 |
if __name__ == "__main__":
|
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
from datetime import datetime
|
3 |
from PIL import Image
|
4 |
import flag
|
5 |
import os
|
|
|
|
|
|
|
6 |
|
7 |
stable_diffusion = gr.Blocks.load(name="spaces/stabilityai/stable-diffusion")
|
8 |
### ββββββββββββββββββββββββββββββββββββββββ
|
9 |
|
10 |
+
title="Any Text to Stable Diffusion"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
def get_images(prompt):
|
13 |
gallery_dir = stable_diffusion(prompt, fn_index=2)
|
14 |
return [os.path.join(gallery_dir, img) for img in os.listdir(gallery_dir)]
|
15 |
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
css = """
|
18 |
.container {
|
19 |
max-width: 880px;
|
|
|
176 |
with gr.Column():
|
177 |
gr.HTML('''
|
178 |
<h1>
|
179 |
+
Any Text to Stable Diffusion
|
180 |
</h1>
|
181 |
<p style='text-align: center;'>
|
182 |
+
Ask stable diffusion in any language !
|
183 |
</p>
|
184 |
|
185 |
<p style='text-align: center;'>
|
186 |
+
This demo is connected to StableDiffusion Space β’ Offered by ddiddi <br />
|
|
|
187 |
</p>
|
188 |
|
189 |
''')
|
|
|
192 |
|
193 |
gr.Markdown(
|
194 |
"""
|
195 |
+
## 1. Stable Diffusion Config
|
|
|
196 |
"""
|
197 |
)
|
198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
+
|
201 |
with gr.Accordion(label="Stable Diffusion Settings", elem_id="sd_settings", visible=False):
|
202 |
with gr.Row():
|
203 |
guidance_scale = gr.Slider(2, 15, value = 7, label = 'Guidance Scale')
|
|
|
206 |
|
207 |
gr.Markdown(
|
208 |
"""
|
209 |
+
## 2. Enter prompt
|
210 |
"""
|
211 |
)
|
212 |
|
213 |
with gr.Row():
|
214 |
|
215 |
transcripted_output = gr.Textbox(
|
216 |
+
label="Enter prompt",
|
217 |
lines=3,
|
218 |
+
elem_id="transcript"
|
219 |
)
|
220 |
+
|
|
|
221 |
with gr.Column():
|
222 |
translated_output = gr.Textbox(
|
223 |
+
label="in English",
|
224 |
lines=4,
|
225 |
elem_id="translated"
|
226 |
)
|
227 |
with gr.Row():
|
228 |
clear_btn = gr.Button(value="Clear")
|
229 |
+
diffuse_btn = gr.Button(value="YES", elem_id="diffuse_btn")
|
230 |
|
231 |
clear_btn.click(fn=lambda value: gr.update(value=""), inputs=clear_btn, outputs=translated_output)
|
232 |
|
|
|
239 |
|
240 |
|
241 |
gr.Markdown("""
|
242 |
+
## 3. Stable Diffusion Results
|
243 |
+
Inference time is about ~30-40 seconds
|
244 |
"""
|
245 |
)
|
246 |
|
|
|
248 |
|
249 |
|
250 |
gr.Markdown("""
|
251 |
+
### π Resources
|
252 |
+
|
|
|
|
|
|
|
253 |
</p>
|
254 |
<p style='font-size: 1em;line-height: 1.5em;'>
|
255 |
<strong>Stable Diffusion</strong> is a state of the art text-to-image model that generates images from text.
|
|
|
280 |
|
281 |
""", elem_id="about")
|
282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
|
284 |
diffuse_btn.click(get_images,
|
285 |
inputs = [
|
|
|
287 |
],
|
288 |
outputs = sd_output
|
289 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
|
291 |
|
292 |
if __name__ == "__main__":
|