Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,49 +1,41 @@
|
|
1 |
import whisper
|
2 |
-
model = whisper.load_model("base")
|
3 |
-
model.device
|
4 |
-
|
5 |
import gradio as gr
|
6 |
-
|
7 |
from keybert import KeyBERT
|
8 |
import random as r
|
9 |
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
|
10 |
import torch
|
11 |
-
|
12 |
-
model_id = 'prompthero/midjourney-v4-diffusion' #"stabilityai/stable-diffusion-2"
|
13 |
-
|
14 |
-
# Use the Euler scheduler here instead
|
15 |
-
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
|
16 |
-
pipe = StableDiffusionPipeline.from_pretrained(model_id , torch_dtype=torch.float16) #pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16)
|
17 |
-
pipe = pipe.to("cuda")
|
18 |
-
|
19 |
-
# from IPython.display import Image
|
20 |
from PIL import Image
|
21 |
import time
|
22 |
import matplotlib.pyplot as plt
|
23 |
import numpy as np
|
24 |
import PIL
|
25 |
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
def transcribe(audio,prompt_num,user_keywords):
|
29 |
|
30 |
-
|
31 |
audio1 = whisper.load_audio(audio)
|
32 |
audio1 = whisper.pad_or_trim(audio1)
|
33 |
|
34 |
-
|
35 |
mel = whisper.log_mel_spectrogram(audio1).to(model.device)
|
36 |
|
37 |
-
|
38 |
_, probs = model.detect_language(mel)
|
39 |
print(f"Detected language: {max(probs, key=probs.get)}")
|
40 |
|
41 |
-
# decode the audio
|
42 |
options = whisper.DecodingOptions()
|
43 |
result = whisper.decode(model, mel, options)
|
44 |
print(result.text)
|
45 |
|
46 |
-
# model = whisper.load_model("base")
|
47 |
audio2 = whisper.load_audio(audio)
|
48 |
final_result = model.transcribe(audio2)
|
49 |
print(final_result["text"])
|
@@ -53,7 +45,6 @@ def transcribe(audio,prompt_num,user_keywords):
|
|
53 |
|
54 |
def keywords(text,prompt_num,user_keywords):
|
55 |
|
56 |
-
# ub = UrlBuilder("demo.imgix.net")
|
57 |
|
58 |
kw_model = KeyBERT()
|
59 |
a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words=None)
|
@@ -83,12 +74,8 @@ def keywords(text,prompt_num,user_keywords):
|
|
83 |
my_list = user_keywords.split(',')
|
84 |
print(my_list)
|
85 |
|
86 |
-
# for i in range(len(my_list)):
|
87 |
-
# sentence.append(my_list[i])
|
88 |
-
|
89 |
-
# numb = 5
|
90 |
for i in range(len(my_list)):
|
91 |
-
|
92 |
|
93 |
sentence.append("mdjrny-v4 style")
|
94 |
|
@@ -122,12 +109,11 @@ def keywords(text,prompt_num,user_keywords):
|
|
122 |
sentence.append(r.choice(set_2))
|
123 |
sentence.append(r.choice(set_3))
|
124 |
|
125 |
-
|
126 |
sentence.append(r.choice(style_prompts))
|
127 |
|
128 |
print("sentence: ", sentence)
|
129 |
|
130 |
-
# Formatting Data as comma-delimited for Mid Journey
|
131 |
myprompt = ', '.join(str(e) for e in sentence)
|
132 |
sentence = []
|
133 |
print("prompt: ",myprompt)
|
@@ -135,38 +121,25 @@ def keywords(text,prompt_num,user_keywords):
|
|
135 |
|
136 |
count += 1
|
137 |
|
138 |
-
print("no. of prompts: ", len(generated_prompts))
|
139 |
-
print("generated prompts: ", generated_prompts)
|
140 |
-
|
141 |
count = 0
|
142 |
images = []
|
143 |
-
# np_images = []
|
144 |
-
while count != int(len(generated_prompts)):
|
145 |
|
|
|
146 |
for i in generated_prompts:
|
147 |
count += 1
|
148 |
-
print(i)
|
149 |
image = pipe(i, height=768, width=768, guidance_scale = 10).images[0]
|
150 |
-
# image.save("/content/drive/MyDrive/ColabNotebooks/GeneratedImages/" + "sd_image_" +str(count)+ ".png")
|
151 |
images.append(image)
|
152 |
|
153 |
-
|
154 |
-
# pick the image which is the smallest, and resize the others to match it (can be arbitrary image shape here)
|
155 |
min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
|
156 |
imgs_comb = np.hstack([i.resize(min_shape) for i in images])
|
157 |
-
|
158 |
-
# save that beautiful picture
|
159 |
imgs_comb = Image.fromarray( imgs_comb)
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
# return imgs_comb #for combined image
|
164 |
return images
|
165 |
|
166 |
speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(placeholder = "Number of Images to be generated (int): "),gr.Textbox(placeholder = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], title = 'Speech to Image Generator', enable_queue=True)
|
167 |
text_prompts = gr.Interface(fn=keywords, inputs=["text","number","text"], outputs=gr.Gallery(label="Generated images", show_label=False, elem_id="gallery").style(grid=[2], height="auto"), title = 'Speech to Image Generator', enable_queue=True)
|
168 |
|
169 |
-
gr.Series(speech_text,text_prompts).launch(inline = False,
|
170 |
|
171 |
|
172 |
|
|
|
1 |
import whisper
|
|
|
|
|
|
|
2 |
import gradio as gr
|
|
|
3 |
from keybert import KeyBERT
|
4 |
import random as r
|
5 |
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
|
6 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from PIL import Image
|
8 |
import time
|
9 |
import matplotlib.pyplot as plt
|
10 |
import numpy as np
|
11 |
import PIL
|
12 |
|
13 |
+
model = whisper.load_model("base")
|
14 |
+
model.device
|
15 |
+
|
16 |
+
model_id = 'prompthero/midjourney-v4-diffusion' #"stabilityai/stable-diffusion-2"
|
17 |
+
|
18 |
+
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
|
19 |
+
pipe = StableDiffusionPipeline.from_pretrained(model_id , torch_dtype=torch.float16) #pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16)
|
20 |
+
pipe = pipe.to("cuda")
|
21 |
|
22 |
def transcribe(audio,prompt_num,user_keywords):
|
23 |
|
24 |
+
|
25 |
audio1 = whisper.load_audio(audio)
|
26 |
audio1 = whisper.pad_or_trim(audio1)
|
27 |
|
28 |
+
|
29 |
mel = whisper.log_mel_spectrogram(audio1).to(model.device)
|
30 |
|
31 |
+
|
32 |
_, probs = model.detect_language(mel)
|
33 |
print(f"Detected language: {max(probs, key=probs.get)}")
|
34 |
|
|
|
35 |
options = whisper.DecodingOptions()
|
36 |
result = whisper.decode(model, mel, options)
|
37 |
print(result.text)
|
38 |
|
|
|
39 |
audio2 = whisper.load_audio(audio)
|
40 |
final_result = model.transcribe(audio2)
|
41 |
print(final_result["text"])
|
|
|
45 |
|
46 |
def keywords(text,prompt_num,user_keywords):
|
47 |
|
|
|
48 |
|
49 |
kw_model = KeyBERT()
|
50 |
a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words=None)
|
|
|
74 |
my_list = user_keywords.split(',')
|
75 |
print(my_list)
|
76 |
|
|
|
|
|
|
|
|
|
77 |
for i in range(len(my_list)):
|
78 |
+
|
79 |
|
80 |
sentence.append("mdjrny-v4 style")
|
81 |
|
|
|
109 |
sentence.append(r.choice(set_2))
|
110 |
sentence.append(r.choice(set_3))
|
111 |
|
112 |
+
|
113 |
sentence.append(r.choice(style_prompts))
|
114 |
|
115 |
print("sentence: ", sentence)
|
116 |
|
|
|
117 |
myprompt = ', '.join(str(e) for e in sentence)
|
118 |
sentence = []
|
119 |
print("prompt: ",myprompt)
|
|
|
121 |
|
122 |
count += 1
|
123 |
|
|
|
|
|
|
|
124 |
count = 0
|
125 |
images = []
|
|
|
|
|
126 |
|
127 |
+
while count != int(len(generated_prompts)):
|
128 |
for i in generated_prompts:
|
129 |
count += 1
|
|
|
130 |
image = pipe(i, height=768, width=768, guidance_scale = 10).images[0]
|
|
|
131 |
images.append(image)
|
132 |
|
|
|
|
|
133 |
min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
|
134 |
imgs_comb = np.hstack([i.resize(min_shape) for i in images])
|
|
|
|
|
135 |
imgs_comb = Image.fromarray( imgs_comb)
|
136 |
+
|
|
|
|
|
|
|
137 |
return images
|
138 |
|
139 |
speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(placeholder = "Number of Images to be generated (int): "),gr.Textbox(placeholder = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], title = 'Speech to Image Generator', enable_queue=True)
|
140 |
text_prompts = gr.Interface(fn=keywords, inputs=["text","number","text"], outputs=gr.Gallery(label="Generated images", show_label=False, elem_id="gallery").style(grid=[2], height="auto"), title = 'Speech to Image Generator', enable_queue=True)
|
141 |
|
142 |
+
gr.Series(speech_text,text_prompts).launch(inline = False, enable_queue=True).queue()
|
143 |
|
144 |
|
145 |
|