Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -21,21 +21,23 @@ pipe = pipe.to("cuda")
|
|
21 |
|
22 |
def transcribe(audio,prompt_num,user_keywords):
|
23 |
|
24 |
-
|
25 |
audio1 = whisper.load_audio(audio)
|
26 |
audio1 = whisper.pad_or_trim(audio1)
|
27 |
|
28 |
-
|
29 |
mel = whisper.log_mel_spectrogram(audio1).to(model.device)
|
30 |
|
31 |
-
|
32 |
_, probs = model.detect_language(mel)
|
33 |
print(f"Detected language: {max(probs, key=probs.get)}")
|
34 |
|
|
|
35 |
options = whisper.DecodingOptions()
|
36 |
result = whisper.decode(model, mel, options)
|
37 |
print(result.text)
|
38 |
|
|
|
39 |
audio2 = whisper.load_audio(audio)
|
40 |
final_result = model.transcribe(audio2)
|
41 |
print(final_result["text"])
|
@@ -47,6 +49,11 @@ def keywords(text,prompt_num,user_keywords):
|
|
47 |
|
48 |
transcription = text
|
49 |
|
|
|
|
|
|
|
|
|
|
|
50 |
kw_model = KeyBERT()
|
51 |
a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words=None)
|
52 |
set_1 = [i[0] for i in a]
|
@@ -75,8 +82,12 @@ def keywords(text,prompt_num,user_keywords):
|
|
75 |
my_list = user_keywords.split(',')
|
76 |
print(my_list)
|
77 |
|
|
|
|
|
|
|
|
|
78 |
for i in range(len(my_list)):
|
79 |
-
|
80 |
|
81 |
sentence.append("mdjrny-v4 style")
|
82 |
|
@@ -111,10 +122,34 @@ def keywords(text,prompt_num,user_keywords):
|
|
111 |
sentence.append(r.choice(set_3))
|
112 |
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
sentence.append(r.choice(style_prompts))
|
115 |
|
116 |
print("sentence: ", sentence)
|
117 |
|
|
|
118 |
myprompt = ', '.join(str(e) for e in sentence)
|
119 |
sentence = []
|
120 |
print("prompt: ",myprompt)
|
@@ -122,18 +157,23 @@ def keywords(text,prompt_num,user_keywords):
|
|
122 |
|
123 |
count += 1
|
124 |
|
|
|
|
|
|
|
125 |
count = 0
|
126 |
images = []
|
127 |
-
|
128 |
while count != int(len(generated_prompts)):
|
|
|
129 |
for i in generated_prompts:
|
130 |
count += 1
|
|
|
131 |
image = pipe(i, height=768, width=768, guidance_scale = 10).images[0]
|
132 |
images.append(image)
|
133 |
|
134 |
-
min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
|
135 |
-
imgs_comb = np.hstack([i.resize(min_shape) for i in images])
|
136 |
-
imgs_comb = Image.fromarray( imgs_comb)
|
137 |
|
138 |
return images,transcription,keyword_pool,generated_prompts
|
139 |
|
@@ -144,8 +184,7 @@ def keywords(text,prompt_num,user_keywords):
|
|
144 |
speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(label = "Number of Images to be generated (int): "),gr.Textbox(label = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True)
|
145 |
text_prompts = gr.Interface(fn=keywords, inputs=["text","number","text"], outputs=[gr.Gallery(label="Generated images", show_label=True, elem_id="gallery").style(grid=[2], height="auto"),gr.TextArea(label="Transcription"),gr.TextArea(label="Keywords"),gr.TextArea(label="Generated Prompts")],theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True)
|
146 |
|
147 |
-
|
148 |
#gr.Series(speech_text,text_prompts).launch(auth = ('PWuser','speechtotextPW'), auth_message = "Welcome to Perkins&Will i/o's Synthesia Tool. Use cases: Ideation/Brainstorming tool - Have it running in the background in a conference, brainstorming session, discussion to create contextually relevant visualizations for moodboarding, to spark more conversations, interactions and inspiration. | Aprameya Pandit | February 2023 | ",inline = False, enable_queue=True).queue()
|
149 |
-
gr.Series(speech_text,text_prompts).launch(
|
150 |
|
151 |
|
|
|
21 |
|
22 |
def transcribe(audio,prompt_num,user_keywords):
|
23 |
|
24 |
+
# load audio and pad/trim it to fit 30 seconds
|
25 |
audio1 = whisper.load_audio(audio)
|
26 |
audio1 = whisper.pad_or_trim(audio1)
|
27 |
|
28 |
+
# make log-Mel spectrogram and move to the same device as the model
|
29 |
mel = whisper.log_mel_spectrogram(audio1).to(model.device)
|
30 |
|
31 |
+
# detect the spoken language
|
32 |
_, probs = model.detect_language(mel)
|
33 |
print(f"Detected language: {max(probs, key=probs.get)}")
|
34 |
|
35 |
+
# decode the audio
|
36 |
options = whisper.DecodingOptions()
|
37 |
result = whisper.decode(model, mel, options)
|
38 |
print(result.text)
|
39 |
|
40 |
+
# model = whisper.load_model("base")
|
41 |
audio2 = whisper.load_audio(audio)
|
42 |
final_result = model.transcribe(audio2)
|
43 |
print(final_result["text"])
|
|
|
49 |
|
50 |
transcription = text
|
51 |
|
52 |
+
# ub = UrlBuilder("demo.imgix.net")
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
kw_model = KeyBERT()
|
58 |
a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words=None)
|
59 |
set_1 = [i[0] for i in a]
|
|
|
82 |
my_list = user_keywords.split(',')
|
83 |
print(my_list)
|
84 |
|
85 |
+
# for i in range(len(my_list)):
|
86 |
+
# sentence.append(my_list[i])
|
87 |
+
|
88 |
+
# numb = 5
|
89 |
for i in range(len(my_list)):
|
90 |
+
# print("keyword_pool",keyword_pool, len(keyword_pool))
|
91 |
|
92 |
sentence.append("mdjrny-v4 style")
|
93 |
|
|
|
122 |
sentence.append(r.choice(set_3))
|
123 |
|
124 |
|
125 |
+
# rand1 = r.randint(0,numb)
|
126 |
+
|
127 |
+
# rand2 = r.randint(0,numb)
|
128 |
+
# if rand2 == rand1:
|
129 |
+
# rand2 = r.randint(0,numb)
|
130 |
+
|
131 |
+
# rand3 = r.randint(0,numb)
|
132 |
+
# if rand3 == rand1 or rand3 == rand2:
|
133 |
+
# rand3 = r.randint(0,numb)
|
134 |
+
|
135 |
+
# rand4 = r.randint(0,numb)
|
136 |
+
# if rand4 == rand1 or rand4 == rand2 or rand4 == rand3:
|
137 |
+
# rand4 = r.randint(0,numb)
|
138 |
+
|
139 |
+
# word_1 = keyword_pool[rand1]
|
140 |
+
# word_2 = keyword_pool[rand2]
|
141 |
+
# word_3 = keyword_pool[rand3]
|
142 |
+
# word_4 = keyword_pool[rand4]
|
143 |
+
|
144 |
+
# sentence.append(word_1 +", "+ word_2+", " + word_3+", " + word_4)
|
145 |
+
|
146 |
+
|
147 |
+
# Add Style Tail Prompt
|
148 |
sentence.append(r.choice(style_prompts))
|
149 |
|
150 |
print("sentence: ", sentence)
|
151 |
|
152 |
+
# Formatting Data as comma-delimited for Mid Journey
|
153 |
myprompt = ', '.join(str(e) for e in sentence)
|
154 |
sentence = []
|
155 |
print("prompt: ",myprompt)
|
|
|
157 |
|
158 |
count += 1
|
159 |
|
160 |
+
print("no. of prompts: ", len(generated_prompts))
|
161 |
+
print("generated prompts: ", generated_prompts)
|
162 |
+
|
163 |
count = 0
|
164 |
images = []
|
165 |
+
# np_images = []
|
166 |
while count != int(len(generated_prompts)):
|
167 |
+
|
168 |
for i in generated_prompts:
|
169 |
count += 1
|
170 |
+
print(i)
|
171 |
image = pipe(i, height=768, width=768, guidance_scale = 10).images[0]
|
172 |
images.append(image)
|
173 |
|
174 |
+
# min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
|
175 |
+
# imgs_comb = np.hstack([i.resize(min_shape) for i in images])
|
176 |
+
# imgs_comb = Image.fromarray( imgs_comb)
|
177 |
|
178 |
return images,transcription,keyword_pool,generated_prompts
|
179 |
|
|
|
184 |
speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(label = "Number of Images to be generated (int): "),gr.Textbox(label = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True)
|
185 |
text_prompts = gr.Interface(fn=keywords, inputs=["text","number","text"], outputs=[gr.Gallery(label="Generated images", show_label=True, elem_id="gallery").style(grid=[2], height="auto"),gr.TextArea(label="Transcription"),gr.TextArea(label="Keywords"),gr.TextArea(label="Generated Prompts")],theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True)
|
186 |
|
|
|
187 |
#gr.Series(speech_text,text_prompts).launch(auth = ('PWuser','speechtotextPW'), auth_message = "Welcome to Perkins&Will i/o's Synthesia Tool. Use cases: Ideation/Brainstorming tool - Have it running in the background in a conference, brainstorming session, discussion to create contextually relevant visualizations for moodboarding, to spark more conversations, interactions and inspiration. | Aprameya Pandit | February 2023 | ",inline = False, enable_queue=True).queue()
|
188 |
+
gr.Series(speech_text,text_prompts).launch(enable_queue=True).queue()
|
189 |
|
190 |
|