apruvd commited on
Commit
9648631
·
1 Parent(s): 971bc80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -10
app.py CHANGED
@@ -21,21 +21,23 @@ pipe = pipe.to("cuda")
21
 
22
  def transcribe(audio,prompt_num,user_keywords):
23
 
24
-
25
  audio1 = whisper.load_audio(audio)
26
  audio1 = whisper.pad_or_trim(audio1)
27
 
28
-
29
  mel = whisper.log_mel_spectrogram(audio1).to(model.device)
30
 
31
-
32
  _, probs = model.detect_language(mel)
33
  print(f"Detected language: {max(probs, key=probs.get)}")
34
 
 
35
  options = whisper.DecodingOptions()
36
  result = whisper.decode(model, mel, options)
37
  print(result.text)
38
 
 
39
  audio2 = whisper.load_audio(audio)
40
  final_result = model.transcribe(audio2)
41
  print(final_result["text"])
@@ -47,6 +49,11 @@ def keywords(text,prompt_num,user_keywords):
47
 
48
  transcription = text
49
 
 
 
 
 
 
50
  kw_model = KeyBERT()
51
  a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words=None)
52
  set_1 = [i[0] for i in a]
@@ -75,8 +82,12 @@ def keywords(text,prompt_num,user_keywords):
75
  my_list = user_keywords.split(',')
76
  print(my_list)
77
 
 
 
 
 
78
  for i in range(len(my_list)):
79
-
80
 
81
  sentence.append("mdjrny-v4 style")
82
 
@@ -111,10 +122,34 @@ def keywords(text,prompt_num,user_keywords):
111
  sentence.append(r.choice(set_3))
112
 
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  sentence.append(r.choice(style_prompts))
115
 
116
  print("sentence: ", sentence)
117
 
 
118
  myprompt = ', '.join(str(e) for e in sentence)
119
  sentence = []
120
  print("prompt: ",myprompt)
@@ -122,18 +157,23 @@ def keywords(text,prompt_num,user_keywords):
122
 
123
  count += 1
124
 
 
 
 
125
  count = 0
126
  images = []
127
-
128
  while count != int(len(generated_prompts)):
 
129
  for i in generated_prompts:
130
  count += 1
 
131
  image = pipe(i, height=768, width=768, guidance_scale = 10).images[0]
132
  images.append(image)
133
 
134
- min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
135
- imgs_comb = np.hstack([i.resize(min_shape) for i in images])
136
- imgs_comb = Image.fromarray( imgs_comb)
137
 
138
  return images,transcription,keyword_pool,generated_prompts
139
 
@@ -144,8 +184,7 @@ def keywords(text,prompt_num,user_keywords):
144
  speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(label = "Number of Images to be generated (int): "),gr.Textbox(label = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True)
145
  text_prompts = gr.Interface(fn=keywords, inputs=["text","number","text"], outputs=[gr.Gallery(label="Generated images", show_label=True, elem_id="gallery").style(grid=[2], height="auto"),gr.TextArea(label="Transcription"),gr.TextArea(label="Keywords"),gr.TextArea(label="Generated Prompts")],theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True)
146
 
147
-
148
  #gr.Series(speech_text,text_prompts).launch(auth = ('PWuser','speechtotextPW'), auth_message = "Welcome to Perkins&Will i/o's Synthesia Tool. Use cases: Ideation/Brainstorming tool - Have it running in the background in a conference, brainstorming session, discussion to create contextually relevant visualizations for moodboarding, to spark more conversations, interactions and inspiration. | Aprameya Pandit | February 2023 | ",inline = False, enable_queue=True).queue()
149
- gr.Series(speech_text,text_prompts).launch( enable_queue=True).queue()
150
 
151
 
 
21
 
22
  def transcribe(audio,prompt_num,user_keywords):
23
 
24
+ # load audio and pad/trim it to fit 30 seconds
25
  audio1 = whisper.load_audio(audio)
26
  audio1 = whisper.pad_or_trim(audio1)
27
 
28
+ # make log-Mel spectrogram and move to the same device as the model
29
  mel = whisper.log_mel_spectrogram(audio1).to(model.device)
30
 
31
+ # detect the spoken language
32
  _, probs = model.detect_language(mel)
33
  print(f"Detected language: {max(probs, key=probs.get)}")
34
 
35
+ # decode the audio
36
  options = whisper.DecodingOptions()
37
  result = whisper.decode(model, mel, options)
38
  print(result.text)
39
 
40
+ # model = whisper.load_model("base")
41
  audio2 = whisper.load_audio(audio)
42
  final_result = model.transcribe(audio2)
43
  print(final_result["text"])
 
49
 
50
  transcription = text
51
 
52
+ # ub = UrlBuilder("demo.imgix.net")
53
+
54
+
55
+
56
+
57
  kw_model = KeyBERT()
58
  a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words=None)
59
  set_1 = [i[0] for i in a]
 
82
  my_list = user_keywords.split(',')
83
  print(my_list)
84
 
85
+ # for i in range(len(my_list)):
86
+ # sentence.append(my_list[i])
87
+
88
+ # numb = 5
89
  for i in range(len(my_list)):
90
+ # print("keyword_pool",keyword_pool, len(keyword_pool))
91
 
92
  sentence.append("mdjrny-v4 style")
93
 
 
122
  sentence.append(r.choice(set_3))
123
 
124
 
125
+ # rand1 = r.randint(0,numb)
126
+
127
+ # rand2 = r.randint(0,numb)
128
+ # if rand2 == rand1:
129
+ # rand2 = r.randint(0,numb)
130
+
131
+ # rand3 = r.randint(0,numb)
132
+ # if rand3 == rand1 or rand3 == rand2:
133
+ # rand3 = r.randint(0,numb)
134
+
135
+ # rand4 = r.randint(0,numb)
136
+ # if rand4 == rand1 or rand4 == rand2 or rand4 == rand3:
137
+ # rand4 = r.randint(0,numb)
138
+
139
+ # word_1 = keyword_pool[rand1]
140
+ # word_2 = keyword_pool[rand2]
141
+ # word_3 = keyword_pool[rand3]
142
+ # word_4 = keyword_pool[rand4]
143
+
144
+ # sentence.append(word_1 +", "+ word_2+", " + word_3+", " + word_4)
145
+
146
+
147
+ # Add Style Tail Prompt
148
  sentence.append(r.choice(style_prompts))
149
 
150
  print("sentence: ", sentence)
151
 
152
+ # Formatting Data as comma-delimited for Mid Journey
153
  myprompt = ', '.join(str(e) for e in sentence)
154
  sentence = []
155
  print("prompt: ",myprompt)
 
157
 
158
  count += 1
159
 
160
+ print("no. of prompts: ", len(generated_prompts))
161
+ print("generated prompts: ", generated_prompts)
162
+
163
  count = 0
164
  images = []
165
+ # np_images = []
166
  while count != int(len(generated_prompts)):
167
+
168
  for i in generated_prompts:
169
  count += 1
170
+ print(i)
171
  image = pipe(i, height=768, width=768, guidance_scale = 10).images[0]
172
  images.append(image)
173
 
174
+ # min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
175
+ # imgs_comb = np.hstack([i.resize(min_shape) for i in images])
176
+ # imgs_comb = Image.fromarray( imgs_comb)
177
 
178
  return images,transcription,keyword_pool,generated_prompts
179
 
 
184
  speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(label = "Number of Images to be generated (int): "),gr.Textbox(label = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True)
185
  text_prompts = gr.Interface(fn=keywords, inputs=["text","number","text"], outputs=[gr.Gallery(label="Generated images", show_label=True, elem_id="gallery").style(grid=[2], height="auto"),gr.TextArea(label="Transcription"),gr.TextArea(label="Keywords"),gr.TextArea(label="Generated Prompts")],theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True)
186
 
 
187
  #gr.Series(speech_text,text_prompts).launch(auth = ('PWuser','speechtotextPW'), auth_message = "Welcome to Perkins&Will i/o's Synthesia Tool. Use cases: Ideation/Brainstorming tool - Have it running in the background in a conference, brainstorming session, discussion to create contextually relevant visualizations for moodboarding, to spark more conversations, interactions and inspiration. | Aprameya Pandit | February 2023 | ",inline = False, enable_queue=True).queue()
188
+ gr.Series(speech_text,text_prompts).launch(enable_queue=True).queue()
189
 
190