apruvd commited on
Commit
c7880a2
·
1 Parent(s): 1be5702

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -43
app.py CHANGED
@@ -1,49 +1,41 @@
1
  import whisper
2
- model = whisper.load_model("base")
3
- model.device
4
-
5
  import gradio as gr
6
-
7
  from keybert import KeyBERT
8
  import random as r
9
  from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
10
  import torch
11
-
12
- model_id = 'prompthero/midjourney-v4-diffusion' #"stabilityai/stable-diffusion-2"
13
-
14
- # Use the Euler scheduler here instead
15
- scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
16
- pipe = StableDiffusionPipeline.from_pretrained(model_id , torch_dtype=torch.float16) #pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16)
17
- pipe = pipe.to("cuda")
18
-
19
- # from IPython.display import Image
20
  from PIL import Image
21
  import time
22
  import matplotlib.pyplot as plt
23
  import numpy as np
24
  import PIL
25
 
26
- # import cv2
 
 
 
 
 
 
 
27
 
28
  def transcribe(audio,prompt_num,user_keywords):
29
 
30
- # load audio and pad/trim it to fit 30 seconds
31
  audio1 = whisper.load_audio(audio)
32
  audio1 = whisper.pad_or_trim(audio1)
33
 
34
- # make log-Mel spectrogram and move to the same device as the model
35
  mel = whisper.log_mel_spectrogram(audio1).to(model.device)
36
 
37
- # detect the spoken language
38
  _, probs = model.detect_language(mel)
39
  print(f"Detected language: {max(probs, key=probs.get)}")
40
 
41
- # decode the audio
42
  options = whisper.DecodingOptions()
43
  result = whisper.decode(model, mel, options)
44
  print(result.text)
45
 
46
- # model = whisper.load_model("base")
47
  audio2 = whisper.load_audio(audio)
48
  final_result = model.transcribe(audio2)
49
  print(final_result["text"])
@@ -53,7 +45,6 @@ def transcribe(audio,prompt_num,user_keywords):
53
 
54
  def keywords(text,prompt_num,user_keywords):
55
 
56
- # ub = UrlBuilder("demo.imgix.net")
57
 
58
  kw_model = KeyBERT()
59
  a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words=None)
@@ -83,12 +74,8 @@ def keywords(text,prompt_num,user_keywords):
83
  my_list = user_keywords.split(',')
84
  print(my_list)
85
 
86
- # for i in range(len(my_list)):
87
- # sentence.append(my_list[i])
88
-
89
- # numb = 5
90
  for i in range(len(my_list)):
91
- # print("keyword_pool",keyword_pool, len(keyword_pool))
92
 
93
  sentence.append("mdjrny-v4 style")
94
 
@@ -122,12 +109,11 @@ def keywords(text,prompt_num,user_keywords):
122
  sentence.append(r.choice(set_2))
123
  sentence.append(r.choice(set_3))
124
 
125
- # Add Style Tail Prompt
126
  sentence.append(r.choice(style_prompts))
127
 
128
  print("sentence: ", sentence)
129
 
130
- # Formatting Data as comma-delimited for Mid Journey
131
  myprompt = ', '.join(str(e) for e in sentence)
132
  sentence = []
133
  print("prompt: ",myprompt)
@@ -135,38 +121,25 @@ def keywords(text,prompt_num,user_keywords):
135
 
136
  count += 1
137
 
138
- print("no. of prompts: ", len(generated_prompts))
139
- print("generated prompts: ", generated_prompts)
140
-
141
  count = 0
142
  images = []
143
- # np_images = []
144
- while count != int(len(generated_prompts)):
145
 
 
146
  for i in generated_prompts:
147
  count += 1
148
- print(i)
149
  image = pipe(i, height=768, width=768, guidance_scale = 10).images[0]
150
- # image.save("/content/drive/MyDrive/ColabNotebooks/GeneratedImages/" + "sd_image_" +str(count)+ ".png")
151
  images.append(image)
152
 
153
-
154
- # pick the image which is the smallest, and resize the others to match it (can be arbitrary image shape here)
155
  min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
156
  imgs_comb = np.hstack([i.resize(min_shape) for i in images])
157
-
158
- # save that beautiful picture
159
  imgs_comb = Image.fromarray( imgs_comb)
160
- # imgs_comb.save("/content/drive/MyDrive/ColabNotebooks/GeneratedImages/" + "Combined.png")
161
-
162
-
163
- # return imgs_comb #for combined image
164
  return images
165
 
166
  speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(placeholder = "Number of Images to be generated (int): "),gr.Textbox(placeholder = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], title = 'Speech to Image Generator', enable_queue=True)
167
  text_prompts = gr.Interface(fn=keywords, inputs=["text","number","text"], outputs=gr.Gallery(label="Generated images", show_label=False, elem_id="gallery").style(grid=[2], height="auto"), title = 'Speech to Image Generator', enable_queue=True)
168
 
169
- gr.Series(speech_text,text_prompts).launch(inline = False, share=True, enable_queue=True).queue()
170
 
171
 
172
 
 
1
  import whisper
 
 
 
2
  import gradio as gr
 
3
  from keybert import KeyBERT
4
  import random as r
5
  from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
6
  import torch
 
 
 
 
 
 
 
 
 
7
  from PIL import Image
8
  import time
9
  import matplotlib.pyplot as plt
10
  import numpy as np
11
  import PIL
12
 
13
+ model = whisper.load_model("base")
14
+ model.device
15
+
16
+ model_id = 'prompthero/midjourney-v4-diffusion' #"stabilityai/stable-diffusion-2"
17
+
18
+ scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
19
+ pipe = StableDiffusionPipeline.from_pretrained(model_id , torch_dtype=torch.float16) #pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16)
20
+ pipe = pipe.to("cuda")
21
 
22
  def transcribe(audio,prompt_num,user_keywords):
23
 
24
+
25
  audio1 = whisper.load_audio(audio)
26
  audio1 = whisper.pad_or_trim(audio1)
27
 
28
+
29
  mel = whisper.log_mel_spectrogram(audio1).to(model.device)
30
 
31
+
32
  _, probs = model.detect_language(mel)
33
  print(f"Detected language: {max(probs, key=probs.get)}")
34
 
 
35
  options = whisper.DecodingOptions()
36
  result = whisper.decode(model, mel, options)
37
  print(result.text)
38
 
 
39
  audio2 = whisper.load_audio(audio)
40
  final_result = model.transcribe(audio2)
41
  print(final_result["text"])
 
45
 
46
  def keywords(text,prompt_num,user_keywords):
47
 
 
48
 
49
  kw_model = KeyBERT()
50
  a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words=None)
 
74
  my_list = user_keywords.split(',')
75
  print(my_list)
76
 
 
 
 
 
77
  for i in range(len(my_list)):
78
+
79
 
80
  sentence.append("mdjrny-v4 style")
81
 
 
109
  sentence.append(r.choice(set_2))
110
  sentence.append(r.choice(set_3))
111
 
112
+
113
  sentence.append(r.choice(style_prompts))
114
 
115
  print("sentence: ", sentence)
116
 
 
117
  myprompt = ', '.join(str(e) for e in sentence)
118
  sentence = []
119
  print("prompt: ",myprompt)
 
121
 
122
  count += 1
123
 
 
 
 
124
  count = 0
125
  images = []
 
 
126
 
127
+ while count != int(len(generated_prompts)):
128
  for i in generated_prompts:
129
  count += 1
 
130
  image = pipe(i, height=768, width=768, guidance_scale = 10).images[0]
 
131
  images.append(image)
132
 
 
 
133
  min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
134
  imgs_comb = np.hstack([i.resize(min_shape) for i in images])
 
 
135
  imgs_comb = Image.fromarray( imgs_comb)
136
+
 
 
 
137
  return images
138
 
139
  speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(placeholder = "Number of Images to be generated (int): "),gr.Textbox(placeholder = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], title = 'Speech to Image Generator', enable_queue=True)
140
  text_prompts = gr.Interface(fn=keywords, inputs=["text","number","text"], outputs=gr.Gallery(label="Generated images", show_label=False, elem_id="gallery").style(grid=[2], height="auto"), title = 'Speech to Image Generator', enable_queue=True)
141
 
142
+ gr.Series(speech_text,text_prompts).launch(inline = False, enable_queue=True).queue()
143
 
144
 
145