apruvd commited on
Commit
916d940
·
1 Parent(s): e5835a1

Creating app.py

Browse files
Files changed (1) hide show
  1. app.py +172 -0
app.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ model = whisper.load_model("base")
3
+ model.device
4
+
5
+ import gradio as gr
6
+
7
+ from keybert import KeyBERT
8
+ import random as r
9
+ from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
10
+ import torch
11
+
12
+ model_id = 'prompthero/midjourney-v4-diffusion' #"stabilityai/stable-diffusion-2"
13
+
14
+ # Use the Euler scheduler here instead
15
+ scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
16
+ pipe = StableDiffusionPipeline.from_pretrained(model_id , torch_dtype=torch.float16) #pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16)
17
+ pipe = pipe.to("cuda")
18
+
19
+ from IPython.display import Image
20
+ from PIL import Image
21
+ import time
22
+ import matplotlib.pyplot as plt
23
+ import numpy as np
24
+ import PIL
25
+
26
+ # import cv2
27
+
28
+ def transcribe(audio,prompt_num,user_keywords):
29
+
30
+ # load audio and pad/trim it to fit 30 seconds
31
+ audio1 = whisper.load_audio(audio)
32
+ audio1 = whisper.pad_or_trim(audio1)
33
+
34
+ # make log-Mel spectrogram and move to the same device as the model
35
+ mel = whisper.log_mel_spectrogram(audio1).to(model.device)
36
+
37
+ # detect the spoken language
38
+ _, probs = model.detect_language(mel)
39
+ print(f"Detected language: {max(probs, key=probs.get)}")
40
+
41
+ # decode the audio
42
+ options = whisper.DecodingOptions()
43
+ result = whisper.decode(model, mel, options)
44
+ print(result.text)
45
+
46
+ # model = whisper.load_model("base")
47
+ audio2 = whisper.load_audio(audio)
48
+ final_result = model.transcribe(audio2)
49
+ print(final_result["text"])
50
+ return final_result["text"],int(prompt_num),user_keywords
51
+
52
+
53
+
54
+ def keywords(text,prompt_num,user_keywords):
55
+
56
+ # ub = UrlBuilder("demo.imgix.net")
57
+
58
+ kw_model = KeyBERT()
59
+ a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words=None)
60
+ set_1 = [i[0] for i in a]
61
+ b = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english',
62
+ use_maxsum=True, nr_candidates=20, top_n=5)
63
+ set_2 = [i[0] for i in b]
64
+ c = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english',
65
+ use_mmr=True, diversity=0.7)
66
+ set_3 = [i[0] for i in c]
67
+ d = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english',
68
+ use_mmr=True, diversity=0.2)
69
+ set_4 = [i[0] for i in d]
70
+ keyword_pool = set_1 + set_2 + set_3 + set_4
71
+ print("keywords: ", keyword_pool, "length: ", len(keyword_pool))
72
+
73
+ generated_prompts = []
74
+
75
+ count = 0
76
+
77
+ while count != int(prompt_num):
78
+
79
+ sentence = []
80
+
81
+ style_prompts = ["perfect shading, soft studio lighting, ultra-realistic, photorealistic, octane render, cinematic lighting, hdr, in-frame, 4k, 8k, edge lighting", "detailed, colourful, psychedelic, unreal engine, octane render, blender effect", "mechanical features, cybernetic eyes, baroque, rococo, anodized titanium highly detailed mechanisms, gears, fiber, cogs, bulbs, wires, cables, 70mm, Canon EOS 6D Mark II, 4k, 35mm (FX, Full-Frame), f/2.5, extremely detailed, very high details, photorealistic, hi res, hdr, UHD, hyper-detailed, ultra-realistic, vibrant, centered, vivid colors, Wide angle, zoom out", "detailed, soft ambiance, japanese influence, unreal engine 5, octane render", "perfect shading, soft studio lighting, ultra-realistic, photorealistic, octane render, cinematic lighting, hdr, in-frame, 4k, 8k, edge lighting --v 4"]
82
+
83
+ my_list = user_keywords.split(',')
84
+ print(my_list)
85
+
86
+ # for i in range(len(my_list)):
87
+ # sentence.append(my_list[i])
88
+
89
+ # numb = 5
90
+ for i in range(len(my_list)):
91
+ # print("keyword_pool",keyword_pool, len(keyword_pool))
92
+
93
+ sentence.append("mdjrny-v4 style")
94
+
95
+ for i in range (len(my_list)):
96
+ sentence.append(my_list[i])
97
+
98
+ rand_1 = r.randint(1, 4)
99
+
100
+ if rand_1 == 1:
101
+ sentence.append(r.choice(set_1))
102
+ sentence.append(r.choice(set_1))
103
+ sentence.append(r.choice(set_2))
104
+ sentence.append(r.choice(set_3))
105
+ sentence.append(r.choice(set_4))
106
+ elif rand_1 == 2:
107
+ sentence.append(r.choice(set_2))
108
+ sentence.append(r.choice(set_2))
109
+ sentence.append(r.choice(set_1))
110
+ sentence.append(r.choice(set_3))
111
+ sentence.append(r.choice(set_4))
112
+ elif rand_1 == 3:
113
+ sentence.append(r.choice(set_3))
114
+ sentence.append(r.choice(set_3))
115
+ sentence.append(r.choice(set_1))
116
+ sentence.append(r.choice(set_2))
117
+ sentence.append(r.choice(set_4))
118
+ else:
119
+ sentence.append(r.choice(set_4))
120
+ sentence.append(r.choice(set_4))
121
+ sentence.append(r.choice(set_1))
122
+ sentence.append(r.choice(set_2))
123
+ sentence.append(r.choice(set_3))
124
+
125
+ # Add Style Tail Prompt
126
+ sentence.append(r.choice(style_prompts))
127
+
128
+ print("sentence: ", sentence)
129
+
130
+ # Formatting Data as comma-delimited for Mid Journey
131
+ myprompt = ', '.join(str(e) for e in sentence)
132
+ sentence = []
133
+ print("prompt: ",myprompt)
134
+ generated_prompts.append(myprompt)
135
+
136
+ count += 1
137
+
138
+ print("no. of prompts: ", len(generated_prompts))
139
+ print("generated prompts: ", generated_prompts)
140
+
141
+ count = 0
142
+ images = []
143
+ # np_images = []
144
+ while count != int(len(generated_prompts)):
145
+
146
+ for i in generated_prompts:
147
+ count += 1
148
+ print(i)
149
+ image = pipe(i, height=768, width=768, guidance_scale = 10).images[0]
150
+ image.save("/content/drive/MyDrive/ColabNotebooks/GeneratedImages/" + "sd_image_" +str(count)+ ".png")
151
+ images.append(image)
152
+
153
+
154
+ # pick the image which is the smallest, and resize the others to match it (can be arbitrary image shape here)
155
+ min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
156
+ imgs_comb = np.hstack([i.resize(min_shape) for i in images])
157
+
158
+ # save that beautiful picture
159
+ imgs_comb = Image.fromarray( imgs_comb)
160
+ imgs_comb.save("/content/drive/MyDrive/ColabNotebooks/GeneratedImages/" + "Combined.png")
161
+
162
+
163
+ # return imgs_comb #for combined image
164
+ return images
165
+
166
+ speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(placeholder = "Number of Images to be generated (int): "),gr.Textbox(placeholder = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], title = 'Speech to Image Generator', enable_queue=True)
167
+ text_prompts = gr.Interface(fn=keywords, inputs=["text","number","text"], outputs=gr.Gallery(label="Generated images", show_label=False, elem_id="gallery").style(grid=[2], height="auto"), title = 'Speech to Image Generator', enable_queue=True)
168
+
169
+ gr.Series(speech_text,text_prompts).launch(inline = False, share = True, debug = True, enable_queue=True).queue()
170
+
171
+
172
+