calebaryee321 commited on
Commit
3c53ea4
·
1 Parent(s): 89f557e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -0
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+ import sounddevice as sd
4
+ import soundfile as sf
5
+ import time
6
+ import whisper
7
+ from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
8
+
9
+
10
+
11
+ def SpeechToText(audio):
12
+ if audio == None : return ""
13
+ model = whisper.load_model("base")
14
+ audio = whisper.load_audio(audio)
15
+ audio = whisper.pad_or_trim(audio)
16
+
17
+ # make log-Mel spectrogram and move to the same device as the model
18
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
19
+
20
+ # Detect the Max probability of language ?
21
+ _, probs = model.detect_language(mel)
22
+ lang = f"Language: {max(probs, key=probs.get)}"
23
+
24
+ # Decode audio to Text
25
+ options = whisper.DecodingOptions(fp16 = False)
26
+ result = whisper.decode(model, mel, options)
27
+ return result.text
28
+
29
+
30
+ def img_Generation(text):
31
+ print(text)
32
+ model_id = "stabilityai/stable-diffusion-2"
33
+
34
+ # Use the Euler scheduler here instead
35
+ scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
36
+ pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16)
37
+ pipe = pipe.to("cuda")
38
+ image = pipe(text, num_inference_steps = 150).images[0]
39
+ image.save("img_1.png")
40
+
41
+ return image
42
+
43
+
44
+ def transcribe(audio):
45
+ text = SpeechToText(audio)
46
+ image = img_Generation(text)
47
+
48
+ return image
49
+
50
+
51
+ gr.Interface(
52
+ fn=transcribe,
53
+ inputs=gr.Audio(source="microphone", type="filepath"),
54
+ outputs="image",description="A Speech to Image Generation App Using OpenAI's Whisper",title= "Whisper2IMG").launch(share="True")