fffiloni commited on
Commit
0d33acd
Β·
1 Parent(s): 02a9629

Create new file

Browse files
Files changed (1) hide show
  1. app.py +41 -0
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+
4
+ from diffusers import DiffusionPipeline
5
+ from transformers import (
6
+ WhisperForConditionalGeneration,
7
+ WhisperProcessor,
8
+ )
9
+
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
12
+ processor = WhisperProcessor.from_pretrained("openai/whisper-small")
13
+
14
+ diffuser_pipeline = DiffusionPipeline.from_pretrained(
15
+ "CompVis/stable-diffusion-v1-4",
16
+ custom_pipeline="speech_to_image_diffusion",
17
+ speech_model=model,
18
+ speech_processor=processor,
19
+ revision="fp16",
20
+ torch_dtype=torch.float16,
21
+ )
22
+
23
+ diffuser_pipeline.enable_attention_slicing()
24
+ diffuser_pipeline = diffuser_pipeline.to(device)
25
+
26
+ #β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
27
+ # GRADIO SETUP
28
+
29
+ audio_input = gr.Audio(source="microphone")
30
+ image_output = gr.Image()
31
+
32
+ def speech_to_text(audio_sample):
33
+ text = audio_sample["text"].lower()
34
+ print(text)
35
+ speech_data = audio_sample["audio"]["array"]
36
+ output = diffuser_pipeline(speech_data)
37
+
38
+ return output.images[0]
39
+
40
+ demo = gr.Interface(fn=speech_to_text, inputs=audio_input, outputs=image_output)
41
+ demo.launch()