thianfoo commited on
Commit
2f29448
·
verified ·
1 Parent(s): 4440dfa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -68
app.py CHANGED
@@ -1,12 +1,14 @@
1
- '''import gradio as gr
2
  import numpy as np
3
  import torch
 
 
4
  from datasets import load_dataset
5
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
6
 
7
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
8
 
9
- title = "GenAI Audio Demo"
10
  description = """
11
  Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
12
  [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
@@ -24,6 +26,27 @@ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(devic
24
 
25
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
26
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # Function for translating different language using pretrained models
28
  def translate(audio):
29
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
@@ -48,62 +71,8 @@ def text_to_speech(text):
48
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
49
  return 16000, synthesised_speech
50
 
51
- demo = gr.Blocks()
52
-
53
- # Mic translation using microphone as the input
54
- mic_translate = gr.Interface(
55
- fn=speech_to_speech_translation,
56
- inputs=gr.Audio(source="microphone", type="filepath"),
57
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
58
- title=title,
59
- description=description,
60
- )
61
-
62
- # File translation using uploaded files as input
63
- file_translate = gr.Interface(
64
- fn=speech_to_speech_translation,
65
- inputs=gr.Audio(source="upload", type="filepath"),
66
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
67
- examples=[["./english.wav"], ["./chinese.wav"]],
68
- title=title,
69
- description=description,
70
- )
71
-
72
- # Text translation using text as input
73
- text_translate = gr.Interface(
74
- fn=text_to_speech,
75
- inputs="textbox",
76
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
77
- title=title,
78
- description=description
79
- )
80
-
81
- # Showcase the demo using different tabs of the different features
82
- with demo:
83
- gr.TabbedInterface([mic_translate, file_translate, text_translate], ["Microphone", "Audio File", "Text to Speech"])
84
-
85
- demo.launch()'''
86
-
87
- import gradio as gr
88
- import numpy as np
89
- import random
90
- from diffusers import DiffusionPipeline
91
- import torch
92
-
93
- device = "cuda" if torch.cuda.is_available() else "cpu"
94
-
95
- if torch.cuda.is_available():
96
- torch.cuda.max_memory_allocated(device=device)
97
- pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16", use_safetensors=True)
98
- pipe.enable_xformers_memory_efficient_attention()
99
- pipe = pipe.to(device)
100
- else:
101
- pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", use_safetensors=True)
102
- pipe = pipe.to(device)
103
-
104
- MAX_SEED = np.iinfo(np.int32).max
105
- MAX_IMAGE_SIZE = 1024
106
-
107
  def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps):
108
 
109
  if randomize_seed:
@@ -124,11 +93,10 @@ def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance
124
  return image
125
 
126
  examples = [
127
- "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
128
- "An astronaut riding a green horse",
129
- "A delicious ceviche cheesecake slice",
130
  ]
131
 
 
132
  css="""
133
  #col-container {
134
  margin: 0 auto;
@@ -136,16 +104,41 @@ css="""
136
  }
137
  """
138
 
139
- if torch.cuda.is_available():
140
- power_device = "GPU"
141
- else:
142
- power_device = "CPU"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
- with gr.Blocks(css=css) as demo:
 
 
 
 
 
 
 
 
 
145
 
146
  with gr.Column(elem_id="col-container"):
147
  gr.Markdown(f"""
148
- # Text-to-Image Gradio Template
149
  Currently running on {power_device}.
150
  """)
151
 
@@ -229,4 +222,15 @@ with gr.Blocks(css=css) as demo:
229
  outputs = [result]
230
  )
231
 
232
- demo.queue().launch()
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
  import numpy as np
3
  import torch
4
+ import random
5
+ from diffusers import DiffusionPipeline
6
  from datasets import load_dataset
7
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
8
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
+ title = "GenAI StoryTeller"
12
  description = """
13
  Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
14
  [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
 
26
 
27
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
28
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
29
+
30
+ # Load diffusion pipeline for image generation
31
+ if torch.cuda.is_available():
32
+ torch.cuda.max_memory_allocated(device=device)
33
+ pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16", use_safetensors=True)
34
+ pipe.enable_xformers_memory_efficient_attention()
35
+ pipe = pipe.to(device)
36
+ else:
37
+ pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", use_safetensors=True)
38
+ pipe = pipe.to(device)
39
+
40
+ if torch.cuda.is_available():
41
+ power_device = "GPU"
42
+ else:
43
+ power_device = "CPU"
44
+
45
+ # Limit the file size
46
+ MAX_SEED = np.iinfo(np.int32).max
47
+ MAX_IMAGE_SIZE = 1024
48
+
49
+ # Speech GenAI
50
  # Function for translating different language using pretrained models
51
  def translate(audio):
52
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
 
71
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
72
  return 16000, synthesised_speech
73
 
74
+ # Image GenAI
75
+ # Text to Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps):
77
 
78
  if randomize_seed:
 
93
  return image
94
 
95
  examples = [
96
+ "Dog licking ice cream",
 
 
97
  ]
98
 
99
+ # CSS
100
  css="""
101
  #col-container {
102
  margin: 0 auto;
 
104
  }
105
  """
106
 
107
+ demo = gr.Blocks()
108
+
109
+ # Mic translation using microphone as the input
110
+ mic_translate = gr.Interface(
111
+ fn=speech_to_speech_translation,
112
+ inputs=gr.Audio(source="microphone", type="filepath"),
113
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
114
+ title=title,
115
+ description=description,
116
+ )
117
+
118
+ # File translation using uploaded files as input
119
+ file_translate = gr.Interface(
120
+ fn=speech_to_speech_translation,
121
+ inputs=gr.Audio(source="upload", type="filepath"),
122
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
123
+ examples=[["./english.wav"], ["./chinese.wav"]],
124
+ title=title,
125
+ description=description,
126
+ )
127
 
128
+ # Text translation using text as input
129
+ text_translate = gr.Interface(
130
+ fn=text_to_speech,
131
+ inputs="textbox",
132
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
133
+ title=title,
134
+ description=description
135
+ )
136
+
137
+ with gr.Blocks(css=css) as image:
138
 
139
  with gr.Column(elem_id="col-container"):
140
  gr.Markdown(f"""
141
+ # Text-to-Image
142
  Currently running on {power_device}.
143
  """)
144
 
 
222
  outputs = [result]
223
  )
224
 
225
+ # Text to Image interface
226
+ image_generation = gr.Interface(
227
+ fn=infer,
228
+ inputs=[prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
229
+ outputs=[result]
230
+ )
231
+
232
+ # Showcase the demo using different tabs of the different features
233
+ with demo:
234
+ gr.TabbedInterface([mic_translate, file_translate, text_translate, image_generation], ["Microphone", "Audio File", "Text to Speech", "Text to Image"])
235
+
236
+ demo.launch()