Spaces:

thianfoo
/

GenAI_StoryTeller

Running

App Files Files Community

thianfoo commited on Jul 2, 2024

Commit

2f29448

verified ·

1 Parent(s): 4440dfa

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -68

app.py CHANGED Viewed

@@ -1,12 +1,14 @@
-'''import gradio as gr
 import numpy as np
 import torch
 from datasets import load_dataset
 from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-title = "GenAI Audio Demo"
 description = """
 Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
 [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
@@ -24,6 +26,27 @@ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(devic
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 # Function for translating different language using pretrained models
 def translate(audio):
     outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
@@ -48,62 +71,8 @@ def text_to_speech(text):
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech
-demo = gr.Blocks()
-# Mic translation using microphone as the input
-mic_translate = gr.Interface(
-    fn=speech_to_speech_translation,
-    inputs=gr.Audio(source="microphone", type="filepath"),
-    outputs=gr.Audio(label="Generated Speech", type="numpy"),
-    title=title,
-    description=description,
-)
-# File translation using uploaded files as input
-file_translate = gr.Interface(
-    fn=speech_to_speech_translation,
-    inputs=gr.Audio(source="upload", type="filepath"),
-    outputs=gr.Audio(label="Generated Speech", type="numpy"),
-    examples=[["./english.wav"], ["./chinese.wav"]],
-    title=title,
-    description=description,
-)
- # Text translation using text as input
-text_translate = gr.Interface(
-    fn=text_to_speech,
-    inputs="textbox",
-    outputs=gr.Audio(label="Generated Speech", type="numpy"),
-    title=title,
-    description=description
-)
-# Showcase the demo using different tabs of the different features
-with demo:
-    gr.TabbedInterface([mic_translate, file_translate, text_translate], ["Microphone", "Audio File", "Text to Speech"])
-demo.launch()'''
-import gradio as gr
-import numpy as np
-import random
-from diffusers import DiffusionPipeline
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-if torch.cuda.is_available():
-    torch.cuda.max_memory_allocated(device=device)
-    pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16", use_safetensors=True)
-    pipe.enable_xformers_memory_efficient_attention()
-    pipe = pipe.to(device)
-else:
-    pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", use_safetensors=True)
-    pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
 def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps):
     if randomize_seed:
@@ -124,11 +93,10 @@ def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance
     return image
 examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
 ]
 css="""
 #col-container {
     margin: 0 auto;
@@ -136,16 +104,41 @@ css="""
 }
 """
-if torch.cuda.is_available():
-    power_device = "GPU"
-else:
-    power_device = "CPU"
-with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown(f"""
-        # Text-to-Image Gradio Template
         Currently running on {power_device}.
         """)
@@ -229,4 +222,15 @@ with gr.Blocks(css=css) as demo:
         outputs = [result]
     )
-demo.queue().launch()

+import gradio as gr
 import numpy as np
 import torch
+import random
+from diffusers import DiffusionPipeline
 from datasets import load_dataset
 from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+title = "GenAI StoryTeller"
 description = """
 Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
 [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+# Load diffusion pipeline for image generation
+if torch.cuda.is_available():
+    torch.cuda.max_memory_allocated(device=device)
+    pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16", use_safetensors=True)
+    pipe.enable_xformers_memory_efficient_attention()
+    pipe = pipe.to(device)
+else:
+    pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", use_safetensors=True)
+    pipe = pipe.to(device)
+if torch.cuda.is_available():
+    power_device = "GPU"
+else:
+    power_device = "CPU"
+# Limit the file size
+MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 1024
+# Speech GenAI
 # Function for translating different language using pretrained models
 def translate(audio):
     outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech
+# Image GenAI
+# Text to Image
 def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps):
     if randomize_seed:
     return image
 examples = [
+    "Dog licking ice cream",
 ]
+# CSS
 css="""
 #col-container {
     margin: 0 auto;
 }
 """
+demo = gr.Blocks()
+# Mic translation using microphone as the input
+mic_translate = gr.Interface(
+    fn=speech_to_speech_translation,
+    inputs=gr.Audio(source="microphone", type="filepath"),
+    outputs=gr.Audio(label="Generated Speech", type="numpy"),
+    title=title,
+    description=description,
+)
+# File translation using uploaded files as input
+file_translate = gr.Interface(
+    fn=speech_to_speech_translation,
+    inputs=gr.Audio(source="upload", type="filepath"),
+    outputs=gr.Audio(label="Generated Speech", type="numpy"),
+    examples=[["./english.wav"], ["./chinese.wav"]],
+    title=title,
+    description=description,
+)
+ # Text translation using text as input
+text_translate = gr.Interface(
+    fn=text_to_speech,
+    inputs="textbox",
+    outputs=gr.Audio(label="Generated Speech", type="numpy"),
+    title=title,
+    description=description
+)
+with gr.Blocks(css=css) as image:
     with gr.Column(elem_id="col-container"):
         gr.Markdown(f"""
+        # Text-to-Image
         Currently running on {power_device}.
         """)
         outputs = [result]
     )
+# Text to Image interface
+image_generation = gr.Interface(
+    fn=infer,
+    inputs=[prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
+    outputs=[result]
+)
+# Showcase the demo using different tabs of the different features
+with demo:
+    gr.TabbedInterface([mic_translate, file_translate, text_translate, image_generation], ["Microphone", "Audio File", "Text to Speech", "Text to Image"])
+demo.launch()