Spaces:

sagar007
/

Multimodal_App

Build error

App Files Files Community

sagar007 commited on Aug 25, 2024

Commit

1b8f6f0

verified ·

1 Parent(s): d3fde93

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -23

app.py CHANGED Viewed

@@ -5,7 +5,9 @@ import gradio as gr
 from threading import Thread
 from PIL import Image
 import subprocess
-import spaces  # Add this import
 # Install flash-attention
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
@@ -96,6 +98,24 @@ def process_vision_query(image, text_input):
     generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
     response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     return response
 # Custom CSS
 custom_css = """
@@ -134,8 +154,8 @@ custom_suggestions = """
         <p>Analyze Images with Vision Model</p>
     </div>
     <div class="suggestion">
-        <span class="suggestion-icon">🤖</span>
-        <p>Get AI-generated responses</p>
     </div>
     <div class="suggestion">
         <span class="suggestion-icon">🔍</span>
@@ -158,33 +178,23 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
     gr.HTML(custom_suggestions)
     with gr.Tab("Text Model (Phi-3.5-mini)"):
-        chatbot = gr.Chatbot(height=400)
-        msg = gr.Textbox(label="Message", placeholder="Type your message here...")
-        with gr.Accordion("Advanced Options", open=False):
-            system_prompt = gr.Textbox(value="You are a helpful assistant", label="System Prompt")
-            temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.8, label="Temperature")
-            max_new_tokens = gr.Slider(minimum=128, maximum=8192, step=1, value=1024, label="Max new tokens")
-            top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=1.0, label="top_p")
-            top_k = gr.Slider(minimum=1, maximum=20, step=1, value=20, label="top_k")
-        submit_btn = gr.Button("Submit", variant="primary")
-        clear_btn = gr.Button("Clear Chat", variant="secondary")
-        submit_btn.click(stream_text_chat, [msg, chatbot, system_prompt, temperature, max_new_tokens, top_p, top_k], [chatbot])
-        clear_btn.click(lambda: None, None, chatbot, queue=False)
     with gr.Tab("Vision Model (Phi-3.5-vision)"):
         with gr.Row():
             with gr.Column(scale=1):
-                vision_input_img = gr.Image(label="Upload an Image", type="pil")
-                vision_text_input = gr.Textbox(label="Ask a question about the image", placeholder="What do you see in this image?")
-                vision_submit_btn = gr.Button("Analyze Image", variant="primary")
             with gr.Column(scale=1):
-                vision_output_text = gr.Textbox(label="AI Analysis", lines=10)
-        vision_submit_btn.click(process_vision_query, [vision_input_img, vision_text_input], [vision_output_text])
-    gr.HTML("<footer>Powered by Phi 3.5 Multimodal AI</footer>")
 if __name__ == "__main__":
     demo.launch()

 from threading import Thread
 from PIL import Image
 import subprocess
+import spaces
+from parler_tts import ParlerTTSForConditionalGeneration
+import soundfile as sf
 # Install flash-attention
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
     generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
     response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     return response
+# Load Parler-TTS model
+tts_device = "cuda:0" if torch.cuda.is_available() else "cpu"
+tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(tts_device)
+tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
+@spaces.GPU
+def generate_speech(prompt, description):
+    input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(tts_device)
+    prompt_input_ids = tts_tokenizer(prompt, return_tensors="pt").input_ids.to(tts_device)
+    generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+    audio_arr = generation.cpu().numpy().squeeze()
+    output_path = "output_audio.wav"
+    sf.write(output_path, audio_arr, tts_model.config.sampling_rate)
+    return output_path
 # Custom CSS
 custom_css = """
         <p>Analyze Images with Vision Model</p>
     </div>
     <div class="suggestion">
+        <span class="suggestion-icon">🔊</span>
+        <p>Generate Speech with Parler-TTS</p>
     </div>
     <div class="suggestion">
         <span class="suggestion-icon">🔍</span>
     gr.HTML(custom_suggestions)
     with gr.Tab("Text Model (Phi-3.5-mini)"):
+        # ... (previous text model code remains the same)
     with gr.Tab("Vision Model (Phi-3.5-vision)"):
+        # ... (previous vision model code remains the same)
+    with gr.Tab("Text-to-Speech (Parler-TTS)"):
         with gr.Row():
             with gr.Column(scale=1):
+                tts_prompt = gr.Textbox(label="Text to Speak", placeholder="Enter the text you want to convert to speech...")
+                tts_description = gr.Textbox(label="Voice Description", value="A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up.", lines=3)
+                tts_submit_btn = gr.Button("Generate Speech", variant="primary")
             with gr.Column(scale=1):
+                tts_output_audio = gr.Audio(label="Generated Speech")
+        tts_submit_btn.click(generate_speech, inputs=[tts_prompt, tts_description], outputs=[tts_output_audio])
+    gr.HTML("<footer>Powered by Phi 3.5 Multimodal AI and Parler-TTS</footer>")
 if __name__ == "__main__":
     demo.launch()