Spaces:

sagar007
/

Multimodal_App

Build error

App Files Files Community

sagar007 commited on Aug 25, 2024

Commit

ae87ff0

verified ·

1 Parent(s): c8be55b

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -11

app.py CHANGED Viewed

@@ -13,8 +13,8 @@ import soundfile as sf
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 # Constants
-TITLE = "<h1><center>Phi 3.5 Multimodal (Text + Vision)</center></h1>"
-DESCRIPTION = "# Phi-3.5 Multimodal Demo (Text + Vision)"
 # Model configurations
 TEXT_MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
@@ -48,6 +48,11 @@ vision_model = AutoModelForCausalLM.from_pretrained(
 vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID, trust_remote_code=True)
 # Helper functions
 @spaces.GPU
 def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_tokens=1024, top_p=1.0, top_k=20):
@@ -82,7 +87,7 @@ def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_t
         buffer += new_text
         yield history + [[message, buffer]]
-@spaces.GPU  # Add this decorator
 def process_vision_query(image, text_input):
     prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
     image = Image.fromarray(image).convert("RGB")
@@ -98,11 +103,6 @@ def process_vision_query(image, text_input):
     generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
     response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     return response
-# Load Parler-TTS model
-tts_device = "cuda:0" if torch.cuda.is_available() else "cpu"
-tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(tts_device)
-tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
 @spaces.GPU
 def generate_speech(prompt, description):
@@ -138,7 +138,7 @@ footer { text-align: center; margin-top: 2rem; color: #64748b;}
 custom_header = """
 <div id="custom-header">
     <h1><span class="blue">Phi 3.5</span> <span class="pink">Multimodal Assistant</span></h1>
-    <h2>Text and Vision AI at Your Service</h2>
 </div>
 """
@@ -178,10 +178,31 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
     gr.HTML(custom_suggestions)
     with gr.Tab("Text Model (Phi-3.5-mini)"):
-        # ... (previous text model code remains the same)
     with gr.Tab("Vision Model (Phi-3.5-vision)"):
-        # ... (previous vision model code remains the same)
     with gr.Tab("Text-to-Speech (Parler-TTS)"):
         with gr.Row():

 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 # Constants
+TITLE = "<h1><center>Phi 3.5 Multimodal (Text + Vision + Speech)</center></h1>"
+DESCRIPTION = "# Phi-3.5 Multimodal Demo (Text + Vision + Speech)"
 # Model configurations
 TEXT_MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
 vision_processor = AutoProcessor.from_pretrained(VISION_MODEL_ID, trust_remote_code=True)
+# Load Parler-TTS model
+tts_device = "cuda:0" if torch.cuda.is_available() else "cpu"
+tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(tts_device)
+tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
 # Helper functions
 @spaces.GPU
 def stream_text_chat(message, history, system_prompt, temperature=0.8, max_new_tokens=1024, top_p=1.0, top_k=20):
         buffer += new_text
         yield history + [[message, buffer]]
+@spaces.GPU
 def process_vision_query(image, text_input):
     prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
     image = Image.fromarray(image).convert("RGB")
     generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
     response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     return response
 @spaces.GPU
 def generate_speech(prompt, description):
 custom_header = """
 <div id="custom-header">
     <h1><span class="blue">Phi 3.5</span> <span class="pink">Multimodal Assistant</span></h1>
+    <h2>Text, Vision, and Speech AI at Your Service</h2>
 </div>
 """
     gr.HTML(custom_suggestions)
     with gr.Tab("Text Model (Phi-3.5-mini)"):
+        chatbot = gr.Chatbot(height=400)
+        msg = gr.Textbox(label="Message", placeholder="Type your message here...")
+        with gr.Accordion("Advanced Options", open=False):
+            system_prompt = gr.Textbox(value="You are a helpful assistant", label="System Prompt")
+            temperature = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.8, label="Temperature")
+            max_new_tokens = gr.Slider(minimum=128, maximum=8192, step=1, value=1024, label="Max new tokens")
+            top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=1.0, label="top_p")
+            top_k = gr.Slider(minimum=1, maximum=20, step=1, value=20, label="top_k")
+        submit_btn = gr.Button("Submit", variant="primary")
+        clear_btn = gr.Button("Clear Chat", variant="secondary")
+        submit_btn.click(stream_text_chat, [msg, chatbot, system_prompt, temperature, max_new_tokens, top_p, top_k], [chatbot])
+        clear_btn.click(lambda: None, None, chatbot, queue=False)
     with gr.Tab("Vision Model (Phi-3.5-vision)"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                vision_input_img = gr.Image(label="Upload an Image", type="pil")
+                vision_text_input = gr.Textbox(label="Ask a question about the image", placeholder="What do you see in this image?")
+                vision_submit_btn = gr.Button("Analyze Image", variant="primary")
+            with gr.Column(scale=1):
+                vision_output_text = gr.Textbox(label="AI Analysis", lines=10)
+        vision_submit_btn.click(process_vision_query, [vision_input_img, vision_text_input], [vision_output_text])
     with gr.Tab("Text-to-Speech (Parler-TTS)"):
         with gr.Row():