Spaces:

sagar007
/

Multimodal_App

Build error

App Files Files Community

sagar007 commited on Aug 24, 2024

Commit

ccb9319

verified ·

1 Parent(s): 15cd21c

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -20

app.py CHANGED Viewed

@@ -69,15 +69,15 @@ def load_sarvam():
 @spaces.GPU
 def load_vision_model():
     try:
-        model_id = "microsoft/Phi-3.5-vision-instruct"
-        model = AutoModelForCausalLM.from_pretrained(
-            model_id, trust_remote_code=True, torch_dtype=torch.float16, use_flash_attention_2=False
-        )
-        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
-        return model, processor
     except Exception as e:
         print(f"Error loading vision model: {e}")
-        return None, None
 # Process audio input within a GPU-decorated function
 @spaces.GPU
@@ -94,24 +94,35 @@ def process_audio_input(audio, whisper_processor, whisper_model):
     except Exception as e:
         return f"Error processing audio: {str(e)}. Please type your message instead."
-# Process image input
 @spaces.GPU
-def process_image_input(image, vision_model, vision_processor):
-    if vision_model is None or vision_processor is None:
         return "Error: Vision model is not available."
     try:
-        # Add a generic prompt for image description
-        prompt = "Describe this image in detail."
-        inputs = vision_processor(images=image, text=prompt, return_tensors="pt")
-        inputs = {k: v.to(vision_model.device) for k, v in inputs.items()}
         with torch.no_grad():
-            outputs = vision_model.generate(**inputs, max_new_tokens=512, do_sample=True, top_k=50, top_p=0.95)
-        generated_text = vision_processor.batch_decode(outputs, skip_special_tokens=True)[0]
-        return generated_text
     except Exception as e:
         return f"Error processing image: {str(e)}"
 # Generate response within a GPU-decorated function
@@ -168,17 +179,16 @@ def detect_language(text):
 @spaces.GPU
 def indic_vision_assistant(input_type, audio_input, text_input, image_input):
     try:
-        # Load models within the GPU-decorated function
         whisper_processor, whisper_model = load_whisper()
         sarvam_pipe = load_sarvam()
-        vision_model, vision_processor = load_vision_model()
         if input_type == "audio" and audio_input is not None:
             transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
         elif input_type == "text" and text_input:
             transcription = text_input
         elif input_type == "image" and image_input is not None:
-            transcription = process_image_input(image_input, vision_model, vision_processor)
         else:
             return "Please provide either audio, text, or image input.", "No input provided.", None
@@ -191,6 +201,7 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
         error_message = f"An error occurred: {str(e)}"
         return error_message, error_message, None
 # Custom CSS
 custom_css = """
 body {

 @spaces.GPU
 def load_vision_model():
     try:
+        model_id = "microsoft/phi-2"  # Changed to phi-2 as it's more widely available
+        model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16)
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
+        return model, tokenizer, image_processor
     except Exception as e:
         print(f"Error loading vision model: {e}")
+        return None, None, None
 # Process audio input within a GPU-decorated function
 @spaces.GPU
     except Exception as e:
         return f"Error processing audio: {str(e)}. Please type your message instead."
+# Updated process_image_input function
 @spaces.GPU
+def process_image_input(image, vision_model, tokenizer, image_processor):
+    if vision_model is None or tokenizer is None or image_processor is None:
         return "Error: Vision model is not available."
     try:
+        # Process the image
+        image = Image.open(io.BytesIO(image)).convert('RGB')
+        image_features = image_processor(images=image, return_tensors="pt")["pixel_values"].to(vision_model.device)
+        # Create a prompt
+        prompt = "Describe this image in detail:\n"
+        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(vision_model.device)
+        # Generate text
         with torch.no_grad():
+            outputs = vision_model.generate(
+                input_ids,
+                max_new_tokens=100,
+                do_sample=True,
+                top_k=50,
+                top_p=0.95,
+                num_return_sequences=1,
+                image_features=image_features
+            )
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return generated_text.replace(prompt, "")  # Remove the prompt from the output
     except Exception as e:
         return f"Error processing image: {str(e)}"
 # Generate response within a GPU-decorated function
 @spaces.GPU
 def indic_vision_assistant(input_type, audio_input, text_input, image_input):
     try:
         whisper_processor, whisper_model = load_whisper()
         sarvam_pipe = load_sarvam()
+        vision_model, tokenizer, image_processor = load_vision_model()
         if input_type == "audio" and audio_input is not None:
             transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
         elif input_type == "text" and text_input:
             transcription = text_input
         elif input_type == "image" and image_input is not None:
+            transcription = process_image_input(image_input, vision_model, tokenizer, image_processor)
         else:
             return "Please provide either audio, text, or image input.", "No input provided.", None
         error_message = f"An error occurred: {str(e)}"
         return error_message, error_message, None
 # Custom CSS
 custom_css = """
 body {