Spaces:

sagar007
/

Multimodal_App

Running on Zero

App Files Files Community

sagar007 commited on Aug 24, 2024

Commit

bb4d7fc

verified ·

1 Parent(s): ccb9319

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -22

app.py CHANGED Viewed

@@ -69,14 +69,23 @@ def load_sarvam():
 @spaces.GPU
 def load_vision_model():
     try:
-        model_id = "microsoft/phi-2"  # Changed to phi-2 as it's more widely available
-        model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-        image_processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
-        return model, tokenizer, image_processor
     except Exception as e:
-        print(f"Error loading vision model: {e}")
-        return None, None, None
 # Process audio input within a GPU-decorated function
@@ -96,33 +105,28 @@ def process_audio_input(audio, whisper_processor, whisper_model):
 # Updated process_image_input function
 @spaces.GPU
-def process_image_input(image, vision_model, tokenizer, image_processor):
-    if vision_model is None or tokenizer is None or image_processor is None:
         return "Error: Vision model is not available."
     try:
         # Process the image
         image = Image.open(io.BytesIO(image)).convert('RGB')
-        image_features = image_processor(images=image, return_tensors="pt")["pixel_values"].to(vision_model.device)
-        # Create a prompt
-        prompt = "Describe this image in detail:\n"
-        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(vision_model.device)
         # Generate text
         with torch.no_grad():
             outputs = vision_model.generate(
-                input_ids,
                 max_new_tokens=100,
                 do_sample=True,
                 top_k=50,
                 top_p=0.95,
-                num_return_sequences=1,
-                image_features=image_features
             )
-        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return generated_text.replace(prompt, "")  # Remove the prompt from the output
     except Exception as e:
         return f"Error processing image: {str(e)}"
 # Generate response within a GPU-decorated function
@@ -181,14 +185,14 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
     try:
         whisper_processor, whisper_model = load_whisper()
         sarvam_pipe = load_sarvam()
-        vision_model, tokenizer, image_processor = load_vision_model()
         if input_type == "audio" and audio_input is not None:
             transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
         elif input_type == "text" and text_input:
             transcription = text_input
         elif input_type == "image" and image_input is not None:
-            transcription = process_image_input(image_input, vision_model, tokenizer, image_processor)
         else:
             return "Please provide either audio, text, or image input.", "No input provided.", None
@@ -201,7 +205,6 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
         error_message = f"An error occurred: {str(e)}"
         return error_message, error_message, None
 # Custom CSS
 custom_css = """
 body {

 @spaces.GPU
 def load_vision_model():
     try:
+        print("Starting to load vision model...")
+        model_id = "microsoft/Phi-3.5-vision-instruct"
+        print(f"Loading model from {model_id}")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            torch_dtype=torch.float16,
+            use_flash_attention_2=False
+        )
+        print("Model loaded successfully")
+        print("Loading processor...")
+        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
+        print("Processor loaded successfully")
+        return model, processor
     except Exception as e:
+        print(f"Detailed error in loading vision model: {str(e)}")
+        return None, None
 # Process audio input within a GPU-decorated function
 # Updated process_image_input function
 @spaces.GPU
+def process_image_input(image, vision_model, processor):
+    if vision_model is None or processor is None:
         return "Error: Vision model is not available."
     try:
         # Process the image
         image = Image.open(io.BytesIO(image)).convert('RGB')
+        inputs = processor(images=image, return_tensors="pt").to(vision_model.device)
         # Generate text
         with torch.no_grad():
             outputs = vision_model.generate(
+                **inputs,
                 max_new_tokens=100,
                 do_sample=True,
                 top_k=50,
                 top_p=0.95,
+                num_return_sequences=1
             )
+        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        return generated_text
     except Exception as e:
         return f"Error processing image: {str(e)}"
 # Generate response within a GPU-decorated function
     try:
         whisper_processor, whisper_model = load_whisper()
         sarvam_pipe = load_sarvam()
+        vision_model, processor = load_vision_model()
         if input_type == "audio" and audio_input is not None:
             transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
         elif input_type == "text" and text_input:
             transcription = text_input
         elif input_type == "image" and image_input is not None:
+            transcription = process_image_input(image_input, vision_model, processor)
         else:
             return "Please provide either audio, text, or image input.", "No input provided.", None
         error_message = f"An error occurred: {str(e)}"
         return error_message, error_message, None
 # Custom CSS
 custom_css = """
 body {