Final_Assignment_Template

Sleeping

App Files Files Community

huytofu92 commited on May 20

Commit

3098bb4

1 Parent(s): 5f9faf7

Fix image shape issue

Browse files

Files changed (1) hide show

vlm_tools.py +46 -20

vlm_tools.py CHANGED Viewed

@@ -11,14 +11,14 @@ from PIL import Image
 from langchain_core.tools import tool as langchain_tool
 from smolagents.tools import Tool, tool
-def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
     """
     Pre-process an image for YOLO model
     Args:
         image: The image in base64 format to process
         input_size: The size to which the image should be resized
     Returns:
-        The pre-processed image as a numpy array
     """
     try:
         # Decode base64 image
@@ -41,9 +41,16 @@ def pre_processing(image: str, input_size=(416, 416))->np.ndarray:
         if img is None:
             raise ValueError("Failed to resize image")
         # Convert BGR to RGB and normalize
-        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to CHW
-        img = np.expand_dims(img, axis=0)
         img = img.astype(np.float32) / 255.0  # Normalize to [0, 1]
         return img, original_shape
@@ -121,12 +128,12 @@ def get_image_from_file_path(file_path: str)->str:
     """
     try:
         # Debug prints for original path
-        print(f"Original file_path: {file_path}")
-        print(f"Original path exists: {os.path.exists(file_path)}")
-        if os.path.exists(file_path):
-            print(f"Original path is file: {os.path.isfile(file_path)}")
-            print(f"Original path permissions: {oct(os.stat(file_path).st_mode)[-3:]}")
-            print(f"Original path absolute: {os.path.abspath(file_path)}")
         # Try reading with cv2
         img = cv2.imread(file_path)
@@ -148,12 +155,12 @@ def get_image_from_file_path(file_path: str)->str:
             adjusted_path = os.path.join(current_file_dir, file_path)
             # Debug prints for adjusted path
-            print(f"Adjusted file_path: {adjusted_path}")
-            print(f"Adjusted path exists: {os.path.exists(adjusted_path)}")
-            if os.path.exists(adjusted_path):
-                print(f"Adjusted path is file: {os.path.isfile(adjusted_path)}")
-                print(f"Adjusted path permissions: {oct(os.stat(adjusted_path).st_mode)[-3:]}")
-                print(f"Adjusted path absolute: {os.path.abspath(adjusted_path)}")
             # Try reading with cv2
             img = cv2.imread(adjusted_path)
@@ -305,6 +312,11 @@ class ObjectDetectionTool(Tool):
             self.onnx_path = onnx_path
             self.onnx_model = onnxruntime.InferenceSession(self.onnx_path)
             # Load class labels
             self.classes = [
                 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
@@ -332,14 +344,28 @@ class ObjectDetectionTool(Tool):
                     # Preprocess the image
                     img, original_shape = pre_processing(image)
                     # Create blob and run inference
-                    blob = dnn.blobFromImage(img[0], 0.00392, (416, 416), (0, 0, 0), True, crop=False)
-                    onnx_input = {self.onnx_model.get_inputs()[0].name: blob}
                     onnx_output = self.onnx_model.run(None, onnx_input)
                     # Handle shape mismatch by transposing if needed
-                    if onnx_output[0].shape[1] == 255:  # If in NCHW format
-                        onnx_output = [onnx_output[0].transpose(0, 2, 3, 1)]  # Convert to NHWC
                     # Post-process the output
                     objects = post_processing(onnx_output, self.classes, original_shape)

 from langchain_core.tools import tool as langchain_tool
 from smolagents.tools import Tool, tool
+def pre_processing(image: str, input_size=(416, 416))->tuple:
     """
     Pre-process an image for YOLO model
     Args:
         image: The image in base64 format to process
         input_size: The size to which the image should be resized
     Returns:
+        tuple: (processed_image, original_shape)
     """
     try:
         # Decode base64 image
         if img is None:
             raise ValueError("Failed to resize image")
+        # Ensure image is in BGR format (3 channels)
+        if len(img.shape) == 2:  # If grayscale
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        elif img.shape[2] == 4:  # If RGBA
+            img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
         # Convert BGR to RGB and normalize
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # More reliable than array slicing
+        img = img.transpose(2, 0, 1)  # HWC to CHW
+        img = np.expand_dims(img, axis=0)  # Add batch dimension
         img = img.astype(np.float32) / 255.0  # Normalize to [0, 1]
         return img, original_shape
     """
     try:
         # Debug prints for original path
+        # print(f"Original file_path: {file_path}")
+        # print(f"Original path exists: {os.path.exists(file_path)}")
+        # if os.path.exists(file_path):
+        #     print(f"Original path is file: {os.path.isfile(file_path)}")
+        #     print(f"Original path permissions: {oct(os.stat(file_path).st_mode)[-3:]}")
+        #     print(f"Original path absolute: {os.path.abspath(file_path)}")
         # Try reading with cv2
         img = cv2.imread(file_path)
             adjusted_path = os.path.join(current_file_dir, file_path)
             # Debug prints for adjusted path
+            # print(f"Adjusted file_path: {adjusted_path}")
+            # print(f"Adjusted path exists: {os.path.exists(adjusted_path)}")
+            # if os.path.exists(adjusted_path):
+            #     print(f"Adjusted path is file: {os.path.isfile(adjusted_path)}")
+            #     print(f"Adjusted path permissions: {oct(os.stat(adjusted_path).st_mode)[-3:]}")
+            #     print(f"Adjusted path absolute: {os.path.abspath(adjusted_path)}")
             # Try reading with cv2
             img = cv2.imread(adjusted_path)
             self.onnx_path = onnx_path
             self.onnx_model = onnxruntime.InferenceSession(self.onnx_path)
+            # Get model input details
+            self.input_name = self.onnx_model.get_inputs()[0].name
+            self.input_shape = self.onnx_model.get_inputs()[0].shape
+            print(f"Model input shape: {self.input_shape}")
             # Load class labels
             self.classes = [
                 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
                     # Preprocess the image
                     img, original_shape = pre_processing(image)
+                    # Verify input shape
+                    if len(img.shape) != 4:  # Should be NCHW
+                        raise ValueError(f"Invalid input shape: {img.shape}, expected NCHW format")
                     # Create blob and run inference
+                    blob = cv2.dnn.blobFromImage(
+                        img[0].transpose(1, 2, 0),  # Convert back to HWC for blobFromImage
+                        1/255.0,  # Scale factor
+                        (416, 416),  # Size
+                        (0, 0, 0),  # Mean
+                        True,  # SwapRB
+                        crop=False
+                    )
+                    # Run inference
+                    onnx_input = {self.input_name: blob}
                     onnx_output = self.onnx_model.run(None, onnx_input)
                     # Handle shape mismatch by transposing if needed
+                    if len(onnx_output[0].shape) == 4:  # If in NCHW format
+                        if onnx_output[0].shape[1] == 255:  # If channels first
+                            onnx_output = [onnx_output[0].transpose(0, 2, 3, 1)]  # Convert to NHWC
                     # Post-process the output
                     objects = post_processing(onnx_output, self.classes, original_shape)