pdich2085
/

new-blip

@@ -13,21 +13,66 @@ class EndpointHandler():
             "Salesforce/blip-image-captioning-large"
         ).to(device)
         self.model.eval()
     def __call__(self, image_data: str) -> dict:
         try:
             raw_image = Image.open(BytesIO(base64.b64decode(image_data))).convert("RGB")
-            processed_input = self.processor(raw_image, return_tensors="pt").to(device)
-            with torch.no_grad():
-                out = self.model.generate(**processed_input)
-            caption = self.processor.batch_decode(out, skip_special_tokens=True)[0]
             return {"caption": caption}
         except Exception as e:
             print(f"Error during processing: {str(e)}")
             return {"caption": "", "error": str(e)}

             "Salesforce/blip-image-captioning-large"
         ).to(device)
         self.model.eval()
+        self.max_length = 16
+        self.num_beams = 4
     def __call__(self, image_data: str) -> dict:
         try:
+            # Convert base64 encoded image string to a PIL Image
             raw_image = Image.open(BytesIO(base64.b64decode(image_data))).convert("RGB")
+            # Ensure the image is in RGB mode
+            if raw_image.mode != "RGB":
+                raw_image = raw_image.convert(mode="RGB")
+            # Extract pixel values and move them to the device
+            pixel_values = self.processor(raw_image, return_tensors="pt").pixel_values.to(device)
+            # Generate the caption
+            gen_kwargs = {"max_length": self.max_length, "num_beams": self.num_beams}
+            output_ids = self.model.generate(pixel_values, **gen_kwargs)
+            caption = self.processor.batch_decode(output_ids, skip_special_tokens=True)[0]
             return {"caption": caption}
         except Exception as e:
             print(f"Error during processing: {str(e)}")
             return {"caption": "", "error": str(e)}
+# === Below code works, but getting the following error:
+# == "error": "argument should be a bytes-like object or ASCII string, not 'dict'"
+# from PIL import Image
+# import torch
+# import base64
+# from io import BytesIO
+# from transformers import BlipForConditionalGeneration, BlipProcessor
+# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+# class EndpointHandler():
+#     def __init__(self, path=""):
+#         self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+#         self.model = BlipForConditionalGeneration.from_pretrained(
+#             "Salesforce/blip-image-captioning-large"
+#         ).to(device)
+#         self.model.eval()
+#     def __call__(self, image_data: str) -> dict:
+#         try:
+#             raw_image = Image.open(BytesIO(base64.b64decode(image_data))).convert("RGB")
+#             processed_input = self.processor(raw_image, return_tensors="pt").to(device)
+#             with torch.no_grad():
+#                 out = self.model.generate(**processed_input)
+#             caption = self.processor.batch_decode(out, skip_special_tokens=True)[0]
+#             return {"caption": caption}
+#         except Exception as e:
+#             print(f"Error during processing: {str(e)}")
+#             return {"caption": "", "error": str(e)}