Vidensogende
/

image-captioning-with-blip

@@ -1,22 +1,50 @@
 import requests
 from PIL import Image
-from transformers import Blip2Processor, Blip2ForConditionalGeneration
-from typing import Dict, List, Any
 import torch
 class EndpointHandler():
     def __init__(self, path=""):
-        self.processor = Blip2Processor.from_pretrained("Salesforce/blip-image-captioning-large")
-        self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
-        image = data.pop("inputs", data)
-        processed = self.processor(images=image, return_tensors="pt").to(self.device)
-        out = self.model.generate(**processed)
-        return self.processor.decode(out[0], skip_special_tokens=True)

 import requests
 from PIL import Image
+from transformers import BlipProcessor, BlipForConditionalGeneration
 import torch
+from typing import Dict, List, Any
 class EndpointHandler():
     def __init__(self, path=""):
+        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
+    def process_single_image(self, img_url, text=None):
+        # Loading and processing the image
+        raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
+        if text:
+            # Conditional image captioning
+            inputs = self.processor(raw_image, text, return_tensors="pt").to(self.device)
+        else:
+            # Unconditional image captioning
+            inputs = self.processor(raw_image, return_tensors="pt").to(self.device)
+        out = self.model.generate(**inputs)
+        return self.processor.decode(out[0], skip_special_tokens=True)
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        try:
+            img_urls = data.get("image_urls")
+            texts = data.get("texts", [None] * len(img_urls))  # Texts are optional for conditional captioning
+            # Check if inputs are for single or multiple images
+            if isinstance(img_urls, str):
+                img_urls = [img_urls]
+                texts = [texts]
+            captions = []
+            for img_url, text in zip(img_urls, texts):
+                caption = self.process_single_image(img_url, text)
+                captions.append({"image_url": img_url, "caption": caption})
+            return captions
+        except Exception as e:
+            print(f"Error processing data: {e}")
+            return [{"error": str(e)}]
+# You may need to add a function to load this handler if the inference toolkit expects it
+def get_pipeline(model_dir, task):
+    return EndpointHandler(model_dir)