davanstrien
/

Molmo-7B-D-0924

@@ -4,44 +4,60 @@ from PIL import Image
 import requests
 import torch
 class EndpointHandler:
     def __init__(self, path=""):
         self.processor = AutoProcessor.from_pretrained(
-            path, trust_remote_code=True, torch_dtype="auto", device_map="auto"
         )
         self.model = AutoModelForCausalLM.from_pretrained(
-            path, trust_remote_code=True, torch_dtype="auto", device_map="auto"
         )
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         # Extract inputs from the request data
-        image_url = data.get("image_url")
-        text_prompt = data.get("text_prompt", "Describe this image.")
         # Download and process the image
-        image = Image.open(requests.get(image_url, stream=True).raw)
-        if image.mode != "RGB":
-            image = image.convert("RGB")
         # Process the image and text
-        inputs = self.processor.process(images=[image], text=text_prompt)
         # Move inputs to the correct device and make a batch of size 1
         inputs = {k: v.to(self.model.device).unsqueeze(0) for k, v in inputs.items()}
         # Generate output
-        with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
-            output = self.model.generate_from_batch(
-                inputs,
-                GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
-                tokenizer=self.processor.tokenizer,
-            )
-        # Decode the generated tokens
-        generated_tokens = output[0, inputs["input_ids"].size(1) :]
-        generated_text = self.processor.tokenizer.decode(
-            generated_tokens, skip_special_tokens=True
-        )
-        return [{"generated_text": generated_text}]

 import requests
 import torch
 class EndpointHandler:
     def __init__(self, path=""):
         self.processor = AutoProcessor.from_pretrained(
+            path,
+            trust_remote_code=True,
+            torch_dtype='auto',
+            device_map='auto'
         )
         self.model = AutoModelForCausalLM.from_pretrained(
+            path,
+            trust_remote_code=True,
+            torch_dtype='auto',
+            device_map='auto'
         )
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         # Extract inputs from the request data
+        inputs = data.get("inputs", {})
+        image_url = inputs.get("image_url")
+        text_prompt = inputs.get("text_prompt", "Describe this image.")
+        if not image_url:
+            return [{"error": "No image_url provided in inputs"}]
         # Download and process the image
+        try:
+            image = Image.open(requests.get(image_url, stream=True).raw)
+            if image.mode != "RGB":
+                image = image.convert("RGB")
+        except Exception as e:
+            return [{"error": f"Failed to load image: {str(e)}"}]
         # Process the image and text
+        inputs = self.processor.process(
+            images=[image],
+            text=text_prompt
+        )
         # Move inputs to the correct device and make a batch of size 1
         inputs = {k: v.to(self.model.device).unsqueeze(0) for k, v in inputs.items()}
         # Generate output
+        try:
+            with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
+                output = self.model.generate_from_batch(
+                    inputs,
+                    GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
+                    tokenizer=self.processor.tokenizer
+                )
+            # Decode the generated tokens
+            generated_tokens = output[0, inputs['input_ids'].size(1):]
+            generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+            return [{"generated_text": generated_text}]
+        except Exception as e:
+            return [{"error": f"Error during generation: {str(e)}"}]