fredaddy
/

MiniCPM-v-2_6

Image-Text-to-Text

feature-extraction

Model card Files Files and versions Community

fredaddy commited on Sep 16, 2024

Commit

cd5795f

·

verified ·

1 Parent(s): b1cc8b6

Update handler.py

Files changed (1) hide show

handler.py +22 -34

handler.py CHANGED Viewed

@@ -1,45 +1,33 @@
-#Handler.py file needed
 from PIL import Image
 import torch
-from transformers import AutoProcessor, AutoModelForVision2Seq
 class ModelHandler:
     def __init__(self):
-        self.model = None
-        self.processor = None
-    def initialize(self, model_dir):
-        # Load the processor and model
-        self.processor = AutoProcessor.from_pretrained(model_dir)
-        self.model = AutoModelForVision2Seq.from_pretrained(model_dir)
     def preprocess(self, inputs):
-        # Process the input image
-        image = Image.open(inputs["image"].file)
-        pixel_values = self.processor(images=image, return_tensors="pt").pixel_values
-        # Process the text context (if provided)
-        text_context = inputs.get("text_context", "")
-        if text_context:
-            context_inputs = self.processor(text=text_context, return_tensors="pt").input_ids
-        else:
-            context_inputs = None
-        return pixel_values, context_inputs
-    def inference(self, pixel_values, context_inputs=None):
-        # Run inference on the image with or without text context
-        with torch.no_grad():
-            if context_inputs is not None:
-                outputs = self.model.generate(pixel_values, input_ids=context_inputs)
-            else:
-                outputs = self.model.generate(pixel_values)
-        return outputs
-    def postprocess(self, outputs):
-        # Decode the output to text
-        decoded_text = self.processor.batch_decode(outputs, skip_special_tokens=True)
-        return {"digitized_text": decoded_text[0]}
 service = ModelHandler()

 from PIL import Image
 import torch
+from transformers import AutoModel, AutoTokenizer
 class ModelHandler:
     def __init__(self):
+        # Load the model and tokenizer with appropriate weights
+        self.model = AutoModel.from_pretrained(
+            'openbmb/MiniCPM-V-2_6',
+            trust_remote_code=True,
+            attn_implementation='sdpa',
+            torch_dtype=torch.bfloat16
+        ).eval().cuda()
+        self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)
     def preprocess(self, inputs):
+        # Preprocess image input
+        image = Image.open(inputs['image'].file).convert('RGB')
+        question = inputs.get("question", "What is in the image?")
+        msgs = [{'role': 'user', 'content': [image, question]}]
+        return msgs
+    def inference(self, msgs):
+        # Run inference on the model
+        result = self.model.chat(image=None, msgs=msgs, tokenizer=self.tokenizer)
+        return result
+    def postprocess(self, result):
+        # Postprocess the output from the model
+        return {"generated_text": result}
 service = ModelHandler()