tech9-ai
/

colpali-v1.3-hf

Visual Document Retrieval

Transformers

Safetensors

ColPali

English

pretraining

Model card Files Files and versions Community

adrish commited on 11 days ago

Commit

9931aed

1 Parent(s): 0d89ced

updated the code

Browse files

Files changed (1) hide show

handler.py +44 -27

handler.py CHANGED Viewed

@@ -9,74 +9,91 @@ from typing import Dict, Any, List
 class EndpointHandler:
     def __init__(self, model_path: str = None):
         """
-        Initialize the endpoint handler using the ColPali retrieval model.
         If no model path is provided, it defaults to 'vidore/colpali-v1.3-hf'.
         """
         if model_path is None:
             model_path = os.path.dirname(os.path.realpath(__file__))
         try:
             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-            # Use the specialized ColPaliForRetrieval class.
             self.model = ColPaliForRetrieval.from_pretrained(
                 model_path,
                 device_map="cuda" if torch.cuda.is_available() else "cpu",
                 trust_remote_code=True,
                 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             ).to(self.device)
-            # Use the specialized ColPaliProcessor.
             self.processor = ColPaliProcessor.from_pretrained(model_path, trust_remote_code=True)
         except Exception as e:
             raise RuntimeError(f"Error loading model or processor: {e}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
-        Process the input data, run inference using the ColPali retrieval model,
-        and return the outputs.
         Expects a dictionary with an "inputs" key containing a list of dictionaries.
-        Each dictionary should have:
-          - "image": a base64-encoded image string.
-          - "prompt": (optional) a text prompt (default is used if missing).
         """
         try:
             inputs_list = data.get("inputs", [])
-            config = data.get("config", {})
             if not inputs_list or not isinstance(inputs_list, list):
-                return {"error": "Inputs should be a list of dictionaries with 'image' and optionally 'prompt' keys."}
             images: List[Image.Image] = []
-            texts: List[str] = []
             for item in inputs_list:
                 image_b64 = item.get("image")
                 if not image_b64:
                     return {"error": "One of the input items is missing 'image' data."}
                 try:
-                    # Decode the base64-encoded image and convert to RGB.
                     image = Image.open(io.BytesIO(base64.b64decode(image_b64))).convert("RGB")
                     images.append(image)
                 except Exception as e:
                     return {"error": f"Failed to decode one of the images: {e}"}
-                # Use the provided prompt or a default prompt.
-                prompt = item.get("prompt", "Describe the image content in detail.")
-                texts.append(prompt)
-            # Prepare inputs with the ColPali processor.
             model_inputs = self.processor(
                 images=images,
-                text=texts,
-                padding=True,
                 return_tensors="pt",
             ).to(self.device)
-            # For retrieval, we call the model directly rather than using generate().
-            outputs = self.model(**model_inputs)
-            # Assuming that the model returns logits or retrieval scores,
-            # we extract and convert them to lists.
-            retrieval_scores = outputs.logits.tolist() if hasattr(outputs, "logits") else outputs
-            return {"responses": retrieval_scores}
         except Exception as e:
             return {"error": f"Unexpected error: {e}"}
@@ -87,7 +104,7 @@ _service = EndpointHandler()
 def handle(data, context):
     """
     Entry point for the Hugging Face dedicated inference endpoint.
-    Processes the input data and returns the model's outputs.
     """
     try:
         if data is None:

 class EndpointHandler:
     def __init__(self, model_path: str = None):
         """
+        Initialize the endpoint handler using the ColPali model for OCR extraction.
         If no model path is provided, it defaults to 'vidore/colpali-v1.3-hf'.
         """
         if model_path is None:
             model_path = os.path.dirname(os.path.realpath(__file__))
         try:
+            # Use GPU if available, otherwise CPU.
             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            # Load the specialized ColPali model (designed for retrieval but repurposed here for OCR generation).
             self.model = ColPaliForRetrieval.from_pretrained(
                 model_path,
                 device_map="cuda" if torch.cuda.is_available() else "cpu",
                 trust_remote_code=True,
                 torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
             ).to(self.device)
+            # Load the processor that handles image preprocessing.
             self.processor = ColPaliProcessor.from_pretrained(model_path, trust_remote_code=True)
         except Exception as e:
             raise RuntimeError(f"Error loading model or processor: {e}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
+        Process the input data for OCR extraction.
         Expects a dictionary with an "inputs" key containing a list of dictionaries.
+        Each dictionary must have an "image" key with a base64-encoded image string.
+        For OCR extraction, no text prompt is provided.
         """
         try:
             inputs_list = data.get("inputs", [])
             if not inputs_list or not isinstance(inputs_list, list):
+                return {"error": "Inputs should be a list of dictionaries with an 'image' key."}
             images: List[Image.Image] = []
             for item in inputs_list:
                 image_b64 = item.get("image")
                 if not image_b64:
                     return {"error": "One of the input items is missing 'image' data."}
                 try:
+                    # Decode the base64 string and convert to an RGB PIL image.
                     image = Image.open(io.BytesIO(base64.b64decode(image_b64))).convert("RGB")
                     images.append(image)
                 except Exception as e:
                     return {"error": f"Failed to decode one of the images: {e}"}
+            # Process only images with the processor (to avoid the text+image conflict).
             model_inputs = self.processor(
                 images=images,
                 return_tensors="pt",
+                padding=True,
             ).to(self.device)
+            # Manually create a dummy text prompt by inserting a beginning-of-sequence token.
+            # This is necessary to trigger text generation even though no prompt is provided.
+            bos_token_id = (
+                self.processor.tokenizer.bos_token_id
+                or self.processor.tokenizer.cls_token_id
+                or self.processor.tokenizer.pad_token_id
+            )
+            if bos_token_id is None:
+                raise RuntimeError("No BOS token found in the tokenizer.")
+            batch_size = model_inputs["pixel_values"].shape[0]
+            dummy_input_ids = torch.full((batch_size, 1), bos_token_id, dtype=torch.long).to(self.device)
+            model_inputs["input_ids"] = dummy_input_ids
+            # Generation parameters (can be overridden via the "config" field).
+            config = data.get("config", {})
+            max_new_tokens = config.get("max_new_tokens", 256)
+            temperature = config.get("temperature", 0.8)
+            num_return_sequences = config.get("num_return_sequences", 1)
+            do_sample = bool(config.get("do_sample", True))
+            # Call generate on the model using the image-only inputs augmented with the dummy text.
+            outputs = self.model.generate(
+                **model_inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                num_return_sequences=num_return_sequences,
+                do_sample=do_sample,
+            )
+            # Decode generated tokens into text using the processor's tokenizer.
+            text_output = self.processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            return {"responses": text_output}
         except Exception as e:
             return {"error": f"Unexpected error: {e}"}
 def handle(data, context):
     """
     Entry point for the Hugging Face dedicated inference endpoint.
+    Processes input data and returns the extracted OCR text.
     """
     try:
         if data is None: