davanstrien
/

Molmo-7B-D-0924

Image-Text-to-Text

text-generation

Model card Files Files and versions Community

davanstrien HF staff commited on Oct 4, 2024

Commit

6c4cdba

·

verified ·

1 Parent(s): d880504

Update handler.py

Files changed (1) hide show

handler.py +14 -4

handler.py CHANGED Viewed

@@ -3,23 +3,29 @@ from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
 from PIL import Image
 import requests
 import torch
 class EndpointHandler:
     def __init__(self, path=""):
         self.processor = AutoProcessor.from_pretrained(
             path,
             trust_remote_code=True,
-            torch_dtype='auto',
             device_map='auto'
         )
         self.model = AutoModelForCausalLM.from_pretrained(
             path,
             trust_remote_code=True,
-            torch_dtype='auto',
-            device_map='auto'
         )
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         # Extract inputs from the request data
         inputs = data.get("inputs", {})
         image_url = inputs.get("image_url")
@@ -38,7 +44,7 @@ class EndpointHandler:
         # Process the image and text
         try:
-            with torch.cuda.amp.autocast(enabled=True):
                 inputs = self.processor.process(
                     images=[image],
                     text=text_prompt
@@ -58,6 +64,10 @@ class EndpointHandler:
             generated_tokens = output[0, inputs['input_ids'].size(1):]
             generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
             return [{"generated_text": generated_text}]
         except Exception as e:
             return [{"error": f"Error during generation: {str(e)}"}]

 from PIL import Image
 import requests
 import torch
+import gc
 class EndpointHandler:
     def __init__(self, path=""):
         self.processor = AutoProcessor.from_pretrained(
             path,
             trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
             device_map='auto'
         )
         self.model = AutoModelForCausalLM.from_pretrained(
             path,
             trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+            device_map='auto',
+            low_cpu_mem_usage=True
         )
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        # Clear CUDA cache
+        torch.cuda.empty_cache()
+        gc.collect()
         # Extract inputs from the request data
         inputs = data.get("inputs", {})
         image_url = inputs.get("image_url")
         # Process the image and text
         try:
+            with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):
                 inputs = self.processor.process(
                     images=[image],
                     text=text_prompt
             generated_tokens = output[0, inputs['input_ids'].size(1):]
             generated_text = self.processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+            # Clear CUDA cache again
+            torch.cuda.empty_cache()
+            gc.collect()
             return [{"generated_text": generated_text}]
         except Exception as e:
             return [{"error": f"Error during generation: {str(e)}"}]