Spaces:

xzerus
/

internvl2.5

Running

App Files Files Community

xzerus commited on Dec 21, 2024

Commit

1258570

verified ·

1 Parent(s): c5e37aa

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -5

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # Device Configuration
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # ImageNet normalization values
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
@@ -28,7 +28,7 @@ def build_transform(input_size):
 def preprocess_image(image, input_size=448):
     """Preprocess the image to the required format."""
     transform = build_transform(input_size)
-    tensor_image = transform(image).unsqueeze(0).to(torch.float32 if device == "cpu" else torch.bfloat16).to(device)
     return tensor_image
 # Load the model and tokenizer
@@ -36,7 +36,7 @@ logging.info("Loading model from Hugging Face Hub...")
 model_path = "OpenGVLab/InternVL2_5-1B"
 model = AutoModel.from_pretrained(
     model_path,
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
     trust_remote_code=True,
 ).to(device).eval()
@@ -55,13 +55,14 @@ def describe_image(image):
         pixel_values = preprocess_image(image, input_size=448)
         prompt = "<image>\nExtract text from the image, respond with only the extracted text."
         response = model.chat(
             tokenizer=tokenizer,
             pixel_values=pixel_values,
             question=prompt,
             history=None,
             return_history=False,
-            generation_config=dict(max_new_tokens=512, do_sample=True)
         )
         return response
     except Exception as e:
@@ -78,4 +79,4 @@ interface = gr.Interface(
 )
 if __name__ == "__main__":
-    interface.launch(server_name="0.0.0.0", server_port=7860)

 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # Device Configuration
+device = torch.device("cpu")  # Force CPU usage
 # ImageNet normalization values
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 def preprocess_image(image, input_size=448):
     """Preprocess the image to the required format."""
     transform = build_transform(input_size)
+    tensor_image = transform(image).unsqueeze(0).to(torch.float32).to(device)  # Use float32 for CPU
     return tensor_image
 # Load the model and tokenizer
 model_path = "OpenGVLab/InternVL2_5-1B"
 model = AutoModel.from_pretrained(
     model_path,
+    torch_dtype=torch.float32,  # Use float32 for CPU compatibility
     trust_remote_code=True,
 ).to(device).eval()
         pixel_values = preprocess_image(image, input_size=448)
         prompt = "<image>\nExtract text from the image, respond with only the extracted text."
+        # Perform inference
         response = model.chat(
             tokenizer=tokenizer,
             pixel_values=pixel_values,
             question=prompt,
             history=None,
             return_history=False,
+            generation_config=dict(max_new_tokens=512, do_sample=True),
         )
         return response
     except Exception as e:
 )
 if __name__ == "__main__":
+    interface.launch(server_name="0.0.0.0", server_port=7860, share=True)