Spaces:

xzerus
/

internvl2.5

Running

App Files Files Community

xzerus commited on Dec 21, 2024

Commit

2b71a80

verified ·

1 Parent(s): 9e7777f

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -69

app.py CHANGED Viewed

@@ -1,81 +1,46 @@
 import torch
-import torchvision.transforms as T
 from PIL import Image
-from transformers import AutoModel, AutoTokenizer
 import gradio as gr
-import logging
-# Setup logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
-# Device Configuration
-device = torch.device("cpu")  # Force CPU usage
-# ImageNet normalization values
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-def build_transform(input_size):
-    """Build preprocessing pipeline for images."""
-    transform = T.Compose([
-        T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
-        T.Resize((input_size, input_size), interpolation=T.InterpolationMode.BICUBIC),
-        T.ToTensor(),
-        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
-    ])
-    return transform
-def preprocess_image(image, input_size=448):
-    """Preprocess the image to the required format."""
-    transform = build_transform(input_size)
-    tensor_image = transform(image).unsqueeze(0).to(torch.float32)  # Use float32 for CPU
-    return tensor_image
-# Load the model and tokenizer
-logging.info("Loading model from Hugging Face Hub...")
-model_path = "OpenGVLab/InternVL2_5-1B"
 model = AutoModel.from_pretrained(
-    model_path,
-    trust_remote_code=True,
-).to(device).eval()
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
-# Add the `<image>` token if missing
-if "<image>" not in tokenizer.get_vocab():
-    tokenizer.add_tokens(["<image>"])
-    model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings
-assert "<image>" in tokenizer.get_vocab(), "Error: `<image>` token is missing from tokenizer vocabulary."
-def describe_image(image):
-    """Generate a description for the uploaded image."""
     try:
-        pixel_values = preprocess_image(image, input_size=448)
-        prompt = "<image>\nExtract text from the image, respond with only the extracted text."
-        # Perform inference
-        response = model.chat(
-            tokenizer=tokenizer,
-            pixel_values=pixel_values,
-            question=prompt,
-            history=None,
-            return_history=False,
-            generation_config=dict(max_new_tokens=512, do_sample=True),
-        )
-        return response
     except Exception as e:
-        logging.error(f"Error during processing: {e}")
-        return f"Error: {e}"
-# Gradio Interface
-interface = gr.Interface(
-    fn=describe_image,
-    inputs=gr.Image(type="pil"),
-    outputs=gr.Textbox(label="Extracted Text", lines=10, interactive=False),
-    title="Image to Text",
-    description="Upload an image to extract text using the pretrained model.",
 )
 if __name__ == "__main__":
-    interface.launch(server_name="0.0.0.0", server_port=7860)

 import torch
 from PIL import Image
+from transformers import AutoModel, CLIPImageProcessor
 import gradio as gr
+# Load the model
 model = AutoModel.from_pretrained(
+    'OpenGVLab/InternViT-6B-448px-V1-5',
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    trust_remote_code=True
+).cuda().eval()
+# Load the image processor
+image_processor = CLIPImageProcessor.from_pretrained('OpenGVLab/InternViT-6B-448px-V1-5')
+# Define the function to process the image and generate outputs
+def process_image(image):
     try:
+        # Convert uploaded image to RGB
+        image = image.convert('RGB')
+        # Preprocess the image
+        pixel_values = image_processor(images=image, return_tensors='pt').pixel_values
+        pixel_values = pixel_values.to(torch.bfloat16).cuda()
+        # Run the model
+        outputs = model(pixel_values)
+        # Assuming the model returns embeddings or features
+        return f"Output Shape: {outputs.last_hidden_state.shape}"
     except Exception as e:
+        return f"Error: {str(e)}"
+# Create the Gradio interface
+demo = gr.Interface(
+    fn=process_image,  # Function to process the input
+    inputs=gr.Image(type="pil"),  # Accepts images as input
+    outputs=gr.Textbox(label="Model Output"),  # Displays model output
+    title="InternViT Demo",
+    description="Upload an image to process it using the InternViT model from OpenGVLab."
 )
+# Launch the demo
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)