import torch from PIL import Image import requests from io import BytesIO import gradio as gr import os import sys import time import warnings # Suppress warnings warnings.filterwarnings("ignore") print("Starting InternVL2 with Llama3-76B initialization...") print(f"Python version: {sys.version}") print(f"PyTorch version: {torch.__version__}") print(f"CUDA available: {torch.cuda.is_available()}") # Set up environment for CUDA os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" # Check GPU availability def check_gpu(): if not torch.cuda.is_available(): print("CUDA is not available. This application requires GPU acceleration.") return False try: # Test GPU with a simple operation test_tensor = torch.rand(10, device="cuda") _ = test_tensor + test_tensor print(f"GPU is available: {torch.cuda.get_device_name(0)}") return True except Exception as e: print(f"Error initializing GPU: {str(e)}") return False # Global flag for GPU availability USE_GPU = check_gpu() # Import InternVL modules try: from transformers import AutoModel, AutoProcessor HAS_TRANSFORMERS = True print("Successfully imported transformers") except ImportError as e: print(f"Error importing transformers: {str(e)}") HAS_TRANSFORMERS = False # Initialize models internvit_model = None llama_model = None processor = None def load_models(): global internvit_model, llama_model, processor if not USE_GPU: print("Cannot load models without GPU") return False try: print("Loading InternViT-6B model for visual feature extraction...") # Following the GitHub repo instructions for using InternViT-6B processor = AutoProcessor.from_pretrained("OpenGVLab/InternViT-6B-224px") internvit_model = AutoModel.from_pretrained("OpenGVLab/InternViT-6B-224px") if USE_GPU: internvit_model = internvit_model.to("cuda") print("InternViT-6B model loaded successfully!") # For demonstration purposes, we'll just extract visual features for now # In a real implementation, we would load Llama3-76B here print("Note: Llama3-76B model loading is commented out for this demonstration") # llama_model = ... return True except Exception as e: print(f"Error loading models: {str(e)}") return False # Load models on startup MODELS_LOADED = load_models() def process_image(image_path, sample_url=None): """Process an image using InternViT-6B for feature extraction""" # Load image if sample_url and not image_path: # Load from URL if provided and no image uploaded response = requests.get(sample_url) image = Image.open(BytesIO(response.content)) print(f"Loaded sample image from URL: {sample_url}") else: # Use uploaded image if isinstance(image_path, str): image = Image.open(image_path) else: image = image_path if not image: return "No image provided" if not MODELS_LOADED: return "Models failed to load. Please check the logs." try: # Start timing start_time = time.time() # Process image through the visual encoder print("Processing image through InternViT-6B...") inputs = processor(images=image, return_tensors="pt") if USE_GPU: inputs = {k: v.to("cuda") for k, v in inputs.items()} with torch.no_grad(): outputs = internvit_model(**inputs) # Extract image features image_features = outputs.last_hidden_state pooled_output = outputs.pooler_output # In a real implementation, we would pass these features to Llama3-76B # For now, we'll just return info about the extracted features feature_info = f""" Image successfully processed through InternViT-6B: - Last hidden state shape: {image_features.shape} - Pooled output shape: {pooled_output.shape} In a complete implementation, these visual features would be passed to Llama3-76B for generating text responses about the image. Note: This is a demonstration of visual feature extraction only. """ # Calculate elapsed time elapsed = time.time() - start_time return f"{feature_info}\n\nProcessing completed in {elapsed:.2f} seconds." except Exception as e: return f"Error processing image: {str(e)}" # Set up Gradio interface def create_interface(): with gr.Blocks(title="InternVL2 with Llama3-76B") as demo: gr.Markdown("# InternVL2 Visual Feature Extraction Demo") gr.Markdown("## Using InternViT-6B for visual feature extraction") # System status status = "✅ Ready" if MODELS_LOADED else "❌ Models failed to load" gr.Markdown(f"### System Status: {status}") with gr.Row(): with gr.Column(): input_image = gr.Image(type="pil", label="Upload Image") sample_btn = gr.Button("Use Sample Image") with gr.Column(): output_text = gr.Textbox(label="Results", lines=10) # Process button process_btn = gr.Button("Extract Visual Features") process_btn.click( fn=process_image, inputs=[input_image], outputs=output_text ) # Sample image button logic sample_image_url = "https://huggingface.co/OpenGVLab/InternVL2/resolve/main/assets/demo.jpg" def use_sample(): return process_image(None, sample_image_url) sample_btn.click( fn=use_sample, inputs=[], outputs=output_text ) # Add some explanation gr.Markdown(""" ## About This Demo This demonstration shows how to use InternViT-6B for visual feature extraction, following the instructions from the OpenGVLab/InternVL GitHub repository. The application extracts visual features from the input image that would typically be passed to a language model like Llama3-76B. In a complete implementation, these features would be used to generate text responses about the image. """) return demo # Main function if __name__ == "__main__": demo = create_interface() demo.launch(share=False, server_name="0.0.0.0")