Spaces:

mknolan
/

internvl2-fixed

Paused

File size: 6,690 Bytes

284b864

import torch
from PIL import Image
import requests
from io import BytesIO
import gradio as gr
import os
import sys
import time
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

print("Starting InternVL2 with Llama3-76B initialization...")
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Set up environment for CUDA
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# Check GPU availability
def check_gpu():
    if not torch.cuda.is_available():
        print("CUDA is not available. This application requires GPU acceleration.")
        return False
        
    try:
        # Test GPU with a simple operation
        test_tensor = torch.rand(10, device="cuda")
        _ = test_tensor + test_tensor
        print(f"GPU is available: {torch.cuda.get_device_name(0)}")
        return True
    except Exception as e:
        print(f"Error initializing GPU: {str(e)}")
        return False

# Global flag for GPU availability
USE_GPU = check_gpu()

# Import InternVL modules
try:
    from transformers import AutoModel, AutoProcessor
    HAS_TRANSFORMERS = True
    print("Successfully imported transformers")
except ImportError as e:
    print(f"Error importing transformers: {str(e)}")
    HAS_TRANSFORMERS = False

# Initialize models
internvit_model = None
llama_model = None
processor = None

def load_models():
    global internvit_model, llama_model, processor
    
    if not USE_GPU:
        print("Cannot load models without GPU")
        return False
    
    try:
        print("Loading InternViT-6B model for visual feature extraction...")
        
        # Following the GitHub repo instructions for using InternViT-6B
        processor = AutoProcessor.from_pretrained("OpenGVLab/InternViT-6B-224px")
        internvit_model = AutoModel.from_pretrained("OpenGVLab/InternViT-6B-224px")
        
        if USE_GPU:
            internvit_model = internvit_model.to("cuda")
        
        print("InternViT-6B model loaded successfully!")
        
        # For demonstration purposes, we'll just extract visual features for now
        # In a real implementation, we would load Llama3-76B here
        print("Note: Llama3-76B model loading is commented out for this demonstration")
        # llama_model = ...
        
        return True
    except Exception as e:
        print(f"Error loading models: {str(e)}")
        return False

# Load models on startup
MODELS_LOADED = load_models()

def process_image(image_path, sample_url=None):
    """Process an image using InternViT-6B for feature extraction"""
    
    # Load image
    if sample_url and not image_path:
        # Load from URL if provided and no image uploaded
        response = requests.get(sample_url)
        image = Image.open(BytesIO(response.content))
        print(f"Loaded sample image from URL: {sample_url}")
    else:
        # Use uploaded image
        if isinstance(image_path, str):
            image = Image.open(image_path)
        else:
            image = image_path
    
    if not image:
        return "No image provided"
    
    if not MODELS_LOADED:
        return "Models failed to load. Please check the logs."
    
    try:
        # Start timing
        start_time = time.time()
        
        # Process image through the visual encoder
        print("Processing image through InternViT-6B...")
        inputs = processor(images=image, return_tensors="pt")
        if USE_GPU:
            inputs = {k: v.to("cuda") for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = internvit_model(**inputs)
        
        # Extract image features
        image_features = outputs.last_hidden_state
        pooled_output = outputs.pooler_output
        
        # In a real implementation, we would pass these features to Llama3-76B
        # For now, we'll just return info about the extracted features
        feature_info = f"""
        Image successfully processed through InternViT-6B:
        - Last hidden state shape: {image_features.shape}
        - Pooled output shape: {pooled_output.shape}
        
        In a complete implementation, these visual features would be passed to Llama3-76B
        for generating text responses about the image.
        
        Note: This is a demonstration of visual feature extraction only.
        """
        
        # Calculate elapsed time
        elapsed = time.time() - start_time
        
        return f"{feature_info}\n\nProcessing completed in {elapsed:.2f} seconds."
    
    except Exception as e:
        return f"Error processing image: {str(e)}"

# Set up Gradio interface
def create_interface():
    with gr.Blocks(title="InternVL2 with Llama3-76B") as demo:
        gr.Markdown("# InternVL2 Visual Feature Extraction Demo")
        gr.Markdown("## Using InternViT-6B for visual feature extraction")
        
        # System status
        status = "✅ Ready" if MODELS_LOADED else "❌ Models failed to load"
        gr.Markdown(f"### System Status: {status}")
        
        with gr.Row():
            with gr.Column():
                input_image = gr.Image(type="pil", label="Upload Image")
                sample_btn = gr.Button("Use Sample Image")
            
            with gr.Column():
                output_text = gr.Textbox(label="Results", lines=10)
        
        # Process button
        process_btn = gr.Button("Extract Visual Features")
        process_btn.click(
            fn=process_image,
            inputs=[input_image],
            outputs=output_text
        )
        
        # Sample image button logic
        sample_image_url = "https://huggingface.co/OpenGVLab/InternVL2/resolve/main/assets/demo.jpg"
        
        def use_sample():
            return process_image(None, sample_image_url)
        
        sample_btn.click(
            fn=use_sample,
            inputs=[],
            outputs=output_text
        )
        
        # Add some explanation
        gr.Markdown("""
        ## About This Demo
        
        This demonstration shows how to use InternViT-6B for visual feature extraction, 
        following the instructions from the OpenGVLab/InternVL GitHub repository.
        
        The application extracts visual features from the input image that would typically
        be passed to a language model like Llama3-76B. In a complete implementation,
        these features would be used to generate text responses about the image.
        """)
    
    return demo

# Main function
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(share=False, server_name="0.0.0.0")