Spaces:

mknolan
/

cursor_slides_internvl2

Paused

File size: 5,596 Bytes

e59dc66

import gradio as gr
from PIL import Image
import os
import time
import numpy as np
import torch
import math

# Import lmdeploy for InternVL2 model
from lmdeploy import pipeline, TurbomindEngineConfig
from lmdeploy.vl import load_image

# Set environment variables
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# Model configuration
MODEL_ID = "OpenGVLab/InternVL2-40B-AWQ"  # 4-bit quantized model
USE_GPU = torch.cuda.is_available()

# Global variables for model
internvl2_pipeline = None

def load_internvl2_model():
    """Load the InternVL2 model using lmdeploy"""
    global internvl2_pipeline
    
    # If already loaded, return
    if internvl2_pipeline is not None:
        return True
        
    print("Loading InternVL2 model...")
    try:
        # Configure for AWQ quantized model
        backend_config = TurbomindEngineConfig(model_format='awq')
        
        # Create pipeline
        internvl2_pipeline = pipeline(
            MODEL_ID, 
            backend_config=backend_config, 
            log_level='INFO'
        )
        
        print("InternVL2 model loaded successfully!")
        return True
    except Exception as e:
        print(f"Error loading InternVL2 model: {str(e)}")
        if "CUDA out of memory" in str(e):
            print("Not enough GPU memory for the model")
        return False

def analyze_image(image, prompt):
    """Analyze the image using InternVL2 model"""
    try:
        start_time = time.time()
        
        # Make sure the model is loaded
        if not load_internvl2_model():
            return "Couldn't load InternVL2 model."
            
        # Convert numpy array to PIL Image
        if isinstance(image, np.ndarray):
            image_pil = Image.fromarray(image).convert('RGB')
        else:
            # If somehow it's already a PIL Image
            image_pil = image.convert('RGB')
        
        # Run inference with the model
        response = internvl2_pipeline((prompt, image_pil))
        
        # Get the response text
        result = response.text
        
        elapsed_time = time.time() - start_time
        return result
        
    except Exception as e:
        print(f"Error in image analysis: {str(e)}")
        # Try to clean up memory in case of error
        if USE_GPU:
            torch.cuda.empty_cache()
        return f"Error in image analysis: {str(e)}"

def process_image(image, analysis_type="general"):
    """Process the image and return the analysis"""
    if image is None:
        return "Please upload an image."
    
    # Define prompt based on analysis type
    if analysis_type == "general":
        prompt = "Describe this image in detail."
    elif analysis_type == "text":
        prompt = "What text can you see in this image? Please transcribe it accurately."
    elif analysis_type == "chart":
        prompt = "Analyze any charts, graphs or diagrams in this image in detail, including trends, data points, and conclusions."
    elif analysis_type == "people":
        prompt = "Describe the people in this image - their appearance, actions, and expressions."
    elif analysis_type == "technical":
        prompt = "Provide a technical analysis of this image, including object identification, spatial relationships, and any technical elements present."
    else:
        prompt = "Describe this image in detail."
    
    start_time = time.time()
    
    # Get analysis from the model
    analysis = analyze_image(image, prompt)
    
    elapsed_time = time.time() - start_time
    return f"{analysis}\n\nAnalysis completed in {elapsed_time:.2f} seconds."

# Define the Gradio interface
def create_interface():
    with gr.Blocks(title="Image Analysis with InternVL2") as demo:
        gr.Markdown("# Image Analysis with InternVL2-40B")
        gr.Markdown("Upload an image to analyze it using the InternVL2-40B model.")
        
        with gr.Row():
            with gr.Column(scale=1):
                input_image = gr.Image(type="pil", label="Upload Image")
                analysis_type = gr.Radio(
                    ["general", "text", "chart", "people", "technical"],
                    label="Analysis Type",
                    value="general"
                )
                submit_btn = gr.Button("Analyze Image")
            
            with gr.Column(scale=2):
                output_text = gr.Textbox(label="Analysis Result", lines=20)
        
        submit_btn.click(
            fn=process_image,
            inputs=[input_image, analysis_type],
            outputs=output_text
        )
        
        gr.Markdown("""
        ## Analysis Types
        - **General**: General description of the image
        - **Text**: Focus on identifying and transcribing text in the image
        - **Chart**: Detailed analysis of charts, graphs, and diagrams
        - **People**: Description of people, their appearance and actions
        - **Technical**: Technical analysis identifying objects and spatial relationships
        """)
        
        # Examples
        gr.Examples(
            examples=[
                ["data_temp/page_2.png", "general"],
                ["data_temp/page_2.png", "text"],
                ["data_temp/page_2.png", "chart"]
            ],
            inputs=[input_image, analysis_type],
            outputs=output_text,
            fn=process_image,
            cache_examples=True,
        )
    
    return demo

# Main function
if __name__ == "__main__":
    # Create the Gradio interface
    demo = create_interface()
    
    # Launch the interface
    demo.launch(share=False)