Spaces:

mknolan
/

cursor_slides_internvl2

Paused

File size: 6,262 Bytes

e59dc66

import os
import torch
import io
from PIL import Image
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPImageProcessor
import logging
import time

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Global variables to store the model and processors
model = None
processor = None
tokenizer = None

def describe_image(image_path=None, image_data=None, show_image=False):
    """
    Load an image and generate a description using Qwen2-VL-7B model.
    
    Args:
        image_path (str, optional): Path to the image file
        image_data (bytes, optional): Raw image data
        show_image (bool): Whether to display the image
        
    Returns:
        dict: Descriptions of the image
    """
    global model, processor, tokenizer
    
    # Initialize model if not already loaded
    if model is None or processor is None or tokenizer is None:
        load_model()
    
    # Check if we have valid input
    if image_path is None and image_data is None:
        return {"error": "No image provided"}
    
    try:
        # Load the image
        if image_path is not None:
            if not os.path.exists(image_path):
                return {"error": f"Image not found at {image_path}"}
            logger.info(f"Processing image from path: {image_path}")
            image = Image.open(image_path).convert('RGB')
        else:
            logger.info("Processing image from uploaded data")
            image = Image.open(io.BytesIO(image_data)).convert('RGB')
        
        # Display the image if requested (for local testing only)
        if show_image:
            plt.figure(figsize=(10, 8))
            plt.imshow(image)
            plt.axis('off')
            if image_path:
                plt.title(os.path.basename(image_path))
            plt.show()
        
        # Process the image
        logger.info("Generating descriptions...")
        
        # Process image and create inputs for various prompts
        pixel_values = processor(images=image, return_tensors="pt").to(model.device)
        
        # Basic description prompt
        prompt_basic = "Describe this image briefly."
        input_ids_basic = tokenizer(prompt_basic, return_tensors="pt").input_ids.to(model.device)
        
        # Detailed description prompt
        prompt_detailed = "Analyze this image in detail. Describe the main elements, any text visible, the colors, and the overall composition."
        input_ids_detailed = tokenizer(prompt_detailed, return_tensors="pt").input_ids.to(model.device)
        
        # Technical analysis prompt
        prompt_technical = "What can you tell me about the technical aspects of this image?"
        input_ids_technical = tokenizer(prompt_technical, return_tensors="pt").input_ids.to(model.device)
        
        # Generate outputs for each prompt
        # Basic description
        with torch.no_grad():
            output_basic = model.generate(
                input_ids=input_ids_basic,
                pixel_values=pixel_values.pixel_values,
                max_new_tokens=150,
                do_sample=False
            )
        basic_description = tokenizer.decode(output_basic[0], skip_special_tokens=True).replace(prompt_basic, "").strip()
        
        # Detailed description
        with torch.no_grad():
            output_detailed = model.generate(
                input_ids=input_ids_detailed,
                pixel_values=pixel_values.pixel_values,
                max_new_tokens=300,
                do_sample=False
            )
        detailed_description = tokenizer.decode(output_detailed[0], skip_special_tokens=True).replace(prompt_detailed, "").strip()
        
        # Technical analysis
        with torch.no_grad():
            output_technical = model.generate(
                input_ids=input_ids_technical,
                pixel_values=pixel_values.pixel_values,
                max_new_tokens=200,
                do_sample=False
            )
        technical_analysis = tokenizer.decode(output_technical[0], skip_special_tokens=True).replace(prompt_technical, "").strip()
        
        return {
            "success": True,
            "basic_description": basic_description,
            "detailed_description": detailed_description,
            "technical_analysis": technical_analysis
        }
        
    except Exception as e:
        logger.error(f"Error processing image: {str(e)}", exc_info=True)
        return {"error": f"Error generating description: {str(e)}"}

def load_model():
    """Load the model and related components"""
    global model, processor, tokenizer
    
    try:
        logger.info("Loading model...")
        model_id = "Qwen/Qwen2-VL-7B"
        
        # Use explicit processor class instead of AutoProcessor
        processor = CLIPImageProcessor.from_pretrained(model_id)
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        
        # Load model with 4-bit quantization to reduce memory requirements
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16,
            load_in_4bit=True,
            device_map="auto"
        )
        logger.info("Model loaded successfully")
        return True
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}", exc_info=True)
        return False

def main():
    """Run in command-line mode"""
    # Path to the image
    image_folder = "data_temp"
    image_name = "page_2.png"
    image_path = os.path.join(image_folder, image_name)
    
    # Get the description
    result = describe_image(image_path=image_path, show_image=True)
    
    # Print the results
    if "error" not in result:
        print("\n==== Image Description Results (Qwen2-VL-7B) ====")
        print(f"\nBasic Description:\n{result['basic_description']}")
        print(f"\nDetailed Description:\n{result['detailed_description']}")
        print(f"\nTechnical Analysis:\n{result['technical_analysis']}")
    else:
        print(result["error"])  # Print error message if there was an issue

if __name__ == "__main__":
    main()