Spaces:

mknolan
/

cursor_slides_internvl2

Paused

App Files Files Community

cursor_slides_internvl2 / app_internvl2.py

mknolan

Upload InternVL2 implementation

e59dc66 verified 4 months ago

raw

history blame

5.6 kB

	import gradio as gr
	from PIL import Image
	import os
	import time
	import numpy as np
	import torch
	import math

	# Import lmdeploy for InternVL2 model
	from lmdeploy import pipeline, TurbomindEngineConfig
	from lmdeploy.vl import load_image

	# Set environment variables
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

	# Model configuration
	MODEL_ID = "OpenGVLab/InternVL2-40B-AWQ" # 4-bit quantized model
	USE_GPU = torch.cuda.is_available()

	# Global variables for model
	internvl2_pipeline = None

	def load_internvl2_model():
	"""Load the InternVL2 model using lmdeploy"""
	global internvl2_pipeline

	# If already loaded, return
	if internvl2_pipeline is not None:
	return True

	print("Loading InternVL2 model...")
	try:
	# Configure for AWQ quantized model
	backend_config = TurbomindEngineConfig(model_format='awq')

	# Create pipeline
	internvl2_pipeline = pipeline(
	MODEL_ID,
	backend_config=backend_config,
	log_level='INFO'
	)

	print("InternVL2 model loaded successfully!")
	return True
	except Exception as e:
	print(f"Error loading InternVL2 model: {str(e)}")
	if "CUDA out of memory" in str(e):
	print("Not enough GPU memory for the model")
	return False

	def analyze_image(image, prompt):
	"""Analyze the image using InternVL2 model"""
	try:
	start_time = time.time()

	# Make sure the model is loaded
	if not load_internvl2_model():
	return "Couldn't load InternVL2 model."

	# Convert numpy array to PIL Image
	if isinstance(image, np.ndarray):
	image_pil = Image.fromarray(image).convert('RGB')
	else:
	# If somehow it's already a PIL Image
	image_pil = image.convert('RGB')

	# Run inference with the model
	response = internvl2_pipeline((prompt, image_pil))

	# Get the response text
	result = response.text

	elapsed_time = time.time() - start_time
	return result

	except Exception as e:
	print(f"Error in image analysis: {str(e)}")
	# Try to clean up memory in case of error
	if USE_GPU:
	torch.cuda.empty_cache()
	return f"Error in image analysis: {str(e)}"

	def process_image(image, analysis_type="general"):
	"""Process the image and return the analysis"""
	if image is None:
	return "Please upload an image."

	# Define prompt based on analysis type
	if analysis_type == "general":
	prompt = "Describe this image in detail."
	elif analysis_type == "text":
	prompt = "What text can you see in this image? Please transcribe it accurately."
	elif analysis_type == "chart":
	prompt = "Analyze any charts, graphs or diagrams in this image in detail, including trends, data points, and conclusions."
	elif analysis_type == "people":
	prompt = "Describe the people in this image - their appearance, actions, and expressions."
	elif analysis_type == "technical":
	prompt = "Provide a technical analysis of this image, including object identification, spatial relationships, and any technical elements present."
	else:
	prompt = "Describe this image in detail."

	start_time = time.time()

	# Get analysis from the model
	analysis = analyze_image(image, prompt)

	elapsed_time = time.time() - start_time
	return f"{analysis}\n\nAnalysis completed in {elapsed_time:.2f} seconds."

	# Define the Gradio interface
	def create_interface():
	with gr.Blocks(title="Image Analysis with InternVL2") as demo:
	gr.Markdown("# Image Analysis with InternVL2-40B")
	gr.Markdown("Upload an image to analyze it using the InternVL2-40B model.")

	with gr.Row():
	with gr.Column(scale=1):
	input_image = gr.Image(type="pil", label="Upload Image")
	analysis_type = gr.Radio(
	["general", "text", "chart", "people", "technical"],
	label="Analysis Type",
	value="general"
	)
	submit_btn = gr.Button("Analyze Image")

	with gr.Column(scale=2):
	output_text = gr.Textbox(label="Analysis Result", lines=20)

	submit_btn.click(
	fn=process_image,
	inputs=[input_image, analysis_type],
	outputs=output_text
	)

	gr.Markdown("""
	## Analysis Types
	- General: General description of the image
	- Text: Focus on identifying and transcribing text in the image
	- Chart: Detailed analysis of charts, graphs, and diagrams
	- People: Description of people, their appearance and actions
	- Technical: Technical analysis identifying objects and spatial relationships
	""")

	# Examples
	gr.Examples(
	examples=[
	["data_temp/page_2.png", "general"],
	["data_temp/page_2.png", "text"],
	["data_temp/page_2.png", "chart"]
	],
	inputs=[input_image, analysis_type],
	outputs=output_text,
	fn=process_image,
	cache_examples=True,
	)

	return demo

	# Main function
	if __name__ == "__main__":
	# Create the Gradio interface
	demo = create_interface()

	# Launch the interface
	demo.launch(share=False)