Spaces:

selfDotOsman
/

BobVLM-demo

Sleeping

App Files Files Community

BobVLM-demo / app.py

maharshpatelx

Change app.py Github link

d9ff7a1 verified 4 months ago

raw

history blame

3.04 kB

	import gradio as gr
	from BobVLM import BobVLMProcessor, load_model, pipeline
	import torch

	# Load model and processor
	model = load_model()
	processor = BobVLMProcessor()

	# Create pipeline
	pipe = pipeline(model, processor)

	def analyze_image(image):
	"""Process the image and return BobVLM's analysis."""
	response = pipe(
	chat=[
	{"role": "system", "content": "You are an image understanding assistant. You can see and interpret images in fine detail. Provide clear, engaging descriptions that highlight the key elements and atmosphere of the image."},
	{"role": "user", "content": "Describe the image shortly"},
	],
	images=image
	)
	return response[0] if response else "I couldn't analyze this image."

	# Create the Gradio interface
	with gr.Blocks(theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="indigo",
	neutral_hue="slate",
	)) as demo:
	gr.Markdown(
	"""
	# 🤖 BobVLM Demo
	This demo runs on cpu since I can't afford GPU prices here 🤧. So it is quite slow so bare with me. Upload an image and let BobVLM describe what it sees
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	input_image = gr.Image(
	label="Upload Image",
	type="pil",
	height=400,
	)
	analyze_btn = gr.Button(
	"🔍 Analyze Image",
	variant="primary",
	size="lg",
	)

	with gr.Column(scale=1):
	output_text = gr.Textbox(
	label="BobVLM's Analysis",
	placeholder="Analysis will appear here...",
	lines=16,
	show_copy_button=True,
	)

	# Add examples
	gr.Examples(
	examples=[
	["https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRmTRHBR1foifAUzxrQ5GOMyKgRX0iE7f9ivw&s"],
	["https://i.guim.co.uk/img/media/1e0c3f8bbf09178377309c1f25ea326eaeb5aa0c/0_280_4200_2520/master/4200.jpg?width=1200&quality=85&auto=format&fit=max&s=858bf3e58ee96174b4b3d1499a324bc5"],
	],
	inputs=input_image,
	outputs=output_text,
	fn=analyze_image,
	cache_examples=True,
	)

	# Set up the click event
	analyze_btn.click(
	fn=analyze_image,
	inputs=input_image,
	outputs=output_text,
	)

	gr.Markdown(
	"""
	### About BobVLM
	BobVLM is a Vision Language Model that combines CLIP's visual understanding with LLaMA's language capabilities.
	It was born out an experiment to train a small adapter layer to see how much it can learn given supervised finetuning (sft) data. The product is a model that can produce detailed and natural
	image descriptions.

	[View on GitHub](https://github.com/logic-OT/BobVLM) \| [Hugging Face Model](https://huggingface.co/selfDotOsman/BobVLM-1.5b)
	"""
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()