Spaces:

chats-bug
/

image-captioning

Runtime error

image-captioning / app.py

chats-bug

Trying to make the git large coco work

4bfc3de about 2 years ago

5.61 kB

	import gradio as gr
	from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, VisionEncoderDecoderModel, BitsAndBytesConfig
	import torch
	import open_clip
	from PIL import Image
	import requests

	from huggingface_hub import hf_hub_download

	# Load the Blip base model
	preprocessor_blip_base = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	model_blip_base = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

	# Load the Blip large model
	preprocessor_blip_large = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
	model_blip_large = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

	# Load the GIT coco model
	preprocessor_git_large_coco = AutoProcessor.from_pretrained("microsoft/git-large-coco")
	model_git_large_coco = AutoModelForCausalLM.from_pretrained("microsoft/git-large-coco")

	# Load the CLIP model
	model_oc_coca, _, transform_oc_coca = open_clip.create_model_and_transforms(
	model_name="coca_ViT-L-14",
	pretrained="mscoco_finetuned_laion2B-s13B-b90k"
	)

	device = "cuda" if torch.cuda.is_available() else "cpu"
	# Transfer the models to the device
	model_blip_base.to(device)
	model_blip_large.to(device)
	model_git_large_coco.to(device)
	model_oc_coca.to(device)


	def generate_caption(
	preprocessor,
	model,
	image,
	tokenizer=None,
	use_float_16=False,
	):
	"""
	Generate captions for the given image.

	-----
	Parameters
	preprocessor: AutoProcessor
	The preprocessor for the model.
	model: BlipForConditionalGeneration
	The model to use.
	image: PIL.Image
	The image to generate captions for.
	tokenizer: AutoTokenizer
	The tokenizer to use. If None, the default tokenizer for the model will be used.
	use_float_16: bool
	Whether to use float16 precision. This can speed up inference, but may lead to worse results.

	-----
	Returns
	str
	The generated caption.
	"""
	inputs = preprocessor(image, return_tensors="pt").to(device)
	pixel_values = preprocessor(images=image, return_tensors="pt").pixel_values

	if use_float_16:
	inputs = inputs.to(torch.float16)

	generated_ids = model.generate(
	pixel_values=pixel_values,
	max_length=50,
	)

	if tokenizer is None:
	generated_caption = preprocessor.batch_decode(generated_ids, skip_special_tokens=True)[0]
	else:
	generated_caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

	# generated_ids = model.generate(**inputs, max_new_tokens=32)
	# generated_text = preprocessor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

	return generated_caption


	def generate_captions_clip(
	model,
	transform,
	image
	):
	"""
	Generate captions for the given image using CLIP.

	-----
	Parameters
	model: VisionEncoderDecoderModel
	The CLIP model to use.
	transform: Callable
	The transform to apply to the image before passing it to the model.
	image: PIL.Image
	The image to generate captions for.

	-----
	Returns
	str
	The generated caption.
	"""
	im = transform(image).unsqueeze(0).to(device)
	with torch.no_grad(), torch.cuda.amp.autocast():
	generated = model.generate(im, seq_len=20)
	generated_caption = open_clip.decode(generated[0].detach()).split("<end_of_text>")[0].replace("<start_of_text>", "")
	return generated_caption


	def generate_captions(
	image,
	max_length,
	temperature,
	use_sample_image,
	):
	"""
	Generate captions for the given image.

	-----
	Parameters
	image: PIL.Image
	The image to generate captions for.

	-----
	Returns
	str
	The generated caption.
	"""
	caption_blip_base = ""
	caption_blip_large = ""
	caption_git_large_coco = ""
	caption_oc_coca = ""

	if use_sample_image:
	url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	image = Image.open(requests.get(url, stream=True).raw)

	# Generate captions for the image using the Blip base model
	try:
	caption_blip_base = generate_caption(preprocessor_blip_base, model_blip_base, image).strip()
	except Exception as e:
	print(e)

	# Generate captions for the image using the Blip large model
	try:
	caption_blip_large = generate_caption(preprocessor_blip_large, model_blip_large, image).strip()
	except Exception as e:
	print(e)

	# Generate captions for the image using the GIT coco model
	try:
	caption_git_large_coco = generate_caption(preprocessor_git_large_coco, model_git_large_coco, image).strip()
	except Exception as e:
	print(e)

	# Generate captions for the image using the CLIP model
	try:
	caption_oc_coca = generate_captions_clip(model_oc_coca, transform_oc_coca, image).strip()
	except Exception as e:
	print(e)

	return caption_blip_base, caption_blip_large, caption_git_large_coco, caption_oc_coca


	# Create the interface
	iface = gr.Interface(
	fn=generate_captions,
	# Define the inputs: Image, Slider for Max Length, Slider for Temperature
	inputs=[
	gr.inputs.Image(label="Image"),
	gr.inputs.Slider(minimum=16, maximum=64, step=2, default=32, label="Max Length"),
	gr.inputs.Slider(minimum=0.5, maximum=1.5, step=0.1, default=1.0, label="Temperature"),
	gr.inputs.Checkbox(default=False, label="Use example image")
	],
	# Define the outputs
	outputs=[
	gr.outputs.Textbox(label="Blip base"),
	gr.outputs.Textbox(label="Blip large"),
	gr.outputs.Textbox(label="GIT large coco"),
	gr.outputs.Textbox(label="CLIP"),
	],
	title="Image Captioning",
	description="Generate captions for images using the Blip2 model, the Blip base model, the Blip large model, the GIT large coco model, and the CLIP model.",
	enable_queue=True,
	)

	# Launch the interface
	iface.launch(debug=True)