Spaces:

chats-bug
/

ai-image-captioning

Runtime error

chats-bug commited on May 5, 2023

Commit

0d08077

1 Parent(s): f4c7af7

Initial model with number of captions control

Files changed (3) hide show

app.py ADDED Viewed

+import gradio as gr
+from PIL import Image
+from model import GitBaseCocoModel
+def generate_captions(
+	image: Image,
+	max_len: int = 50,
+	num_captions: int = 1,
+	):
+	"""
+	Generates captions for the given image.
+	-----
+	Parameters:
+	image: PIL.Image
+		The image to generate captions for.
+	max_len: int
+		The maximum length of the caption.
+	num_captions: int
+		The number of captions to generate.
+	-----
+	Returns:
+	list[str]
+	"""
+	device = "cuda" if gradio.use_gpu else "cpu"
+	checkpoint = "microsoft/git-base-coco"
+	model = GitBaseCocoModel(device, checkpoint)
+	caption = model.generate(image, max_len, num_captions)
+	return caption
+inputs = [
+	gr.inputs.Image(type="pil", label="Image"),
+	gr.inputs.Number(default=50, label="Maximum Caption Length"),
+	gr.inputs.Number(default=1, label="Number of Captions to Generate"),
+]
+outputs = gr.outputs.Textbox()
+title = "Git-Base-COCO Image Captioning"
+description = "A model for generating captions for images."
+gr.Interface(
+	fn=generate_captions,
+	inputs=inputs,
+	outputs=outputs,
+	title=title,
+	description=description,
+	).launch()

model.py ADDED Viewed

+from transformers import AutoProcessor, AutoModelForCausalLM
+class GitBaseCocoModel:
+	def __init__(self, device, checkpoint="microsoft/git-base-coco"):
+		"""
+		A wrapper class for the Git-Base-COCO model. It is a pretrained model for image captioning.
+		-----
+		Parameters:
+		device: torch.device
+			The device to run the model on.
+		checkpoint: str
+			The checkpoint to load the model from.
+		-----
+		Returns:
+		None
+		"""
+		self.checkpoint = checkpoint
+		self.device = device
+		self.processor = AutoProcessor.from_pretrained(self.checkpoint)
+		self.model = AutoModelForCausalLM.from_pretrained(self.checkpoint).to(self.device)
+	def generate(self, image, max_len=50, num_captions=1):
+		"""
+		Generates captions for the given image.
+		-----
+		Parameters:
+		image: PIL.Image
+			The image to generate captions for.
+		max_len: int
+			The maximum length of the caption.
+		num_captions: int
+			The number of captions to generate.
+		"""
+		pixel_values = self.processor(
+			images=image, return_tensors="pt"
+			).pixel_values.to(self.device)
+		generated_ids = self.model.generate(
+			pixel_values=pixel_values,
+			max_length=max_len,
+			num_beams=num_captions,
+    		num_return_sequences=num_captions,
+		)
+		return self.processor.batch_decode(generated_ids, skip_special_tokens=True)

requirements.txt ADDED Viewed

+torch
+open_clip_torch
+accelerate
+bitsandbytes
+git+https://github.com/huggingface/transformers.git@main