Spaces:

agentsea
/

paligemma-waveui

Sleeping

App Files Files Community

paligemma-waveui / app.py

nph4rd

disc

4ef3c5a 9 months ago

raw

history blame

3.76 kB

	import gradio as gr
	import PIL.Image
	import transformers
	from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
	import torch
	import os
	import string
	import functools
	import re
	import numpy as np
	import spaces


	model_id = "agentsea/paligemma-3b-ft-widgetcap-waveui-448"
	processor_id = "google/paligemma-3b-pt-448"
	COLORS = ['#4285f4', '#db4437', '#f4b400', '#0f9d58', '#e48ef1']
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval().to(device)
	processor = PaliGemmaProcessor.from_pretrained(processor_id)

	###### Transformers Inference
	@spaces.GPU
	def infer(
	image: PIL.Image.Image,
	text: str,
	max_new_tokens: int
	) -> str:
	inputs = processor(text=text, images=image, return_tensors="pt").to(device)
	with torch.inference_mode():
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	do_sample=False
	)
	result = processor.batch_decode(generated_ids, skip_special_tokens=True)
	return result[0][len(text):].lstrip("\n")

	def parse_segmentation(input_image, input_text):
	out = infer(input_image, input_text, max_new_tokens=100)
	objs = extract_objs(out.lstrip("\n"), input_image.size[0], input_image.size[1], unique_labels=True)
	labels = set(obj.get('name') for obj in objs if obj.get('name'))
	color_map = {l: COLORS[i % len(COLORS)] for i, l in enumerate(labels)}
	highlighted_text = [(obj['content'], obj.get('name')) for obj in objs]
	annotated_img = (
	input_image,
	[
	(
	obj['mask'] if obj.get('mask') is not None else obj['xyxy'],
	obj['name'] or '',
	)
	for obj in objs
	if 'mask' in obj or 'xyxy' in obj
	],
	)
	has_annotations = bool(annotated_img[1])
	return annotated_img

	######## Demo

	INTRO_TEXT = """## PaliGemma WaveUI\n\n
	PaliGemma 448 fine-tuned on WaveUI dataset for UI element detection
	"""


	with gr.Blocks(css="style.css") as demo:
	gr.Markdown(INTRO_TEXT)
	with gr.Tab("Detection"):
	image = gr.Image(type="pil")
	seg_input = gr.Text(label="Entities to Detect")
	seg_btn = gr.Button("Submit")
	annotated_image = gr.AnnotatedImage(label="Output")

	examples = [["./airbnb.jpg", "detect 'Amazing pools' button"]]
	gr.Examples(
	examples=examples,
	inputs=[image, seg_input],
	)

	seg_inputs = [
	image,
	seg_input
	]
	seg_outputs = [
	annotated_image
	]
	seg_btn.click(
	fn=parse_segmentation,
	inputs=seg_inputs,
	outputs=seg_outputs,
	)


	_SEGMENT_DETECT_RE = re.compile(
	r'(.*?)' +
	r'<loc(\d{4})>' * 4 + r'\s*' +
	'(?:%s)?' % (r'<seg(\d{3})>' * 16) +
	r'\s*([^;<>]+)? ?(?:; )?',
	)

	def extract_objs(text, width, height, unique_labels=False):
	"""Returns objs for a string with "<loc>" and "<seg>" tokens."""
	objs = []
	seen = set()
	while text:
	m = _SEGMENT_DETECT_RE.match(text)
	if not m:
	break
	print("m", m)
	gs = list(m.groups())
	before = gs.pop(0)
	name = gs.pop()
	y1, x1, y2, x2 = [int(x) / 1024 for x in gs[:4]]

	y1, x1, y2, x2 = map(round, (y1height, x1width, y2height, x2width))
	mask = None

	content = m.group()
	if before:
	objs.append(dict(content=before))
	content = content[len(before):]
	while unique_labels and name in seen:
	name = (name or '') + "'"
	seen.add(name)
	objs.append(dict(
	content=content, xyxy=(x1, y1, x2, y2), mask=mask, name=name))
	text = text[len(before) + len(content):]

	if text:
	objs.append(dict(content=text))

	return objs

	#########

	if __name__ == "__main__":
	demo.queue(max_size=10).launch(debug=True)