import re import PIL.Image import gradio as gr import jax import jax.numpy as jnp import numpy as np import flax.linen as nn from inference import PaliGemmaModel, VAEModel COLORS = ['#4285f4', '#db4437', '#f4b400', '#0f9d58', '#e48ef1'] # Instantiate the models pali_gemma_model = PaliGemmaModel() vae_model = VAEModel('vae-oid.npz') ##### Parse segmentation output tokens into masks ##### Also returns bounding boxes with their labels def parse_segmentation(input_image, input_text, max_new_tokens=100): out = pali_gemma_model.infer(image=input_image, text=input_text, max_new_tokens=max_new_tokens) objs = extract_objs(out.lstrip("\n"), input_image.size[0], input_image.size[1], unique_labels=True) labels = set(obj.get('name') for obj in objs if obj.get('name')) color_map = {l: COLORS[i % len(COLORS)] for i, l in enumerate(labels)} highlighted_text = [(obj['content'], obj.get('name')) for obj in objs] annotated_img = ( input_image, [ ( obj['mask'] if obj.get('mask') is not None else obj['xyxy'], obj['name'] or '', ) for obj in objs if 'mask' in obj or 'xyxy' in obj ], ) has_annotations = bool(annotated_img[1]) return annotated_img INTRO_TEXT="🔬🧠 CellVision AI -- Intelligent Cell Imaging Analysis 🤖🧫" IMAGE_PROMPT=""" Describe the morphological characteristics and visible interactions between different cell types. Assess the biological context to identify signs of cancer and the presence of antigens. """ with gr.Blocks(css="style.css") as demo: gr.Markdown(INTRO_TEXT) with gr.Tab("Segment/Detect"): with gr.Row(): with gr.Column(): image = gr.Image(type="pil") seg_input = gr.Text(label="Entities to Segment/Detect") with gr.Column(): annotated_image = gr.AnnotatedImage(label="Output") seg_btn = gr.Button("Submit") examples = [ ["./examples/cart1.jpg", "segment cells"], ["./examples/cart1.jpg", "detect cells"], ["./examples/cart2.jpg", "segment cells"], ["./examples/cart2.jpg", "detect cells"], ["./examples/cart3.jpg", "segment cells"], ["./examples/cart3.jpg", "detect cells"] ] gr.Examples( examples=examples, inputs=[image, seg_input], ) seg_inputs = [ image, seg_input, ] seg_outputs = [ annotated_image ] seg_btn.click( fn=parse_segmentation, inputs=seg_inputs, outputs=seg_outputs, ) with gr.Tab("Text Generation"): with gr.Column(): image = gr.Image(type="pil") text_input = gr.Text(label="Input Text") text_output = gr.Text(label="Text Output") chat_btn = gr.Button() tokens = gr.Slider( label="Max New Tokens", info="Set to larger for longer generation.", minimum=10, maximum=100, value=50, step=10, ) chat_inputs = [ image, text_input, tokens ] chat_outputs = [ text_output ] chat_btn.click( fn=pali_gemma_model.infer, inputs=chat_inputs, outputs=chat_outputs, ) examples = [ ["./examples/cart1.jpg", IMAGE_PROMPT], ["./examples/cart2.jpg", IMAGE_PROMPT], ["./examples/cart3.jpg", IMAGE_PROMPT] ] gr.Examples( examples=examples, inputs=chat_inputs, ) ### Postprocessing Utils for Segmentation Tokens ### Segmentation tokens are passed to another VAE which decodes them to a mask def extract_objs(text, width, height, unique_labels=False): """Returns objs for a string with "" and "" tokens.""" objs = [] seen = set() while text: m = _SEGMENT_DETECT_RE.match(text) if not m: break print("m", m) gs = list(m.groups()) before = gs.pop(0) name = gs.pop() y1, x1, y2, x2 = [int(x) / 1024 for x in gs[:4]] y1, x1, y2, x2 = map(round, (y1*height, x1*width, y2*height, x2*width)) seg_indices = gs[4:20] if seg_indices[0] is None: mask = None else: seg_indices = np.array([int(x) for x in seg_indices], dtype=np.int32) m64, = vae_model.reconstruct_masks(seg_indices[None])[..., 0] m64 = np.clip(np.array(m64) * 0.5 + 0.5, 0, 1) m64 = PIL.Image.fromarray((m64 * 255).astype('uint8')) mask = np.zeros([height, width]) if y2 > y1 and x2 > x1: mask[y1:y2, x1:x2] = np.array(m64.resize([x2 - x1, y2 - y1])) / 255.0 content = m.group() if before: objs.append(dict(content=before)) content = content[len(before):] while unique_labels and name in seen: name = (name or '') + "'" seen.add(name) objs.append(dict( content=content, xyxy=(x1, y1, x2, y2), mask=mask, name=name)) text = text[len(before) + len(content):] if text: objs.append(dict(content=text)) return objs _SEGMENT_DETECT_RE = re.compile( r'(.*?)' + r'' * 4 + r'\s*' + '(?:%s)?' % (r'' * 16) + r'\s*([^;<>]+)? ?(?:; )?', ) if __name__ == "__main__": demo.queue(max_size=10).launch(debug=True)