Spaces:
Runtime error
Runtime error
File size: 3,836 Bytes
5cdc3ab 65df304 eed255c 65df304 7e28ce7 65df304 eed255c 65df304 eed255c 65df304 eed255c 65df304 eed255c 65df304 5cdc3ab 65df304 719ad71 65df304 719ad71 65df304 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
#import spaces
import torch
import re
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
from PIL import ImageDraw
from torchvision.transforms.v2 import Resize
from optimum.onnxruntime import ORTModelForImageClassification
import subprocess
#subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "vikhyatk/moondream2"
#model_id = "zesquirrelnator/moondream2-finetuneV2"
#revision = "2024-08-26"
#tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
tokenizer = AutoTokenizer.from_pretrained(model_id)
#moondream = AutoModelForCausalLM.from_pretrained(
# model_id, trust_remote_code=True, #revision=revision,
# torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, #device_map="auto",
#ignore_mismatched_sizes=True,
#attn_implementation="flash_attention_2"
#).to(device)
moondream = ORTModelForImageClassification.from_pretrained(
model_id, trust_remote_code=True,
torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
from_transformers=True
).to(device)
moondream.eval()
#moondream.to_bettertransformer()
#@spaces.GPU
def answer_question(img, prompt):
image_embeds = moondream.encode_image(img)
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
thread = Thread(
target=moondream.answer_question,
kwargs={
"image_embeds": image_embeds,
"question": prompt,
"tokenizer": tokenizer,
"streamer": streamer,
},
)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
yield buffer.strip()
def extract_floats(text):
# Regular expression to match an array of four floating point numbers
pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
match = re.search(pattern, text)
if match:
# Extract the numbers and convert them to floats
return [float(num) for num in match.groups()]
return None # Return None if no match is found
def extract_bbox(text):
bbox = None
if extract_floats(text) is not None:
x1, y1, x2, y2 = extract_floats(text)
bbox = (x1, y1, x2, y2)
return bbox
def process_answer(img, answer):
if extract_bbox(answer) is not None:
x1, y1, x2, y2 = extract_bbox(answer)
draw_image = Resize(768)(img)
width, height = draw_image.size
x1, x2 = int(x1 * width), int(x2 * width)
y1, y2 = int(y1 * height), int(y2 * height)
bbox = (x1, y1, x2, y2)
ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
return gr.update(visible=True, value=draw_image)
return gr.update(visible=False, value=None)
with gr.Blocks() as demo:
gr.Markdown(
"""
# 🌔 moondream2
A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
"""
)
with gr.Row():
prompt = gr.Textbox(label="Input", value="Describe this image.", scale=4)
submit = gr.Button("Submit")
with gr.Row():
img = gr.Image(type="pil", image_mode="RGB", label="Upload an Image")
with gr.Column():
output = gr.Markdown(label="Response")
ann = gr.Image(visible=False, label="Annotated Image")
submit.click(answer_question, [img, prompt], output, queue=True)
prompt.submit(answer_question, [img, prompt], output, queue=True)
output.change(process_answer, [img, output], ann, show_progress=False)
demo.queue().launch()
|