Spaces:
Sleeping
Sleeping
File size: 3,277 Bytes
fda8dae 5781b89 07d11bb 5781b89 955b202 5781b89 6d0cb8a 12e7969 5781b89 60e7a28 1322687 bac7d5d 5781b89 69b4d88 5781b89 6b26249 955b202 5781b89 3c205df 07d11bb 3c205df 07d11bb 955b202 07d11bb 955b202 5781b89 1322687 5781b89 07d11bb 69b4d88 07d11bb 5781b89 955b202 5781b89 eccdcf9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import spaces
import torch
import re
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
from PIL import ImageDraw
from torchvision.transforms.v2 import Resize
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
model_id = "vikhyatk/moondream2"
revision = "2024-05-20"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
moondream = AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, revision=revision,
torch_dtype=torch.bfloat16, device_map={"": "cuda"},
attn_implementation="flash_attention_2"
)
moondream.eval()
@spaces.GPU(duration=20)
def answer_question(img, prompt):
image_embeds = moondream.encode_image(img)
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
thread = Thread(
target=moondream.answer_question,
kwargs={
"image_embeds": image_embeds,
"question": prompt,
"tokenizer": tokenizer,
"streamer": streamer,
},
)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
# Wait for the thread to finish
thread.join()
return buffer.strip()
# def extract_floats(text):
# # Regular expression to match an array of four floating point numbers
# pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
# match = re.search(pattern, text)
# if match:
# # Extract the numbers and convert them to floats
# return [float(num) for num in match.groups()]
# return None # Return None if no match is found
# def extract_bbox(text):
# bbox = None
# if extract_floats(text) is not None:
# x1, y1, x2, y2 = extract_floats(text)
# bbox = (x1, y1, x2, y2)
# return bbox
# def process_answer(img, answer):
# if extract_bbox(answer) is not None:
# x1, y1, x2, y2 = extract_bbox(answer)
# draw_image = Resize(768)(img)
# width, height = draw_image.size
# x1, x2 = int(x1 * width), int(x2 * width)
# y1, y2 = int(y1 * height), int(y2 * height)
# bbox = (x1, y1, x2, y2)
# ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
# return gr.update(visible=True, value=draw_image)
# return gr.update(visible=False, value=None)
with gr.Blocks() as demo:
gr.Markdown(
"""
# 🌔 moondream2
A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
"""
)
with gr.Row():
prompt = gr.Textbox(label="Input", value="Describe this image.", scale=4)
submit = gr.Button("Submit")
with gr.Row():
img = gr.Image(type="pil", label="Upload an Image")
with gr.Column():
output = gr.Text(label="Response")
ann = gr.Image(visible=False, label="Annotated Image")
submit.click(answer_question, [img, prompt], output)
prompt.submit(answer_question, [img, prompt], output)
# output.change(process_answer, [img, output], ann, show_progress=False)
demo.queue().launch(debug=True, show_error=True)
|