File size: 3,277 Bytes
fda8dae
5781b89
 
 
 
 
07d11bb
 
5781b89
955b202
 
5781b89
 
6d0cb8a
12e7969
5781b89
60e7a28
1322687
 
bac7d5d
5781b89
 
 
69b4d88
5781b89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b26249
955b202
 
 
 
 
5781b89
3c205df
 
 
 
 
 
 
 
07d11bb
 
3c205df
 
 
 
 
 
07d11bb
955b202
 
 
 
 
 
 
 
 
 
07d11bb
955b202
5781b89
 
 
 
 
 
 
 
 
1322687
5781b89
 
 
07d11bb
69b4d88
07d11bb
 
5781b89
 
955b202
5781b89
eccdcf9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import spaces
import torch
import re
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
from PIL import ImageDraw
from torchvision.transforms.v2 import Resize

import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

model_id = "vikhyatk/moondream2"
revision = "2024-05-20"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
moondream = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision=revision,
    torch_dtype=torch.bfloat16, device_map={"": "cuda"},
    attn_implementation="flash_attention_2"
)
moondream.eval()


@spaces.GPU(duration=20)
def answer_question(img, prompt):
    image_embeds = moondream.encode_image(img)
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    thread = Thread(
        target=moondream.answer_question,
        kwargs={
            "image_embeds": image_embeds,
            "question": prompt,
            "tokenizer": tokenizer,
            "streamer": streamer,
        },
    )
    thread.start()

    buffer = ""
    for new_text in streamer:
        buffer += new_text
    
    # Wait for the thread to finish
    thread.join()

    return buffer.strip()

# def extract_floats(text):
#     # Regular expression to match an array of four floating point numbers
#     pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
#     match = re.search(pattern, text)
#     if match:
#         # Extract the numbers and convert them to floats
#         return [float(num) for num in match.groups()]
#     return None  # Return None if no match is found


# def extract_bbox(text):
#     bbox = None
#     if extract_floats(text) is not None:
#         x1, y1, x2, y2 = extract_floats(text)
#         bbox = (x1, y1, x2, y2)
#     return bbox

# def process_answer(img, answer):
#     if extract_bbox(answer) is not None:
#         x1, y1, x2, y2 = extract_bbox(answer)
#         draw_image = Resize(768)(img)
#         width, height = draw_image.size
#         x1, x2 = int(x1 * width), int(x2 * width)
#         y1, y2 = int(y1 * height), int(y2 * height)
#         bbox = (x1, y1, x2, y2)
#         ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
#         return gr.update(visible=True, value=draw_image)

#     return gr.update(visible=False, value=None)

with gr.Blocks() as demo:
    gr.Markdown(
        """
        # 🌔 moondream2
        A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
        """
    )
    with gr.Row():
        prompt = gr.Textbox(label="Input", value="Describe this image.", scale=4)
        submit = gr.Button("Submit")
    with gr.Row():
        img = gr.Image(type="pil", label="Upload an Image")
        with gr.Column():
            output = gr.Text(label="Response")
            ann = gr.Image(visible=False, label="Annotated Image")

    submit.click(answer_question, [img, prompt], output)
    prompt.submit(answer_question, [img, prompt], output)
    # output.change(process_answer, [img, output], ann, show_progress=False)

demo.queue().launch(debug=True, show_error=True)