File size: 6,461 Bytes
ade70cf
1d51385
d69fd19
d7f29ce
ade70cf
1d51385
d7f29ce
d9cf2fe
d7f29ce
ade70cf
 
d7f29ce
ade70cf
1d51385
d7f29ce
39ae23a
 
ade70cf
d502400
45845d0
110cd3e
d502400
 
 
45845d0
afe3c68
d502400
ade70cf
d256f3b
0ac529d
ca16909
d256f3b
 
 
2f7ee0a
 
 
 
 
69958d1
beec895
3d8d1aa
d502400
 
1d51385
 
 
 
69958d1
ade70cf
 
 
 
 
 
 
 
 
 
1d51385
 
ade70cf
 
 
 
 
 
 
 
 
 
 
 
1d51385
 
ade70cf
 
6172e67
ade70cf
1d51385
ade70cf
 
1d51385
 
 
 
 
 
 
 
 
 
 
 
 
 
6172e67
 
 
 
 
 
 
 
 
1d51385
 
 
 
 
 
 
 
 
 
 
 
 
 
45845d0
6172e67
5826c2f
628b60d
d502400
6172e67
 
d502400
6172e67
 
 
84d0e49
 
 
 
 
 
 
 
 
 
 
 
6172e67
 
 
 
 
 
 
 
4d69588
 
628b60d
4d69588
 
 
 
6172e67
 
 
1d51385
6172e67
 
45845d0
4d69588
90d93e3
6172e67
 
 
84d0e49
6172e67
 
d502400
6172e67
84d0e49
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces

import requests
import copy

from PIL import Image, ImageDraw, ImageFont 
import io
import matplotlib.pyplot as plt
import matplotlib.patches as patches

import random
import numpy as np

import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

models = {
    'J-LAB/Florence_2_B_FluxiAI_Product_Caption': AutoModelForCausalLM.from_pretrained('J-LAB/Florence_2_B_FluxiAI_Product_Caption', trust_remote_code=True).to("cuda").eval(),
    'J-LAB/Florence_2_L_FluxiAI_Product_Caption': AutoModelForCausalLM.from_pretrained('J-LAB/Florence_2_L_FluxiAI_Product_Caption', trust_remote_code=True).to("cuda").eval()
}

processors = {
    'J-LAB/Florence_2_B_FluxiAI_Product_Caption': AutoProcessor.from_pretrained('J-LAB/Florence_2_B_FluxiAI_Product_Caption', trust_remote_code=True),
    'J-LAB/Florence_2_L_FluxiAI_Product_Caption': AutoProcessor.from_pretrained('J-LAB/Florence_2_L_FluxiAI_Product_Caption', trust_remote_code=True)
}


DESCRIPTION = "# [Florence-2 Product Describe by Fluxi IA](https://huggingface.co/microsoft/Florence-2-large)"

colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
            'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']

def fig_to_pil(fig):
    buf = io.BytesIO()
    fig.savefig(buf, format='png')
    buf.seek(0)
    return Image.open(buf)

@spaces.GPU
def run_example(task_prompt, image, text_input=None, model_id='J-LAB/Florence-Idesire'):
    model = models[model_id]
    processor = processors[model_id]
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        early_stopping=False,
        do_sample=False,
        num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )
    return parsed_answer

def plot_bbox(image, data):
    fig, ax = plt.subplots()
    ax.imshow(image)
    for bbox, label in zip(data['bboxes'], data['labels']):
        x1, y1, x2, y2 = bbox
        rect = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=1, edgecolor='r', facecolor='none')
        ax.add_patch(rect)
        plt.text(x1, y1, label, color='white', fontsize=8, bbox=dict(facecolor='red', alpha=0.5))
    ax.axis('off')
    return fig

def draw_polygons(image, prediction, fill_mask=False):

    draw = ImageDraw.Draw(image)
    scale = 1
    for polygons, label in zip(prediction['polygons'], prediction['labels']):
        color = random.choice(colormap)
        fill_color = random.choice(colormap) if fill_mask else None
        for _polygon in polygons:
            _polygon = np.array(_polygon).reshape(-1, 2)
            if len(_polygon) < 3:
                print('Invalid polygon:', _polygon)
                continue
            _polygon = (_polygon * scale).reshape(-1).tolist()
            if fill_mask:
                draw.polygon(_polygon, outline=color, fill=fill_color)
            else:
                draw.polygon(_polygon, outline=color)
            draw.text((_polygon[0] + 8, _polygon[1] + 2), label, fill=color)
    return image

def convert_to_od_format(data):
    bboxes = data.get('bboxes', [])
    labels = data.get('bboxes_labels', [])
    od_results = {
        'bboxes': bboxes,
        'labels': labels
    }
    return od_results

def draw_ocr_bboxes(image, prediction):
    scale = 1
    draw = ImageDraw.Draw(image)
    bboxes, labels = prediction['quad_boxes'], prediction['labels']
    for box, label in zip(bboxes, labels):
        color = random.choice(colormap)
        new_box = (np.array(box) * scale).tolist()
        draw.polygon(new_box, width=3, outline=color)
        draw.text((new_box[0]+8, new_box[1]+2),
                  "{}".format(label),
                  align="right",
                  fill=color)
    return image

def process_image(image, task_prompt, text_input=None, model_id='J-LAB/Florence_2_B_FluxiAI_Product_Caption'):
    image = Image.fromarray(image)  # Convert NumPy array to PIL Image
    if task_prompt == 'Product Caption':
        task_prompt = '<PC>'
        results = run_example(task_prompt, image, model_id=model_id)
    elif task_prompt == 'More Detailed Caption':
        task_prompt = '<MORE_DETAILED_CAPTION>'
        results = run_example(task_prompt, image, model_id=model_id)
    else:
        return "", None  # Return empty string and None for unknown task prompts

    # Remove the key and get the text value
    if results and task_prompt in results:
        output_text = results[task_prompt]
    else:
        output_text = ""

    # Convert newline characters to HTML line breaks
    output_text = output_text.replace("\n\n", "<br><br>").replace("\n", "<br>")

    return output_text, None


css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""


single_task_list =[
    'Product Caption', 'More Detailed Caption'
]



with gr.Blocks(css=css) as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Tab(label="Florence-2 Image Captioning"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value='J-LAB/Florence_2_B_FluxiAI_Product_Caption')
                task_type = gr.Radio(choices=['Single task', 'Cascased task'], label='Task type selector', value='Single task')
                task_prompt = gr.Dropdown(choices=single_task_list, label="Task Prompt", value="Caption")
                text_input = gr.Textbox(label="Text Input (optional)")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.HTML(label="Output Text")
                output_img = gr.Image(label="Output Image")

        submit_btn.click(process_image, [input_img, task_prompt, text_input, model_selector], [output_text, output_img])

demo.launch(debug=True)