vigneshwar472 commited on
Commit
2ad0a56
Β·
verified Β·
1 Parent(s): 6efe9e0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -0
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import libraries and load the model
2
+ import random
3
+ import requests
4
+ from PIL import Image, ImageDraw, ImageFont
5
+ import numpy as np
6
+ import torch
7
+ from transformers import AutoProcessor, Owlv2ForObjectDetection
8
+ from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
9
+
10
+ obj_processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
11
+ obj_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
12
+
13
+ colors = [
14
+ (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 165, 0), (75, 0, 130),
15
+ (255, 255, 0), (0, 255, 255), (255, 105, 180), (138, 43, 226), (0, 128, 0),
16
+ (0, 128, 128), (255, 20, 147), (64, 224, 208), (128, 0, 128), (70, 130, 180),
17
+ (220, 20, 60), (255, 140, 0), (34, 139, 34), (218, 112, 214), (255, 99, 71),
18
+ (47, 79, 79), (186, 85, 211), (240, 230, 140), (169, 169, 169), (199, 21, 133)
19
+ ]
20
+
21
+ def detect_objects(image, objects):
22
+
23
+ texts = [objects]
24
+ inputs = obj_processor(text=texts, images=image, return_tensors="pt")
25
+
26
+ with torch.no_grad():
27
+ outputs = obj_model(**inputs)
28
+
29
+ target_sizes = torch.Tensor([image.size[::-1]])
30
+ results = obj_processor.post_process_object_detection(
31
+ outputs=outputs, threshold=0.2, target_sizes=target_sizes
32
+ )
33
+
34
+ i = 0
35
+ text = texts[i]
36
+ boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
37
+ return image, boxes, scores, labels
38
+
39
+ def annotate_image(image, boxes, scores, labels, objects):
40
+ draw = ImageDraw.Draw(image)
41
+ font = ImageFont.load_default()
42
+
43
+ for i, (box, score, label) in enumerate(zip(boxes, scores, labels)):
44
+ box = [round(coord, 2) for coord in box.tolist()]
45
+ color = colors[label % len(colors)]
46
+ draw.rectangle(box, outline=color, width=3)
47
+ draw.text((box[0], box[1]), f"{objects[label]}: {score:.2f}", font=font, fill=color)
48
+
49
+ return image
50
+
51
+
52
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
53
+ from PIL import Image
54
+ import requests
55
+
56
+ #cbt_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
57
+ #cbt_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", device_map="cuda")
58
+
59
+ cbt_model = Qwen2VLForConditionalGeneration.from_pretrained(
60
+ "Qwen/Qwen2-VL-2B-Instruct",
61
+ torch_dtype="auto",
62
+ device_map="auto",
63
+ )
64
+
65
+ cbt_processor = AutoProcessor.from_pretrained(
66
+ "Qwen/Qwen2-VL-2B-Instruct"
67
+ )
68
+
69
+ import random
70
+ import time
71
+ import gradio as gr
72
+
73
+ global history
74
+ history = [
75
+ {
76
+ "role": "system",
77
+ "content" : [
78
+ {
79
+ "type": "image",
80
+ },
81
+ {
82
+ "type": "text",
83
+ "text": "You are an conversation image recognition chatbot. Communicate with humans using natural language. Recognize the images, have a spatial understanding and answer the questions in a concise manner. Generate the best response for a user query. It must be correct lexically and grammatically.",
84
+ }
85
+ ]
86
+ }
87
+ ]
88
+
89
+ with gr.Blocks() as demo:
90
+
91
+ with gr.Row():
92
+
93
+ with gr.Column(scale=1):
94
+
95
+ gr.Markdown("## Upload an Image")
96
+ image_input = gr.Image(type="pil", label="Upload your image here")
97
+ objects_input = gr.Textbox(label="Enter the objects to detect (comma-separated)", placeholder="e.g. 'cat, dog, car'")
98
+ image_output = gr.Image(type="pil", label="Detected Objects")
99
+
100
+ def run_object_detection(image, objects):
101
+ object_list = [obj.strip() for obj in objects.split(",")]
102
+ image, boxes, scores, labels = detect_objects(image, object_list)
103
+ annotated_image = annotate_image(image, boxes, scores, labels, object_list)
104
+ history.append({
105
+ 'role': 'system',
106
+ 'content': [
107
+ {
108
+ 'type': 'text',
109
+ 'text': f'In the image the objects detected are {labels}'
110
+ }
111
+ ]
112
+ })
113
+ return annotated_image
114
+
115
+ detect_button = gr.Button("Detect Objects")
116
+ detect_button.click(fn=run_object_detection, inputs=[image_input, objects_input], outputs=image_output)
117
+
118
+ with gr.Column(scale=2):
119
+
120
+ chatbot = gr.Chatbot()
121
+ msg = gr.Textbox()
122
+ clear = gr.ClearButton([msg, chatbot])
123
+
124
+ def user(message, chat_history):
125
+ return "", chat_history + [[message, ""]]
126
+
127
+ def chat_function(image, chat_history):
128
+
129
+ message = ''
130
+
131
+ if chat_history[-1][0] is not None:
132
+ message = str(chat_history[-1][0])
133
+
134
+ history.append({
135
+ "role": "user",
136
+ "content" : [
137
+ {
138
+ "type": "text",
139
+ "text": message
140
+ }
141
+ ]
142
+ })
143
+
144
+ text_prompt = cbt_processor.apply_chat_template(history, add_generation_prompt=True)
145
+
146
+ inputs = cbt_processor(
147
+ text = [text_prompt],
148
+ images = [image],
149
+ padding = True,
150
+ return_tensors = "pt"
151
+ )
152
+
153
+ inputs = inputs.to("cuda")
154
+
155
+ output_ids = cbt_model.generate(**inputs, max_new_tokens=1024)
156
+
157
+ generated_ids = [
158
+ output_ids[len(input_ids) :]
159
+ for input_ids, output_ids in zip(inputs.input_ids, output_ids)
160
+ ]
161
+
162
+ bot_output = cbt_processor.batch_decode(
163
+ generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
164
+ )
165
+
166
+ history.append({
167
+ "role": "assistant",
168
+ "content" : [
169
+ {
170
+ "type": "text",
171
+ "text": bot_output
172
+ }
173
+ ]
174
+ })
175
+
176
+ bot_output_str = str(bot_output).replace('"', '').replace('[', '').replace(']', '').replace("\n", "<br>")
177
+
178
+ chat_history[-1][1] = ""
179
+ for character in bot_output_str:
180
+ chat_history[-1][1] += character
181
+ time.sleep(0.05)
182
+ yield chat_history
183
+
184
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(chat_function, [image_input, chatbot], [chatbot])
185
+ clear.click(lambda :None, None, chatbot, queue=False)
186
+
187
+
188
+ demo.launch(debug=True)