artificialguybr commited on
Commit
e3457ad
·
1 Parent(s): 3944bd4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -55
app.py CHANGED
@@ -5,64 +5,125 @@ from PIL import Image
5
  import re
6
  import requests
7
  from io import BytesIO
 
 
 
8
 
9
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True)
10
  model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Int4", device_map="auto", trust_remote_code=True).eval()
11
 
12
- def generate_predictions(image_input, text_input, with_grounding):
13
- user_image_path = "/tmp/user_input_test_image.jpg"
14
- original_image = Image.fromarray((255 - (image_input * 255).astype('uint8')))
15
- original_image.save(user_image_path)
16
-
17
- if with_grounding == "Yes":
18
- text_input += " with grounding"
19
-
20
- query = tokenizer.from_list_format([
21
- {'image': user_image_path},
22
- {'text': text_input},
23
- ])
24
- inputs = tokenizer(query, return_tensors='pt')
25
- inputs = inputs.to(model.device)
26
-
27
- pred = model.generate(**inputs)
28
- full_response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
29
-
30
- frontend_response = re.sub(r'Picture \d+:|<.*?>|\/tmp\/.*\.jpg', '', full_response).replace(text_input, '').strip()
31
- print("Generated Caption:", frontend_response) # Debugging line
32
-
33
- image_with_boxes = tokenizer.draw_bbox_on_latest_picture(full_response)
34
-
35
- # Check if the response contains bounding box coordinates
36
- if not re.search(r'\(\d+,\d+\),\(\d+,\d+\)', frontend_response):
37
- image_with_boxes = original_image
38
-
39
- if image_with_boxes:
40
- temp_path = "/tmp/image_with_boxes.jpg"
41
- image_with_boxes.save(temp_path)
42
- image_with_boxes = Image.open(temp_path)
43
-
44
- return image_with_boxes, frontend_response
45
 
46
- iface = gr.Interface(
47
- fn=generate_predictions,
48
- inputs=[
49
- gr.inputs.Image(label="Image Input"),
50
- gr.inputs.Textbox(default="Generate a caption for that image:", label="Prompt"),
51
- gr.inputs.Radio(["No", "Yes"], label="With Grounding", default="No")
52
- ],
53
- outputs=[
54
- gr.outputs.Image(type='pil', label="Image"),
55
- gr.outputs.Textbox(label="Generated")
56
- ],
57
- title="Qwen-VL Demonstration",
58
- description = """
59
- ## Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud
60
- **Space by [@Artificialguybr](https://twitter.com/artificialguybr)**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- ### Key Features:
63
- - **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.
64
- - **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.
65
- - **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.
66
- """,
67
- )
68
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import re
6
  import requests
7
  from io import BytesIO
8
+ import copy
9
+ import secrets
10
+ from pathlib import Path
11
 
12
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat-Int4", trust_remote_code=True)
13
  model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat-Int4", device_map="auto", trust_remote_code=True).eval()
14
 
15
+ BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
16
+ PUNCTUATION = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』&#8203;``oaicite:{"number":1,"invalid_reason":"Malformed citation 【】"}``&#8203;〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
17
+
18
+ def _parse_text(text):
19
+ lines = text.split("\n")
20
+ lines = [line for line in lines if line != ""]
21
+ count = 0
22
+ for i, line in enumerate(lines):
23
+ if "```" in line:
24
+ count += 1
25
+ items = line.split("`")
26
+ if count % 2 == 1:
27
+ lines[i] = f'<pre><code class="language-{items[-1]}">'
28
+ else:
29
+ lines[i] = f"<br></code></pre>"
30
+ else:
31
+ if i > 0:
32
+ if count % 2 == 1:
33
+ line = line.replace("`", r"\`")
34
+ line = line.replace("<", "&lt;")
35
+ line = line.replace(">", "&gt;")
36
+ line = line.replace(" ", "&nbsp;")
37
+ line = line.replace("*", "&ast;")
38
+ line = line.replace("_", "&lowbar;")
39
+ line = line.replace("-", "&#45;")
40
+ line = line.replace(".", "&#46;")
41
+ line = line.replace("!", "&#33;")
42
+ line = line.replace("(", "&#40;")
43
+ line = line.replace(")", "&#41;")
44
+ line = line.replace("$", "&#36;")
45
+ lines[i] = "<br>" + line
46
+ text = "".join(lines)
47
+ return text
48
 
49
+ def predict(_chatbot, task_history):
50
+ chat_query = _chatbot[-1][0]
51
+ query = task_history[-1][0]
52
+ history_cp = copy.deepcopy(task_history)
53
+ full_response = ""
54
+
55
+ history_filter = []
56
+ pic_idx = 1
57
+ pre = ""
58
+ for i, (q, a) in enumerate(history_cp):
59
+ if isinstance(q, (tuple, list)):
60
+ q = f'Picture {pic_idx}: <img>{q[0]}</img>'
61
+ pre += q + '\n'
62
+ pic_idx += 1
63
+ else:
64
+ pre += q
65
+ history_filter.append((pre, a))
66
+ pre = ""
67
+ history, message = history_filter[:-1], history_filter[-1][0]
68
+ response, history = model.chat(tokenizer, message, history=history)
69
+ image = tokenizer.draw_bbox_on_latest_picture(response, history)
70
+ if image is not None:
71
+ temp_dir = secrets.token_hex(20)
72
+ temp_dir = Path("/tmp") / temp_dir
73
+ temp_dir.mkdir(exist_ok=True, parents=True)
74
+ name = f"tmp{secrets.token_hex(5)}.jpg"
75
+ filename = temp_dir / name
76
+ image.save(str(filename))
77
+ _chatbot[-1] = (_parse_text(chat_query), (str(filename),))
78
+ chat_response = response.replace("<ref>", "")
79
+ chat_response = chat_response.replace(r"</ref>", "")
80
+ chat_response = re.sub(BOX_TAG_PATTERN, "", chat_response)
81
+ if chat_response != "":
82
+ _chatbot.append((None, chat_response))
83
+ else:
84
+ _chatbot[-1] = (_parse_text(chat_query), response)
85
+ full_response = _parse_text(response)
86
+ task_history[-1] = (query, full_response)
87
+ return _chatbot
88
 
89
+ def add_text(history, task_history, text):
90
+ task_text = text
91
+ if len(text) >= 2 and text[-1] in PUNCTUATION and text[-2] not in PUNCTUATION:
92
+ task_text = text[:-1]
93
+ history = history + [(_parse_text(text), None)]
94
+ task_history = task_history + [(task_text, None)]
95
+ return history, task_history, ""
96
+
97
+ def add_file(history, task_history, file):
98
+ history = history + [((file.name,), None)]
99
+ task_history = task_history + [((file.name,), None)]
100
+ return history, task_history
101
+
102
+ def reset_user_input():
103
+ return gr.update(value="")
104
+
105
+ def reset_state(task_history):
106
+ task_history.clear()
107
+ return []
108
+
109
+ with gr.Blocks() as demo:
110
+ gr.Markdown("<center><font size=8>Qwen-VL-Chat Bot</center>")
111
+ gr.Markdown("<center><font size=3>Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud **Space by [@Artificialguybr](https://twitter.com/artificialguybr)</center>")
112
+ gr.Markdown("### Key Features:\n- **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.\n- **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.\n- **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.")
113
+ chatbot = gr.Chatbot(label='Qwen-VL-Chat', elem_classes="control-height", height=750)
114
+ query = gr.Textbox(lines=2, label='Input')
115
+ task_history = gr.State([])
116
+
117
+ with gr.Row():
118
+ empty_bin = gr.Button("🧹 Clear History")
119
+ submit_btn = gr.Button("🚀 Submit")
120
+ regen_btn = gr.Button("🤔️ Regenerate")
121
+ addfile_btn = gr.UploadButton("📁 Upload", file_types=["image"])
122
+
123
+ submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then(
124
+ predict, [chatbot, task_history], [chatbot], show_progress=True
125
+ )
126
+ submit_btn.click(reset_user_input, [], [query])
127
+ empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
128
+
129
+ demo.launch()