xzerus commited on
Commit
f2b2dae
·
verified ·
1 Parent(s): dfdba19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -163
app.py CHANGED
@@ -1,14 +1,15 @@
1
  import numpy as np
2
  import torch
3
  import torchvision.transforms as T
4
- from decord import VideoReader, cpu
5
  from PIL import Image
6
  from torchvision.transforms.functional import InterpolationMode
7
  from transformers import AutoModel, AutoTokenizer
 
8
 
9
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
10
  IMAGENET_STD = (0.229, 0.224, 0.225)
11
 
 
12
  def build_transform(input_size):
13
  MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
14
  transform = T.Compose([
@@ -19,195 +20,73 @@ def build_transform(input_size):
19
  ])
20
  return transform
21
 
22
- def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
23
- best_ratio_diff = float('inf')
24
- best_ratio = (1, 1)
25
- area = width * height
26
- for ratio in target_ratios:
27
- target_aspect_ratio = ratio[0] / ratio[1]
28
- ratio_diff = abs(aspect_ratio - target_aspect_ratio)
29
- if ratio_diff < best_ratio_diff:
30
- best_ratio_diff = ratio_diff
31
- best_ratio = ratio
32
- elif ratio_diff == best_ratio_diff:
33
- if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
34
- best_ratio = ratio
35
- return best_ratio
36
-
37
  def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
38
  orig_width, orig_height = image.size
39
  aspect_ratio = orig_width / orig_height
40
-
41
- # calculate the existing image aspect ratio
42
- target_ratios = set(
43
- (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
44
- i * j <= max_num and i * j >= min_num)
45
- target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
46
-
47
- # find the closest aspect ratio to the target
48
- target_aspect_ratio = find_closest_aspect_ratio(
49
- aspect_ratio, target_ratios, orig_width, orig_height, image_size)
50
-
51
- # calculate the target width and height
52
  target_width = image_size * target_aspect_ratio[0]
53
  target_height = image_size * target_aspect_ratio[1]
54
  blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
55
-
56
- # resize the image
57
  resized_img = image.resize((target_width, target_height))
58
- processed_images = []
59
- for i in range(blocks):
60
- box = (
61
  (i % (target_width // image_size)) * image_size,
62
  (i // (target_width // image_size)) * image_size,
63
  ((i % (target_width // image_size)) + 1) * image_size,
64
  ((i // (target_width // image_size)) + 1) * image_size
65
- )
66
- # split the image
67
- split_img = resized_img.crop(box)
68
- processed_images.append(split_img)
69
- assert len(processed_images) == blocks
70
  if use_thumbnail and len(processed_images) != 1:
71
  thumbnail_img = image.resize((image_size, image_size))
72
  processed_images.append(thumbnail_img)
73
  return processed_images
74
 
75
- def load_image(image_file, input_size=448, max_num=12):
76
- image = Image.open(image_file).convert('RGB')
77
  transform = build_transform(input_size=input_size)
78
  images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
79
  pixel_values = [transform(image) for image in images]
80
  pixel_values = torch.stack(pixel_values)
81
  return pixel_values
82
 
83
- # If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
84
  path = 'OpenGVLab/InternVL2_5-1B'
85
  model = AutoModel.from_pretrained(
86
  path,
87
  torch_dtype=torch.bfloat16,
88
  low_cpu_mem_usage=True,
89
  use_flash_attn=True,
90
- trust_remote_code=True).eval().cuda()
91
- tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
92
-
93
- # set the max number of tiles in `max_num`
94
- pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
95
- generation_config = dict(max_new_tokens=1024, do_sample=True)
96
-
97
- # pure-text conversation (纯文本对话)
98
- question = 'Hello, who are you?'
99
- response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
100
- print(f'User: {question}\nAssistant: {response}')
101
-
102
- question = 'Can you tell me a story?'
103
- response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
104
- print(f'User: {question}\nAssistant: {response}')
105
-
106
- # single-image single-round conversation (单图单轮对话)
107
- question = '<image>\nPlease describe the image shortly.'
108
- response = model.chat(tokenizer, pixel_values, question, generation_config)
109
- print(f'User: {question}\nAssistant: {response}')
110
-
111
- # single-image multi-round conversation (单图多轮对话)
112
- question = '<image>\nPlease describe the image in detail.'
113
- response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
114
- print(f'User: {question}\nAssistant: {response}')
115
-
116
- question = 'Please write a poem according to the image.'
117
- response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
118
- print(f'User: {question}\nAssistant: {response}')
119
-
120
- # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
121
- pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
122
- pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
123
- pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
124
 
125
- question = '<image>\nDescribe the two images in detail.'
126
- response, history = model.chat(tokenizer, pixel_values, question, generation_config,
127
- history=None, return_history=True)
128
- print(f'User: {question}\nAssistant: {response}')
129
-
130
- question = 'What are the similarities and differences between these two images.'
131
- response, history = model.chat(tokenizer, pixel_values, question, generation_config,
132
- history=history, return_history=True)
133
- print(f'User: {question}\nAssistant: {response}')
134
-
135
- # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
136
- pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
137
- pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
138
- pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
139
- num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
140
-
141
- question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
142
- response, history = model.chat(tokenizer, pixel_values, question, generation_config,
143
- num_patches_list=num_patches_list,
144
- history=None, return_history=True)
145
- print(f'User: {question}\nAssistant: {response}')
146
-
147
- question = 'What are the similarities and differences between these two images.'
148
- response, history = model.chat(tokenizer, pixel_values, question, generation_config,
149
- num_patches_list=num_patches_list,
150
- history=history, return_history=True)
151
- print(f'User: {question}\nAssistant: {response}')
152
-
153
- # batch inference, single image per sample (单图批处理)
154
- pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
155
- pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
156
- num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
157
- pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
158
-
159
- questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
160
- responses = model.batch_chat(tokenizer, pixel_values,
161
- num_patches_list=num_patches_list,
162
- questions=questions,
163
- generation_config=generation_config)
164
- for question, response in zip(questions, responses):
165
- print(f'User: {question}\nAssistant: {response}')
166
-
167
- # video multi-round conversation (视频多轮对话)
168
- def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
169
- if bound:
170
- start, end = bound[0], bound[1]
171
- else:
172
- start, end = -100000, 100000
173
- start_idx = max(first_idx, round(start * fps))
174
- end_idx = min(round(end * fps), max_frame)
175
- seg_size = float(end_idx - start_idx) / num_segments
176
- frame_indices = np.array([
177
- int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
178
- for idx in range(num_segments)
179
- ])
180
- return frame_indices
181
-
182
- def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
183
- vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
184
- max_frame = len(vr) - 1
185
- fps = float(vr.get_avg_fps())
186
-
187
- pixel_values_list, num_patches_list = [], []
188
- transform = build_transform(input_size=input_size)
189
- frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
190
- for frame_index in frame_indices:
191
- img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
192
- img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
193
- pixel_values = [transform(tile) for tile in img]
194
- pixel_values = torch.stack(pixel_values)
195
- num_patches_list.append(pixel_values.shape[0])
196
- pixel_values_list.append(pixel_values)
197
- pixel_values = torch.cat(pixel_values_list)
198
- return pixel_values, num_patches_list
199
-
200
- video_path = './examples/red-panda.mp4'
201
- pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
202
- pixel_values = pixel_values.to(torch.bfloat16).cuda()
203
- video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
204
- question = video_prefix + 'What is the red panda doing?'
205
- # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
206
- response, history = model.chat(tokenizer, pixel_values, question, generation_config,
207
- num_patches_list=num_patches_list, history=None, return_history=True)
208
- print(f'User: {question}\nAssistant: {response}')
209
 
210
- question = 'Describe this video in detail.'
211
- response, history = model.chat(tokenizer, pixel_values, question, generation_config,
212
- num_patches_list=num_patches_list, history=history, return_history=True)
213
- print(f'User: {question}\nAssistant: {response}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import numpy as np
2
  import torch
3
  import torchvision.transforms as T
 
4
  from PIL import Image
5
  from torchvision.transforms.functional import InterpolationMode
6
  from transformers import AutoModel, AutoTokenizer
7
+ import gradio as gr
8
 
9
  IMAGENET_MEAN = (0.485, 0.456, 0.406)
10
  IMAGENET_STD = (0.229, 0.224, 0.225)
11
 
12
+ # Build the image transform
13
  def build_transform(input_size):
14
  MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
15
  transform = T.Compose([
 
20
  ])
21
  return transform
22
 
23
+ # Dynamic preprocessing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
25
  orig_width, orig_height = image.size
26
  aspect_ratio = orig_width / orig_height
27
+ target_ratios = sorted(
28
+ set((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num),
29
+ key=lambda x: x[0] * x[1]
30
+ )
31
+ target_aspect_ratio = target_ratios[0]
 
 
 
 
 
 
 
32
  target_width = image_size * target_aspect_ratio[0]
33
  target_height = image_size * target_aspect_ratio[1]
34
  blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
 
 
35
  resized_img = image.resize((target_width, target_height))
36
+ processed_images = [
37
+ resized_img.crop((
 
38
  (i % (target_width // image_size)) * image_size,
39
  (i // (target_width // image_size)) * image_size,
40
  ((i % (target_width // image_size)) + 1) * image_size,
41
  ((i // (target_width // image_size)) + 1) * image_size
42
+ ))
43
+ for i in range(blocks)
44
+ ]
 
 
45
  if use_thumbnail and len(processed_images) != 1:
46
  thumbnail_img = image.resize((image_size, image_size))
47
  processed_images.append(thumbnail_img)
48
  return processed_images
49
 
50
+ # Load image dynamically from user upload
51
+ def load_image(image, input_size=448, max_num=12):
52
  transform = build_transform(input_size=input_size)
53
  images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
54
  pixel_values = [transform(image) for image in images]
55
  pixel_values = torch.stack(pixel_values)
56
  return pixel_values
57
 
58
+ # Load the model and tokenizer
59
  path = 'OpenGVLab/InternVL2_5-1B'
60
  model = AutoModel.from_pretrained(
61
  path,
62
  torch_dtype=torch.bfloat16,
63
  low_cpu_mem_usage=True,
64
  use_flash_attn=True,
65
+ trust_remote_code=True
66
+ ).eval().cuda()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ # Define the function for Gradio interface
71
+ def process_image(image):
72
+ try:
73
+ pixel_values = load_image(image, max_num=12).to(torch.bfloat16).cuda()
74
+ generation_config = dict(max_new_tokens=1024, do_sample=True)
75
+ question = '<image>\nPlease describe the image in detail.'
76
+ response = model.chat(tokenizer, pixel_values, question, generation_config)
77
+ return response
78
+ except Exception as e:
79
+ return f"Error: {str(e)}"
80
+
81
+ # Create Gradio Interface
82
+ demo = gr.Interface(
83
+ fn=process_image,
84
+ inputs=gr.Image(type="pil"),
85
+ outputs="text",
86
+ title="Dynamic Image Processing with InternVL",
87
+ description="Upload an image and get detailed responses using the InternVL model."
88
+ )
89
+
90
+ # Launch the Gradio app
91
+ if __name__ == "__main__":
92
+ demo.launch(server_name="0.0.0.0", server_port=7860)