supersolar commited on
Commit
8c36ce6
·
verified ·
1 Parent(s): 817293e

Rename kaggle_gpu_1.py to kaggle_florence_gpu_1.py

Browse files
Files changed (2) hide show
  1. kaggle_florence_gpu_1.py +134 -0
  2. kaggle_gpu_1.py +0 -255
kaggle_florence_gpu_1.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #gpu0
2
+ %cd /kaggle/florence-sam
3
+ import os
4
+ from typing import Tuple, Optional
5
+ import shutil
6
+ import os
7
+ import cv2
8
+ import numpy as np
9
+ import spaces
10
+ import supervision as sv
11
+ import torch
12
+ from PIL import Image
13
+ from tqdm import tqdm
14
+ import sys
15
+ import json
16
+ import pickle
17
+ os.chdir("/kaggle/florence-sam")
18
+ sys.path.append('/kaggle/florence-sam')
19
+ from utils.video import generate_unique_name, create_directory, delete_directory
20
+ from utils.florencegpu2 import load_florence_model, run_florence_inference, \
21
+ FLORENCE_DETAILED_CAPTION_TASK, \
22
+ FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
23
+ from utils.modes import IMAGE_INFERENCE_MODES, IMAGE_OPEN_VOCABULARY_DETECTION_MODE, \
24
+ IMAGE_CAPTION_GROUNDING_MASKS_MODE, VIDEO_INFERENCE_MODES
25
+ from utils.sam import load_sam_image_model, run_sam_inference, load_sam_video_model
26
+ DEVICE = torch.device("cuda")
27
+ DEVICE = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())][-1]
28
+ #DEVICE = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())][0]
29
+
30
+ torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
31
+ if torch.cuda.get_device_properties(0).major >= 8:
32
+ torch.backends.cuda.matmul.allow_tf32 = True
33
+ torch.backends.cudnn.allow_tf32 = True
34
+
35
+ FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
36
+ SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
37
+ '''
38
+ with open('/kaggle/texts.pkl', 'rb') as file:
39
+ texts = pickle.load(file)
40
+ print(texts)
41
+ '''
42
+ with open('/kaggle/output_video1.pkl', 'rb') as file:
43
+ output_video = pickle.load(file)
44
+ print(output_video)
45
+
46
+ VIDEO_SCALE_FACTOR = 1
47
+ VIDEO_TARGET_DIRECTORY = "/kaggle/"
48
+ create_directory(directory_path=VIDEO_TARGET_DIRECTORY)
49
+
50
+
51
+ video_input= output_video
52
+ #texts = ['the table', 'men','ball']
53
+ #VIDEO_TARGET_DIRECTORY = "/content/"
54
+ if not video_input:
55
+ print("Please upload a video.")
56
+
57
+ frame_generator = sv.get_video_frames_generator(video_input)
58
+ frame = next(frame_generator)
59
+ frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
60
+
61
+
62
+
63
+
64
+ detections_list = []
65
+ width, height = frame.size
66
+ all_ok_bboxes = []
67
+ half_area = width * height * 0.5
68
+
69
+ # 存储所有 the table 的边界框和面积
70
+ table_bboxes = []
71
+ table_areas = []
72
+ given_area =1000
73
+ ok_result =[]
74
+ for text in texts:
75
+ _, result = run_florence_inference(
76
+ model=FLORENCE_MODEL,
77
+ processor=FLORENCE_PROCESSOR,
78
+ device=DEVICE,
79
+ image=frame,
80
+ task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
81
+ text=text )
82
+ #print(result)
83
+ for bbox, label in zip(result['<OPEN_VOCABULARY_DETECTION>']['bboxes'], result['<OPEN_VOCABULARY_DETECTION>']['bboxes_labels']):
84
+ print(bbox, label)
85
+ new_result = {'<OPEN_VOCABULARY_DETECTION>': {'bboxes': [bbox], 'bboxes_labels': [label], 'polygons': [], 'polygons_labels': []}}
86
+ print(new_result)
87
+ if label == 'ping pong ball':
88
+ # 计算当前 ping pong ball 的面积
89
+ area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
90
+ # 检查面积是否不超过给定边界框的面积
91
+ if area <= given_area:
92
+ all_ok_bboxes.append([[bbox[0], bbox[1]], [bbox[2], bbox[3]]])
93
+ ok_result.append(new_result)
94
+ elif label == 'the table':
95
+ # 计算当前 the table 的面积
96
+ print('the tablethe table!!!!')
97
+ area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
98
+ table_bboxes.append([[bbox[0] - 100, bbox[1]], [bbox[2] + 100, bbox[3]]])
99
+ table_areas.append(area)
100
+ elif label == 'table tennis bat':
101
+ all_ok_bboxes.append([[bbox[0], bbox[1]], [bbox[2], bbox[3]]])
102
+ ok_result.append(new_result)
103
+ elif label == 'men':
104
+ print('menmne!!!!')
105
+ all_ok_bboxes.append([[bbox[0], bbox[1]], [bbox[2], bbox[3]]])
106
+ ok_result.append(new_result)
107
+
108
+ # 找到面积最大的 the table
109
+ if table_areas:
110
+ max_area_index = table_areas.index(max(table_areas))
111
+ max_area_bbox = table_bboxes[max_area_index]
112
+
113
+ # 检查面积是否超过50%
114
+ if max(table_areas) < half_area:
115
+ all_ok_bboxes.append(max_area_bbox)
116
+ ok_result.append(new_result)
117
+
118
+ print(ok_result)
119
+ with open('/kaggle/all_ok_bboxes.pkl', 'wb') as file:
120
+ pickle.dump(all_ok_bboxes, file)
121
+
122
+ for xyxy in ok_result:
123
+ print(frame.size,xyxy)
124
+ detections = sv.Detections.from_lmm(
125
+ lmm=sv.LMM.FLORENCE_2,
126
+ result=xyxy,
127
+ resolution_wh=frame.size
128
+ )
129
+ detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
130
+ print(detections)
131
+ detections_list.append(detections)
132
+ with open('/kaggle/detections_list.pkl', 'wb') as file:
133
+ pickle.dump(detections_list, file)
134
+ print(detections_list)
kaggle_gpu_1.py DELETED
@@ -1,255 +0,0 @@
1
- import os
2
- from typing import Tuple, Optional
3
- import shutil
4
- import os
5
- import cv2
6
- import numpy as np
7
- import spaces
8
- import supervision as sv
9
- import torch
10
- from PIL import Image
11
- from tqdm import tqdm
12
- from utils.video import generate_unique_name, create_directory, delete_directory
13
- from utils.florence import load_florence_model, run_florence_inference, \
14
- FLORENCE_DETAILED_CAPTION_TASK, \
15
- FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
16
- from utils.modes import IMAGE_INFERENCE_MODES, IMAGE_OPEN_VOCABULARY_DETECTION_MODE, \
17
- IMAGE_CAPTION_GROUNDING_MASKS_MODE, VIDEO_INFERENCE_MODES
18
- from utils.sam import load_sam_image_model, run_sam_inference, load_sam_video_model
19
- DEVICE = torch.device("cuda")
20
- DEVICE = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())][-1]
21
- DEVICE = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())][0]
22
- # DEVICE = torch.device("cpu")
23
-
24
- torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
25
- if torch.cuda.get_device_properties(0).major >= 8:
26
- torch.backends.cuda.matmul.allow_tf32 = True
27
- torch.backends.cudnn.allow_tf32 = True
28
-
29
-
30
- FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
31
- SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
32
- SAM_VIDEO_MODEL = load_sam_video_model(device=DEVICE)
33
-
34
-
35
- # @title #视频帧提取
36
- import supervision as sv
37
- import os
38
- import cv2
39
- import shutil
40
- def extract_video_frames(video_input):
41
- # 目标目录
42
- VIDEO_TARGET_DIRECTORY = '/kaggle/working/frame'
43
- if not os.path.exists(VIDEO_TARGET_DIRECTORY):
44
- os.makedirs(VIDEO_TARGET_DIRECTORY)
45
-
46
- shutil.rmtree(VIDEO_TARGET_DIRECTORY)
47
- # 视频缩放因子
48
- VIDEO_SCALE_FACTOR = 1
49
-
50
- # 获取视频信息
51
- video_info = sv.VideoInfo.from_video_path(video_input)
52
- print(video_info)
53
-
54
- # 生成唯一的名称
55
- # 使用视频文件名作为唯一名称
56
- name = os.path.splitext(os.path.basename(video_input))[0]
57
-
58
- # 构建帧目录路径
59
- frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
60
-
61
- # 创建 ImageSink 对象
62
- frames_sink = sv.ImageSink(
63
- target_dir_path=frame_directory_path,
64
- image_name_pattern="{:05d}.jpeg"
65
- )
66
-
67
- # 获取视频帧生成器
68
- frames_generator = sv.get_video_frames_generator(video_input)
69
-
70
- # 使用 with 语句确保资源正确释放
71
- with frames_sink:
72
- # 遍历每一帧
73
- for i, frame in enumerate(frames_generator):
74
- # 如果需要缩放帧
75
- if VIDEO_SCALE_FACTOR != 1:
76
- frame = cv2.resize(frame, None, fx=VIDEO_SCALE_FACTOR, fy=VIDEO_SCALE_FACTOR)
77
-
78
- # 保存帧
79
- frames_sink.save_image(frame)
80
- return frame_directory_path,video_info
81
-
82
- # 使用示例
83
- video_input_path = '/kaggle/input/pinnpong/VS_010.mp4'# @param {type:"string"}
84
- video_frame_dir,video_info = extract_video_frames(video_input_path)
85
-
86
- texts = ['the table', 'all person','ball']
87
- from PIL import Image
88
- import supervision as sv
89
-
90
- def detect_objects_in_image(image_input_path, texts):
91
- # 加载图像
92
- image_input = Image.open(image_input_path)
93
-
94
- # 初始化检测列表
95
- detections_list = []
96
-
97
- # 对每个文本进行检测
98
- for text in texts:
99
- _, result = run_florence_inference(
100
- model=FLORENCE_MODEL,
101
- processor=FLORENCE_PROCESSOR,
102
- device=DEVICE,
103
- image=image_input,
104
- task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
105
- text=text
106
- )
107
-
108
- # 从结果中构建监督检测对象
109
- detections = sv.Detections.from_lmm(
110
- lmm=sv.LMM.FLORENCE_2,
111
- result=result,
112
- resolution_wh=image_input.size
113
- )
114
-
115
- # 运行 SAM 推理
116
- detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
117
-
118
- # 将检测结果添加到列表中
119
- detections_list.append(detections)
120
-
121
- # 合并所有检测结果
122
- detections = sv.Detections.merge(detections_list)
123
-
124
- # 再次运行 SAM 推理
125
- detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
126
-
127
- return detections
128
- # @title #合并遮罩加模糊merge_image_with_mask
129
- import numpy as np
130
- import cv2
131
- import os
132
- from PIL import Image, ImageFilter
133
-
134
- def merge_image_with_mask(image_input_path, detections, output_folder):
135
- # 创建输出文件夹
136
- if not os.path.exists(output_folder):
137
- os.makedirs(output_folder)
138
-
139
- # 提取图片文件名
140
- image_name = os.path.basename(image_input_path)
141
- output_path = os.path.join(output_folder, image_name)
142
-
143
- # 创建掩码文件夹
144
- mask_folder = '/kaggle/working/mask'
145
- if not os.path.exists(mask_folder):
146
- os.makedirs(mask_folder)
147
-
148
- # 合并掩码
149
- combined_mask = np.zeros_like(detections.mask[0], dtype=np.uint8)
150
- for mask in detections.mask:
151
- combined_mask += mask
152
- combined_mask = np.clip(combined_mask, 0, 255)
153
- combined_mask = combined_mask.astype(np.uint8)
154
-
155
- # 膨胀掩码
156
- kernel = np.ones((6, 6), np.uint8)
157
- dilated_mask = cv2.dilate(combined_mask, kernel, iterations=1)
158
-
159
- # 保存膨胀后的掩码
160
- #mask_path = os.path.join(mask_folder, 'test1.png')
161
- #cv2.imwrite(mask_path, dilated_mask * 255)
162
-
163
- # 读取原始图像
164
- original_image = cv2.imread(image_input_path)
165
-
166
- # 读取遮罩图片
167
- #mask_image = cv2.imread(mask_path)
168
-
169
- # 确保原始图片和遮罩图片尺寸一致
170
- #assert original_image.shape == mask_image.shape, "The images must have the same dimensions."
171
-
172
- # 使用掩膜从原始图片中提取部分区域
173
- masked_image = cv2.bitwise_and(original_image, original_image, mask=dilated_mask)
174
- # 将掩膜应用于原始图片
175
- blurred_image = cv2.GaussianBlur(original_image, (21, 21), 500) # 使用较大的核大小进行模糊
176
- # 将提取的部分区域叠加到模糊后的图片上
177
- blurred_image = cv2.bitwise_and(blurred_image, blurred_image, mask=~dilated_mask)
178
- # 将提取的部分区域叠加到模糊后的图片上
179
- result = np.where(dilated_mask[:, :, None] > 0, masked_image, blurred_image)
180
-
181
- # 保存合并后的图片
182
- cv2.imwrite(output_path, result)
183
- # @title #进度条批量处理文件夹process_images_in_folder(input_folder)
184
- from tqdm import tqdm
185
- import shutil
186
- def process_images_in_folder(input_folder):
187
- # 确保输出文件夹存在
188
- output_folder = '/kaggle/working/okframe'
189
- if not os.path.exists(output_folder):
190
- os.makedirs(output_folder)
191
- shutil.rmtree('/kaggle/working/okframe')
192
- output_folder = '/kaggle/working/okframe'
193
- if not os.path.exists(output_folder):
194
- os.makedirs(output_folder)
195
-
196
- # 获取文件夹中的所有文件
197
- files = [f for f in os.listdir(input_folder) if f.endswith('.jpg') or f.endswith('.png') or f.endswith('.jpeg')]
198
-
199
- # 使用 tqdm 显示进度条
200
- for filename in tqdm(files, desc="Processing Images"):
201
- image_input_path = os.path.join(input_folder, filename)
202
-
203
- # 检测对象
204
- detections = detect_objects_in_image(
205
- image_input_path=image_input_path,
206
- texts=texts
207
- )
208
-
209
- # 合并图像
210
- merge_image_with_mask(
211
- image_input_path=image_input_path,
212
- detections=detections,
213
- output_folder=output_folder
214
- )
215
-
216
- # 使用示例
217
- video_name = video_input_path.split('/')[-1].split('.')[0]
218
- input_folder = f'/kaggle/working/frame/{video_name}'
219
- process_images_in_folder(input_folder)
220
-
221
- # @title #合并所有帧成新视频frames_to_video(frame_folder, video_output_path, video_info)
222
- import cv2
223
- import os
224
- import natsort
225
- import numpy as np
226
-
227
- def frames_to_video(frame_folder, video_output_path, video_info):
228
- # 获取所有帧文件名,并使用 natsorted 进行自然排序
229
- frame_files = natsort.natsorted([f for f in os.listdir(frame_folder) if f.endswith(('.jpg', '.png', '.jpeg'))])
230
-
231
- # 创建视频写入器
232
- fourcc = cv2.VideoWriter_fourcc(*'mp4v') # 编码器
233
- out = cv2.VideoWriter(video_output_path, fourcc, video_info.fps, (video_info.width, video_info.height))
234
-
235
- # 遍历所有帧文件
236
- for frame_file in frame_files:
237
- frame_path = os.path.join(frame_folder, frame_file)
238
- frame = cv2.imread(frame_path)
239
-
240
- # 如果帧大小不匹配,调整大小
241
- if frame.shape[:2] != (video_info.height, video_info.width):
242
- frame = cv2.resize(frame, (video_info.width, video_info.height))
243
-
244
- # 写入视频
245
- out.write(frame)
246
-
247
- # 释放资源
248
- out.release()
249
-
250
- # 使用示例
251
- video_info = video_info
252
- frame_folder = '/kaggle/working/okframe'
253
- video_output_path = '/kaggle/working/output_video.mp4'
254
-
255
- frames_to_video(frame_folder, video_output_path, video_info)