florence-sam-kaggle

Running

App Files Files Community

supersolar commited on Nov 19, 2024

Commit

8c36ce6

verified ·

1 Parent(s): 817293e

Rename kaggle_gpu_1.py to kaggle_florence_gpu_1.py

Browse files

Files changed (2) hide show

kaggle_florence_gpu_1.py +134 -0
kaggle_gpu_1.py +0 -255

kaggle_florence_gpu_1.py ADDED Viewed

	@@ -0,0 +1,134 @@

+#gpu0
+%cd /kaggle/florence-sam
+import os
+from typing import Tuple, Optional
+import shutil
+import os
+import cv2
+import numpy as np
+import spaces
+import supervision as sv
+import torch
+from PIL import Image
+from tqdm import tqdm
+import sys
+import json
+import pickle
+os.chdir("/kaggle/florence-sam")
+sys.path.append('/kaggle/florence-sam')
+from utils.video import generate_unique_name, create_directory, delete_directory
+from utils.florencegpu2 import load_florence_model, run_florence_inference, \
+    FLORENCE_DETAILED_CAPTION_TASK, \
+    FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
+from utils.modes import IMAGE_INFERENCE_MODES, IMAGE_OPEN_VOCABULARY_DETECTION_MODE, \
+    IMAGE_CAPTION_GROUNDING_MASKS_MODE, VIDEO_INFERENCE_MODES
+from utils.sam import load_sam_image_model, run_sam_inference, load_sam_video_model
+DEVICE = torch.device("cuda")
+DEVICE = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())][-1]
+#DEVICE = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())][0]
+torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+if torch.cuda.get_device_properties(0).major >= 8:
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
+SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
+'''
+with open('/kaggle/texts.pkl', 'rb') as file:
+    texts = pickle.load(file)
+print(texts)
+'''
+with open('/kaggle/output_video1.pkl', 'rb') as file:
+    output_video = pickle.load(file)
+print(output_video)
+VIDEO_SCALE_FACTOR = 1
+VIDEO_TARGET_DIRECTORY = "/kaggle/"
+create_directory(directory_path=VIDEO_TARGET_DIRECTORY)
+video_input= output_video
+#texts = ['the table', 'men','ball']
+#VIDEO_TARGET_DIRECTORY = "/content/"
+if not video_input:
+    print("Please upload a video.")
+frame_generator = sv.get_video_frames_generator(video_input)
+frame = next(frame_generator)
+frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+detections_list = []
+width, height = frame.size
+all_ok_bboxes = []
+half_area = width * height * 0.5
+# 存储所有 the table 的边界框和面积
+table_bboxes = []
+table_areas = []
+given_area =1000
+ok_result =[]
+for text in texts:
+    _, result = run_florence_inference(
+        model=FLORENCE_MODEL,
+        processor=FLORENCE_PROCESSOR,
+        device=DEVICE,
+        image=frame,
+        task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
+        text=text    )
+    #print(result)
+    for bbox, label in zip(result['<OPEN_VOCABULARY_DETECTION>']['bboxes'], result['<OPEN_VOCABULARY_DETECTION>']['bboxes_labels']):
+      print(bbox, label)
+      new_result = {'<OPEN_VOCABULARY_DETECTION>': {'bboxes': [bbox], 'bboxes_labels': [label], 'polygons': [], 'polygons_labels': []}}
+      print(new_result)
+      if label == 'ping pong ball':
+          # 计算当前 ping pong ball 的面积
+          area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+          # 检查面积是否不超过给定边界框的面积
+          if area <= given_area:
+              all_ok_bboxes.append([[bbox[0], bbox[1]], [bbox[2], bbox[3]]])
+              ok_result.append(new_result)
+      elif label == 'the table':
+          # 计算当前 the table 的面积
+          print('the tablethe table!!!!')
+          area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+          table_bboxes.append([[bbox[0] - 100, bbox[1]], [bbox[2] + 100, bbox[3]]])
+          table_areas.append(area)
+      elif label == 'table tennis bat':
+          all_ok_bboxes.append([[bbox[0], bbox[1]], [bbox[2], bbox[3]]])
+          ok_result.append(new_result)
+      elif label == 'men':
+          print('menmne!!!!')
+          all_ok_bboxes.append([[bbox[0], bbox[1]], [bbox[2], bbox[3]]])
+          ok_result.append(new_result)
+    # 找到面积最大的 the table
+    if table_areas:
+        max_area_index = table_areas.index(max(table_areas))
+        max_area_bbox = table_bboxes[max_area_index]
+        # 检查面积是否超过50%
+        if max(table_areas) < half_area:
+            all_ok_bboxes.append(max_area_bbox)
+            ok_result.append(new_result)
+print(ok_result)
+with open('/kaggle/all_ok_bboxes.pkl', 'wb') as file:
+    pickle.dump(all_ok_bboxes, file)
+for xyxy in ok_result:
+    print(frame.size,xyxy)
+    detections = sv.Detections.from_lmm(
+        lmm=sv.LMM.FLORENCE_2,
+        result=xyxy,
+        resolution_wh=frame.size
+        )
+    detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
+    print(detections)
+    detections_list.append(detections)
+with open('/kaggle/detections_list.pkl', 'wb') as file:
+    pickle.dump(detections_list, file)
+print(detections_list)

kaggle_gpu_1.py DELETED Viewed

@@ -1,255 +0,0 @@
-import os
-from typing import Tuple, Optional
-import shutil
-import os
-import cv2
-import numpy as np
-import spaces
-import supervision as sv
-import torch
-from PIL import Image
-from tqdm import tqdm
-from utils.video import generate_unique_name, create_directory, delete_directory
-from utils.florence import load_florence_model, run_florence_inference, \
-    FLORENCE_DETAILED_CAPTION_TASK, \
-    FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
-from utils.modes import IMAGE_INFERENCE_MODES, IMAGE_OPEN_VOCABULARY_DETECTION_MODE, \
-    IMAGE_CAPTION_GROUNDING_MASKS_MODE, VIDEO_INFERENCE_MODES
-from utils.sam import load_sam_image_model, run_sam_inference, load_sam_video_model
-DEVICE = torch.device("cuda")
-DEVICE = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())][-1]
-DEVICE = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())][0]
-# DEVICE = torch.device("cpu")
-torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
-if torch.cuda.get_device_properties(0).major >= 8:
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
-SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
-SAM_VIDEO_MODEL = load_sam_video_model(device=DEVICE)
-# @title #视频帧提取
-import supervision as sv
-import os
-import cv2
-import shutil
-def extract_video_frames(video_input):
-    # 目标目录
-    VIDEO_TARGET_DIRECTORY = '/kaggle/working/frame'
-    if not os.path.exists(VIDEO_TARGET_DIRECTORY):
-        os.makedirs(VIDEO_TARGET_DIRECTORY)
-    shutil.rmtree(VIDEO_TARGET_DIRECTORY)
-    # 视频缩放因子
-    VIDEO_SCALE_FACTOR = 1
-    # 获取视频信息
-    video_info = sv.VideoInfo.from_video_path(video_input)
-    print(video_info)
-    # 生成唯一的名称
-    # 使用视频文件名作为唯一名称
-    name = os.path.splitext(os.path.basename(video_input))[0]
-    # 构建帧目录路径
-    frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
-    # 创建 ImageSink 对象
-    frames_sink = sv.ImageSink(
-        target_dir_path=frame_directory_path,
-        image_name_pattern="{:05d}.jpeg"
-    )
-    # 获取视频帧生成器
-    frames_generator = sv.get_video_frames_generator(video_input)
-    # 使用 with 语句确保资源正确释放
-    with frames_sink:
-        # 遍历每一帧
-        for i, frame in enumerate(frames_generator):
-            # 如果需要缩放帧
-            if VIDEO_SCALE_FACTOR != 1:
-                frame = cv2.resize(frame, None, fx=VIDEO_SCALE_FACTOR, fy=VIDEO_SCALE_FACTOR)
-            # 保存帧
-            frames_sink.save_image(frame)
-    return frame_directory_path,video_info
-# 使用示例
-video_input_path = '/kaggle/input/pinnpong/VS_010.mp4'# @param {type:"string"}
-video_frame_dir,video_info = extract_video_frames(video_input_path)
-texts = ['the table', 'all person','ball']
-from PIL import Image
-import supervision as sv
-def detect_objects_in_image(image_input_path, texts):
-    # 加载图像
-    image_input = Image.open(image_input_path)
-    # 初始化检测列表
-    detections_list = []
-    # 对每个文本进行检测
-    for text in texts:
-        _, result = run_florence_inference(
-          model=FLORENCE_MODEL,
-          processor=FLORENCE_PROCESSOR,
-          device=DEVICE,
-          image=image_input,
-          task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
-          text=text
-        )
-        # 从结果中构建监督检测对象
-        detections = sv.Detections.from_lmm(
-            lmm=sv.LMM.FLORENCE_2,
-            result=result,
-            resolution_wh=image_input.size
-        )
-        # 运行 SAM 推理
-        detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
-        # 将检测结果添加到列表中
-        detections_list.append(detections)
-    # 合并所有检测结果
-    detections = sv.Detections.merge(detections_list)
-    # 再次运行 SAM 推理
-    detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
-    return detections
-# @title #合并遮罩加模糊merge_image_with_mask
-import numpy as np
-import cv2
-import os
-from PIL import Image, ImageFilter
-def merge_image_with_mask(image_input_path, detections, output_folder):
-    # 创建输出文件夹
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
-    # 提取图片文件名
-    image_name = os.path.basename(image_input_path)
-    output_path = os.path.join(output_folder, image_name)
-    # 创建掩码文件夹
-    mask_folder = '/kaggle/working/mask'
-    if not os.path.exists(mask_folder):
-        os.makedirs(mask_folder)
-    # 合并掩码
-    combined_mask = np.zeros_like(detections.mask[0], dtype=np.uint8)
-    for mask in detections.mask:
-        combined_mask += mask
-    combined_mask = np.clip(combined_mask, 0, 255)
-    combined_mask = combined_mask.astype(np.uint8)
-    # 膨胀掩码
-    kernel = np.ones((6, 6), np.uint8)
-    dilated_mask = cv2.dilate(combined_mask, kernel, iterations=1)
-    # 保存膨胀后的掩码
-    #mask_path = os.path.join(mask_folder, 'test1.png')
-    #cv2.imwrite(mask_path, dilated_mask * 255)
-    # 读取原始图像
-    original_image = cv2.imread(image_input_path)
-    # 读取遮罩图片
-    #mask_image = cv2.imread(mask_path)
-    # 确保原始图片和遮罩图片尺寸一致
-    #assert original_image.shape == mask_image.shape, "The images must have the same dimensions."
-    # 使用掩膜从原始图片中提取部分区域
-    masked_image = cv2.bitwise_and(original_image, original_image, mask=dilated_mask)
-    # 将掩膜应用于原始图片
-    blurred_image = cv2.GaussianBlur(original_image, (21, 21), 500)  # 使用较大的核大小进行模糊
-    # 将提取的部分区域叠加到模糊后的图片上
-    blurred_image = cv2.bitwise_and(blurred_image, blurred_image, mask=~dilated_mask)
-        # 将提取的部分区域叠加到模糊后的图片上
-    result = np.where(dilated_mask[:, :, None] > 0, masked_image, blurred_image)
-    # 保存合并后的图片
-    cv2.imwrite(output_path, result)
-# @title #进度条批量处理文件夹process_images_in_folder(input_folder)
-from tqdm import tqdm
-import shutil
-def process_images_in_folder(input_folder):
-    # 确保输出文件夹存在
-    output_folder = '/kaggle/working/okframe'
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
-    shutil.rmtree('/kaggle/working/okframe')
-    output_folder = '/kaggle/working/okframe'
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
-    # 获取文件夹中的所有文件
-    files = [f for f in os.listdir(input_folder) if f.endswith('.jpg') or f.endswith('.png') or f.endswith('.jpeg')]
-    # 使用 tqdm 显示进度条
-    for filename in tqdm(files, desc="Processing Images"):
-        image_input_path = os.path.join(input_folder, filename)
-        # 检测对象
-        detections = detect_objects_in_image(
-            image_input_path=image_input_path,
-            texts=texts
-        )
-        # 合并图像
-        merge_image_with_mask(
-            image_input_path=image_input_path,
-            detections=detections,
-            output_folder=output_folder
-        )
-# 使用示例
-video_name = video_input_path.split('/')[-1].split('.')[0]
-input_folder = f'/kaggle/working/frame/{video_name}'
-process_images_in_folder(input_folder)
-# @title #合并所有帧成新视频frames_to_video(frame_folder, video_output_path, video_info)
-import cv2
-import os
-import natsort
-import numpy as np
-def frames_to_video(frame_folder, video_output_path, video_info):
-    # 获取所有帧文件名，并使用 natsorted 进行自然排序
-    frame_files = natsort.natsorted([f for f in os.listdir(frame_folder) if f.endswith(('.jpg', '.png', '.jpeg'))])
-    # 创建视频写入器
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # 编码器
-    out = cv2.VideoWriter(video_output_path, fourcc, video_info.fps, (video_info.width, video_info.height))
-    # 遍历所有帧文件
-    for frame_file in frame_files:
-        frame_path = os.path.join(frame_folder, frame_file)
-        frame = cv2.imread(frame_path)
-        # 如果帧大小不匹配，调整大小
-        if frame.shape[:2] != (video_info.height, video_info.width):
-            frame = cv2.resize(frame, (video_info.width, video_info.height))
-        # 写入视频
-        out.write(frame)
-    # 释放资源
-    out.release()
-# 使用示例
-video_info = video_info
-frame_folder = '/kaggle/working/okframe'
-video_output_path = '/kaggle/working/output_video.mp4'
-frames_to_video(frame_folder, video_output_path, video_info)