import torch import numpy as np from PIL import Image import cv2 from transformers import AutoImageProcessor, SegformerForSemanticSegmentation from imagehash import average_hash def load_model(): processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") return processor, model def segment_person(image: Image.Image, processor, model): inputs = processor(images=image, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits upsampled_logits = torch.nn.functional.interpolate( logits, size=image.size[::-1], mode="bilinear", align_corners=False, ) pred_classes = upsampled_logits.argmax(dim=1)[0].cpu().numpy() mask = (pred_classes == 12).astype(np.uint8) * 255 # Class 12 = person # Clean mask kernel = np.ones((7, 7), np.uint8) eroded_mask = cv2.erode(mask, kernel, iterations=1) blurred_mask = cv2.GaussianBlur(eroded_mask, (3, 3), sigmaX=0, sigmaY=0) final_mask = blurred_mask.astype(np.float32) / 255.0 final_mask_3ch = np.stack([final_mask]*3, axis=-1) return final_mask_3ch def resize_image(image, size_percent): # Convert image to RGB if it's RGBA image = Image.fromarray(image).convert("RGB") width, height = image.size new_width = int(width * size_percent / 100) new_height = int(height * size_percent / 100) # Create new transparent image with original dimensions resized_image = Image.new('RGB', (width, height), (0, 0, 0)) # Resize original image scaled_content = image.resize((new_width, new_height)) # Calculate position to paste resized content in center x = (width - new_width) // 2 y = (height - new_height) // 2 # Paste resized content onto transparent background resized_image.paste(scaled_content, (x, y)) return resized_image # Check if two images are similar def check_image_similarity(image1, image2): hash1 = average_hash(Image.fromarray(image1)) hash2 = average_hash(Image.fromarray(image2)) return hash1 - hash2 < 10 def split_stereo_image(image): """ Splits an image into left and right halves for stereographic viewing. Args: image: PIL Image or numpy array Returns: tuple: (left_half, right_half) as numpy arrays """ # Convert to numpy array if PIL Image if isinstance(image, Image.Image): image = np.array(image) # Get width and calculate split point width = image.shape[1] split_point = width // 2 # Split into left and right halves left_half = image[:, :split_point] right_half = image[:, split_point:] #If stereo image is provided, return left and right halves if check_image_similarity(left_half, right_half): return left_half, right_half else: return image, resize_image(image, 99) def resize_image_to_width(person_img, background_img): # Resize image to match background dimensions if (background_img.shape[1] > background_img.shape[0]): width = background_img.shape[1] img_array = np.array(person_img) height = int(width * img_array.shape[0] / img_array.shape[1]) person_img = Image.fromarray(img_array).resize((width, height)) person_img = np.array(person_img) image = Image.fromarray(person_img) else: height = background_img.shape[0] img_array = np.array(person_img) width = int(height * img_array.shape[1] / img_array.shape[0]) person_img = Image.fromarray(img_array).resize((width, height)) person_img = np.array(person_img) image = Image.fromarray(person_img) return image def resize_mask(person_size, mask): scale_factor = person_size / 100.0 mask_height, mask_width = mask.shape[:2] new_height = int(mask_height * scale_factor) new_width = int(mask_width * scale_factor) # Convert mask to PIL Image for resizing mask_image = Image.fromarray((mask * 255).astype(np.uint8)) resized_mask = mask_image.resize((new_width, new_height)) # Convert back to numpy and normalize to 0-1 mask = np.array(resized_mask).astype(np.float32) / 255.0 # Add third channel dimension back if needed if len(mask.shape) == 2: mask = np.stack([mask] * 3, axis=-1) return mask def resize_images(image, person_size): image_np = np.array(image) # Resize image based on person_size percentage scale_factor = person_size / 100.0 img_height, img_width = image_np.shape[:2] new_height = int(img_height * scale_factor) new_width = int(img_width * scale_factor) # Convert image to PIL Image for resizing image_pil = Image.fromarray(image_np) resized_image = image_pil.resize((new_width, new_height)) # Convert back to numpy image = resized_image image_np = np.array(image) return image_np