"hi"

Browse files

Files changed (11) hide show

.gitignore +6 -0
README.md +1 -1
README.yaml +0 -99
app.py +131 -0
example.ipynb +150 -0
requirements.txt +7 -0
sample_images/image_five.jpg +0 -0
sample_images/image_four.jpg +0 -0
sample_images/image_six.jpg +0 -0
yolo/BodyMask.py +248 -0
yolo/utils.py +291 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio_cached_examples/
+checkpoint-*
+*/example.ipynb
+*.pyc

README.md CHANGED Viewed

@@ -56,7 +56,7 @@ To use this model, you'll need to have the appropriate YOLO framework installed.
 To use the model for inference, you can use the following Python script:
 ```python
-from yolo_segmentation import YOLO
 # Load the model
 model = YOLO('path/to/your/model.pt')

 To use the model for inference, you can use the following Python script:
 ```python
+from ultralytics import YOLO
 # Load the model
 model = YOLO('path/to/your/model.pt')

README.yaml DELETED Viewed

@@ -1,99 +0,0 @@
----
-language:
-  - "en"
-thumbnail: "https://example.com/path/to/your/thumbnail.jpg"
-tags:
-  - yolo
-  - object-detection
-  - image-segmentation
-  - computer-vision
-  - human-body-parts
-license: "mit"
-datasets:
-  - custom_human_body_parts_dataset
-metrics:
-  - mean_average_precision
-  - intersection_over_union
-base_model: "ultralytics/yolov5yolov8x-seg"
----
-# YOLO Segmentation Model for Human Body Parts and Objects
-This model is a fine-tuned version of YOLOv5 for segmenting human body parts and objects. It can detect and segment 11 different classes including various body parts, outfits, and phones.
-## Model Details
-- **Model Type:** YOLOv8 for Instance Segmentation
-- **Task:** Segmentation
-- **Fine-tuning Dataset:** Custom dataset of human body parts and objects
-- **Number of Classes:** 11
-## Classes
-The model can detect and segment the following classes:
-0. Hair
-1. Face
-2. Neck
-3. Arm
-4. Hand
-5. Back
-6. Leg
-7. Foot
-8. Outfit
-9. Person
-10. Phone
-## Usage
-This model can be used for various applications, including:
-- Human pose estimation
-- Gesture recognition
-- Fashion analysis
-- Person tracking
-- Human-computer interaction
-For detailed usage instructions, please refer to the model's README file.
-## Training Procedure
-The model was fine-tuned on a custom dataset of annotated images containing human body parts and objects. The training process involved transfer learning from the base YOLOv8 model, with adjustments made to the final layers to accommodate the new class structure.
-## Evaluation Results
-(Note: Replace these placeholder metrics with your actual evaluation results)
-lr/pg0:0.000572628
-lr/pg1:0.000572628
-lr/pg2:0.000572628
-metrics/mAP50-95(B):0.53001
-metrics/mAP50-95(M):0.42367
-metrics/mAP50(B):0.69407
-metrics/mAP50(M):0.61714
-metrics/precision(B):0.7047
-metrics/precision(M):0.68041
-metrics/recall(B):0.68802
-metrics/recall(M):0.62248
-model/GFLOPs:344.557
-model/parameters:71,761,441
-model/speed_PyTorch(ms):5.813
-train/box_loss:0.54718
-train/cls_loss:0.52977
-train/dfl_loss:0.95171
-train/seg_loss:1.34628
-val/box_loss:0.80538
-val/cls_loss:0.83434
-val/dfl_loss:1.18352
-val/seg_loss:2.19488
-## Limitations and Biases
-- The model's performance may vary depending on lighting conditions and image quality.
-- It may have difficulty with occluded or partially visible body parts.
-- The model's performance on diverse body types and skin tones should be carefully evaluated to ensure fairness and inclusivity.
-## Ethical Considerations
-Users of this model should be aware of privacy concerns related to human body detection and ensure they have appropriate consent for its application. The model should not be used for surveillance or any application that could infringe on personal privacy without explicit consent.

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import gradio as gr
+import os
+from ultralytics import YOLO
+from yolo.BodyMask import BodyMask
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import patches
+from skimage.transform import resize
+from PIL import Image
+import io
+model_id = os.path.abspath("yolo-human-parse-epoch-125.pt")
+def display_image_with_masks(image, results, cols=4):
+    # Convert PIL Image to numpy array
+    image_np = np.array(image)
+    # Check image dimensions
+    if image_np.ndim != 3 or image_np.shape[2] != 3:
+        raise ValueError("Image must be a 3-dimensional array with 3 color channels")
+    # Number of masks
+    n = len(results)
+    rows = (n + cols - 1) // cols  # Calculate required number of rows
+    # Setting up the plot
+    fig, axs = plt.subplots(rows, cols, figsize=(5 * cols, 5 * rows))
+    axs = np.array(axs).reshape(-1)  # Flatten axs array for easy indexing
+    for i, result in enumerate(results):
+        mask = result["mask"]
+        label = result["label"]
+        score = float(result["score"])
+        # Convert PIL mask to numpy array and resize if necessary
+        mask_np = np.array(mask)
+        if mask_np.shape != image_np.shape[:2]:
+            mask_np = resize(
+                mask_np, image_np.shape[:2], mode="constant", anti_aliasing=False
+            )
+            mask_np = (mask_np > 0.5).astype(
+                np.uint8
+            )  # Threshold back to binary after resize
+        # Create an overlay where mask is True
+        overlay = np.zeros_like(image_np)
+        overlay[mask_np > 0] = [0, 0, 255]  # Applying blue color on the mask area
+        # Combine the image and the overlay
+        combined = image_np.copy()
+        indices = np.where(mask_np > 0)
+        combined[indices] = combined[indices] * 0.5 + overlay[indices] * 0.5
+        # Show the combined image
+        ax = axs[i]
+        ax.imshow(combined)
+        ax.axis("off")
+        ax.set_title(f"Label: {label}, Score: {score:.2f}", fontsize=12)
+        rect = patches.Rectangle(
+            (0, 0),
+            image_np.shape[1],
+            image_np.shape[0],
+            linewidth=1,
+            edgecolor="r",
+            facecolor="none",
+        )
+        ax.add_patch(rect)
+    # Hide unused subplots if the total number of masks is not a multiple of cols
+    for idx in range(i + 1, rows * cols):
+        axs[idx].axis("off")
+    plt.tight_layout()
+    # Save the plot to a bytes buffer
+    buf = io.BytesIO()
+    plt.savefig(buf, format="png")
+    buf.seek(0)
+    # Clear the current figure
+    plt.close(fig)
+    return buf
+def perform_segmentation(input_image):
+    bm = BodyMask(input_image, model_id=model_id, resize_to=640)
+    results = bm.results
+    buf = display_image_with_masks(input_image, results)
+    # Convert BytesIO to PIL Image
+    img = Image.open(buf)
+    return img
+# Get example images
+example_images = [
+    os.path.join("sample_images", f)
+    for f in os.listdir("sample_images")
+    if f.endswith((".png", ".jpg", ".jpeg"))
+]
+with gr.Blocks() as demo:
+    gr.Markdown("# YOLO Segmentation Demo with BodyMask")
+    gr.Markdown(
+        "Upload an image or select an example to see the YOLO segmentation results."
+    )
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(type="pil", label="Input Image", height=512)
+            segment_button = gr.Button("Perform Segmentation")
+        output_image = gr.Image(label="Segmentation Result")
+    gr.Examples(
+        examples=example_images,
+        inputs=input_image,
+        outputs=output_image,
+        fn=perform_segmentation,
+        cache_examples=True,
+    )
+    segment_button.click(
+        fn=perform_segmentation,
+        inputs=input_image,
+        outputs=output_image,
+    )
+demo.launch()

example.ipynb ADDED Viewed

	@@ -0,0 +1,150 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os \n",
+    "from ultralytics import YOLO\n",
+    "from yolo.BodyMask import BodyMask\n",
+    "\n",
+    "\n",
+    "model_id = os.path.abspath(\"yolo-human-parse-epoch-125.pt\")\n",
+    "\n",
+    "example_images = [\n",
+    "    os.path.join(\"sample_images\", f)\n",
+    "    for f in os.listdir(\"sample_images\")\n",
+    "    if f.endswith((\".png\", \".jpg\", \".jpeg\"))\n",
+    "]\n",
+    "\n",
+    "image = example_images[0]\n",
+    "\n",
+    "bm = BodyMask(image, model_id=model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bm.display_results()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[0;31mInit signature:\u001b[0m\n",
+      "\u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mImage\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mvalue\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | PIL.Image.Image | np.ndarray | Callable | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0;34m*\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mformat\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'webp'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mheight\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int | str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mwidth\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int | str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mimage_mode\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"Literal['1', 'L', 'P', 'RGB', 'RGBA', 'CMYK', 'YCbCr', 'LAB', 'HSV', 'I', 'F'] | None\"\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'RGB'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0msources\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"list[Literal['upload', 'webcam', 'clipboard']] | Literal['upload', 'webcam', 'clipboard'] | None\"\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mtype\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"Literal['numpy', 'pil', 'filepath']\"\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'numpy'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mlabel\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mevery\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Timer | float | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0minputs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Component | Sequence[Component] | set[Component] | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mshow_label\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mshow_download_button\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mcontainer\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mscale\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mmin_width\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m160\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0minteractive\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mvisible\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mstreaming\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0melem_id\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0melem_classes\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'list[str] | str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mrender\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mkey\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int | str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mmirror_webcam\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mshow_share_button\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mplaceholder\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m    \u001b[0mshow_fullscreen_button\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+      "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mDocstring:\u001b[0m     \n",
+      "Creates an image component that can be used to upload images (as an input) or display images (as an output).\n",
+      "\n",
+      "Demos: sepia_filter, fake_diffusion\n",
+      "Guides: image-classification-in-pytorch, image-classification-in-tensorflow, image-classification-with-vision-transformers, create-your-own-friends-with-a-gan\n",
+      "\u001b[0;31mInit docstring:\u001b[0m\n",
+      "Parameters:\n",
+      "    value: A PIL Image, numpy array, path or URL for the default value that Image component is going to take. If callable, the function will be called whenever the app loads to set the initial value of the component.\n",
+      "    format: File format (e.g. \"png\" or \"gif\") to save image if it does not already have a valid format (e.g. if the image is being returned to the frontend as a numpy array or PIL Image).  The format should be supported by the PIL library. This parameter has no effect on SVG files.\n",
+      "    height: The height of the displayed image, specified in pixels if a number is passed, or in CSS units if a string is passed.\n",
+      "    width: The width of the displayed image, specified in pixels if a number is passed, or in CSS units if a string is passed.\n",
+      "    image_mode: \"RGB\" if color, or \"L\" if black and white. See https://pillow.readthedocs.io/en/stable/handbook/concepts.html for other supported image modes and their meaning. This parameter has no effect on SVG or GIF files. If set to None, the image_mode will be inferred from the image file.\n",
+      "    sources: List of sources for the image. \"upload\" creates a box where user can drop an image file, \"webcam\" allows user to take snapshot from their webcam, \"clipboard\" allows users to paste an image from the clipboard. If None, defaults to [\"upload\", \"webcam\", \"clipboard\"] if streaming is False, otherwise defaults to [\"webcam\"].\n",
+      "    type: The format the image is converted before being passed into the prediction function. \"numpy\" converts the image to a numpy array with shape (height, width, 3) and values from 0 to 255, \"pil\" converts the image to a PIL image object, \"filepath\" passes a str path to a temporary file containing the image. If the image is SVG, the `type` is ignored and the filepath of the SVG is returned. To support animated GIFs in input, the `type` should be set to \"filepath\" or \"pil\".\n",
+      "    label: The label for this component. Appears above the component and is also used as the header if there are a table of examples for this component. If None and used in a `gr.Interface`, the label will be the name of the parameter this component is assigned to.\n",
+      "    every: Continously calls `value` to recalculate it if `value` is a function (has no effect otherwise). Can provide a Timer whose tick resets `value`, or a float that provides the regular interval for the reset Timer.\n",
+      "    inputs: Components that are used as inputs to calculate `value` if `value` is a function (has no effect otherwise). `value` is recalculated any time the inputs change.\n",
+      "    show_label: if True, will display label.\n",
+      "    show_download_button: If True, will display button to download image.\n",
+      "    container: If True, will place the component in a container - providing some extra padding around the border.\n",
+      "    scale: relative size compared to adjacent Components. For example if Components A and B are in a Row, and A has scale=2, and B has scale=1, A will be twice as wide as B. Should be an integer. scale applies in Rows, and to top-level Components in Blocks where fill_height=True.\n",
+      "    min_width: minimum pixel width, will wrap if not sufficient screen space to satisfy this value. If a certain scale value results in this Component being narrower than min_width, the min_width parameter will be respected first.\n",
+      "    interactive: if True, will allow users to upload and edit an image; if False, can only be used to display images. If not provided, this is inferred based on whether the component is used as an input or output.\n",
+      "    visible: If False, component will be hidden.\n",
+      "    streaming: If True when used in a `live` interface, will automatically stream webcam feed. Only valid is source is 'webcam'.\n",
+      "    elem_id: An optional string that is assigned as the id of this component in the HTML DOM. Can be used for targeting CSS styles.\n",
+      "    elem_classes: An optional list of strings that are assigned as the classes of this component in the HTML DOM. Can be used for targeting CSS styles.\n",
+      "    render: If False, component will not render be rendered in the Blocks context. Should be used if the intention is to assign event listeners now but render the component later.\n",
+      "    key: if assigned, will be used to assume identity across a re-render. Components that have the same key across a re-render will have their value preserved.\n",
+      "    mirror_webcam: If True webcam will be mirrored. Default is True.\n",
+      "    show_share_button: If True, will show a share icon in the corner of the component that allows user to share outputs to Hugging Face Spaces Discussions. If False, icon does not appear. If set to None (default behavior), then the icon appears if this Gradio app is launched on Spaces, but not otherwise.\n",
+      "    placeholder: Custom text for the upload area. Overrides default upload messages when provided. Accepts new lines and `#` to designate a heading.\n",
+      "    show_fullscreen_button: If True, will show a fullscreen icon in the corner of the component that allows user to view the image in fullscreen mode. If False, icon does not appear.\n",
+      "\u001b[0;31mFile:\u001b[0m           /opt/homebrew/Caskroom/miniforge/base/envs/lemons/lib/python3.10/site-packages/gradio/components/image.py\n",
+      "\u001b[0;31mType:\u001b[0m           ComponentMeta\n",
+      "\u001b[0;31mSubclasses:\u001b[0m     "
+     ]
+    }
+   ],
+   "source": [
+    "import gradio as gr \n",
+    "\n",
+    "gr.Image?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "lemons",
+   "language": "python",
+   "name": "lemons"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+diffusers==0.30.3
+gradio==4.44.0
+matplotlib==3.8.4
+numpy==2.1.1
+Pillow==10.4.0
+skimage==0.0
+ultralytics==8.2.97

sample_images/image_five.jpg ADDED Viewed

sample_images/image_four.jpg ADDED Viewed

sample_images/image_six.jpg ADDED Viewed

yolo/BodyMask.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import os
+from functools import lru_cache
+from typing import List
+import cv2
+import numpy as np
+from diffusers.utils import load_image
+from PIL import Image, ImageChops, ImageFilter
+from ultralytics import YOLO
+from .utils import *
+def dilate_mask(mask, dilate_factor=6, blur_radius=2, erosion_factor=2):
+    if not mask:
+        return None
+    # Convert PIL image to NumPy array if necessary
+    if isinstance(mask, Image.Image):
+        mask = np.array(mask)
+    # Ensure mask is in uint8 format
+    mask = mask.astype(np.uint8)
+    # Apply dilation
+    kernel = np.ones((dilate_factor, dilate_factor), np.uint8)
+    dilated_mask = cv2.dilate(mask, kernel, iterations=1)
+    # Apply erosion for refinement
+    kernel = np.ones((erosion_factor, erosion_factor), np.uint8)
+    eroded_mask = cv2.erode(dilated_mask, kernel, iterations=1)
+    # Apply Gaussian blur to smooth the edges
+    blurred_mask = cv2.GaussianBlur(
+        eroded_mask, (2 * blur_radius + 1, 2 * blur_radius + 1), 0
+    )
+    # Convert back to PIL image
+    smoothed_mask = Image.fromarray(blurred_mask).convert("L")
+    # Optionally, apply an additional blur for extra smoothness using PIL
+    smoothed_mask = smoothed_mask.filter(ImageFilter.GaussianBlur(radius=blur_radius))
+    return smoothed_mask
+@lru_cache(maxsize=1)
+def get_model(model_id):
+    model = YOLO(model=model_id)
+    return model
+def combine_masks(masks: List[dict], labels: List[str], is_label=True) -> Image.Image:
+    """
+    Combine masks with the specified labels into a single mask, optimized for speed and non-overlapping of excluded masks.
+    Parameters:
+    - masks (List[dict]): A list of dictionaries, each containing the mask under a 'mask' key and its label under a 'label' key.
+    - labels (List[str]): A list of labels to include in the combination.
+    Returns:
+    - Image.Image: The combined mask as a PIL Image object, or None if no masks are combined.
+    """
+    labels_set = set(labels)  # Convert labels list to a set for O(1) lookups
+    # Filter and convert mask images based on the specified labels
+    mask_images = [
+        mask["mask"].convert("L")
+        for mask in masks
+        if (mask["label"] in labels_set) == is_label
+    ]
+    # Ensure there is at least one mask to combine
+    if not mask_images:
+        return None  # Or raise an appropriate error, e.g., ValueError("No masks found for the specified labels.")
+    # Initialize the combined mask with the first mask
+    combined_mask = mask_images[0]
+    # Combine the remaining masks with the existing combined_mask using a bitwise OR operation to ensure non-overlap
+    for mask in mask_images[1:]:
+        combined_mask = ImageChops.lighter(combined_mask, mask)
+    return combined_mask
+body_labels = ["hair", "face", "arm", "hand", "leg", "foot", "outfit"]
+class BodyMask:
+    def __init__(
+        self,
+        image_path,
+        model_id,
+        labels=body_labels,
+        overlay="mask",
+        widen_box=0,
+        elongate_box=0,
+        resize_to=640,
+        dilate_factor=0,
+        is_label=False,
+        resize_to_nearest_eight=False,
+        verbose=True,
+        remove_overlap=True,
+    ):
+        self.image_path = image_path
+        self.image = self.get_image(
+            resize_to=resize_to, resize_to_nearest_eight=resize_to_nearest_eight
+        )
+        self.labels = labels
+        self.is_label = is_label
+        self.model_id = model_id
+        self.model = get_model(self.model_id)
+        self.model_labels = self.model.names
+        self.verbose = verbose
+        self.results = self.get_results()
+        self.dilate_factor = dilate_factor
+        self.body_mask = self.get_body_mask()
+        self.box = get_bounding_box(self.body_mask)
+        self.body_box = self.get_body_box(
+            remove_overlap=remove_overlap, widen=widen_box, elongate=elongate_box
+        )
+        if overlay == "box":
+            self.overlay = overlay_mask(
+                self.image, self.body_box, opacity=0.9, color="red"
+            )
+        else:
+            self.overlay = overlay_mask(
+                self.image, self.body_mask, opacity=0.9, color="red"
+            )
+    def get_image(self, resize_to, resize_to_nearest_eight):
+        image = load_image(self.image_path)
+        if resize_to:
+            image = resize_preserve_aspect_ratio(image, resize_to)
+        if resize_to_nearest_eight:
+            image = resize_image_to_nearest_eight(image)
+        else:
+            image = image
+        return image
+    def get_body_mask(self):
+        body_mask = combine_masks(self.results, self.labels, self.is_label)
+        return dilate_mask(body_mask, self.dilate_factor)
+    def get_results(self):
+        imgsz = max(self.image.size)
+        results = self.model(
+            self.image, retina_masks=True, imgsz=imgsz, verbose=self.verbose
+        )[0]
+        self.masks, self.boxes, self.scores, self.phrases = unload(
+            results, self.model_labels
+        )
+        results = format_results(
+            self.masks,
+            self.boxes,
+            self.scores,
+            self.phrases,
+            self.model_labels,
+            person_masks_only=False,
+        )
+        # filter out lower score results
+        masks_to_filter = ["hair"]
+        results = filter_highest_score(results, ["hair", "face", "phone"])
+        return results
+    def display_results(self):
+        if len(self.masks) < 4:
+            cols = len(self.masks)
+        else:
+            cols = 4
+        display_image_with_masks(self.image, self.results, cols=cols)
+    def get_mask(self, mask_label):
+        assert mask_label in self.phrases, "Mask label not found in results"
+        return [f for f in self.results if f.get("label") == mask_label]
+    def combine_masks(self, mask_labels: List, no_labels=None, is_label=True):
+        """
+        Combine the masks included in the labels list or all of the masks not in the list
+        """
+        if not is_label:
+            mask_labels = [
+                phrase for phrase in self.phrases if phrase not in mask_labels
+            ]
+        masks = [
+            row.get("mask") for row in self.results if row.get("label") in mask_labels
+        ]
+        if len(masks) == 0:
+            return None
+        combined_mask = masks[0]
+        for mask in masks[1:]:
+            combined_mask = ImageChops.lighter(combined_mask, mask)
+        return combined_mask
+    def get_body_box(self, remove_overlap=True, widen=0, elongate=0):
+        body_box = get_bounding_box_mask(self.body_mask, widen=widen, elongate=elongate)
+        if remove_overlap:
+            body_box = self.remove_overlap(body_box)
+        return body_box
+    def remove_overlap(self, body_box):
+        """
+        Remove mask regions that overlap with unwanted labels
+        """
+        # convert mask to numpy array
+        box_array = np.array(body_box)
+        # combine the masks for those labels
+        mask = self.combine_masks(mask_labels=self.labels, is_label=True)
+        # convert mask to numpy array
+        mask_array = np.array(mask)
+        # where the mask array is white set the box array to black
+        box_array[mask_array == 255] = 0
+        # convert the box array to an image
+        mask_image = Image.fromarray(box_array)
+        return mask_image
+if __name__ == "__main__":
+    url = "https://sjc1.vultrobjects.com/photo-storage/images/525d1f68-314c-455b-a8b6-f5dc3fa044e4.jpeg"
+    image_name = url.split("/")[-1]
+    labels = ["face", "hair", "phone", "hand"]
+    image = load_image(url)
+    image_size = image.size
+    # Get the original size of the image
+    original_size = image.size
+    # Create body mask
+    body_mask = BodyMask(
+        image,
+        overlay="box",
+        labels=labels,
+        widen_box=50,
+        elongate_box=10,
+        dilate_factor=0,
+        resize_to=640,
+        is_label=False,
+        remove_overlap=True,
+        verbose=False,
+    )
+    # Resize the image back to the original size
+    image = body_mask.image.resize(original_size)
+    body_mask.body_box.save(image_name)

yolo/utils.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import matplotlib.patches as patches
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image, ImageDraw
+def unload_mask(mask):
+    mask = mask.cpu().numpy().squeeze()
+    mask = mask.astype(np.uint8) * 255
+    return Image.fromarray(mask)
+def unload_box(box):
+    return box.cpu().numpy().tolist()
+def masks_overlap(mask1, mask2):
+    return np.any(np.logical_and(mask1, mask2))
+def remove_non_person_masks(person_mask, formatted_results):
+    return [
+        f
+        for f in formatted_results
+        if f.get("label") == "person" or masks_overlap(person_mask, f.get("mask"))
+    ]
+def format_masks(masks):
+    return [unload_mask(mask) for mask in masks]
+def format_boxes(boxes):
+    return [unload_box(box) for box in boxes]
+def format_scores(scores):
+    return scores.cpu().numpy().tolist()
+def unload(result, labels_dict):
+    masks = format_masks(result.masks.data)
+    boxes = format_boxes(result.boxes.xyxy)
+    scores = format_scores(result.boxes.conf)
+    labels = result.boxes.cls
+    labels = [int(label.item()) for label in labels]
+    phrases = [labels_dict[label] for label in labels]
+    return masks, boxes, scores, phrases
+def format_results(masks, boxes, scores, labels, labels_dict, person_masks_only=True):
+    if isinstance(list(labels_dict.keys())[0], int):
+        labels_dict = {v: k for k, v in labels_dict.items()}
+    # check that the person mask is present
+    if person_masks_only:
+        assert "person" in labels, "Person mask not present in results"
+    results_dict = []
+    for row in zip(labels, scores, boxes, masks):
+        label, score, box, mask = row
+        label_id = labels_dict[label]
+        results_row = dict(
+            label=label, score=score, mask=mask, box=box, label_id=label_id
+        )
+        results_dict.append(results_row)
+    results_dict = sorted(results_dict, key=lambda x: x["label"])
+    if person_masks_only:
+        # Get the person mask
+        person_mask = [f for f in results_dict if f.get("label") == "person"][0]["mask"]
+        assert person_mask is not None, "Person mask not found in results"
+        # Remove any results that do no overlap with the person
+        results_dict = remove_non_person_masks(person_mask, results_dict)
+    return results_dict
+def filter_highest_score(results, labels):
+    """
+    Filter results to remove entries with lower scores for specified labels.
+    Args:
+        results (list): List of dictionaries containing 'label', 'score', and other keys.
+        labels (list): List of labels to filter.
+    Returns:
+        list: Filtered results with only the highest score for each specified label.
+    """
+    # Dictionary to keep track of the highest score entry for each label
+    label_highest = {}
+    # First pass: identify the highest score for each label
+    for result in results:
+        label = result["label"]
+        if label in labels:
+            if (
+                label not in label_highest
+                or result["score"] > label_highest[label]["score"]
+            ):
+                label_highest[label] = result
+    # Second pass: construct the filtered list while preserving the order
+    filtered_results = []
+    seen_labels = set()
+    for result in results:
+        label = result["label"]
+        if label in labels:
+            if label in seen_labels:
+                continue
+            if result == label_highest[label]:
+                filtered_results.append(result)
+                seen_labels.add(label)
+        else:
+            filtered_results.append(result)
+    return filtered_results
+def display_image_with_masks(image, results, cols=4, return_images=False):
+    # Convert PIL Image to numpy array
+    image_np = np.array(image)
+    # Check image dimensions
+    if image_np.ndim != 3 or image_np.shape[2] != 3:
+        raise ValueError("Image must be a 3-dimensional array with 3 color channels")
+    # Number of masks
+    n = len(results)
+    rows = (n + cols - 1) // cols  # Calculate required number of rows
+    # Setting up the plot
+    fig, axs = plt.subplots(rows, cols, figsize=(5 * cols, 5 * rows))
+    axs = np.array(axs).reshape(-1)  # Flatten axs array for easy indexing
+    for i, result in enumerate(results):
+        mask = result["mask"]
+        label = result["label"]
+        score = float(result["score"])
+        # Convert PIL mask to numpy array and resize if necessary
+        mask_np = np.array(mask)
+        if mask_np.shape != image_np.shape[:2]:
+            mask_np = resize(
+                mask_np, image_np.shape[:2], mode="constant", anti_aliasing=False
+            )
+            mask_np = (mask_np > 0.5).astype(
+                np.uint8
+            )  # Threshold back to binary after resize
+        # Create an overlay where mask is True
+        overlay = np.zeros_like(image_np)
+        overlay[mask_np > 0] = [0, 0, 255]  # Applying blue color on the mask area
+        # Combine the image and the overlay
+        combined = image_np.copy()
+        indices = np.where(mask_np > 0)
+        combined[indices] = combined[indices] * 0.5 + overlay[indices] * 0.5
+        # Show the combined image
+        ax = axs[i]
+        ax.imshow(combined)
+        ax.axis("off")
+        ax.set_title(f"Label: {label}, Score: {score:.2f}", fontsize=12)
+        rect = patches.Rectangle(
+            (0, 0),
+            image_np.shape[1],
+            image_np.shape[0],
+            linewidth=1,
+            edgecolor="r",
+            facecolor="none",
+        )
+        ax.add_patch(rect)
+    # Hide unused subplots if the total number of masks is not a multiple of cols
+    for idx in range(i + 1, rows * cols):
+        axs[idx].axis("off")
+    plt.tight_layout()
+    plt.show()
+def get_bounding_box(mask):
+    """
+    Given a segmentation mask, return the bounding box for the mask object.
+    """
+    # Find indices where the mask is non-zero
+    coords = np.argwhere(mask)
+    # Get the minimum and maximum x and y coordinates
+    x_min, y_min = np.min(coords, axis=0)
+    x_max, y_max = np.max(coords, axis=0)
+    # Return the bounding box coordinates
+    return (y_min, x_min, y_max, x_max)
+def get_bounding_box_mask(segmentation_mask, widen=0, elongate=0):
+    # Convert the PIL segmentation mask to a NumPy array
+    mask_array = np.array(segmentation_mask)
+    # Find the coordinates of the non-zero pixels
+    non_zero_y, non_zero_x = np.nonzero(mask_array)
+    # Calculate the bounding box coordinates
+    min_x, max_x = np.min(non_zero_x), np.max(non_zero_x)
+    min_y, max_y = np.min(non_zero_y), np.max(non_zero_y)
+    if widen > 0:
+        min_x = max(0, min_x - widen)
+        max_x = min(mask_array.shape[1], max_x + widen)
+    if elongate > 0:
+        min_y = max(0, min_y - elongate)
+        max_y = min(mask_array.shape[0], max_y + elongate)
+    # Create a new blank image for the bounding box mask
+    bounding_box_mask = Image.new("1", segmentation_mask.size)
+    # Draw the filled bounding box on the blank image
+    draw = ImageDraw.Draw(bounding_box_mask)
+    draw.rectangle([(min_x, min_y), (max_x, max_y)], fill=1)
+    return bounding_box_mask
+colors = {
+    "blue": (136, 207, 249),
+    "red": (255, 0, 0),
+    "green": (0, 255, 0),
+    "yellow": (255, 255, 0),
+    "purple": (128, 0, 128),
+    "cyan": (0, 255, 255),
+    "magenta": (255, 0, 255),
+    "orange": (255, 165, 0),
+    "lime": (50, 205, 50),
+    "pink": (255, 192, 203),
+    "brown": (139, 69, 19),
+    "gray": (128, 128, 128),
+    "black": (0, 0, 0),
+    "white": (255, 255, 255),
+    "gold": (255, 215, 0),
+    "silver": (192, 192, 192),
+    "beige": (245, 245, 220),
+    "navy": (0, 0, 128),
+    "maroon": (128, 0, 0),
+    "olive": (128, 128, 0),
+}
+def overlay_mask(image, mask, opacity=0.5, color="blue"):
+    """
+    Takes in a PIL image and a PIL boolean image mask. Overlay the mask on the image
+    and color the mask with a low opacity blue with hex #88CFF9.
+    """
+    # Convert the boolean mask to an image with alpha channel
+    alpha = mask.convert("L").point(lambda x: 255 if x == 255 else 0, mode="1")
+    # Choose the color
+    r, g, b = colors[color]
+    color_mask = Image.new("RGBA", mask.size, (r, g, b, int(opacity * 255)))
+    mask_rgba = Image.composite(
+        color_mask, Image.new("RGBA", mask.size, (0, 0, 0, 0)), alpha
+    )
+    # Create a new RGBA image to overlay the mask on
+    overlay = Image.new("RGBA", image.size, (0, 0, 0, 0))
+    # Paste the mask onto the overlay
+    overlay.paste(mask_rgba, (0, 0))
+    # Create a new image to return by blending the original image and the overlay
+    result = Image.alpha_composite(image.convert("RGBA"), overlay)
+    # Convert the result back to the original mode and return it
+    return result.convert(image.mode)
+def resize_preserve_aspect_ratio(image, max_side=512):
+    width, height = image.size
+    scale = min(max_side / width, max_side / height)
+    new_width = int(width * scale)
+    new_height = int(height * scale)
+    return image.resize((new_width, new_height))
+def round_to_nearest_eigth(value):
+    return int((value // 8 * 8))
+def resize_image_to_nearest_eight(image):
+    width, height = image.size
+    width, height = round_to_nearest_eigth(width), round_to_nearest_eigth(height)
+    image = image.resize((width, height))
+    return image