Spaces:

NandiniLokeshReddy
/

EfficientSAM_vs_SAM

Running

App Files Files Community

NandiniLokeshReddy commited on Jul 13, 2024

Commit

d8dd7fb

verified ·

1 Parent(s): e0b4b52

Upload 38 files

Browse files

Files changed (39) hide show

.gitattributes +4 -0
EfficientSAM-main/.DS_Store +0 -0
EfficientSAM-main/.gitignore +5 -0
EfficientSAM-main/EfficientSAM_example.py +54 -0
EfficientSAM-main/EfficientSAM_onnx_example.py +77 -0
EfficientSAM-main/LICENSE +201 -0
EfficientSAM-main/README.md +63 -0
EfficientSAM-main/efficient_sam/__init__.py +7 -0
EfficientSAM-main/efficient_sam/build_efficient_sam.py +22 -0
EfficientSAM-main/efficient_sam/efficient_sam.py +305 -0
EfficientSAM-main/efficient_sam/efficient_sam_decoder.py +315 -0
EfficientSAM-main/efficient_sam/efficient_sam_encoder.py +257 -0
EfficientSAM-main/efficient_sam/mlp.py +29 -0
EfficientSAM-main/efficient_sam/two_way_transformer.py +266 -0
EfficientSAM-main/export_to_onnx.py +139 -0
EfficientSAM-main/export_to_torchscript.py +17 -0
EfficientSAM-main/figs/.DS_Store +0 -0
EfficientSAM-main/figs/examples/demo_box.png +3 -0
EfficientSAM-main/figs/examples/demo_everything.png +3 -0
EfficientSAM-main/figs/examples/demo_point.png +3 -0
EfficientSAM-main/figs/examples/demo_saliency.png +3 -0
EfficientSAM-main/figs/examples/dogs.jpg +0 -0
EfficientSAM-main/figs/examples/dogs_efficient_sam_vits_mask.png +0 -0
EfficientSAM-main/figs/examples/dogs_efficient_sam_vitt_mask.png +0 -0
EfficientSAM-main/figs/examples/dogs_efficientsam_s_mask.png +0 -0
EfficientSAM-main/figs/examples/dogs_efficientsam_ti_mask.png +0 -0
EfficientSAM-main/figs/examples/dogs_squeeze_sam_mask.png +0 -0
EfficientSAM-main/linter.sh +32 -0
EfficientSAM-main/notebooks/EfficientSAM_example.ipynb +0 -0
EfficientSAM-main/notebooks/EfficientSAM_segment_everything_example.ipynb +0 -0
EfficientSAM-main/onnx_models.py +166 -0
EfficientSAM-main/setup.cfg +11 -0
EfficientSAM-main/setup.py +18 -0
EfficientSAM-main/torchscripted_model/efficient_sam_vitt_torchscript.pt +3 -0
EfficientSAM-main/weights/efficient_sam_vits.pt.zip +3 -0
EfficientSAM-main/weights/efficient_sam_vitt.onnx +3 -0
EfficientSAM-main/weights/efficient_sam_vitt.pt +3 -0
EfficientSAM-main/weights/efficient_sam_vitt_decoder.onnx +3 -0
EfficientSAM-main/weights/efficient_sam_vitt_encoder.onnx +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+EfficientSAM-main/figs/examples/demo_box.png filter=lfs diff=lfs merge=lfs -text
+EfficientSAM-main/figs/examples/demo_everything.png filter=lfs diff=lfs merge=lfs -text
+EfficientSAM-main/figs/examples/demo_point.png filter=lfs diff=lfs merge=lfs -text
+EfficientSAM-main/figs/examples/demo_saliency.png filter=lfs diff=lfs merge=lfs -text

EfficientSAM-main/.DS_Store ADDED Viewed

Binary file (10.2 kB). View file

EfficientSAM-main/.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+*.pyc
+*.pyo
+*.pyd
+__py
+**/__pycache__/

EfficientSAM-main/EfficientSAM_example.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
+# from squeeze_sam.build_squeeze_sam import build_squeeze_sam
+from PIL import Image
+from torchvision import transforms
+import torch
+import numpy as np
+import zipfile
+models = {}
+# Build the EfficientSAM-Ti model.
+models['efficientsam_ti'] = build_efficient_sam_vitt()
+# Since EfficientSAM-S checkpoint file is >100MB, we store the zip file.
+with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", 'r') as zip_ref:
+    zip_ref.extractall("weights")
+# Build the EfficientSAM-S model.
+models['efficientsam_s'] = build_efficient_sam_vits()
+# Build the SqueezeSAM model.
+# models['squeeze_sam'] = build_squeeze_sam()
+# load an image
+sample_image_np = np.array(Image.open("figs/examples/dogs.jpg"))
+sample_image_tensor = transforms.ToTensor()(sample_image_np)
+# Feed a few (x,y) points in the mask as input.
+input_points = torch.tensor([[[[580, 350], [650, 350]]]])
+input_labels = torch.tensor([[[1, 1]]])
+# Run inference for both EfficientSAM-Ti and EfficientSAM-S models.
+for model_name, model in models.items():
+    print('Running inference using ', model_name)
+    predicted_logits, predicted_iou = model(
+        sample_image_tensor[None, ...],
+        input_points,
+        input_labels,
+    )
+    sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True)
+    predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2)
+    predicted_logits = torch.take_along_dim(
+        predicted_logits, sorted_ids[..., None, None], dim=2
+    )
+    # The masks are already sorted by their predicted IOUs.
+    # The first dimension is the batch size (we have a single image. so it is 1).
+    # The second dimension is the number of masks we want to generate (in this case, it is only 1)
+    # The third dimension is the number of candidate masks output by the model.
+    # For this demo we use the first mask.
+    mask = torch.ge(predicted_logits[0, 0, 0, :, :], 0).cpu().detach().numpy()
+    masked_image_np = sample_image_np.copy().astype(np.uint8) * mask[:,:,None]
+    Image.fromarray(masked_image_np).save(f"figs/examples/dogs_{model_name}_mask.png")

EfficientSAM-main/EfficientSAM_onnx_example.py ADDED Viewed

	@@ -0,0 +1,77 @@

+#Onnx export code is from [labelme annotation tool](https://github.com/labelmeai/efficient-sam). Huge thanks to Kentaro Wada.
+import numpy as np
+import torch
+import imgviz
+import onnxruntime
+import time
+from PIL import Image
+def predict_onnx(input_image, input_points, input_labels):
+    if 0:
+        inference_session = onnxruntime.InferenceSession(
+            "weights/efficient_sam_vitt.onnx"
+        )
+        (
+            predicted_logits,
+            predicted_iou,
+            predicted_lowres_logits,
+        ) = inference_session.run(
+            output_names=None,
+            input_feed={
+                "batched_images": input_image,
+                "batched_point_coords": input_points,
+                "batched_point_labels": input_labels,
+            },
+        )
+    else:
+        inference_session = onnxruntime.InferenceSession(
+            "weights/efficient_sam_vitt_encoder.onnx"
+        )
+        t_start = time.time()
+        image_embeddings, = inference_session.run(
+            output_names=None,
+            input_feed={
+                "batched_images": input_image,
+            },
+        )
+        print("encoder time", time.time() - t_start)
+        inference_session = onnxruntime.InferenceSession(
+            "weights/efficient_sam_vitt_decoder.onnx"
+        )
+        t_start = time.time()
+        (
+            predicted_logits,
+            predicted_iou,
+            predicted_lowres_logits,
+        ) = inference_session.run(
+            output_names=None,
+            input_feed={
+                "image_embeddings": image_embeddings,
+                "batched_point_coords": input_points,
+                "batched_point_labels": input_labels,
+                "orig_im_size": np.array(input_image.shape[2:], dtype=np.int64),
+            },
+        )
+        print("decoder time", time.time() - t_start)
+    mask = predicted_logits[0, 0, 0, :, :] >= 0
+    imgviz.io.imsave(f"figs/examples/dogs_onnx_mask.png", mask)
+def main():
+    image = np.array(Image.open("figs/examples/dogs.jpg"))
+    input_image = image.transpose(2, 0, 1)[None].astype(np.float32) / 255.0
+    # batch_size, num_queries, num_points, 2
+    input_points = np.array([[[[580, 350], [650, 350]]]], dtype=np.float32)
+    # batch_size, num_queries, num_points
+    input_labels = np.array([[[1, 1]]], dtype=np.float32)
+    predict_onnx(input_image, input_points, input_labels)
+if __name__ == "__main__":
+    main()

EfficientSAM-main/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

EfficientSAM-main/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# EfficientSAM
+EfficientSAM: Leveraged Masked Image Pretraining for Efficient Segment Anything
+## News
+[Jan.12 2024] ONNX version of EfficientSAM including separate encoder and decoder is available on the [Hugging Face Space](https://huggingface.co/spaces/yunyangx/EfficientSAM/tree/main) (thanks to @wkentaro Kentaro Wada for implementing onnx export)
+[Dec.31 2023] EfficientSAM is integrated into the annotation tool, [Labelme](https://github.com/labelmeai/labelme) (huge thanks to lableme team and @wkentaro Kentaro Wada)
+[Dec.11 2023] The EfficientSAM model code with checkpoints is fully available in this repository. The [example](https://github.com/yformer/EfficientSAM/blob/main/EfficientSAM_example.py) script shows how to instantiate the model with checkpoint and query points on an image.
+[Dec.10 2023] Grounded EfficientSAM demo is available on [Grounded-Efficient-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/main/EfficientSAM) (huge thanks to IDEA-Research team and @rentainhe for supporting [grounded-efficient-sam demo](https://github.com/IDEA-Research/Grounded-Segment-Anything/blob/main/EfficientSAM/grounded_efficient_sam.py) under [Grounded-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything)).
+[Dec.6 2023] EfficientSAM demo is available on the [Hugging Face Space](https://huggingface.co/spaces/yunyangx/EfficientSAM) (huge thanks to all the HF team for their support).
+[Dec.5 2023] We release the torchscript version of EfficientSAM and share a colab.
+## Online Demo & Examples
+Online demo and examples can be found in the [project page](https://yformer.github.io/efficient-sam/).
+## EfficientSAM Instance Segmentation Examples
+  |   |   |
+:-------------------------:|:-------------------------:
+Point-prompt | ![point-prompt](figs/examples/demo_point.png)
+Box-prompt |  ![box-prompt](figs/examples/demo_box.png)
+Segment everything |![segment everything](figs/examples/demo_everything.png)
+Saliency | ![Saliency](figs/examples/demo_saliency.png)
+## Model
+EfficientSAM checkpoints are available under the weights folder of this github repository. Example instantiations and run of the models can be found in [EfficientSAM_example.py](https://github.com/yformer/EfficientSAM/blob/main/EfficientSAM_example.py).
+| EfficientSAM-S | EfficientSAM-Ti |
+|------------------------------|------------------------------|
+| [Download](https://github.com/yformer/EfficientSAM/blob/main/weights/efficient_sam_vits.pt.zip) |[Download](https://github.com/yformer/EfficientSAM/blob/main/weights/efficient_sam_vitt.pt)|
+You can directly use EfficientSAM with checkpoints,
+```
+from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
+efficientsam = build_efficient_sam_vitt()
+```
+## Jupyter Notebook Example
+The notebook is shared [here](https://github.com/yformer/EfficientSAM/blob/main/notebooks)
+## Acknowledgement
++ [SAM](https://github.com/facebookresearch/segment-anything)
++ [MobileSAM](https://github.com/ChaoningZhang/MobileSAM)
++ [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM)
++ [U-2-Net](https://github.com/xuebinqin/U-2-Net)
+If you're using EfficientSAM in your research or applications, please cite using this BibTeX:
+```bibtex
+@article{xiong2023efficientsam,
+  title={EfficientSAM: Leveraged Masked Image Pretraining for Efficient Segment Anything},
+  author={Yunyang Xiong, Bala Varadarajan, Lemeng Wu, Xiaoyu Xiang, Fanyi Xiao, Chenchen Zhu, Xiaoliang Dai, Dilin Wang, Fei Sun, Forrest Iandola, Raghuraman Krishnamoorthi, Vikas Chandra},
+  journal={arXiv:2312.00863},
+  year={2023}
+}
+```

EfficientSAM-main/efficient_sam/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from .build_efficient_sam import (
+    build_efficient_sam_vitt,
+    build_efficient_sam_vits,
+)

EfficientSAM-main/efficient_sam/build_efficient_sam.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .efficient_sam import build_efficient_sam
+def build_efficient_sam_vitt():
+    return build_efficient_sam(
+        encoder_patch_embed_dim=192,
+        encoder_num_heads=3,
+        checkpoint="weights/efficient_sam_vitt.pt",
+    ).eval()
+def build_efficient_sam_vits():
+    return build_efficient_sam(
+        encoder_patch_embed_dim=384,
+        encoder_num_heads=6,
+        checkpoint="weights/efficient_sam_vits.pt",
+    ).eval()

EfficientSAM-main/efficient_sam/efficient_sam.py ADDED Viewed

	@@ -0,0 +1,305 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Any, List, Tuple, Type
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from .efficient_sam_decoder import MaskDecoder, PromptEncoder
+from .efficient_sam_encoder import ImageEncoderViT
+from .two_way_transformer import TwoWayAttentionBlock, TwoWayTransformer
+class EfficientSam(nn.Module):
+    mask_threshold: float = 0.0
+    image_format: str = "RGB"
+    def __init__(
+        self,
+        image_encoder: ImageEncoderViT,
+        prompt_encoder: PromptEncoder,
+        decoder_max_num_input_points: int,
+        mask_decoder: MaskDecoder,
+        pixel_mean: List[float] = [0.485, 0.456, 0.406],
+        pixel_std: List[float] = [0.229, 0.224, 0.225],
+    ) -> None:
+        """
+        SAM predicts object masks from an image and input prompts.
+        Arguments:
+          image_encoder (ImageEncoderViT): The backbone used to encode the
+            image into image embeddings that allow for efficient mask prediction.
+          prompt_encoder (PromptEncoder): Encodes various types of input prompts.
+          mask_decoder (MaskDecoder): Predicts masks from the image embeddings
+            and encoded prompts.
+          pixel_mean (list(float)): Mean values for normalizing pixels in the input image.
+          pixel_std (list(float)): Std values for normalizing pixels in the input image.
+        """
+        super().__init__()
+        self.image_encoder = image_encoder
+        self.prompt_encoder = prompt_encoder
+        self.decoder_max_num_input_points = decoder_max_num_input_points
+        self.mask_decoder = mask_decoder
+        self.register_buffer(
+            "pixel_mean", torch.Tensor(pixel_mean).view(1, 3, 1, 1), False
+        )
+        self.register_buffer(
+            "pixel_std", torch.Tensor(pixel_std).view(1, 3, 1, 1), False
+        )
+    @torch.jit.export
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        batched_points: torch.Tensor,
+        batched_point_labels: torch.Tensor,
+        multimask_output: bool,
+        input_h: int,
+        input_w: int,
+        output_h: int = -1,
+        output_w: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predicts masks given image embeddings and prompts. This only runs the decoder.
+        Arguments:
+          image_embeddings: A tensor of shape [B, C, H, W] or [B*max_num_queries, C, H, W]
+          batched_points: A tensor of shape [B, max_num_queries, num_pts, 2]
+          batched_point_labels: A tensor of shape [B, max_num_queries, num_pts]
+        Returns:
+          A tuple of two tensors:
+            low_res_mask: A tensor of shape [B, max_num_queries, 256, 256] of predicted masks
+            iou_predictions: A tensor of shape [B, max_num_queries] of estimated IOU scores
+        """
+        batch_size, max_num_queries, num_pts, _ = batched_points.shape
+        num_pts = batched_points.shape[2]
+        rescaled_batched_points = self.get_rescaled_pts(batched_points, input_h, input_w)
+        if num_pts > self.decoder_max_num_input_points:
+            rescaled_batched_points = rescaled_batched_points[
+                :, :, : self.decoder_max_num_input_points, :
+            ]
+            batched_point_labels = batched_point_labels[
+                :, :, : self.decoder_max_num_input_points
+            ]
+        elif num_pts < self.decoder_max_num_input_points:
+            rescaled_batched_points = F.pad(
+                rescaled_batched_points,
+                (0, 0, 0, self.decoder_max_num_input_points - num_pts),
+                value=-1.0,
+            )
+            batched_point_labels = F.pad(
+                batched_point_labels,
+                (0, self.decoder_max_num_input_points - num_pts),
+                value=-1.0,
+            )
+        sparse_embeddings = self.prompt_encoder(
+            rescaled_batched_points.reshape(
+                batch_size * max_num_queries, self.decoder_max_num_input_points, 2
+            ),
+            batched_point_labels.reshape(
+                batch_size * max_num_queries, self.decoder_max_num_input_points
+            ),
+        )
+        sparse_embeddings = sparse_embeddings.view(
+            batch_size,
+            max_num_queries,
+            sparse_embeddings.shape[1],
+            sparse_embeddings.shape[2],
+        )
+        low_res_masks, iou_predictions = self.mask_decoder(
+            image_embeddings,
+            self.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            multimask_output=multimask_output,
+        )
+        _, num_predictions, low_res_size, _ = low_res_masks.shape
+        if output_w > 0 and output_h > 0:
+            output_masks = F.interpolate(
+                low_res_masks, (output_h, output_w), mode="bicubic"
+            )
+            output_masks = torch.reshape(
+                output_masks,
+                (batch_size, max_num_queries, num_predictions, output_h, output_w),
+            )
+        else:
+            output_masks = torch.reshape(
+                low_res_masks,
+                (
+                    batch_size,
+                    max_num_queries,
+                    num_predictions,
+                    low_res_size,
+                    low_res_size,
+                ),
+            )
+        iou_predictions = torch.reshape(
+            iou_predictions, (batch_size, max_num_queries, num_predictions)
+        )
+        return output_masks, iou_predictions
+    def get_rescaled_pts(self, batched_points: torch.Tensor, input_h: int, input_w: int):
+        return torch.stack(
+            [
+                torch.where(
+                    batched_points[..., 0] >= 0,
+                    batched_points[..., 0] * self.image_encoder.img_size / input_w,
+                    -1.0,
+                ),
+                torch.where(
+                    batched_points[..., 1] >= 0,
+                    batched_points[..., 1] * self.image_encoder.img_size / input_h,
+                    -1.0,
+                ),
+            ],
+            dim=-1,
+        )
+    @torch.jit.export
+    def get_image_embeddings(self, batched_images) -> torch.Tensor:
+        """
+        Predicts masks end-to-end from provided images and prompts.
+        If prompts are not known in advance, using SamPredictor is
+        recommended over calling the model directly.
+        Arguments:
+          batched_images: A tensor of shape [B, 3, H, W]
+        Returns:
+          List of image embeddings each of of shape [B, C(i), H(i), W(i)].
+          The last embedding corresponds to the final layer.
+        """
+        batched_images = self.preprocess(batched_images)
+        return self.image_encoder(batched_images)
+    def forward(
+        self,
+        batched_images: torch.Tensor,
+        batched_points: torch.Tensor,
+        batched_point_labels: torch.Tensor,
+        scale_to_original_image_size: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predicts masks end-to-end from provided images and prompts.
+        If prompts are not known in advance, using SamPredictor is
+        recommended over calling the model directly.
+        Arguments:
+          batched_images: A tensor of shape [B, 3, H, W]
+          batched_points: A tensor of shape [B, num_queries, max_num_pts, 2]
+          batched_point_labels: A tensor of shape [B, num_queries, max_num_pts]
+        Returns:
+          A list tuples of two tensors where the ith element is by considering the first i+1 points.
+            low_res_mask: A tensor of shape [B, 256, 256] of predicted masks
+            iou_predictions: A tensor of shape [B, max_num_queries] of estimated IOU scores
+        """
+        batch_size, _, input_h, input_w = batched_images.shape
+        image_embeddings = self.get_image_embeddings(batched_images)
+        return self.predict_masks(
+            image_embeddings,
+            batched_points,
+            batched_point_labels,
+            multimask_output=True,
+            input_h=input_h,
+            input_w=input_w,
+            output_h=input_h if scale_to_original_image_size else -1,
+            output_w=input_w if scale_to_original_image_size else -1,
+        )
+    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+        """Normalize pixel values and pad to a square input."""
+        if (
+            x.shape[2] != self.image_encoder.img_size
+            or x.shape[3] != self.image_encoder.img_size
+        ):
+            x = F.interpolate(
+                x,
+                (self.image_encoder.img_size, self.image_encoder.img_size),
+                mode="bilinear",
+            )
+        return (x - self.pixel_mean) / self.pixel_std
+def build_efficient_sam(encoder_patch_embed_dim, encoder_num_heads, checkpoint=None):
+    img_size = 1024
+    encoder_patch_size = 16
+    encoder_depth = 12
+    encoder_mlp_ratio = 4.0
+    encoder_neck_dims = [256, 256]
+    decoder_max_num_input_points = 6
+    decoder_transformer_depth = 2
+    decoder_transformer_mlp_dim = 2048
+    decoder_num_heads = 8
+    decoder_upscaling_layer_dims = [64, 32]
+    num_multimask_outputs = 3
+    iou_head_depth = 3
+    iou_head_hidden_dim = 256
+    activation = "gelu"
+    normalization_type = "layer_norm"
+    normalize_before_activation = False
+    assert activation == "relu" or activation == "gelu"
+    if activation == "relu":
+        activation_fn = nn.ReLU
+    else:
+        activation_fn = nn.GELU
+    image_encoder = ImageEncoderViT(
+        img_size=img_size,
+        patch_size=encoder_patch_size,
+        in_chans=3,
+        patch_embed_dim=encoder_patch_embed_dim,
+        normalization_type=normalization_type,
+        depth=encoder_depth,
+        num_heads=encoder_num_heads,
+        mlp_ratio=encoder_mlp_ratio,
+        neck_dims=encoder_neck_dims,
+        act_layer=activation_fn,
+    )
+    image_embedding_size = image_encoder.image_embedding_size
+    encoder_transformer_output_dim = image_encoder.transformer_output_dim
+    sam = EfficientSam(
+        image_encoder=image_encoder,
+        prompt_encoder=PromptEncoder(
+            embed_dim=encoder_transformer_output_dim,
+            image_embedding_size=(image_embedding_size, image_embedding_size),
+            input_image_size=(img_size, img_size),
+        ),
+        decoder_max_num_input_points=decoder_max_num_input_points,
+        mask_decoder=MaskDecoder(
+            transformer_dim=encoder_transformer_output_dim,
+            transformer=TwoWayTransformer(
+                depth=decoder_transformer_depth,
+                embedding_dim=encoder_transformer_output_dim,
+                num_heads=decoder_num_heads,
+                mlp_dim=decoder_transformer_mlp_dim,
+                activation=activation_fn,
+                normalize_before_activation=normalize_before_activation,
+            ),
+            num_multimask_outputs=num_multimask_outputs,
+            activation=activation_fn,
+            normalization_type=normalization_type,
+            normalize_before_activation=normalize_before_activation,
+            iou_head_depth=iou_head_depth - 1,
+            iou_head_hidden_dim=iou_head_hidden_dim,
+            upscaling_layer_dims=decoder_upscaling_layer_dims,
+        ),
+        pixel_mean=[0.485, 0.456, 0.406],
+        pixel_std=[0.229, 0.224, 0.225],
+    )
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu")
+        sam.load_state_dict(state_dict["model"])
+    return sam

EfficientSAM-main/efficient_sam/efficient_sam_decoder.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Tuple, Type
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .mlp import MLPBlock
+class PromptEncoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        image_embedding_size: Tuple[int, int],
+        input_image_size: Tuple[int, int],
+    ) -> None:
+        """
+        Encodes prompts for input to SAM's mask decoder.
+        Arguments:
+          embed_dim (int): The prompts' embedding dimension
+          image_embedding_size (tuple(int, int)): The spatial size of the
+            image embedding, as (H, W).
+          input_image_size (int): The padded size of the image as input
+            to the image encoder, as (H, W).
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.input_image_size = input_image_size
+        self.image_embedding_size = image_embedding_size
+        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+        self.invalid_points = nn.Embedding(1, embed_dim)
+        self.point_embeddings = nn.Embedding(1, embed_dim)
+        self.bbox_top_left_embeddings = nn.Embedding(1, embed_dim)
+        self.bbox_bottom_right_embeddings = nn.Embedding(1, embed_dim)
+    def get_dense_pe(self) -> torch.Tensor:
+        """
+        Returns the positional encoding used to encode point prompts,
+        applied to a dense set of points the shape of the image encoding.
+        Returns:
+          torch.Tensor: Positional encoding with shape
+            1x(embed_dim)x(embedding_h)x(embedding_w)
+        """
+        return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+    def _embed_points(
+        self,
+        points: torch.Tensor,
+        labels: torch.Tensor,
+    ) -> torch.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        point_embedding = self.pe_layer.forward_with_coords(
+            points, self.input_image_size
+        )
+        invalid_label_ids = torch.eq(labels, -1)[:,:,None]
+        point_label_ids = torch.eq(labels, 1)[:,:,None]
+        topleft_label_ids = torch.eq(labels, 2)[:,:,None]
+        bottomright_label_ids = torch.eq(labels, 3)[:,:,None]
+        point_embedding = point_embedding + self.invalid_points.weight[:,None,:] * invalid_label_ids
+        point_embedding = point_embedding + self.point_embeddings.weight[:,None,:] * point_label_ids
+        point_embedding = point_embedding + self.bbox_top_left_embeddings.weight[:,None,:] * topleft_label_ids
+        point_embedding = point_embedding + self.bbox_bottom_right_embeddings.weight[:,None,:] * bottomright_label_ids
+        return point_embedding
+    def forward(
+        self,
+        coords,
+        labels,
+    ) -> torch.Tensor:
+        """
+        Embeds different types of prompts, returning both sparse and dense
+        embeddings.
+        Arguments:
+          points: A tensor of shape [B, 2]
+          labels: An integer tensor of shape [B] where each element is 1,2 or 3.
+        Returns:
+          torch.Tensor: sparse embeddings for the points and boxes, with shape
+            BxNx(embed_dim), where N is determined by the number of input points
+            and boxes.
+        """
+        return self._embed_points(coords, labels)
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+    def __init__(self, num_pos_feats: int) -> None:
+        super().__init__()
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix", torch.randn((2, num_pos_feats))
+        )
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones([h, w], device=device, dtype=torch.float32)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C
+class MaskDecoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        transformer_dim: int,
+        transformer: nn.Module,
+        num_multimask_outputs: int,
+        activation: Type[nn.Module],
+        normalization_type: str,
+        normalize_before_activation: bool,
+        iou_head_depth: int,
+        iou_head_hidden_dim: int,
+        upscaling_layer_dims: List[int],
+    ) -> None:
+        """
+        Predicts masks given an image and prompt embeddings, using a
+        transformer architecture.
+        Arguments:
+          transformer_dim (int): the channel dimension of the transformer
+          transformer (nn.Module): the transformer used to predict masks
+          num_multimask_outputs (int): the number of masks to predict
+            when disambiguating masks
+          activation (nn.Module): the type of activation to use when
+            upscaling masks
+          iou_head_depth (int): the depth of the MLP used to predict
+            mask quality
+          iou_head_hidden_dim (int): the hidden dimension of the MLP
+            used to predict mask quality
+        """
+        super().__init__()
+        self.transformer_dim = transformer_dim
+        self.transformer = transformer
+        self.num_multimask_outputs = num_multimask_outputs
+        self.iou_token = nn.Embedding(1, transformer_dim)
+        if num_multimask_outputs > 1:
+            self.num_mask_tokens = num_multimask_outputs + 1
+        else:
+            self.num_mask_tokens = 1
+        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+        output_dim_after_upscaling = transformer_dim
+        self.final_output_upscaling_layers = nn.ModuleList([])
+        for idx, layer_dims in enumerate(upscaling_layer_dims):
+            self.final_output_upscaling_layers.append(
+                nn.Sequential(
+                    nn.ConvTranspose2d(
+                        output_dim_after_upscaling,
+                        layer_dims,
+                        kernel_size=2,
+                        stride=2,
+                    ),
+                    nn.GroupNorm(1, layer_dims)
+                    if idx < len(upscaling_layer_dims) - 1
+                    else nn.Identity(),
+                    activation(),
+                )
+            )
+            output_dim_after_upscaling = layer_dims
+        self.output_hypernetworks_mlps = nn.ModuleList(
+            [
+                MLPBlock(
+                    input_dim=transformer_dim,
+                    hidden_dim=transformer_dim,
+                    output_dim=output_dim_after_upscaling,
+                    num_layers=2,
+                    act=activation,
+                )
+                for i in range(self.num_mask_tokens)
+            ]
+        )
+        self.iou_prediction_head = MLPBlock(
+            input_dim=transformer_dim,
+            hidden_dim=iou_head_hidden_dim,
+            output_dim=self.num_mask_tokens,
+            num_layers=iou_head_depth,
+            act=activation,
+        )
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        multimask_output: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predict masks given image and prompt embeddings.
+        Arguments:
+          image_embeddings: A tensor of shape [B, C, H, W] or [B*max_num_queries, C, H, W]
+          image_pe (torch.Tensor): positional encoding with the shape of image_embeddings (the batch dimension is broadcastable).
+          sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+          multimask_output (bool): Whether to return multiple masks or a single
+            mask.
+        Returns:
+          torch.Tensor: batched predicted masks
+          torch.Tensor: batched predictions of mask quality
+        """
+        (
+            batch_size,
+            max_num_queries,
+            sparse_embed_dim_1,
+            sparse_embed_dim_2,
+        ) = sparse_prompt_embeddings.shape
+        (
+            _,
+            image_embed_dim_c,
+            image_embed_dim_h,
+            image_embed_dim_w,
+        ) = image_embeddings.shape
+        # Tile the image embedding for all queries.
+        image_embeddings_tiled = torch.tile(
+            image_embeddings[:, None, :, :, :], [1, max_num_queries, 1, 1, 1]
+        ).view(
+            batch_size * max_num_queries,
+            image_embed_dim_c,
+            image_embed_dim_h,
+            image_embed_dim_w,
+        )
+        sparse_prompt_embeddings = sparse_prompt_embeddings.reshape(
+            batch_size * max_num_queries, sparse_embed_dim_1, sparse_embed_dim_2
+        )
+        masks, iou_pred = self.predict_masks(
+            image_embeddings=image_embeddings_tiled,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sparse_prompt_embeddings,
+        )
+        if multimask_output and self.num_multimask_outputs > 1:
+            return masks[:, 1:, :], iou_pred[:, 1:]
+        else:
+            return masks[:, :1, :], iou_pred[:, :1]
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predicts masks. See 'forward' for more details."""
+        # Concatenate output tokens
+        output_tokens = torch.cat(
+            [self.iou_token.weight, self.mask_tokens.weight], dim=0
+        )
+        output_tokens = output_tokens.unsqueeze(0).expand(
+            sparse_prompt_embeddings.size(0), -1, -1
+        )
+        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+        # Expand per-image data in batch direction to be per-mask
+        pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+        b, c, h, w = image_embeddings.shape
+        hs, src = self.transformer(image_embeddings, pos_src, tokens)
+        iou_token_out = hs[:, 0, :]
+        mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
+        # Upscale mask embeddings and predict masks using the mask tokens
+        upscaled_embedding = src.transpose(1, 2).view(b, c, h, w)
+        for upscaling_layer in self.final_output_upscaling_layers:
+            upscaled_embedding = upscaling_layer(upscaled_embedding)
+        hyper_in_list: List[torch.Tensor] = []
+        for i, output_hypernetworks_mlp in enumerate(self.output_hypernetworks_mlps):
+            hyper_in_list.append(output_hypernetworks_mlp(mask_tokens_out[:, i, :]))
+        hyper_in = torch.stack(hyper_in_list, dim=1)
+        b, c, h, w = upscaled_embedding.shape
+        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+        # Generate mask quality predictions
+        iou_pred = self.iou_prediction_head(iou_token_out)
+        return masks, iou_pred

EfficientSAM-main/efficient_sam/efficient_sam_encoder.py ADDED Viewed

	@@ -0,0 +1,257 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import List, Optional, Tuple, Type
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+    def __init__(
+        self,
+        img_size,
+        patch_size,
+        in_chans,
+        embed_dim,
+    ):
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            bias=True,
+        )
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        qkv_bias,
+        qk_scale=None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        return x
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        act_layer=nn.GELU,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+        )
+        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+        )
+    def forward(self, x):
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+@torch.jit.export
+def get_abs_pos(
+    abs_pos: torch.Tensor, has_cls_token: bool, hw: List[int]
+) -> torch.Tensor:
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    h = hw[0]
+    w = hw[1]
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+    if size != h or size != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        )
+        return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+# Image encoder for efficient SAM.
+class ImageEncoderViT(nn.Module):
+    def __init__(
+        self,
+        img_size: int,
+        patch_size: int,
+        in_chans: int,
+        patch_embed_dim: int,
+        normalization_type: str,
+        depth: int,
+        num_heads: int,
+        mlp_ratio: float,
+        neck_dims: List[int],
+        act_layer: Type[nn.Module],
+    ) -> None:
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            patch_embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            act_layer (nn.Module): Activation layer.
+        """
+        super().__init__()
+        self.img_size = img_size
+        self.image_embedding_size = img_size // ((patch_size if patch_size > 0 else 1))
+        self.transformer_output_dim = ([patch_embed_dim] + neck_dims)[-1]
+        self.pretrain_use_cls_token = True
+        pretrain_img_size = 224
+        self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, patch_embed_dim)
+        # Initialize absolute positional embedding with pretrain image size.
+        num_patches = (pretrain_img_size // patch_size) * (
+            pretrain_img_size // patch_size
+        )
+        num_positions = num_patches + 1
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, patch_embed_dim))
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            vit_block = Block(patch_embed_dim, num_heads, mlp_ratio, True)
+            self.blocks.append(vit_block)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                patch_embed_dim,
+                neck_dims[0],
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(neck_dims[0]),
+            nn.Conv2d(
+                neck_dims[0],
+                neck_dims[0],
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(neck_dims[0]),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert (
+            x.shape[2] == self.img_size and x.shape[3] == self.img_size
+        ), "input image size must match self.img_size"
+        x = self.patch_embed(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        x = x + get_abs_pos(
+            self.pos_embed, self.pretrain_use_cls_token, [x.shape[1], x.shape[2]]
+        )
+        num_patches = x.shape[1]
+        assert x.shape[2] == num_patches
+        x = x.reshape(x.shape[0], num_patches * num_patches, x.shape[3])
+        for blk in self.blocks:
+            x = blk(x)
+        x = x.reshape(x.shape[0], num_patches, num_patches, x.shape[2])
+        x = self.neck(x.permute(0, 3, 1, 2))
+        return x

EfficientSAM-main/efficient_sam/mlp.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import Type
+from torch import nn
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        act: Type[nn.Module],
+    ) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Sequential(nn.Linear(n, k), act())
+            for n, k in zip([input_dim] + h, [hidden_dim] * num_layers)
+        )
+        self.fc = nn.Linear(hidden_dim, output_dim)
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return self.fc(x)

EfficientSAM-main/efficient_sam/two_way_transformer.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import math
+from typing import Tuple, Type
+import torch
+from torch import nn, Tensor
+from .mlp import MLPBlock
+class TwoWayTransformer(nn.Module):
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module],
+        normalize_before_activation: bool,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """
+        A transformer decoder that attends to an input image using
+        queries whose positional embedding is supplied.
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+        for i in range(depth):
+            curr_layer = TwoWayAttentionBlock(
+                embedding_dim=embedding_dim,
+                num_heads=num_heads,
+                mlp_dim=mlp_dim,
+                activation=activation,
+                normalize_before_activation=normalize_before_activation,
+                attention_downsample_rate=attention_downsample_rate,
+                skip_first_layer_pe=(i == 0),
+            )
+            self.layers.append(curr_layer)
+        self.final_attn_token_to_image = AttentionForTwoWayAttentionBlock(
+            embedding_dim,
+            num_heads,
+            downsample_rate=attention_downsample_rate,
+        )
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+        # Apply transformer blocks and final layernorm
+        for idx, layer in enumerate(self.layers):
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+        # Apply the final attention layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+        return queries, keys
+class TwoWayAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module],
+        normalize_before_activation: bool,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """
+        A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to sparse
+        inputs.
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = AttentionForTwoWayAttentionBlock(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_token_to_image = AttentionForTwoWayAttentionBlock(
+            embedding_dim,
+            num_heads,
+            downsample_rate=attention_downsample_rate,
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+        self.mlp = MLPBlock(
+            embedding_dim,
+            mlp_dim,
+            embedding_dim,
+            1,
+            activation,
+        )
+        self.norm3 = nn.LayerNorm(embedding_dim)
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = AttentionForTwoWayAttentionBlock(
+            embedding_dim,
+            num_heads,
+            downsample_rate=attention_downsample_rate,
+        )
+        self.skip_first_layer_pe = skip_first_layer_pe
+    def forward(
+        self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if not self.skip_first_layer_pe:
+            queries = queries + query_pe
+        attn_out = self.self_attn(q=queries, k=queries, v=queries)
+        queries = queries + attn_out
+        queries = self.norm1(queries)
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+        return queries, keys
+class AttentionForTwoWayAttentionBlock(nn.Module):
+    """
+    An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values.
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert (
+            self.internal_dim % num_heads == 0
+        ), "num_heads must divide embedding_dim."
+        self.c_per_head = self.internal_dim / num_heads
+        self.inv_sqrt_c_per_head = 1.0 / math.sqrt(self.c_per_head)
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+        self._reset_parameters()
+    def _reset_parameters(self) -> None:
+        # The fan_out is incorrect, but matches pytorch's initialization
+        # for which qkv is a single 3*embedding_dim x embedding_dim matrix
+        fan_in = self.embedding_dim
+        fan_out = 3 * self.internal_dim
+        # Xavier uniform with our custom fan_out
+        bnd = math.sqrt(6 / (fan_in + fan_out))
+        nn.init.uniform_(self.q_proj.weight, -bnd, bnd)
+        nn.init.uniform_(self.k_proj.weight, -bnd, bnd)
+        nn.init.uniform_(self.v_proj.weight, -bnd, bnd)
+        # out_proj.weight is left with default initialization, like pytorch attention
+        nn.init.zeros_(self.q_proj.bias)
+        nn.init.zeros_(self.k_proj.bias)
+        nn.init.zeros_(self.v_proj.bias)
+        nn.init.zeros_(self.out_proj.bias)
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2)
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+        # Attention
+        _, _, _, c_per_head = q.shape
+        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
+        attn = attn * self.inv_sqrt_c_per_head
+        attn = torch.softmax(attn, dim=-1)
+        # Get output
+        out = attn @ v
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+        return out

EfficientSAM-main/export_to_onnx.py ADDED Viewed

	@@ -0,0 +1,139 @@

+#ONNX export code is from [labelme annotation tool](https://github.com/labelmeai/efficient-sam). Huge thanks to Kentaro Wada.
+import onnxruntime
+import torch
+from efficient_sam.build_efficient_sam import build_efficient_sam_vits
+from efficient_sam.build_efficient_sam import build_efficient_sam_vitt
+import onnx_models
+def export_onnx(onnx_model, output, dynamic_axes, dummy_inputs, output_names):
+    with open(output, "wb") as f:
+        print(f"Exporting onnx model to {output}...")
+        torch.onnx.export(
+            onnx_model,
+            tuple(dummy_inputs.values()),
+            f,
+            export_params=True,
+            verbose=False,
+            opset_version=17,
+            do_constant_folding=True,
+            input_names=list(dummy_inputs.keys()),
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+        )
+    inference_session = onnxruntime.InferenceSession(output)
+    output = inference_session.run(
+        output_names=output_names,
+        input_feed={k: v.numpy() for k, v in dummy_inputs.items()},
+    )
+    print(output_names)
+    print([output_i.shape for output_i in output])
+def export_onnx_esam(model, output):
+    onnx_model = onnx_models.OnnxEfficientSam(model=model)
+    dynamic_axes = {
+        "batched_images": {0: "batch", 2: "height", 3: "width"},
+        "batched_point_coords": {2: "num_points"},
+        "batched_point_labels": {2: "num_points"},
+    }
+    dummy_inputs = {
+        "batched_images": torch.randn(1, 3, 1080, 1920, dtype=torch.float),
+        "batched_point_coords": torch.randint(
+            low=0, high=1080, size=(1, 1, 5, 2), dtype=torch.float
+        ),
+        "batched_point_labels": torch.randint(
+            low=0, high=4, size=(1, 1, 5), dtype=torch.float
+        ),
+    }
+    output_names = ["output_masks", "iou_predictions"]
+    export_onnx(
+        onnx_model=onnx_model,
+        output=output,
+        dynamic_axes=dynamic_axes,
+        dummy_inputs=dummy_inputs,
+        output_names=output_names,
+    )
+def export_onnx_esam_encoder(model, output):
+    onnx_model = onnx_models.OnnxEfficientSamEncoder(model=model)
+    dynamic_axes = {
+        "batched_images": {0: "batch", 2: "height", 3: "width"},
+    }
+    dummy_inputs = {
+        "batched_images": torch.randn(1, 3, 1080, 1920, dtype=torch.float),
+    }
+    output_names = ["image_embeddings"]
+    export_onnx(
+        onnx_model=onnx_model,
+        output=output,
+        dynamic_axes=dynamic_axes,
+        dummy_inputs=dummy_inputs,
+        output_names=output_names,
+    )
+def export_onnx_esam_decoder(model, output):
+    onnx_model = onnx_models.OnnxEfficientSamDecoder(model=model)
+    dynamic_axes = {
+        "image_embeddings": {0: "batch"},
+        "batched_point_coords": {2: "num_points"},
+        "batched_point_labels": {2: "num_points"},
+    }
+    dummy_inputs = {
+        "image_embeddings": torch.randn(1, 256, 64, 64, dtype=torch.float),
+        "batched_point_coords": torch.randint(
+            low=0, high=1080, size=(1, 1, 5, 2), dtype=torch.float
+        ),
+        "batched_point_labels": torch.randint(
+            low=0, high=4, size=(1, 1, 5), dtype=torch.float
+        ),
+        "orig_im_size": torch.tensor([1080, 1920], dtype=torch.long),
+    }
+    output_names = ["output_masks", "iou_predictions"]
+    export_onnx(
+        onnx_model=onnx_model,
+        output=output,
+        dynamic_axes=dynamic_axes,
+        dummy_inputs=dummy_inputs,
+        output_names=output_names,
+    )
+def main():
+    # faster
+    export_onnx_esam(
+        model=build_efficient_sam_vitt(),
+        output="weights/efficient_sam_vitt.onnx",
+    )
+    export_onnx_esam_encoder(
+        model=build_efficient_sam_vitt(),
+        output="weights/efficient_sam_vitt_encoder.onnx",
+    )
+    export_onnx_esam_decoder(
+        model=build_efficient_sam_vitt(),
+        output="weights/efficient_sam_vitt_decoder.onnx",
+    )
+    # more accurate
+    export_onnx_esam(
+        model=build_efficient_sam_vits(),
+        output="weights/efficient_sam_vits.onnx",
+    )
+    export_onnx_esam_encoder(
+        model=build_efficient_sam_vits(),
+        output="weights/efficient_sam_vits_encoder.onnx",
+    )
+    export_onnx_esam_decoder(
+        model=build_efficient_sam_vits(),
+        output="weights/efficient_sam_vits_decoder.onnx",
+    )
+if __name__ == "__main__":
+    main()

EfficientSAM-main/export_to_torchscript.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
+# from squeeze_sam.build_squeeze_sam import build_squeeze_sam
+import zipfile
+import os
+# Efficient SAM (VIT-tiny)
+torch.jit.save(torch.jit.script(build_efficient_sam_vitt()), "torchscripted_model/efficient_sam_vitt_torchscript.pt")
+# Efficient SAM (VIT-small)
+# Since VIT-small is >100MB, we store the zip file.
+with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", 'r') as zip_ref:
+    zip_ref.extractall("weights")
+torch.jit.save(torch.jit.script(build_efficient_sam_vits()), "torchscripted_model/efficient_sam_vits_torchscript.pt")
+# Squeeze SAM (UNET)
+# torch.jit.save(torch.jit.script(build_squeeze_sam()), "torchscripted_model/squeeze_sam_torchscript.pt")

EfficientSAM-main/figs/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

EfficientSAM-main/figs/examples/demo_box.png ADDED Viewed

Git LFS Details

SHA256: 7f8a5afbf5e785fcc14ec2190410ee8d72c983b305eb40d8368a072edb6b4689
Pointer size: 132 Bytes
Size of remote file: 3.58 MB

EfficientSAM-main/figs/examples/demo_everything.png ADDED Viewed

Git LFS Details

SHA256: 09a0d74c93b81e1968249302212156c110ebf5d5b68f9483b05c5f678b5981fc
Pointer size: 132 Bytes
Size of remote file: 3.08 MB

EfficientSAM-main/figs/examples/demo_point.png ADDED Viewed

Git LFS Details

SHA256: 66b1f7745da8f62eebf89b939acc9fe5c8c21e9a546c05d592a57cf0eeb24f77
Pointer size: 132 Bytes
Size of remote file: 5.04 MB

EfficientSAM-main/figs/examples/demo_saliency.png ADDED Viewed

Git LFS Details

SHA256: 9aade441926db1702be187bef4bc270d1dd33f016702e509f6b2613a7546c6a2
Pointer size: 132 Bytes
Size of remote file: 2.55 MB

EfficientSAM-main/figs/examples/dogs.jpg ADDED Viewed

EfficientSAM-main/figs/examples/dogs_efficient_sam_vits_mask.png ADDED Viewed

EfficientSAM-main/figs/examples/dogs_efficient_sam_vitt_mask.png ADDED Viewed

EfficientSAM-main/figs/examples/dogs_efficientsam_s_mask.png ADDED Viewed

EfficientSAM-main/figs/examples/dogs_efficientsam_ti_mask.png ADDED Viewed

EfficientSAM-main/figs/examples/dogs_squeeze_sam_mask.png ADDED Viewed

EfficientSAM-main/linter.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates.
+{
+  black --version | grep -E "23\." > /dev/null
+} || {
+  echo "Linter requires 'black==23.*' !"
+  exit 1
+}
+ISORT_VERSION=$(isort --version-number)
+if [[ "$ISORT_VERSION" != 5.12* ]]; then
+  echo "Linter requires isort==5.12.0 !"
+  exit 1
+fi
+echo "Running isort ..."
+isort . --atomic
+echo "Running black ..."
+black -l 100 .
+echo "Running flake8 ..."
+if [ -x "$(command -v flake8)" ]; then
+  flake8 .
+else
+  python3 -m flake8 .
+fi
+echo "Running mypy..."
+mypy --exclude 'setup.py|notebooks' .

EfficientSAM-main/notebooks/EfficientSAM_example.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

EfficientSAM-main/notebooks/EfficientSAM_segment_everything_example.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

EfficientSAM-main/onnx_models.py ADDED Viewed

	@@ -0,0 +1,166 @@

+#Onnx export code is from [labelme annotation tool](https://github.com/labelmeai/efficient-sam). Huge thanks to Kentaro Wada.
+import torch
+import torch.nn.functional as F
+class OnnxEfficientSam(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+    @property
+    def decoder_max_num_input_points(self):
+        return self.model.decoder_max_num_input_points
+    @property
+    def image_encoder(self):
+        return self.model.image_encoder
+    @property
+    def get_image_embeddings(self):
+        return self.model.get_image_embeddings
+    @property
+    def prompt_encoder(self):
+        return self.model.prompt_encoder
+    @property
+    def mask_decoder(self):
+        return self.model.mask_decoder
+    def forward(
+        self,
+        batched_images: torch.Tensor,
+        batched_points: torch.Tensor,
+        batched_point_labels: torch.Tensor,
+    ):
+        batch_size, _, input_h, input_w = batched_images.shape
+        image_embeddings = self.get_image_embeddings(batched_images)
+        return self.predict_masks(
+            image_embeddings,
+            batched_points,
+            batched_point_labels,
+            multimask_output=True,
+            input_h=input_h,
+            input_w=input_w,
+            output_h=input_h,
+            output_w=input_w,
+        )
+    def get_rescaled_pts(
+        self, batched_points: torch.Tensor, input_h: int, input_w: int
+    ):
+        return torch.stack(
+            [
+                batched_points[..., 0] * self.image_encoder.img_size / input_w,
+                batched_points[..., 1] * self.image_encoder.img_size / input_h,
+            ],
+            dim=-1,
+        )
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        batched_points: torch.Tensor,
+        batched_point_labels: torch.Tensor,
+        multimask_output: bool,
+        input_h: int,
+        input_w: int,
+        output_h: int = -1,
+        output_w: int = -1,
+    ):
+        batch_size, max_num_queries, num_pts, _ = batched_points.shape
+        num_pts = batched_points.shape[2]
+        rescaled_batched_points = self.get_rescaled_pts(
+            batched_points, input_h, input_w
+        )
+        if num_pts > self.decoder_max_num_input_points:
+            rescaled_batched_points = rescaled_batched_points[
+                :, :, : self.decoder_max_num_input_points, :
+            ]
+            batched_point_labels = batched_point_labels[
+                :, :, : self.decoder_max_num_input_points
+            ]
+        elif num_pts < self.decoder_max_num_input_points:
+            rescaled_batched_points = F.pad(
+                rescaled_batched_points,
+                (0, 0, 0, self.decoder_max_num_input_points - num_pts),
+                value=-1.0,
+            )
+            batched_point_labels = F.pad(
+                batched_point_labels,
+                (0, self.decoder_max_num_input_points - num_pts),
+                value=-1.0,
+            )
+        sparse_embeddings = self.prompt_encoder(
+            rescaled_batched_points.reshape(
+                batch_size * max_num_queries, self.decoder_max_num_input_points, 2
+            ),
+            batched_point_labels.reshape(
+                batch_size * max_num_queries, self.decoder_max_num_input_points
+            ),
+        )
+        sparse_embeddings = sparse_embeddings.view(
+            batch_size,
+            max_num_queries,
+            sparse_embeddings.shape[1],
+            sparse_embeddings.shape[2],
+        )
+        low_res_masks, iou_predictions = self.mask_decoder(
+            image_embeddings,
+            self.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            multimask_output=multimask_output,
+        )
+        _, num_predictions, low_res_size, _ = low_res_masks.shape
+        if output_w > 0 and output_h > 0:
+            output_masks = F.interpolate(
+                low_res_masks,
+                (output_h, output_w),
+                # NOTE: "bicubic" is inefficient on onnx
+                mode="bilinear",
+            )
+            output_masks = torch.reshape(
+                output_masks,
+                (batch_size, max_num_queries, num_predictions, output_h, output_w),
+            )
+        else:
+            output_masks = torch.reshape(
+                low_res_masks,
+                (
+                    batch_size,
+                    max_num_queries,
+                    num_predictions,
+                    low_res_size,
+                    low_res_size,
+                ),
+            )
+        iou_predictions = torch.reshape(
+            iou_predictions, (batch_size, max_num_queries, num_predictions)
+        )
+        return output_masks, iou_predictions, low_res_masks
+class OnnxEfficientSamEncoder(OnnxEfficientSam):
+    def forward(self, batched_images: torch.Tensor):
+        return self.model.get_image_embeddings(batched_images)
+class OnnxEfficientSamDecoder(OnnxEfficientSam):
+    def forward(
+        self, image_embeddings, batched_points, batched_point_labels, orig_im_size
+    ):
+        return self.predict_masks(
+            image_embeddings=image_embeddings,
+            batched_points=batched_points,
+            batched_point_labels=batched_point_labels,
+            multimask_output=True,
+            input_h=orig_im_size[0],
+            input_w=orig_im_size[1],
+            output_h=orig_im_size[0],
+            output_w=orig_im_size[1],
+        )

EfficientSAM-main/setup.cfg ADDED Viewed

	@@ -0,0 +1,11 @@

+[isort]
+line_length=100
+multi_line_output=3
+include_trailing_comma=True
+known_standard_library=numpy,setuptools
+skip_glob=*/__init__.py
+known_myself=efficient_sam
+known_third_party=matplotlib,torch,torchvision,black,isort
+no_lines_before=STDLIB,THIRDPARTY
+sections=FUTURE,STDLIB,THIRDPARTY,MYSELF,FIRSTPARTY,LOCALFOLDER
+default_section=FIRSTPARTY

EfficientSAM-main/setup.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from setuptools import find_packages, setup
+setup(
+    name="efficient_sam",
+    version="1.0",
+    install_requires=[],
+    packages=find_packages(exclude="notebooks"),
+    extras_require={
+        "all": ["matplotlib", "onnx", "onnxruntime"],
+        "dev": ["flake8", "isort", "black", "mypy"],
+    },
+)

EfficientSAM-main/torchscripted_model/efficient_sam_vitt_torchscript.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d300fa20a94f40bf307616a3979f4d4b6a4a347edd08ed37d69535c2185188e
+size 41074768

EfficientSAM-main/weights/efficient_sam_vits.pt.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1801de05adeea87a6b779b0bedf3ab6751e03c21facb82d2c660867c02813fc
+size 98304933

EfficientSAM-main/weights/efficient_sam_vitt.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:143c3198a7b2a15f23c21cdb723432fb3fbcdbabbdad3483cf3babd8b95c1397
+size 41365520

EfficientSAM-main/weights/efficient_sam_vitt.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dff858b19600a46461cbb7de98f796b23a7a888d9f5e34c0b033f7d6eb9e4e6a
+size 40982470

EfficientSAM-main/weights/efficient_sam_vitt_decoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a62f8fa5ea080447c0689418d69e58f1e83e0b7adf9c142e2bd9bcc8045c0b11
+size 16565728

EfficientSAM-main/weights/efficient_sam_vitt_encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84ed466ffcc5c1f8d08409bc34a23bb364ab2c15e402cb12d4335a42be0e0951
+size 24799761