diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..cf2841044ee43a674119c780a43facf2d1180b69 --- /dev/null +++ b/.gitignore @@ -0,0 +1,214 @@ +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### PyCharm ### +# User-specific stuff +.idea + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +# JetBrains templates +**___jb_tmp___ + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +docs/build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don’t work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim + +# Temporary +.netrwhist +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +# output +docs/api +.code-workspace.code-workspace +*.pkl +*.npy +*.pth +*.onnx +events.out.tfevents* + +# vscode +*.code-workspace +.vscode + +# vim +.vim diff --git a/LICENSE b/LICENSE index 261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64..cc87e8683f8accf92fb441738e981d6ab8ce7536 100644 --- a/LICENSE +++ b/LICENSE @@ -178,7 +178,7 @@ APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" + boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2021 Megvii, Base Detection Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/demo/ncnn/yolox.cpp b/demo/ncnn/yolox.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0ce4f4b4fd62f82a64c0d589244d80dd3d1681a7 --- /dev/null +++ b/demo/ncnn/yolox.cpp @@ -0,0 +1,419 @@ +// This file is wirtten base on the following file: +// https://github.com/Tencent/ncnn/blob/master/examples/yolov5.cpp +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. +// ------------------------------------------------------------------------------ +// Copyright (C) 2020-2021, Megvii Inc. All rights reserved. + +#include "layer.h" +#include "net.h" + +#if defined(USE_NCNN_SIMPLEOCV) +#include "simpleocv.h" +#else +#include +#include +#include +#endif +#include +#include +#include + +// YOLOX use the same focus in yolov5 +class YoloV5Focus : public ncnn::Layer +{ +public: + YoloV5Focus() + { + one_blob_only = true; + } + + virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + + int outw = w / 2; + int outh = h / 2; + int outc = channels * 4; + + top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outc; p++) + { + const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2); + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + *outptr = *ptr; + + outptr += 1; + ptr += 2; + } + + ptr += w; + } + } + + return 0; + } +}; + +DEFINE_LAYER_CREATOR(YoloV5Focus) + +struct Object +{ + cv::Rect_ rect; + int label; + float prob; +}; + +struct GridAndStride +{ + int grid0; + int grid1; + int stride; +}; + +static inline float intersection_area(const Object& a, const Object& b) +{ + cv::Rect_ inter = a.rect & b.rect; + return inter.area(); +} + +static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) +{ + int i = left; + int j = right; + float p = faceobjects[(left + right) / 2].prob; + + while (i <= j) + { + while (faceobjects[i].prob > p) + i++; + + while (faceobjects[j].prob < p) + j--; + + if (i <= j) + { + // swap + std::swap(faceobjects[i], faceobjects[j]); + + i++; + j--; + } + } + + #pragma omp parallel sections + { + #pragma omp section + { + if (left < j) qsort_descent_inplace(faceobjects, left, j); + } + #pragma omp section + { + if (i < right) qsort_descent_inplace(faceobjects, i, right); + } + } +} + +static void qsort_descent_inplace(std::vector& objects) +{ + if (objects.empty()) + return; + + qsort_descent_inplace(objects, 0, objects.size() - 1); +} + +static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold) +{ + picked.clear(); + + const int n = faceobjects.size(); + + std::vector areas(n); + for (int i = 0; i < n; i++) + { + areas[i] = faceobjects[i].rect.area(); + } + + for (int i = 0; i < n; i++) + { + const Object& a = faceobjects[i]; + + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) + { + const Object& b = faceobjects[picked[j]]; + + // intersection over union + float inter_area = intersection_area(a, b); + float union_area = areas[i] + areas[picked[j]] - inter_area; + // float IoU = inter_area / union_area + if (inter_area / union_area > nms_threshold) + keep = 0; + } + + if (keep) + picked.push_back(i); + } +} + +static int generate_grids_and_stride(const int target_size, std::vector& strides, std::vector& grid_strides) +{ + for (auto stride : strides) + { + int num_grid = target_size / stride; + for (int g1 = 0; g1 < num_grid; g1++) + { + for (int g0 = 0; g0 < num_grid; g0++) + { + grid_strides.push_back((GridAndStride){g0, g1, stride}); + } + } + } +} + +static void generate_yolox_proposals(std::vector grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector& objects) +{ + const int num_grid = feat_blob.h; + fprintf(stderr, "output height: %d, width: %d, channels: %d, dims:%d\n", feat_blob.h, feat_blob.w, feat_blob.c, feat_blob.dims); + + const int num_class = feat_blob.w - 5; + + const int num_anchors = grid_strides.size(); + + const float* feat_ptr = feat_blob.channel(0); + for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++) + { + const int grid0 = grid_strides[anchor_idx].grid0; + const int grid1 = grid_strides[anchor_idx].grid1; + const int stride = grid_strides[anchor_idx].stride; + + // yolox/models/yolo_head.py decode logic + // outputs[..., :2] = (outputs[..., :2] + grids) * strides + // outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides + float x_center = (feat_ptr[0] + grid0) * stride; + float y_center = (feat_ptr[1] + grid1) * stride; + float w = exp(feat_ptr[2]) * stride; + float h = exp(feat_ptr[3]) * stride; + float x0 = x_center - w * 0.5f; + float y0 = y_center - h * 0.5f; + + float box_objectness = feat_ptr[4]; + for (int class_idx = 0; class_idx < num_class; class_idx++) + { + float box_cls_score = feat_ptr[5 + class_idx]; + float box_prob = box_objectness * box_cls_score; + if (box_prob > prob_threshold) + { + Object obj; + obj.rect.x = x0; + obj.rect.y = y0; + obj.rect.width = w; + obj.rect.height = h; + obj.label = class_idx; + obj.prob = box_prob; + + objects.push_back(obj); + } + + } // class loop + feat_ptr += feat_blob.w; + + } // point anchor loop +} + +static int detect_yolox(const cv::Mat& bgr, std::vector& objects) +{ + ncnn::Net yolox; + + yolox.opt.use_vulkan_compute = true; + // yolox.opt.use_bf16_storage = true; + + yolox.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator); + + // original pretrained model from https://github.com/yolox + // TODO ncnn model https://github.com/nihui/ncnn-assets/tree/master/models + yolox.load_param("yolox.param"); + yolox.load_model("yolox.bin"); + + const int target_size = 416; + const float prob_threshold = 0.3f; + const float nms_threshold = 0.65f; + + int img_w = bgr.cols; + int img_h = bgr.rows; + + int w = img_w; + int h = img_h; + float scale = 1.f; + if (w > h) + { + scale = (float)target_size / w; + w = target_size; + h = h * scale; + } + else + { + scale = (float)target_size / h; + h = target_size; + w = w * scale; + } + ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); + + // pad to target_size rectangle + int wpad = target_size - w; + int hpad = target_size - h; + ncnn::Mat in_pad; + // different from yolov5, yolox only pad on bottom and right side, + // which means users don't need to extra padding info to decode boxes coordinate. + ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f); + + // python 0-1 input tensor with rgb_means = (0.485, 0.456, 0.406), std = (0.229, 0.224, 0.225) + // so for 0-255 input image, rgb_mean should multiply 255 and norm should div by std. + const float mean_vals[3] = {255.f * 0.485f, 255.f * 0.456, 255.f * 0.406f}; + const float norm_vals[3] = {1 / (255.f * 0.229f), 1 / (255.f * 0.224f), 1 / (255.f * 0.225f)}; + + in_pad.substract_mean_normalize(mean_vals, norm_vals); + + ncnn::Extractor ex = yolox.create_extractor(); + + ex.input("images", in_pad); + + std::vector proposals; + + { + ncnn::Mat out; + ex.extract("output", out); + + std::vector strides = {8, 16, 32}; // might have stride=64 + std::vector grid_strides; + generate_grids_and_stride(target_size, strides, grid_strides); + generate_yolox_proposals(grid_strides, out, prob_threshold, proposals); + } + + // sort all proposals by score from highest to lowest + qsort_descent_inplace(proposals); + + // apply nms with nms_threshold + std::vector picked; + nms_sorted_bboxes(proposals, picked, nms_threshold); + + int count = picked.size(); + + objects.resize(count); + for (int i = 0; i < count; i++) + { + objects[i] = proposals[picked[i]]; + + // adjust offset to original unpadded + float x0 = (objects[i].rect.x) / scale; + float y0 = (objects[i].rect.y) / scale; + float x1 = (objects[i].rect.x + objects[i].rect.width) / scale; + float y1 = (objects[i].rect.y + objects[i].rect.height) / scale; + + // clip + x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); + y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); + x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); + y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); + + objects[i].rect.x = x0; + objects[i].rect.y = y0; + objects[i].rect.width = x1 - x0; + objects[i].rect.height = y1 - y0; + } + + return 0; +} + +static void draw_objects(const cv::Mat& bgr, const std::vector& objects) +{ + static const char* class_names[] = { + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush" + }; + + cv::Mat image = bgr.clone(); + + for (size_t i = 0; i < objects.size(); i++) + { + const Object& obj = objects[i]; + + fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, + obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); + + cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0)); + + char text[256]; + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); + + int baseLine = 0; + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + int x = obj.rect.x; + int y = obj.rect.y - label_size.height - baseLine; + if (y < 0) + y = 0; + if (x + label_size.width > image.cols) + x = image.cols - label_size.width; + + cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), + cv::Scalar(255, 255, 255), -1); + + cv::putText(image, text, cv::Point(x, y + label_size.height), + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); + } + + cv::imshow("image", image); + cv::waitKey(0); +} + +int main(int argc, char** argv) +{ + if (argc != 2) + { + fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); + return -1; + } + + const char* imagepath = argv[1]; + + cv::Mat m = cv::imread(imagepath, 1); + if (m.empty()) + { + fprintf(stderr, "cv::imread %s failed\n", imagepath); + return -1; + } + + std::vector objects; + detect_yolox(m, objects); + + draw_objects(m, objects); + + return 0; +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d4f717307c1e58a637ee921724d9be478241b37 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +numpy +torch +opencv_python +loguru +scikit_image +tqdm +apex +torchvision +pycocotools +apex +Pillow +skimage +thop +ninja diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..e6352cedd44804d41cacb806067b4e9c2c1296b6 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,18 @@ +[isort] +line_length = 100 +multi_line_output = 3 +balanced_wrapping = True +known_standard_library = setuptools +known_third_party = tqdm,loguru +known_data_processing = cv2,numpy,scipy,PIL,matplotlib,scikit_image +known_datasets = pycocotools +known_deeplearning = torch,torchvision,caffe2,onnx,apex,timm,thop,torch2trt,tensorrt +known_myself = yolox +sections = FUTURE,STDLIB,THIRDPARTY,data_processing,datasets,deeplearning,myself,FIRSTPARTY,LOCALFOLDER +no_lines_before=STDLIB,THIRDPARTY,datasets +default_section = FIRSTPARTY + +[flake8] +max-line-length = 100 +max-complexity = 18 +exclude = __init__.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..3d243ef66587db9f83b028320e9211f094165205 --- /dev/null +++ b/setup.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# Copyright (c) Megvii, Inc. and its affiliates. All Rights Reserved + +import re +import setuptools +import glob +from os import path +import torch +from torch.utils.cpp_extension import CppExtension + +torch_ver = [int(x) for x in torch.__version__.split(".")[:2]] +assert torch_ver >= [1, 3], "Requires PyTorch >= 1.3" + + +def get_extensions(): + this_dir = path.dirname(path.abspath(__file__)) + extensions_dir = path.join(this_dir, "yolox", "layers", "csrc") + + main_source = path.join(extensions_dir, "vision.cpp") + sources = glob.glob(path.join(extensions_dir, "**", "*.cpp")) + + sources = [main_source] + sources + extension = CppExtension + + extra_compile_args = {"cxx": ["-O3"]} + define_macros = [] + + include_dirs = [extensions_dir] + + ext_modules = [ + extension( + "yolox._C", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + + return ext_modules + + +with open("yolox/__init__.py", "r") as f: + version = re.search( + r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', + f.read(), re.MULTILINE + ).group(1) + + +with open("README.md", "r") as f: + long_description = f.read() + + +setuptools.setup( + name="yolox", + version=version, + author="basedet team", + python_requires=">=3.6", + long_description=long_description, + ext_modules=get_extensions(), + classifiers=["Programming Language :: Python :: 3", "Operating System :: OS Independent"], + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, + packages=setuptools.find_packages(), +) diff --git a/tools/demo.py b/tools/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..dc1460a8cd4adf1700656df9f2b792b7dde6df84 --- /dev/null +++ b/tools/demo.py @@ -0,0 +1,278 @@ +import argparse +import os +import time +from loguru import logger + +import cv2 + +import torch +import torch.backends.cudnn as cudnn + +from yolox.data.data_augment import preproc +from yolox.data.datasets import COCO_CLASSES +from yolox.exp import get_exp +from yolox.utils import fuse_model, get_model_info, postprocess, setup_logger, vis, xyxy2xywh + +IMAGE_EXT = ['.jpg', '.jpeg', '.webp', '.bmp', '.png'] + + +def make_parser(): + parser = argparse.ArgumentParser("YOLOX Demo!") + parser.add_argument('demo', default='image', help='demo type, eg. image, video and webcam') + parser.add_argument("-expn", "--experiment-name", type=str, default=None) + parser.add_argument("-n", "--name", type=str, default=None, help="model name") + + parser.add_argument('--path', default='./demo', help='path to images or video') + parser.add_argument('--camid', type=int, default=0, help='webcam demo camera id') + parser.add_argument( + '--save_result', action='store_true', + help='whether to save the inference result of image/video' + ) + + # exp file + parser.add_argument( + "-f", + "--exp_file", + default=None, + type=str, + help="pls input your expriment description file", + ) + parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval") + parser.add_argument("--conf", default=None, type=float, help="test conf") + parser.add_argument("--nms", default=None, type=float, help="test nms threshold") + parser.add_argument("--tsize", default=None, type=int, help="test img size") + parser.add_argument( + "--fp16", + dest="fp16", + default=False, + action="store_true", + help="Adopting mix precision evaluating.", + ) + parser.add_argument( + "--fuse", + dest="fuse", + default=False, + action="store_true", + help="Fuse conv and bn for testing.", + ) + parser.add_argument( + "--trt", + dest="trt", + default=False, + action="store_true", + help="Using TensorRT model for testing.", + ) + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + return parser + + +def get_image_list(path): + image_names = [] + for maindir, subdir, file_name_list in os.walk(path): + for filename in file_name_list: + apath = os.path.join(maindir, filename) + ext = os.path.splitext(apath)[1] + if ext in IMAGE_EXT: + image_names.append(apath) + return image_names + + +class Predictor(object): + def __init__(self, model, exp, cls_names=COCO_CLASSES, trt_file=None, decoder=None): + self.model = model + self.cls_names = cls_names + self.decoder = decoder + self.num_classes = exp.num_classes + self.confthre = exp.test_conf + self.nmsthre = exp.nmsthre + self.test_size = exp.test_size + if trt_file is not None: + from torch2trt import TRTModule + model_trt = TRTModule() + model_trt.load_state_dict(torch.load(trt_file)) + + x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda() + self.model(x) + self.model = model_trt + self.rgb_means = (0.485, 0.456, 0.406) + self.std = (0.229, 0.224, 0.225) + + def inference(self, img): + img_info = {'id': 0} + if isinstance(img, str): + img_info['file_name'] = os.path.basename(img) + img = cv2.imread(img) + else: + img_info['file_name'] = None + + height, width = img.shape[:2] + img_info['height'] = height + img_info['width'] = width + img_info['raw_img'] = img + + img, ratio = preproc(img, self.test_size, self.rgb_means, self.std) + img_info['ratio'] = ratio + img = torch.from_numpy(img).unsqueeze(0).cuda() + + with torch.no_grad(): + t0 = time.time() + outputs = self.model(img) + if self.decoder is not None: + outputs = self.decoder(outputs, dtype=outputs.type()) + outputs = postprocess( + outputs, self.num_classes, self.confthre, self.nmsthre + ) + logger.info('Infer time: {:.4f}s'.format(time.time()-t0)) + return outputs, img_info + + def visual(self, output, img_info, cls_conf=0.35): + ratio = img_info['ratio'] + img = img_info['raw_img'] + output = output.cpu() + + bboxes = output[:, 0:4] + + # preprocessing: resize + bboxes /= ratio + bboxes = xyxy2xywh(bboxes) + + cls = output[:, 6] + scores = output[:, 4] * output[:, 5] + + vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names) + return vis_res + + +def image_demo(predictor, vis_folder, path, current_time, save_result): + if os.path.isdir(path): + files = get_image_list(path) + else: + files = [path] + files.sort() + for image_name in files: + outputs, img_info = predictor.inference(image_name) + result_image = predictor.visual(outputs[0], img_info) + if save_result: + save_folder = os.path.join( + vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time) + ) + os.makedirs(save_folder, exist_ok=True) + save_file_name = os.path.join(save_folder, os.path.basename(image_name)) + logger.info("Saving detection result in {}".format(save_file_name)) + cv2.imwrite(save_file_name, result_image) + ch = cv2.waitKey(0) + if ch == 27 or ch == ord('q') or ch == ord('Q'): + break + + +def imageflow_demo(predictor, vis_folder, current_time, args): + cap = cv2.VideoCapture(args.path if args.demo == 'video' else args.camid) + width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) # float + height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) # float + fps = cap.get(cv2.CAP_PROP_FPS) + save_folder = os.path.join(vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)) + os.makedirs(save_folder, exist_ok=True) + if args.demo == "video": + save_path = os.path.join(save_folder, args.path.split('/')[-1]) + else: + save_path = os.path.join(save_folder, 'camera.mp4') + logger.info(f'video save_path is {save_path}') + vid_writer = cv2.VideoWriter( + save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (int(width), int(height)) + ) + while True: + ret_val, frame = cap.read() + if ret_val: + outputs, img_info = predictor.inference(frame) + result_frame = predictor.visualize(outputs[0], img_info) + if args.save_result: + vid_writer.write(result_frame) + ch = cv2.waitKey(1) + if ch == 27 or ch == ord('q') or ch == ord('Q'): + break + else: + break + + +def main(exp, args): + if not args.experiment_name: + args.experiment_name = exp.exp_name + + # set environment variables for distributed training + cudnn.benchmark = True + rank = 0 + + file_name = os.path.join(exp.output_dir, args.experiment_name) + os.makedirs(file_name, exist_ok=True) + + if args.save_result: + vis_folder = os.path.join(file_name, 'vis_res') + os.makedirs(vis_folder, exist_ok=True) + + setup_logger( + file_name, distributed_rank=rank, filename="demo_log.txt", mode="a" + ) + logger.info("Args: {}".format(args)) + + if args.conf is not None: + exp.test_conf = args.conf + if args.nms is not None: + exp.nmsthre = args.nms + if args.tsize is not None: + exp.test_size = (args.tsize, args.tsize) + + model = exp.get_model() + logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size))) + + torch.cuda.set_device(rank) + model.cuda(rank) + model.eval() + + if not args.trt: + if args.ckpt is None: + ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar") + else: + ckpt_file = args.ckpt + logger.info("loading checkpoint") + loc = "cuda:{}".format(rank) + ckpt = torch.load(ckpt_file, map_location=loc) + # load the model state dict + model.load_state_dict(ckpt["model"]) + logger.info("loaded checkpoint done.") + + if args.fuse: + logger.info("\tFusing model...") + model = fuse_model(model) + + if args.trt: + assert (not args.fuse),\ + "TensorRT model is not support model fusing!" + trt_file = os.path.join(file_name, "model_trt.pth") + assert os.path.exists(trt_file), ( + "TensorRT model is not found!\n Run python3 yolox/deploy/trt.py first!" + ) + model.head.decode_in_inference = False + decoder = model.head.decode_outputs + logger.info("Using TensorRT to inference") + else: + trt_file = None + decoder = None + + predictor = Predictor(model, exp, COCO_CLASSES, trt_file, decoder) + current_time = time.localtime() + if args.demo == 'image': + image_demo(predictor, vis_folder, args.path, current_time, args.save_result) + elif args.demo == 'video' or args.demo == 'webcam': + imageflow_demo(predictor, vis_folder, current_time, args) + + +if __name__ == "__main__": + args = make_parser().parse_args() + exp = get_exp(args.exp_file, args.name) + + main(exp, args) diff --git a/tools/eval.py b/tools/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..cbc0bb6c9bd79fd10a1048e1b827c4b54cbcd233 --- /dev/null +++ b/tools/eval.py @@ -0,0 +1,195 @@ +import argparse +import os +import random +import warnings +from loguru import logger + +import torch +import torch.backends.cudnn as cudnn +from torch.nn.parallel import DistributedDataParallel as DDP + +from yolox.core import launch +from yolox.exp import get_exp +from yolox.utils import configure_nccl, fuse_model, get_local_rank, get_model_info, setup_logger + + +def make_parser(): + parser = argparse.ArgumentParser("YOLOX Eval") + parser.add_argument("-expn", "--experiment-name", type=str, default=None) + parser.add_argument("-n", "--name", type=str, default=None, help="model name") + + # distributed + parser.add_argument( + "--dist-backend", default="nccl", type=str, help="distributed backend" + ) + parser.add_argument( + "--dist-url", default=None, type=str, help="url used to set up distributed training" + ) + parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size") + parser.add_argument( + "-d", "--devices", default=None, type=int, help="device for training" + ) + parser.add_argument( + "--local_rank", default=0, type=int, help="local rank for dist training" + ) + parser.add_argument( + "--num_machine", default=1, type=int, help="num of node for training" + ) + parser.add_argument( + "--machine_rank", default=0, type=int, help="node rank for multi-node training" + ) + parser.add_argument( + "-f", + "--exp_file", + default=None, + type=str, + help="pls input your expriment description file", + ) + parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval") + parser.add_argument("--conf", default=None, type=float, help="test conf") + parser.add_argument("--nms", default=None, type=float, help="test nms threshold") + parser.add_argument("--tsize", default=None, type=int, help="test img size") + parser.add_argument("--seed", default=None, type=int, help="eval seed") + parser.add_argument( + "--fp16", + dest="fp16", + default=False, + action="store_true", + help="Adopting mix precision evaluating.", + ) + parser.add_argument( + "--fuse", + dest="fuse", + default=False, + action="store_true", + help="Fuse conv and bn for testing.", + ) + parser.add_argument( + "--trt", + dest="trt", + default=False, + action="store_true", + help="Using TensorRT model for testing.", + ) + parser.add_argument( + "--test", + dest="test", + default=False, + action="store_true", + help="Evaluating on test-dev set.", + ) + parser.add_argument( + "--speed", dest="speed", default=False, action="store_true", help="speed test only." + ) + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + return parser + + +@logger.catch +def main(exp, num_gpu, args): + if not args.experiment_name: + args.experiment_name = exp.exp_name + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + warnings.warn( + "You have chosen to seed testing. This will turn on the CUDNN deterministic setting, " + ) + + is_distributed = num_gpu > 1 + + # set environment variables for distributed training + configure_nccl() + cudnn.benchmark = True + + # rank = args.local_rank + rank = get_local_rank() + + if rank == 0: + if os.path.exists("./" + args.experiment_name + "ip_add.txt"): + os.remove("./" + args.experiment_name + "ip_add.txt") + + file_name = os.path.join(exp.output_dir, args.experiment_name) + + if rank == 0: + os.makedirs(file_name, exist_ok=True) + + setup_logger( + file_name, distributed_rank=rank, filename="val_log.txt", mode="a" + ) + logger.info("Args: {}".format(args)) + + if args.conf is not None: + exp.test_conf = args.conf + if args.nms is not None: + exp.nmsthre = args.nms + if args.tsize is not None: + exp.test_size = (args.tsize, args.tsize) + + model = exp.get_model() + logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size))) + logger.info("Model Structure:\n{}".format(str(model))) + + evaluator = exp.get_evaluator(args.batch_size, is_distributed, args.test) + + torch.cuda.set_device(rank) + model.cuda(rank) + model.eval() + + if not args.speed and not args.trt: + if args.ckpt is None: + ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar") + else: + ckpt_file = args.ckpt + logger.info("loading checkpoint") + loc = "cuda:{}".format(rank) + ckpt = torch.load(ckpt_file, map_location=loc) + # load the model state dict + model.load_state_dict(ckpt["model"]) + logger.info("loaded checkpoint done.") + + if is_distributed: + model = DDP(model, device_ids=[rank]) + + if args.fuse: + logger.info("\tFusing model...") + model = fuse_model(model) + + if args.trt: + assert (not args.fuse and not is_distributed and args.batch_size == 1),\ + "TensorRT model is not support model fusing and distributed inferencing!" + trt_file = os.path.join(file_name, "model_trt.pth") + assert os.path.exists(trt_file), "TensorRT model is not found!\n Run tools/trt.py first!" + model.head.decode_in_inference = False + decoder = model.head.decode_outputs + else: + trt_file = None + decoder = None + + # start evaluate + *_, summary = evaluator.evaluate( + model, is_distributed, args.fp16, trt_file, decoder, exp.test_size + ) + logger.info("\n" + summary) + + +if __name__ == "__main__": + args = make_parser().parse_args() + exp = get_exp(args.exp_file, args.name) + exp.merge(args.opts) + + num_gpu = torch.cuda.device_count() if args.devices is None else args.devices + assert num_gpu <= torch.cuda.device_count() + + dist_url = "auto" if args.dist_url is None else args.dist_url + launch( + main, num_gpu, args.num_machine, backend=args.dist_backend, + dist_url=dist_url, args=(exp, num_gpu, args) + ) diff --git a/tools/export_onnx.py b/tools/export_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..1eae933252bbc853acde3798b683439d79b0a455 --- /dev/null +++ b/tools/export_onnx.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import argparse +import os +from loguru import logger + +import torch +from torch import nn + +from yolox.exp import get_exp +from yolox.models.network_blocks import SiLU +from yolox.utils import replace_module + + +def make_parser(): + parser = argparse.ArgumentParser("YOLOX onnx deploy") + parser.add_argument( + "--output-name", type=str, default="yolox.onnx", help="output name of models" + ) + parser.add_argument("--input", default="images", type=str, help="input name of onnx model") + parser.add_argument("--output", default="output", type=str, help="output name of onnx model") + parser.add_argument("-o", "--opset", default=11, type=int, help="onnx opset version") + parser.add_argument("--no-onnxsim", action="store_true", help="use onnxsim or not") + + parser.add_argument( + "-f", + "--exp_file", + default=None, + type=str, + help="expriment description file", + ) + parser.add_argument("-expn", "--experiment-name", type=str, default=None) + parser.add_argument("-n", "--name", type=str, default=None, help="model name") + parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt path") + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + + return parser + + +@logger.catch +def main(): + args = make_parser().parse_args() + logger.info("args value: {}".format(args)) + exp = get_exp(args.exp_file, args.name) + exp.merge(args.opts) + + if not args.experiment_name: + args.experiment_name = exp.exp_name + + model = exp.get_model() + if args.ckpt is None: + file_name = os.path.join(exp.output_dir, args.experiment_name) + ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar") + else: + ckpt_file = args.ckpt + + ckpt = torch.load(ckpt_file, map_location="cpu") + # load the model state dict + + model.eval() + if "model" in ckpt: + ckpt = ckpt["model"] + model.load_state_dict(ckpt) + model = replace_module(model, nn.SiLU, SiLU) + model.head.decode_in_inference = False + + logger.info("loaded checkpoint done.") + dummy_input = torch.randn(1, 3, exp.test_size[0], exp.test_size[1]) + torch.onnx._export( + model, + dummy_input, + args.output_name, + input_names=[args.input], + output_names=[args.output], + opset_version=args.opset, + ) + logger.info("generate onnx named {}".format(args.output_name)) + + if not args.no_onnxsim: + # use onnxsimplify to reduce reduent model. + os.system("python3 -m onnxsim {} {}".format(args.output_name, args.output_name)) + logger.info("generate simplify onnx named {}".format(args.output_name)) + + +if __name__ == "__main__": + main() diff --git a/tools/train.py b/tools/train.py new file mode 100644 index 0000000000000000000000000000000000000000..848086d1d0ae978ebd3d1db856e7a4e1388247f9 --- /dev/null +++ b/tools/train.py @@ -0,0 +1,112 @@ +import argparse +import random +import warnings +from loguru import logger + +import torch +import torch.backends.cudnn as cudnn + +from yolox.core import Trainer, launch +from yolox.exp import get_exp +from yolox.utils import configure_nccl + + +def make_parser(): + parser = argparse.ArgumentParser("YOLOX train parser") + parser.add_argument("-expn", "--experiment-name", type=str, default=None) + parser.add_argument("-n", "--name", type=str, default=None, help="model name") + + # distributed + parser.add_argument( + "--dist-backend", default="nccl", type=str, help="distributed backend" + ) + parser.add_argument( + "--dist-url", default=None, type=str, help="url used to set up distributed training" + ) + parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size") + parser.add_argument( + "-d", "--devices", default=None, type=int, help="device for training" + ) + parser.add_argument( + "--local_rank", default=0, type=int, help="local rank for dist training" + ) + parser.add_argument( + "-f", + "--exp_file", + default=None, + type=str, + help="plz input your expriment description file", + ) + parser.add_argument( + "--resume", default=False, action="store_true", help="resume training" + ) + parser.add_argument("-c", "--ckpt", default=None, type=str, help="checkpoint file") + parser.add_argument( + "-e", "--start_epoch", default=None, type=int, help="resume training start epoch" + ) + parser.add_argument( + "--num_machine", default=1, type=int, help="num of node for training" + ) + parser.add_argument( + "--machine_rank", default=0, type=int, help="node rank for multi-node training" + ) + parser.add_argument( + "--fp16", + dest="fp16", + default=True, + action="store_true", + help="Adopting mix precision training.", + ) + parser.add_argument( + "-o", + "--occumpy", + dest="occumpy", + default=False, + action="store_true", + help="occumpy GPU memory first for training.", + ) + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + return parser + + +@logger.catch +def main(exp, args): + if not args.experiment_name: + args.experiment_name = exp.exp_name + + if exp.seed is not None: + random.seed(exp.seed) + torch.manual_seed(exp.seed) + cudnn.deterministic = True + warnings.warn( + "You have chosen to seed training. This will turn on the CUDNN deterministic setting, " + "which can slow down your training considerably! You may see unexpected behavior " + "when restarting from checkpoints." + ) + + # set environment variables for distributed training + configure_nccl() + cudnn.benchmark = True + + trainer = Trainer(exp, args) + trainer.train() + + +if __name__ == "__main__": + args = make_parser().parse_args() + exp = get_exp(args.exp_file, args.name) + exp.merge(args.opts) + + num_gpu = torch.cuda.device_count() if args.devices is None else args.devices + assert num_gpu <= torch.cuda.device_count() + + dist_url = "auto" if args.dist_url is None else args.dist_url + launch( + main, num_gpu, args.num_machine, backend=args.dist_backend, + dist_url=dist_url, args=(exp, args) + ) diff --git a/tools/trt.py b/tools/trt.py new file mode 100644 index 0000000000000000000000000000000000000000..6ccf0a2b13bb7438849d8d1794be8897ffc644ba --- /dev/null +++ b/tools/trt.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import argparse +import os +from loguru import logger + +import tensorrt as trt +import torch +from torch2trt import torch2trt + +from yolox.exp import get_exp + + +def make_parser(): + parser = argparse.ArgumentParser("YOLOX ncnn deploy") + parser.add_argument("-expn", "--experiment-name", type=str, default=None) + parser.add_argument("-n", "--name", type=str, default=None, help="model name") + + parser.add_argument( + "-f", + "--exp_file", + default=None, + type=str, + help="pls input your expriment description file", + ) + parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt path") + return parser + + +@logger.catch +def main(): + args = make_parser().parse_args() + exp = get_exp(args.exp_file, args.name) + if not args.experiment_name: + args.experiment_name = exp.exp_name + + model = exp.get_model() + file_name = os.path.join(exp.output_dir, args.experiment_name) + os.makedirs(file_name, exist_ok=True) + if args.ckpt is None: + ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar") + else: + ckpt_file = args.ckpt + + ckpt = torch.load(ckpt_file, map_location="cpu") + # load the model state dict + + model.load_state_dict(ckpt["model"]) + logger.info("loaded checkpoint done.") + model.eval() + model.cuda() + model.head.decode_in_inference = False + x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda() + model_trt = torch2trt( + model, + [x], + fp16_mode=True, + log_level=trt.Logger.INFO, + max_workspace_size=(1 << 32), + ) + torch.save(model_trt.state_dict(), os.path.join(file_name, 'model_trt.pth')) + logger.info("Converted TensorRT model done.") + + +if __name__ == "__main__": + main() diff --git a/yolox/__init__.py b/yolox/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1cbc411d419c55098e7d4e24ff0f21caaaf10a1f --- /dev/null +++ b/yolox/__init__.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +from .utils import configure_module + +configure_module() + +__version__ = "0.1.0" diff --git a/yolox/core/__init__.py b/yolox/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8484835d0a57aedada22894e085776c9effd3b20 --- /dev/null +++ b/yolox/core/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +from .launch import launch +from .trainer import Trainer diff --git a/yolox/core/launch.py b/yolox/core/launch.py new file mode 100644 index 0000000000000000000000000000000000000000..94e84f7865c27a63613ae7bbc2c082521abf66b7 --- /dev/null +++ b/yolox/core/launch.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Code are based on +# https://github.com/facebookresearch/detectron2/blob/master/detectron2/engine/launch.py +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Megvii, Inc. and its affiliates. + + +from loguru import logger + +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + +import yolox.utils.dist as comm + +__all__ = ["launch"] + + +def _find_free_port(): + """ + Find an available port of current machine / node. + """ + import socket + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + # Binding to port 0 will cause the OS to find an available port for us + sock.bind(("", 0)) + port = sock.getsockname()[1] + sock.close() + # NOTE: there is still a chance the port could be taken by other processes. + return port + + +def launch( + main_func, num_gpus_per_machine, num_machines=1, machine_rank=0, + backend="nccl", dist_url=None, args=() +): + """ + Args: + main_func: a function that will be called by `main_func(*args)` + num_machines (int): the total number of machines + machine_rank (int): the rank of this machine (one per machine) + dist_url (str): url to connect to for distributed training, including protocol + e.g. "tcp://127.0.0.1:8686". + Can be set to auto to automatically select a free port on localhost + args (tuple): arguments passed to main_func + """ + world_size = num_machines * num_gpus_per_machine + if world_size > 1: + # https://github.com/pytorch/pytorch/pull/14391 + # TODO prctl in spawned processes + + if dist_url == "auto": + assert num_machines == 1, "dist_url=auto cannot work with distributed training." + port = _find_free_port() + dist_url = f"tcp://127.0.0.1:{port}" + + mp.spawn( + _distributed_worker, + nprocs=num_gpus_per_machine, + args=( + main_func, world_size, num_gpus_per_machine, + machine_rank, backend, dist_url, args + ), + daemon=False, + ) + else: + main_func(*args) + + +def _distributed_worker( + local_rank, main_func, world_size, num_gpus_per_machine, + machine_rank, backend, dist_url, args +): + assert torch.cuda.is_available(), "cuda is not available. Please check your installation." + global_rank = machine_rank * num_gpus_per_machine + local_rank + logger.info("Rank {} initialization finished.".format(global_rank)) + try: + dist.init_process_group( + backend=backend, + init_method=dist_url, + world_size=world_size, + rank=global_rank, + ) + except Exception: + logger.error("Process group URL: {}".format(dist_url)) + raise + # synchronize is needed here to prevent a possible timeout after calling init_process_group + # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 + comm.synchronize() + + assert num_gpus_per_machine <= torch.cuda.device_count() + torch.cuda.set_device(local_rank) + + # Setup the local process group (which contains ranks within the same machine) + assert comm._LOCAL_PROCESS_GROUP is None + num_machines = world_size // num_gpus_per_machine + for i in range(num_machines): + ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)) + pg = dist.new_group(ranks_on_i) + if i == machine_rank: + comm._LOCAL_PROCESS_GROUP = pg + + main_func(*args) diff --git a/yolox/core/trainer.py b/yolox/core/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..6f5d24bea1d27a0bd127fdef7883ce1c03044424 --- /dev/null +++ b/yolox/core/trainer.py @@ -0,0 +1,318 @@ +import datetime +import os +import time +from loguru import logger + +import apex +import torch +from apex import amp +from torch.utils.tensorboard import SummaryWriter + +from yolox.data import DataPrefetcher +from yolox.utils import ( + MeterBuffer, + ModelEMA, + all_reduce_norm, + get_local_rank, + get_model_info, + get_rank, + get_world_size, + gpu_mem_usage, + load_ckpt, + occumpy_mem, + save_checkpoint, + setup_logger, + synchronize +) + + +class Trainer: + + def __init__(self, exp, args): + # init function only defines some basic attr, other attrs like model, optimizer are built in + # before_train methods. + self.exp = exp + self.args = args + + # training related attr + self.max_epoch = exp.max_epoch + self.amp_training = args.fp16 + self.is_distributed = get_world_size() > 1 + self.rank = get_rank() + self.local_rank = get_local_rank() + self.device = "cuda:{}".format(self.local_rank) + self.use_model_ema = exp.ema + + # data/dataloader related attr + self.data_type = torch.float16 if args.fp16 else torch.float32 + self.input_size = exp.input_size + self.best_ap = 0 + + # metric record + self.meter = MeterBuffer(window_size=exp.print_interval) + self.file_name = os.path.join(exp.output_dir, args.experiment_name) + + if self.rank == 0 and os.path.exists("./" + args.experiment_name + "ip_add.txt"): + os.remove("./" + args.experiment_name + "ip_add.txt") + + if self.rank == 0: + os.makedirs(self.file_name, exist_ok=True) + + setup_logger(self.file_name, distributed_rank=self.rank, filename="train_log.txt", mode="a") + + def train(self): + self.before_train() + try: + self.train_in_epoch() + except Exception: + raise + finally: + self.after_train() + + def train_in_epoch(self): + for self.epoch in range(self.start_epoch, self.max_epoch): + self.before_epoch() + self.train_in_iter() + self.after_epoch() + + def train_in_iter(self): + for self.iter in range(self.max_iter): + self.before_iter() + self.train_one_iter() + self.after_iter() + + def train_one_iter(self): + iter_start_time = time.time() + + inps, targets = self.prefetcher.next() + inps = inps.to(self.data_type) + targets = targets.to(self.data_type) + targets.requires_grad = False + data_end_time = time.time() + + outputs = self.model(inps, targets) + loss = outputs["total_loss"] + + self.optimizer.zero_grad() + if self.amp_training: + with amp.scale_loss(loss, self.optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + self.optimizer.step() + + if self.use_model_ema: + self.ema_model.update(self.model) + + lr = self.lr_scheduler.update_lr(self.progress_in_iter + 1) + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + iter_end_time = time.time() + self.meter.update( + iter_time=iter_end_time - iter_start_time, + data_time=data_end_time - iter_start_time, + lr=lr, + **outputs, + ) + + def before_train(self): + logger.info("args: {}".format(self.args)) + logger.info("exp value:\n{}".format(self.exp)) + + # model related init + torch.cuda.set_device(self.local_rank) + model = self.exp.get_model() + logger.info("Model Summary: {}".format(get_model_info(model, self.exp.test_size))) + model.to(self.device) + + # solver related init + self.optimizer = self.exp.get_optimizer(self.args.batch_size) + + if self.amp_training: + model, optimizer = amp.initialize(model, self.optimizer, opt_level="O1") + + # value of epoch will be set in `resume_train` + model = self.resume_train(model) + + # data related init + self.no_aug = self.start_epoch >= self.max_epoch - self.exp.no_aug_epochs + self.train_loader = self.exp.get_data_loader( + batch_size=self.args.batch_size, + is_distributed=self.is_distributed, + no_aug=self.no_aug + ) + logger.info("init prefetcher, this might take a while...") + self.prefetcher = DataPrefetcher(self.train_loader) + # max_iter means iters per epoch + self.max_iter = len(self.train_loader) + + self.lr_scheduler = self.exp.get_lr_scheduler( + self.exp.basic_lr_per_img * self.args.batch_size, self.max_iter + ) + if self.args.occumpy: + occumpy_mem(self.local_rank) + + if self.is_distributed: + model = apex.parallel.DistributedDataParallel(model) + # from torch.nn.parallel import DistributedDataParallel as DDP + # model = DDP(model, device_ids=[self.local_rank], broadcast_buffers=False) + + if self.use_model_ema: + self.ema_model = ModelEMA(model, 0.9998) + self.ema_model.updates = self.max_iter * self.start_epoch + + self.model = model + self.model.train() + + self.evaluator = self.exp.get_evaluator( + batch_size=self.args.batch_size, is_distributed=self.is_distributed + ) + # Tensorboard logger + if self.rank == 0: + self.tblogger = SummaryWriter(self.file_name) + + logger.info("Training start...") + logger.info("\n{}".format(model)) + + def after_train(self): + logger.info( + "Training of experiment is done and the best AP is {:.2f}".format(self.best_ap * 100) + ) + + def before_epoch(self): + logger.info("---> start train epoch{}".format(self.epoch + 1)) + + if self.epoch + 1 == self.max_epoch - self.exp.no_aug_epochs or self.no_aug: + logger.info("--->No mosaic aug now!") + self.train_loader.close_mosaic() + logger.info("--->Add additional L1 loss now!") + if self.is_distributed: + self.model.module.head.use_l1 = True + else: + self.model.head.use_l1 = True + self.exp.eval_interval = 1 + if not self.no_aug: + self.save_ckpt(ckpt_name="last_mosaic_epoch") + + def after_epoch(self): + if self.use_model_ema: + self.ema_model.update_attr(self.model) + + self.save_ckpt(ckpt_name="latest") + + if (self.epoch + 1) % self.exp.eval_interval == 0: + all_reduce_norm(self.model) + self.evaluate_and_save_model() + + def before_iter(self): + pass + + def after_iter(self): + """ + `after_iter` contains two parts of logic: + * log information + * reset setting of resize + """ + # log needed information + if (self.iter + 1) % self.exp.print_interval == 0: + # TODO check ETA logic + left_iters = self.max_iter * self.max_epoch - (self.progress_in_iter + 1) + eta_seconds = self.meter["iter_time"].global_avg * left_iters + eta_str = "ETA: {}".format(datetime.timedelta(seconds=int(eta_seconds))) + + progress_str = "epoch: {}/{}, iter: {}/{}".format( + self.epoch + 1, self.max_epoch, self.iter + 1, self.max_iter + ) + loss_meter = self.meter.get_filtered_meter("loss") + loss_str = ", ".join(["{}: {:.1f}".format(k, v.latest) for k, v in loss_meter.items()]) + + time_meter = self.meter.get_filtered_meter("time") + time_str = ", ".join(["{}: {:.3f}s".format(k, v.avg) for k, v in time_meter.items()]) + + logger.info( + "{}, mem: {:.0f}Mb, {}, {}, lr: {:.3e}".format( + progress_str, + gpu_mem_usage(), + time_str, + loss_str, + self.meter["lr"].latest, + ) + + (", size: {:d}, {}".format(self.input_size[0], eta_str)) + ) + self.meter.clear_meters() + + # random resizing + if self.exp.random_size is not None and (self.progress_in_iter + 1) % 10 == 0: + self.input_size = self.exp.random_resize( + self.train_loader, self.epoch, self.rank, self.is_distributed + ) + + @property + def progress_in_iter(self): + return self.epoch * self.max_iter + self.iter + + def resume_train(self, model): + if self.args.resume: + logger.info("resume training") + if self.args.ckpt is None: + ckpt_file = os.path.join(self.file_name, "latest" + "_ckpt.pth.tar") + else: + ckpt_file = self.args.ckpt + + ckpt = torch.load(ckpt_file, map_location=self.device) + # resume the model/optimizer state dict + model.load_state_dict(ckpt["model"]) + self.optimizer.load_state_dict(ckpt["optimizer"]) + # resume the training states variables + if self.amp_training and "amp" in ckpt: + amp.load_state_dict(ckpt["amp"]) + start_epoch = ( + self.args.start_epoch - 1 + if self.args.start_epoch is not None + else ckpt["start_epoch"] + ) + self.start_epoch = start_epoch + logger.info("loaded checkpoint '{}' (epoch {})".format(self.args.resume, self.start_epoch)) # noqa + else: + if self.args.ckpt is not None: + logger.info("loading checkpoint for fine tuning") + ckpt_file = self.args.ckpt + ckpt = torch.load(ckpt_file, map_location=self.device)["model"] + model = load_ckpt(self.model, ckpt) + self.start_epoch = 0 + + return model + + def evaluate_and_save_model(self): + evalmodel = self.ema_model.ema if self.use_model_ema else self.model + ap50_95, ap50, summary = self.exp.eval(evalmodel, self.evaluator, self.is_distributed) + self.model.train() + if self.rank == 0: + self.tblogger.add_scalar("val/COCOAP50", ap50, self.epoch + 1) + self.tblogger.add_scalar("val/COCOAP50_95", ap50_95, self.epoch + 1) + logger.info("\n" + summary) + synchronize() + + self.save_ckpt("last_epoch", ap50_95 > self.best_ap) + self.best_ap = max(self.best_ap, ap50_95) + + def save_ckpt(self, ckpt_name, update_best_ckpt=False): + if self.rank == 0: + save_model = self.ema_model.ema if self.use_model_ema else self.model + logger.info("Save weights to {}".format(self.file_name)) + ckpt_state = { + "start_epoch": self.epoch + 1, + "model": save_model.state_dict(), + "optimizer": self.optimizer.state_dict(), + } + if self.amp_training: + # save amp state according to + # https://nvidia.github.io/apex/amp.html#checkpointing + ckpt_state["amp"] = amp.state_dict() + save_checkpoint( + ckpt_state, + update_best_ckpt, + self.file_name, + ckpt_name, + ) diff --git a/yolox/data/__init__.py b/yolox/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5b6bb501915f4bd131e3e706168122764a3514cc --- /dev/null +++ b/yolox/data/__init__.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +from .data_augment import TrainTransform, ValTransform +from .data_prefetcher import DataPrefetcher +from .dataloading import DataLoader, get_yolox_datadir +from .datasets import * +from .samplers import InfiniteSampler, YoloBatchSampler diff --git a/yolox/data/data_augment.py b/yolox/data/data_augment.py new file mode 100644 index 0000000000000000000000000000000000000000..5ccdbbe5f285e9e49b67ce4469fcb868c5ae0efc --- /dev/null +++ b/yolox/data/data_augment.py @@ -0,0 +1,389 @@ +""" +Data augmentation functionality. Passed as callable transformations to +Dataset classes. + +The data augmentation procedures were interpreted from @weiliu89's SSD paper +http://arxiv.org/abs/1512.02325 +""" + +import math +import random + +import cv2 +import numpy as np + +import torch + + +def augment_hsv(img, hgain=0.015, sgain=0.7, vgain=0.4): + r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1 # random gains + hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV)) + dtype = img.dtype # uint8 + + x = np.arange(0, 256, dtype=np.int16) + lut_hue = ((x * r[0]) % 180).astype(dtype) + lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) + lut_val = np.clip(x * r[2], 0, 255).astype(dtype) + + img_hsv = cv2.merge( + (cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)) + ).astype(dtype) + cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) # no return needed + + +def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.2): + # box1(4,n), box2(4,n) + # Compute candidate boxes which include follwing 5 things: + # box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio + w1, h1 = box1[2] - box1[0], box1[3] - box1[1] + w2, h2 = box2[2] - box2[0], box2[3] - box2[1] + ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16)) # aspect ratio + return ( + (w2 > wh_thr) + & (h2 > wh_thr) + & (w2 * h2 / (w1 * h1 + 1e-16) > area_thr) + & (ar < ar_thr) + ) # candidates + + +def random_perspective( + img, targets=(), degrees=10, translate=0.1, scale=0.1, shear=10, perspective=0.0, border=(0, 0), +): + # targets = [cls, xyxy] + height = img.shape[0] + border[0] * 2 # shape(h,w,c) + width = img.shape[1] + border[1] * 2 + + # Center + C = np.eye(3) + C[0, 2] = -img.shape[1] / 2 # x translation (pixels) + C[1, 2] = -img.shape[0] / 2 # y translation (pixels) + + # Rotation and Scale + R = np.eye(3) + a = random.uniform(-degrees, degrees) + # a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations + s = random.uniform(scale[0], scale[1]) + # s = 2 ** random.uniform(-scale, scale) + R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s) + + # Shear + S = np.eye(3) + S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # x shear (deg) + S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # y shear (deg) + + # Translation + T = np.eye(3) + T[0, 2] = (random.uniform(0.5 - translate, 0.5 + translate) * width) # x translation (pixels) + T[1, 2] = (random.uniform(0.5 - translate, 0.5 + translate) * height) # y translation (pixels) + + # Combined rotation matrix + M = T @ S @ R @ C # order of operations (right to left) is IMPORTANT + + ########################### + # For Aug out of Mosaic + # s = 1. + # M = np.eye(3) + ########################### + + if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any(): # image changed + if perspective: + img = cv2.warpPerspective(img, M, dsize=(width, height), borderValue=(114, 114, 114)) + else: # affine + img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114)) + + # Transform label coordinates + n = len(targets) + if n: + # warp points + xy = np.ones((n * 4, 3)) + xy[:, :2] = targets[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 + xy = xy @ M.T # transform + if perspective: + xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) # rescale + else: # affine + xy = xy[:, :2].reshape(n, 8) + + # create new boxes + x = xy[:, [0, 2, 4, 6]] + y = xy[:, [1, 3, 5, 7]] + xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T + + # clip boxes + xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) + xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) + + # filter candidates + i = box_candidates(box1=targets[:, :4].T * s, box2=xy.T) + targets = targets[i] + targets[:, :4] = xy[i] + + return img, targets + + +def _distort(image): + def _convert(image, alpha=1, beta=0): + tmp = image.astype(float) * alpha + beta + tmp[tmp < 0] = 0 + tmp[tmp > 255] = 255 + image[:] = tmp + + image = image.copy() + + if random.randrange(2): + _convert(image, beta=random.uniform(-32, 32)) + + if random.randrange(2): + _convert(image, alpha=random.uniform(0.5, 1.5)) + + image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) + + if random.randrange(2): + tmp = image[:, :, 0].astype(int) + random.randint(-18, 18) + tmp %= 180 + image[:, :, 0] = tmp + + if random.randrange(2): + _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5)) + + image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) + + return image + + +def _mirror(image, boxes): + _, width, _ = image.shape + if random.randrange(2): + image = image[:, ::-1] + boxes = boxes.copy() + boxes[:, 0::2] = width - boxes[:, 2::-2] + return image, boxes + + +# TODO: reorg: use mosaicDet instead +def _random_affine( + img, + targets=None, + degrees=(-10, 10), + translate=(0.1, 0.1), + scale=(0.9, 1.1), + shear=(-2, 2), + borderValue=(114, 114, 114), +): + # degrees = (0, 0) + # shear = (0, 0) + border = 0 # width of added border (optional) + # height = max(img.shape[0], img.shape[1]) + border * 2 + height, width, _ = img.shape + + # Rotation and Scale + R = np.eye(3) + a = random.random() * (degrees[1] - degrees[0]) + degrees[0] + # a += random.choice([-180, -90, 0, 90]) # 90deg rotations added to small rotations + s = random.random() * (scale[1] - scale[0]) + scale[0] + R[:2] = cv2.getRotationMatrix2D( + angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s + ) + + # Translation + T = np.eye(3) + # x translation (pixels) + T[0, 2] = (random.random() * 2 - 1) * translate[0] * img.shape[0] + border + # y translation (pixels) + T[1, 2] = (random.random() * 2 - 1) * translate[1] * img.shape[1] + border + + # Shear + S = np.eye(3) + # x shear (deg) + S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) + # y shear (deg) + S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) + + # Combined rotation matrix. NOTE: ORDER IS IMPORTANT HERE!! + M = S @ T @ R + # BGR order borderValue + imw = cv2.warpPerspective( + img, M, dsize=(width, height), flags=cv2.INTER_LINEAR, borderValue=borderValue + ) + + # Return warped points also + if targets is not None: + if len(targets) > 0: + n = targets.shape[0] + points = targets[:, 0:4].copy() + + # warp points + xy = np.ones((n * 4, 3)) + xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( + n * 4, 2 + ) # x1y1, x2y2, x1y2, x2y1 + xy = (xy @ M.T)[:, :2].reshape(n, 8) + + # create new boxes + x = xy[:, [0, 2, 4, 6]] + y = xy[:, [1, 3, 5, 7]] + xy = ( + np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T + ) + + # apply angle-based reduction + radians = a * math.pi / 180 + reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5 + x = (xy[:, 2] + xy[:, 0]) / 2 + y = (xy[:, 3] + xy[:, 1]) / 2 + w = (xy[:, 2] - xy[:, 0]) * reduction + h = (xy[:, 3] - xy[:, 1]) * reduction + xy = ( + np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)) + .reshape(4, n) + .T + ) + + # reject warped points outside of image + x1 = np.clip(xy[:, 0], 0, width) + y1 = np.clip(xy[:, 1], 0, height) + x2 = np.clip(xy[:, 2], 0, width) + y2 = np.clip(xy[:, 3], 0, height) + boxes = np.concatenate((x1, y1, x2, y2)).reshape(4, n).T + + return imw, boxes, M + else: + return imw + + +def preproc(image, input_size, mean, std, swap=(2, 0, 1)): + if len(image.shape) == 3: + padded_img = np.ones((input_size[0], input_size[1], 3)) * 114.0 + else: + padded_img = np.ones(input_size) * 114.0 + img = np.array(image) + r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) + resized_img = cv2.resize( + img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LINEAR + ).astype(np.float32) + padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img + image = padded_img + + image = image.astype(np.float32) + image = image[:, :, ::-1] + image /= 255.0 + if mean is not None: + image -= mean + if std is not None: + image /= std + image = image.transpose(swap) + image = np.ascontiguousarray(image, dtype=np.float32) + return image, r + + +class TrainTransform: + def __init__(self, p=0.5, rgb_means=None, std=None, max_labels=50): + self.means = rgb_means + self.std = std + self.p = p + self.max_labels = max_labels + + def __call__(self, image, targets, input_dim): + boxes = targets[:, :4].copy() + labels = targets[:, 4].copy() + if targets.shape[1] > 5: + mixup = True + ratios = targets[:, -1].copy() + ratios_o = targets[:, -1].copy() + else: + mixup = False + ratios = None + ratios_o = None + lshape = 6 if mixup else 5 + if len(boxes) == 0: + targets = np.zeros((self.max_labels, lshape), dtype=np.float32) + image, r_o = preproc(image, input_dim, self.means, self.std) + image = np.ascontiguousarray(image, dtype=np.float32) + return image, targets + + image_o = image.copy() + targets_o = targets.copy() + height_o, width_o, _ = image_o.shape + boxes_o = targets_o[:, :4] + labels_o = targets_o[:, 4] + # bbox_o: [xyxy] to [c_x,c_y,w,h] + b_x_o = (boxes_o[:, 2] + boxes_o[:, 0]) * 0.5 + b_y_o = (boxes_o[:, 3] + boxes_o[:, 1]) * 0.5 + b_w_o = (boxes_o[:, 2] - boxes_o[:, 0]) * 1.0 + b_h_o = (boxes_o[:, 3] - boxes_o[:, 1]) * 1.0 + boxes_o[:, 0] = b_x_o + boxes_o[:, 1] = b_y_o + boxes_o[:, 2] = b_w_o + boxes_o[:, 3] = b_h_o + + image_t = _distort(image) + image_t, boxes = _mirror(image_t, boxes) + height, width, _ = image_t.shape + image_t, r_ = preproc(image_t, input_dim, self.means, self.std) + boxes = boxes.copy() + # boxes [xyxy] 2 [cx,cy,w,h] + b_x = (boxes[:, 2] + boxes[:, 0]) * 0.5 + b_y = (boxes[:, 3] + boxes[:, 1]) * 0.5 + b_w = (boxes[:, 2] - boxes[:, 0]) * 1.0 + b_h = (boxes[:, 3] - boxes[:, 1]) * 1.0 + boxes[:, 0] = b_x + boxes[:, 1] = b_y + boxes[:, 2] = b_w + boxes[:, 3] = b_h + + boxes *= r_ + + mask_b = np.minimum(boxes[:, 2], boxes[:, 3]) > 8 + boxes_t = boxes[mask_b] + labels_t = labels[mask_b].copy() + if mixup: + ratios_t = ratios[mask_b].copy() + + if len(boxes_t) == 0: + image_t, r_o = preproc(image_o, input_dim, self.means, self.std) + boxes_o *= r_o + boxes_t = boxes_o + labels_t = labels_o + ratios_t = ratios_o + + labels_t = np.expand_dims(labels_t, 1) + if mixup: + ratios_t = np.expand_dims(ratios_t, 1) + targets_t = np.hstack((labels_t, boxes_t, ratios_t)) + else: + targets_t = np.hstack((labels_t, boxes_t)) + padded_labels = np.zeros((self.max_labels, lshape)) + padded_labels[range(len(targets_t))[: self.max_labels]] = targets_t[ + : self.max_labels + ] + padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32) + image_t = np.ascontiguousarray(image_t, dtype=np.float32) + return image_t, padded_labels + + +class ValTransform: + """ + Defines the transformations that should be applied to test PIL image + for input into the network + + dimension -> tensorize -> color adj + + Arguments: + resize (int): input dimension to SSD + rgb_means ((int,int,int)): average RGB of the dataset + (104,117,123) + swap ((int,int,int)): final order of channels + + Returns: + transform (transform) : callable transform to be applied to test/val + data + """ + + def __init__(self, rgb_means=None, std=None, swap=(2, 0, 1)): + self.means = rgb_means + self.swap = swap + self.std = std + + # assume input is cv2 img for now + def __call__(self, img, res, input_size): + img, _ = preproc(img, input_size, self.means, self.std, self.swap) + return torch.from_numpy(img), torch.zeros(1, 5) diff --git a/yolox/data/data_prefetcher.py b/yolox/data/data_prefetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..6ef73c0a70ece9a7bf5692c5cbc58bbc52c864d6 --- /dev/null +++ b/yolox/data/data_prefetcher.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import random + +import torch +import torch.distributed as dist + +from yolox.utils import synchronize + + +class DataPrefetcher: + """ + DataPrefetcher is inspired by code of following file: + https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main_amp.py + It could speedup your pytorch dataloader. For more information, please check + https://github.com/NVIDIA/apex/issues/304#issuecomment-493562789. + """ + + def __init__(self, loader): + self.loader = iter(loader) + self.stream = torch.cuda.Stream() + self.input_cuda = self._input_cuda_for_image + self.record_stream = DataPrefetcher._record_stream_for_image + self.preload() + + def preload(self): + try: + self.next_input, self.next_target, _, _ = next(self.loader) + except StopIteration: + self.next_input = None + self.next_target = None + return + + with torch.cuda.stream(self.stream): + self.input_cuda() + self.next_target = self.next_target.cuda(non_blocking=True) + + def next(self): + torch.cuda.current_stream().wait_stream(self.stream) + input = self.next_input + target = self.next_target + if input is not None: + self.record_stream(input) + if target is not None: + target.record_stream(torch.cuda.current_stream()) + self.preload() + return input, target + + def _input_cuda_for_image(self): + self.next_input = self.next_input.cuda(non_blocking=True) + + @staticmethod + def _record_stream_for_image(input): + input.record_stream(torch.cuda.current_stream()) + + +def random_resize(data_loader, exp, epoch, rank, is_distributed): + tensor = torch.LongTensor(1).cuda() + if is_distributed: + synchronize() + + if rank == 0: + if epoch > exp.max_epoch - 10: + size = exp.input_size + else: + size = random.randint(*exp.random_size) + size = int(32 * size) + tensor.fill_(size) + + if is_distributed: + synchronize() + dist.broadcast(tensor, 0) + + input_size = data_loader.change_input_dim(multiple=tensor.item(), random_range=None) + return input_size diff --git a/yolox/data/dataloading.py b/yolox/data/dataloading.py new file mode 100644 index 0000000000000000000000000000000000000000..d4f9512ddc90f8751d2b1de1ba819c58f5a7a40d --- /dev/null +++ b/yolox/data/dataloading.py @@ -0,0 +1,172 @@ +import os +import random + +import torch +from torch.utils.data.dataloader import DataLoader as torchDataLoader +from torch.utils.data.dataloader import default_collate + +from .samplers import YoloBatchSampler + + +def get_yolox_datadir(): + """ + get dataset dir of YOLOX. If environment variable named `YOLOX_DATADIR` is set, + this function will return value of the environment variable. Otherwise, use data + """ + yolox_datadir = os.getenv("YOLOX_DATADIR", None) + if yolox_datadir is None: + import yolox + yolox_datadir = os.path.join(os.path.dirname(yolox.__file__), "data") + return yolox_datadir + + +class DataLoader(torchDataLoader): + """ + Lightnet dataloader that enables on the fly resizing of the images. + See :class:`torch.utils.data.DataLoader` for more information on the arguments. + Check more on the following website: + https://gitlab.com/EAVISE/lightnet/-/blob/master/lightnet/data/_dataloading.py + + Note: + This dataloader only works with :class:`lightnet.data.Dataset` based datasets. + + Example: + >>> class CustomSet(ln.data.Dataset): + ... def __len__(self): + ... return 4 + ... @ln.data.Dataset.resize_getitem + ... def __getitem__(self, index): + ... # Should return (image, anno) but here we return (input_dim,) + ... return (self.input_dim,) + >>> dl = ln.data.DataLoader( + ... CustomSet((200,200)), + ... batch_size = 2, + ... collate_fn = ln.data.list_collate # We want the data to be grouped as a list + ... ) + >>> dl.dataset.input_dim # Default input_dim + (200, 200) + >>> for d in dl: + ... d + [[(200, 200), (200, 200)]] + [[(200, 200), (200, 200)]] + >>> dl.change_input_dim(320, random_range=None) + (320, 320) + >>> for d in dl: + ... d + [[(320, 320), (320, 320)]] + [[(320, 320), (320, 320)]] + >>> dl.change_input_dim((480, 320), random_range=None) + (480, 320) + >>> for d in dl: + ... d + [[(480, 320), (480, 320)]] + [[(480, 320), (480, 320)]] + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.__initialized = False + shuffle = False + batch_sampler = None + if len(args) > 5: + shuffle = args[2] + sampler = args[3] + batch_sampler = args[4] + elif len(args) > 4: + shuffle = args[2] + sampler = args[3] + if "batch_sampler" in kwargs: + batch_sampler = kwargs["batch_sampler"] + elif len(args) > 3: + shuffle = args[2] + if "sampler" in kwargs: + sampler = kwargs["sampler"] + if "batch_sampler" in kwargs: + batch_sampler = kwargs["batch_sampler"] + else: + if "shuffle" in kwargs: + shuffle = kwargs["shuffle"] + if "sampler" in kwargs: + sampler = kwargs["sampler"] + if "batch_sampler" in kwargs: + batch_sampler = kwargs["batch_sampler"] + + # Use custom BatchSampler + if batch_sampler is None: + if sampler is None: + if shuffle: + sampler = torch.utils.data.sampler.RandomSampler(self.dataset) + # sampler = torch.utils.data.DistributedSampler(self.dataset) + else: + sampler = torch.utils.data.sampler.SequentialSampler(self.dataset) + batch_sampler = YoloBatchSampler( + sampler, + self.batch_size, + self.drop_last, + input_dimension=self.dataset.input_dim, + ) + # batch_sampler = IterationBasedBatchSampler(batch_sampler, num_iterations = + + self.batch_sampler = batch_sampler + + self.__initialized = True + + def close_mosaic(self): + self.batch_sampler.mosaic = False + + def change_input_dim(self, multiple=32, random_range=(10, 19)): + """ This function will compute a new size and update it on the next mini_batch. + + Args: + multiple (int or tuple, optional): values to multiply the randomly generated range by. + Default **32** + random_range (tuple, optional): This (min, max) tuple sets the range + for the randomisation; Default **(10, 19)** + + Return: + tuple: width, height tuple with new dimension + + Note: + The new size is generated as follows: |br| + First we compute a random integer inside ``[random_range]``. + We then multiply that number with the ``multiple`` argument, + which gives our final new input size. |br| + If ``multiple`` is an integer we generate a square size. If you give a tuple + of **(width, height)**, the size is computed + as :math:`rng * multiple[0], rng * multiple[1]`. + + Note: + You can set the ``random_range`` argument to **None** to set + an exact size of multiply. |br| + See the example above for how this works. + """ + if random_range is None: + size = 1 + else: + size = random.randint(*random_range) + + if isinstance(multiple, int): + size = (size * multiple, size * multiple) + else: + size = (size * multiple[0], size * multiple[1]) + + self.batch_sampler.new_input_dim = size + + return size + + +def list_collate(batch): + """ + Function that collates lists or tuples together into one list (of lists/tuples). + Use this as the collate function in a Dataloader, if you want to have a list of + items as an output, as opposed to tensors (eg. Brambox.boxes). + """ + items = list(zip(*batch)) + + for i in range(len(items)): + if isinstance(items[i][0], (list, tuple)): + items[i] = list(items[i]) + else: + items[i] = default_collate(items[i]) + + return items diff --git a/yolox/data/datasets/__init__.py b/yolox/data/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..75dc1aaf4ae7438a98bd7fe4bea6c5bc2d878a7a --- /dev/null +++ b/yolox/data/datasets/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +from .coco import COCODataset +from .coco_classes import COCO_CLASSES +from .datasets_wrapper import ConcatDataset, Dataset, MixConcatDataset +from .mosaicdetection import MosaicDetection diff --git a/yolox/data/datasets/coco.py b/yolox/data/datasets/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..f32f98817de54f7a574f33fc5a0e78a55ce8399d --- /dev/null +++ b/yolox/data/datasets/coco.py @@ -0,0 +1,119 @@ +import os + +import cv2 +import numpy as np +from pycocotools.coco import COCO + +from ..dataloading import get_yolox_datadir +from .datasets_wrapper import Dataset + + +class COCODataset(Dataset): + """ + COCO dataset class. + """ + + def __init__( + self, + data_dir=None, + json_file="instances_train2017.json", + name="train2017", + img_size=(416, 416), + preproc=None, + ): + """ + COCO dataset initialization. Annotation data are read into memory by COCO API. + Args: + data_dir (str): dataset root directory + json_file (str): COCO json file name + name (str): COCO data name (e.g. 'train2017' or 'val2017') + img_size (int): target image size after pre-processing + preproc: data augmentation strategy + """ + super().__init__(img_size) + if data_dir is None: + data_dir = os.path.join(get_yolox_datadir(), "COCO") + self.data_dir = data_dir + self.json_file = json_file + + self.coco = COCO(os.path.join(self.data_dir, "annotations", self.json_file)) + self.ids = self.coco.getImgIds() + self.class_ids = sorted(self.coco.getCatIds()) + cats = self.coco.loadCats(self.coco.getCatIds()) + self._classes = tuple([c["name"] for c in cats]) + self.name = name + self.max_labels = 50 + self.img_size = img_size + self.preproc = preproc + + def __len__(self): + return len(self.ids) + + def pull_item(self, index): + id_ = self.ids[index] + + im_ann = self.coco.loadImgs(id_)[0] + width = im_ann["width"] + height = im_ann["height"] + anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=False) + annotations = self.coco.loadAnns(anno_ids) + + # load image and preprocess + img_file = os.path.join( + self.data_dir, self.name, "{:012}".format(id_) + ".jpg" + ) + + img = cv2.imread(img_file) + assert img is not None + + # load labels + valid_objs = [] + for obj in annotations: + x1 = np.max((0, obj["bbox"][0])) + y1 = np.max((0, obj["bbox"][1])) + x2 = np.min((width - 1, x1 + np.max((0, obj["bbox"][2] - 1)))) + y2 = np.min((height - 1, y1 + np.max((0, obj["bbox"][3] - 1)))) + if obj["area"] > 0 and x2 >= x1 and y2 >= y1: + obj["clean_bbox"] = [x1, y1, x2, y2] + valid_objs.append(obj) + objs = valid_objs + num_objs = len(objs) + + res = np.zeros((num_objs, 5)) + + for ix, obj in enumerate(objs): + cls = self.class_ids.index(obj["category_id"]) + res[ix, 0:4] = obj["clean_bbox"] + res[ix, 4] = cls + + img_info = (height, width) + + return img, res, img_info, id_ + + @Dataset.resize_getitem + def __getitem__(self, index): + """ + One image / label pair for the given index is picked up and pre-processed. + + Args: + index (int): data index + + Returns: + img (numpy.ndarray): pre-processed image + padded_labels (torch.Tensor): pre-processed label data. + The shape is :math:`[self.max_labels, 5]`. + each label consists of [class, xc, yc, w, h]: + class (float): class index. + xc, yc (float) : center of bbox whose values range from 0 to 1. + w, h (float) : size of bbox whose values range from 0 to 1. + info_img : tuple of h, w, nh, nw, dx, dy. + h, w (int): original shape of the image + nh, nw (int): shape of the resized image without padding + dx, dy (int): pad size + img_id (int): same as the input index. Used for evaluation. + """ + img, res, img_info, img_id = self.pull_item(index) + + if self.preproc is not None: + img, target = self.preproc(img, res, self.input_dim) + return img, target, img_info, img_id diff --git a/yolox/data/datasets/coco_classes.py b/yolox/data/datasets/coco_classes.py new file mode 100644 index 0000000000000000000000000000000000000000..f3ca22671fd20f0303667761226dc6fd945e65bb --- /dev/null +++ b/yolox/data/datasets/coco_classes.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + + +COCO_CLASSES = ( + "person", + "bicycle", + "car", + "motorcycle", + "airplane", + "bus", + "train", + "truck", + "boat", + "traffic light", + "fire hydrant", + "stop sign", + "parking meter", + "bench", + "bird", + "cat", + "dog", + "horse", + "sheep", + "cow", + "elephant", + "bear", + "zebra", + "giraffe", + "backpack", + "umbrella", + "handbag", + "tie", + "suitcase", + "frisbee", + "skis", + "snowboard", + "sports ball", + "kite", + "baseball bat", + "baseball glove", + "skateboard", + "surfboard", + "tennis racket", + "bottle", + "wine glass", + "cup", + "fork", + "knife", + "spoon", + "bowl", + "banana", + "apple", + "sandwich", + "orange", + "broccoli", + "carrot", + "hot dog", + "pizza", + "donut", + "cake", + "chair", + "couch", + "potted plant", + "bed", + "dining table", + "toilet", + "tv", + "laptop", + "mouse", + "remote", + "keyboard", + "cell phone", + "microwave", + "oven", + "toaster", + "sink", + "refrigerator", + "book", + "clock", + "vase", + "scissors", + "teddy bear", + "hair drier", + "toothbrush", +) diff --git a/yolox/data/datasets/datasets_wrapper.py b/yolox/data/datasets/datasets_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..056ca85a3c9a575aa1edea64ed54bd7d68de7f89 --- /dev/null +++ b/yolox/data/datasets/datasets_wrapper.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +import bisect +from functools import wraps + +from torch.utils.data.dataset import ConcatDataset as torchConcatDataset +from torch.utils.data.dataset import Dataset as torchDataset + + +class ConcatDataset(torchConcatDataset): + def __init__(self, datasets): + super(ConcatDataset, self).__init__(datasets) + if hasattr(self.datasets[0], "input_dim"): + self._input_dim = self.datasets[0].input_dim + self.input_dim = self.datasets[0].input_dim + + def pull_item(self, idx): + if idx < 0: + if -idx > len(self): + raise ValueError( + "absolute value of index should not exceed dataset length" + ) + idx = len(self) + idx + dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] + return self.datasets[dataset_idx].pull_item(sample_idx) + + +class MixConcatDataset(torchConcatDataset): + def __init__(self, datasets): + super(MixConcatDataset, self).__init__(datasets) + if hasattr(self.datasets[0], "input_dim"): + self._input_dim = self.datasets[0].input_dim + self.input_dim = self.datasets[0].input_dim + + def __getitem__(self, index): + + if not isinstance(index, int): + idx = index[1] + if idx < 0: + if -idx > len(self): + raise ValueError( + "absolute value of index should not exceed dataset length" + ) + idx = len(self) + idx + dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] + if not isinstance(index, int): + index = (index[0], sample_idx, index[2]) + + return self.datasets[dataset_idx][index] + + +class Dataset(torchDataset): + """ This class is a subclass of the base :class:`torch.utils.data.Dataset`, + that enables on the fly resizing of the ``input_dim``. + + Args: + input_dimension (tuple): (width,height) tuple with default dimensions of the network + """ + + def __init__(self, input_dimension, mosaic=True): + super().__init__() + self.__input_dim = input_dimension[:2] + self._mosaic = mosaic + + @property + def input_dim(self): + """ + Dimension that can be used by transforms to set the correct image size, etc. + This allows transforms to have a single source of truth + for the input dimension of the network. + + Return: + list: Tuple containing the current width,height + """ + if hasattr(self, "_input_dim"): + return self._input_dim + return self.__input_dim + + @staticmethod + def resize_getitem(getitem_fn): + """ + Decorator method that needs to be used around the ``__getitem__`` method. |br| + This decorator enables the on the fly resizing of + the ``input_dim`` with our :class:`~lightnet.data.DataLoader` class. + + Example: + >>> class CustomSet(ln.data.Dataset): + ... def __len__(self): + ... return 10 + ... @ln.data.Dataset.resize_getitem + ... def __getitem__(self, index): + ... # Should return (image, anno) but here we return input_dim + ... return self.input_dim + >>> data = CustomSet((200,200)) + >>> data[0] + (200, 200) + >>> data[(480,320), 0] + (480, 320) + """ + + @wraps(getitem_fn) + def wrapper(self, index): + if not isinstance(index, int): + has_dim = True + self._input_dim = index[0] + self._mosaic = index[2] + index = index[1] + else: + has_dim = False + + ret_val = getitem_fn(self, index) + + if has_dim: + del self._input_dim + + return ret_val + + return wrapper diff --git a/yolox/data/datasets/mosaicdetection.py b/yolox/data/datasets/mosaicdetection.py new file mode 100644 index 0000000000000000000000000000000000000000..778f6483b7cedcacc6c26e87f1e85044849b31f9 --- /dev/null +++ b/yolox/data/datasets/mosaicdetection.py @@ -0,0 +1,195 @@ +import random + +import cv2 +import numpy as np + +from yolox.utils import adjust_box_anns + +from ..data_augment import box_candidates, random_perspective +from .datasets_wrapper import Dataset + + +class MosaicDetection(Dataset): + """Detection dataset wrapper that performs mixup for normal dataset. + + Parameters + ---------- + dataset : Pytorch Dataset + Gluon dataset object. + *args : list + Additional arguments for mixup random sampler. + """ + + def __init__( + self, dataset, img_size, mosaic=True, preproc=None, + degrees=10.0, translate=0.1, scale=(0.5, 1.5), mscale=(0.5, 1.5), + shear=2.0, perspective=0.0, enable_mixup=True, *args + ): + super().__init__(img_size, mosaic=mosaic) + self._dataset = dataset + self.preproc = preproc + self.degrees = degrees + self.translate = translate + self.scale = scale + self.shear = shear + self.perspective = perspective + self.mixup_scale = mscale + self._mosaic = mosaic + self.enable_mixup = enable_mixup + + def __len__(self): + return len(self._dataset) + + @Dataset.resize_getitem + def __getitem__(self, idx): + if self._mosaic: + labels4 = [] + s = self._dataset.input_dim[0] + # yc, xc = s, s # mosaic center x, y + yc = int(random.uniform(0.5 * s, 1.5 * s)) + xc = int(random.uniform(0.5 * s, 1.5 * s)) + + # 3 additional image indices + indices = [idx] + [random.randint(0, len(self._dataset) - 1) for _ in range(3)] + + for i, index in enumerate(indices): + img, _labels, _, _ = self._dataset.pull_item(index) + h0, w0 = img.shape[:2] # orig hw + r = 1.0 * s / max(h0, w0) # resize image to img_size + interp = cv2.INTER_LINEAR + img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp) + (h, w) = img.shape[:2] + + if i == 0: # top left + # base image with 4 tiles + img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8) + # xmin, ymin, xmax, ymax (large image) + x1a, y1a, x2a, y2a = (max(xc - w, 0), max(yc - h, 0), xc, yc,) + # xmin, ymin, xmax, ymax (small image) + x1b, y1b, x2b, y2b = (w - (x2a - x1a), h - (y2a - y1a), w, h,) + elif i == 1: # top right + x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc + x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h + elif i == 2: # bottom left + x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h) + x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h) + elif i == 3: # bottom right + x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h) + x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h) + + img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax] + padw = x1a - x1b + padh = y1a - y1b + + labels = _labels.copy() # [[xmin, ymin, xmax, ymax, label_ind], ... ] + if _labels.size > 0: # Normalized xywh to pixel xyxy format + labels[:, 0] = r * _labels[:, 0] + padw + labels[:, 1] = r * _labels[:, 1] + padh + labels[:, 2] = r * _labels[:, 2] + padw + labels[:, 3] = r * _labels[:, 3] + padh + labels4.append(labels) + + if len(labels4): + labels4 = np.concatenate(labels4, 0) + np.clip(labels4[:, :4], 0, 2 * s, out=labels4[:, :4]) # use with random_affine + img4, labels4 = random_perspective( + img4, + labels4, + degrees=self.degrees, + translate=self.translate, + scale=self.scale, + shear=self.shear, + perspective=self.perspective, + border=[-s // 2, -s // 2], + ) # border to remove + + # ----------------------------------------------------------------- + # CopyPaste: https://arxiv.org/abs/2012.07177 + # ----------------------------------------------------------------- + if self.enable_mixup and not len(labels4) == 0: + img4, labels4 = self.mixup(img4, labels4, self.input_dim) + mix_img, padded_labels = self.preproc(img4, labels4, self.input_dim) + img_info = (mix_img.shape[1], mix_img.shape[0]) + + return mix_img, padded_labels, img_info, int(idx) + + else: + self._dataset._input_dim = self.input_dim + img, label, img_info, idx = self._dataset.pull_item(idx) + img, label = self.preproc(img, label, self.input_dim) + return img, label, img_info, int(idx) + + def mixup(self, origin_img, origin_labels, input_dim): + # jit_factor = random.uniform(0.8, 1.2) + jit_factor = random.uniform(*self.mixup_scale) + FLIP = random.uniform(0, 1) > 0.5 + cp_labels = [] + while len(cp_labels) == 0: + cp_index = random.randint(0, self.__len__() - 1) + id_ = self._dataset.ids[cp_index] + anno_ids = self._dataset.coco.getAnnIds(imgIds=[int(id_)], iscrowd=False) + cp_labels = self._dataset.coco.loadAnns(anno_ids) + img, cp_labels, _, _ = self._dataset.pull_item(cp_index) + + if len(img.shape) == 3: + cp_img = np.ones((input_dim[0], input_dim[1], 3)) * 114.0 + else: + cp_img = np.ones(input_dim) * 114.0 + cp_scale_ratio = input_dim[0] / max(img.shape[0], img.shape[1]) + resized_img = cv2.resize( + img, + (int(img.shape[1] * cp_scale_ratio), int(img.shape[0] * cp_scale_ratio)), + interpolation=cv2.INTER_LINEAR, + ).astype(np.float32) + cp_img[ + : int(img.shape[0] * cp_scale_ratio), : int(img.shape[1] * cp_scale_ratio) + ] = resized_img + cp_img = cv2.resize( + cp_img, + (int(cp_img.shape[1] * jit_factor), int(cp_img.shape[0] * jit_factor)), + ) + cp_scale_ratio *= jit_factor + if FLIP: + cp_img = cp_img[:, ::-1, :] + + origin_h, origin_w = cp_img.shape[:2] + target_h, target_w = origin_img.shape[:2] + padded_img = np.zeros( + (max(origin_h, target_h), max(origin_w, target_w), 3) + ).astype(np.uint8) + padded_img[:origin_h, :origin_w] = cp_img + + x_offset, y_offset = 0, 0 + if padded_img.shape[0] > target_h: + y_offset = random.randint(0, padded_img.shape[0] - target_h - 1) + if padded_img.shape[1] > target_w: + x_offset = random.randint(0, padded_img.shape[1] - target_w - 1) + padded_cropped_img = padded_img[ + y_offset: y_offset + target_h, x_offset: x_offset + target_w + ] + + cp_bboxes_origin_np = adjust_box_anns( + cp_labels[:, :4], cp_scale_ratio, 0, 0, origin_w, origin_h + ) + if FLIP: + cp_bboxes_origin_np[:, 0::2] = ( + origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1] + ) + cp_bboxes_transformed_np = cp_bboxes_origin_np.copy() + cp_bboxes_transformed_np[:, 0::2] = np.clip( + cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w + ) + cp_bboxes_transformed_np[:, 1::2] = np.clip( + cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h + ) + keep_list = box_candidates(cp_bboxes_origin_np.T, cp_bboxes_transformed_np.T, 5) + + if keep_list.sum() >= 1.0: + cls_labels = cp_labels[keep_list, 4:5] + box_labels = cp_bboxes_transformed_np[keep_list] + labels = np.hstack((box_labels, cls_labels)) + origin_labels = np.vstack((origin_labels, labels)) + origin_img = origin_img.astype(np.float32) + origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(np.float32) + + return origin_img.astype(np.uint8), origin_labels diff --git a/yolox/data/datasets/voc.py b/yolox/data/datasets/voc.py new file mode 100644 index 0000000000000000000000000000000000000000..8b8edb92701510c46f04382cf1a8a263179d740b --- /dev/null +++ b/yolox/data/datasets/voc.py @@ -0,0 +1,313 @@ +"""VOC Dataset Classes + +Original author: Francisco Massa +https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py + +Updated by: Ellis Brown, Max deGroot +""" + +import os +import os.path +import pickle +import xml.etree.ElementTree as ET + +import cv2 +import numpy as np + +from yolox.evalutors.voc_eval import voc_eval + +from .datasets_wrapper import Dataset +from .voc_classes import VOC_CLASSES + +# for making bounding boxes pretty +COLORS = ( + (255, 0, 0, 128), + (0, 255, 0, 128), + (0, 0, 255, 128), + (0, 255, 255, 128), + (255, 0, 255, 128), + (255, 255, 0, 128), +) + + +class AnnotationTransform(object): + + """Transforms a VOC annotation into a Tensor of bbox coords and label index + Initilized with a dictionary lookup of classnames to indexes + + Arguments: + class_to_ind (dict, optional): dictionary lookup of classnames -> indexes + (default: alphabetic indexing of VOC's 20 classes) + keep_difficult (bool, optional): keep difficult instances or not + (default: False) + height (int): height + width (int): width + """ + + def __init__(self, class_to_ind=None, keep_difficult=True): + self.class_to_ind = class_to_ind or dict(zip(VOC_CLASSES, range(len(VOC_CLASSES)))) + self.keep_difficult = keep_difficult + + def __call__(self, target): + """ + Arguments: + target (annotation) : the target annotation to be made usable + will be an ET.Element + Returns: + a list containing lists of bounding boxes [bbox coords, class name] + """ + res = np.empty((0, 5)) + for obj in target.iter("object"): + difficult = int(obj.find("difficult").text) == 1 + if not self.keep_difficult and difficult: + continue + name = obj.find("name").text.lower().strip() + bbox = obj.find("bndbox") + + pts = ["xmin", "ymin", "xmax", "ymax"] + bndbox = [] + for i, pt in enumerate(pts): + cur_pt = int(bbox.find(pt).text) - 1 + # scale height or width + # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height + bndbox.append(cur_pt) + label_idx = self.class_to_ind[name] + bndbox.append(label_idx) + res = np.vstack((res, bndbox)) # [xmin, ymin, xmax, ymax, label_ind] + # img_id = target.find('filename').text[:-4] + + return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] + + +class VOCDetection(Dataset): + + """ + VOC Detection Dataset Object + + input is image, target is annotation + + Args: + root (string): filepath to VOCdevkit folder. + image_set (string): imageset to use (eg. 'train', 'val', 'test') + transform (callable, optional): transformation to perform on the + input image + target_transform (callable, optional): transformation to perform on the + target `annotation` + (eg: take in caption string, return tensor of word indices) + dataset_name (string, optional): which dataset to load + (default: 'VOC2007') + """ + + def __init__( + self, + root, + image_sets, + preproc=None, + target_transform=AnnotationTransform(), + input_dim=(416, 416), + dataset_name="VOC0712", + ): + super().__init__(input_dim) + self.root = root + self.image_set = image_sets + self.preproc = preproc + self.target_transform = target_transform + self.name = dataset_name + self._annopath = os.path.join("%s", "Annotations", "%s.xml") + self._imgpath = os.path.join("%s", "JPEGImages", "%s.jpg") + self._classes = VOC_CLASSES + self.ids = list() + for (year, name) in image_sets: + self._year = year + rootpath = os.path.join(self.root, "VOC" + year) + for line in open( + os.path.join(rootpath, "ImageSets", "Main", name + ".txt") + ): + self.ids.append((rootpath, line.strip())) + + @Dataset.resize_getitem + def __getitem__(self, index): + img_id = self.ids[index] + target = ET.parse(self._annopath % img_id).getroot() + img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) + # img = Image.open(self._imgpath % img_id).convert('RGB') + + height, width, _ = img.shape + + if self.target_transform is not None: + target = self.target_transform(target) + + if self.preproc is not None: + img, target = self.preproc(img, target, self.input_dim) + # print(img.size()) + + img_info = (width, height) + + return img, target, img_info, img_id + + def __len__(self): + return len(self.ids) + + def pull_image(self, index): + """Returns the original image object at index in PIL form + + Note: not using self.__getitem__(), as any transformations passed in + could mess up this functionality. + + Argument: + index (int): index of img to show + Return: + PIL img + """ + img_id = self.ids[index] + return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) + + def pull_anno(self, index): + """Returns the original annotation of image at index + + Note: not using self.__getitem__(), as any transformations passed in + could mess up this functionality. + + Argument: + index (int): index of img to get annotation of + Return: + list: [img_id, [(label, bbox coords),...]] + eg: ('001718', [('dog', (96, 13, 438, 332))]) + """ + img_id = self.ids[index] + anno = ET.parse(self._annopath % img_id).getroot() + gt = self.target_transform(anno, 1, 1) + return img_id[1], gt + + def pull_item(self, index): + """Returns the original image and target at an index for mixup + + Note: not using self.__getitem__(), as any transformations passed in + could mess up this functionality. + + Argument: + index (int): index of img to show + Return: + img, target + """ + img_id = self.ids[index] + target = ET.parse(self._annopath % img_id).getroot() + img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) + + height, width, _ = img.shape + + img_info = (width, height) + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target, img_info, img_id + + def evaluate_detections(self, all_boxes, output_dir=None): + """ + all_boxes is a list of length number-of-classes. + Each list element is a list of length number-of-images. + Each of those list elements is either an empty list [] + or a numpy array of detection. + + all_boxes[class][image] = [] or np.array of shape #dets x 5 + """ + self._write_voc_results_file(all_boxes) + IouTh = np.linspace(0.5, 0.95, np.round((0.95 - 0.5) / 0.05) + 1, endpoint=True) + mAPs = [] + for iou in IouTh: + mAP = self._do_python_eval(output_dir, iou) + mAPs.append(mAP) + + print("--------------------------------------------------------------") + print("map_5095:", np.mean(mAPs)) + print("map_50:", mAPs[0]) + print("--------------------------------------------------------------") + return np.mean(mAPs), mAPs[0] + + def _get_voc_results_file_template(self): + filename = "comp4_det_test" + "_{:s}.txt" + filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main") + if not os.path.exists(filedir): + os.makedirs(filedir) + path = os.path.join(filedir, filename) + return path + + def _write_voc_results_file(self, all_boxes): + for cls_ind, cls in enumerate(VOC_CLASSES): + cls_ind = cls_ind + if cls == "__background__": + continue + print("Writing {} VOC results file".format(cls)) + filename = self._get_voc_results_file_template().format(cls) + with open(filename, "wt") as f: + for im_ind, index in enumerate(self.ids): + index = index[1] + dets = all_boxes[cls_ind][im_ind] + if dets == []: + continue + for k in range(dets.shape[0]): + f.write( + "{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n".format( + index, + dets[k, -1], + dets[k, 0] + 1, + dets[k, 1] + 1, + dets[k, 2] + 1, + dets[k, 3] + 1, + ) + ) + + def _do_python_eval(self, output_dir="output", iou=0.5): + rootpath = os.path.join(self.root, "VOC" + self._year) + name = self.image_set[0][1] + annopath = os.path.join(rootpath, "Annotations", "{:s}.xml") + imagesetfile = os.path.join(rootpath, "ImageSets", "Main", name + ".txt") + cachedir = os.path.join( + self.root, "annotations_cache", "VOC" + self._year, name + ) + if not os.path.exists(cachedir): + os.makedirs(cachedir) + aps = [] + # The PASCAL VOC metric changed in 2010 + use_07_metric = True if int(self._year) < 2010 else False + print("VOC07 metric? " + ("Yes" if use_07_metric else "No")) + if output_dir is not None and not os.path.isdir(output_dir): + os.mkdir(output_dir) + for i, cls in enumerate(VOC_CLASSES): + + if cls == "__background__": + continue + + filename = self._get_voc_results_file_template().format(cls) + rec, prec, ap = voc_eval( + filename, + annopath, + imagesetfile, + cls, + cachedir, + ovthresh=iou, + use_07_metric=use_07_metric, + ) + aps += [ap] + if iou == 0.5: + print("AP for {} = {:.4f}".format(cls, ap)) + if output_dir is not None: + with open(os.path.join(output_dir, cls + "_pr.pkl"), "wb") as f: + pickle.dump({"rec": rec, "prec": prec, "ap": ap}, f) + if iou == 0.5: + print("Mean AP = {:.4f}".format(np.mean(aps))) + print("~~~~~~~~") + print("Results:") + for ap in aps: + print("{:.3f}".format(ap)) + print("{:.3f}".format(np.mean(aps))) + print("~~~~~~~~") + print("") + print("--------------------------------------------------------------") + print("Results computed with the **unofficial** Python eval code.") + print("Results should be very close to the official MATLAB eval code.") + print("Recompute with `./tools/reval.py --matlab ...` for your paper.") + print("-- Thanks, The Management") + print("--------------------------------------------------------------") + + return np.mean(aps) diff --git a/yolox/data/datasets/voc_classes.py b/yolox/data/datasets/voc_classes.py new file mode 100644 index 0000000000000000000000000000000000000000..1fe640af60045d4f03397ccb5c645ce656d3d698 --- /dev/null +++ b/yolox/data/datasets/voc_classes.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + + +# VOC_CLASSES = ( '__background__', # always index 0 +VOC_CLASSES = ( + "aeroplane", + "bicycle", + "bird", + "boat", + "bottle", + "bus", + "car", + "cat", + "chair", + "cow", + "diningtable", + "dog", + "horse", + "motorbike", + "person", + "pottedplant", + "sheep", + "sofa", + "train", + "tvmonitor", +) diff --git a/yolox/data/samplers.py b/yolox/data/samplers.py new file mode 100644 index 0000000000000000000000000000000000000000..6a7d521a09345b0a3645fee6e6d9b0fda73a904a --- /dev/null +++ b/yolox/data/samplers.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import itertools +from typing import Optional + +import torch +import torch.distributed as dist +from torch.utils.data.sampler import BatchSampler as torchBatchSampler +from torch.utils.data.sampler import Sampler + + +class YoloBatchSampler(torchBatchSampler): + """ + This batch sampler will generate mini-batches of (dim, index) tuples from another sampler. + It works just like the :class:`torch.utils.data.sampler.BatchSampler`, + but it will prepend a dimension, whilst ensuring it stays the same across one mini-batch. + """ + + def __init__(self, *args, input_dimension=None, mosaic=True, **kwargs): + super().__init__(*args, **kwargs) + self.input_dim = input_dimension + self.new_input_dim = None + self.mosaic = mosaic + + def __iter__(self): + self.__set_input_dim() + for batch in super().__iter__(): + yield [(self.input_dim, idx, self.mosaic) for idx in batch] + self.__set_input_dim() + + def __set_input_dim(self): + """ This function randomly changes the the input dimension of the dataset. """ + if self.new_input_dim is not None: + self.input_dim = (self.new_input_dim[0], self.new_input_dim[1]) + self.new_input_dim = None + + +class InfiniteSampler(Sampler): + """ + In training, we only care about the "infinite stream" of training data. + So this sampler produces an infinite stream of indices and + all workers cooperate to correctly shuffle the indices and sample different indices. + The samplers in each worker effectively produces `indices[worker_id::num_workers]` + where `indices` is an infinite stream of indices consisting of + `shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True) + or `range(size) + range(size) + ...` (if shuffle is False) + """ + + def __init__( + self, + size: int, + shuffle: bool = True, + seed: Optional[int] = 0, + rank=0, + world_size=1, + ): + """ + Args: + size (int): the total number of data of the underlying dataset to sample from + shuffle (bool): whether to shuffle the indices or not + seed (int): the initial seed of the shuffle. Must be the same + across all workers. If None, will use a random seed shared + among workers (require synchronization among all workers). + """ + self._size = size + assert size > 0 + self._shuffle = shuffle + self._seed = int(seed) + + if dist.is_available() and dist.is_initialized(): + self._rank = dist.get_rank() + self._world_size = dist.get_world_size() + else: + self._rank = rank + self._world_size = world_size + + def __iter__(self): + start = self._rank + yield from itertools.islice( + self._infinite_indices(), start, None, self._world_size + ) + + def _infinite_indices(self): + g = torch.Generator() + g.manual_seed(self._seed) + while True: + if self._shuffle: + yield from torch.randperm(self._size, generator=g) + else: + yield from torch.arange(self._size) + + def __len__(self): + return self._size // self._world_size diff --git a/yolox/evalutors/__init__.py b/yolox/evalutors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..adb1a3deff2ebbc57fef2725607485ae2a7d6884 --- /dev/null +++ b/yolox/evalutors/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +from .coco_evaluator import COCOEvaluator diff --git a/yolox/evalutors/coco_evaluator.py b/yolox/evalutors/coco_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..2551bc3a1695c96aaa725c51352715b8561d4415 --- /dev/null +++ b/yolox/evalutors/coco_evaluator.py @@ -0,0 +1,217 @@ +import contextlib +import io +import itertools +import json +import tempfile +import time +from loguru import logger +from tqdm import tqdm + +import torch + +from yolox.utils import ( + gather, + is_main_process, + postprocess, + synchronize, + time_synchronized, + xyxy2xywh +) + + +class COCOEvaluator: + """ + COCO AP Evaluation class. All the data in the val2017 dataset are processed + and evaluated by COCO API. + """ + + def __init__( + self, dataloader, img_size, confthre, nmsthre, num_classes, testdev=False + ): + """ + Args: + dataloader (Dataloader): evaluate dataloader. + img_size (int): image size after preprocess. images are resized + to squares whose shape is (img_size, img_size). + confthre (float): confidence threshold ranging from 0 to 1, which + is defined in the config file. + nmsthre (float): IoU threshold of non-max supression ranging from 0 to 1. + """ + self.dataloader = dataloader + self.img_size = img_size + self.confthre = confthre + self.nmsthre = nmsthre + self.num_classes = num_classes + self.testdev = testdev + + def evaluate( + self, + model, + distributed=False, + half=False, + trt_file=None, + decoder=None, + test_size=None, + ): + """ + COCO average precision (AP) Evaluation. Iterate inference on the test dataset + and the results are evaluated by COCO API. + + NOTE: This function will change training mode to False, please save states if needed. + + Args: + model : model to evaluate. + + Returns: + ap50_95 (float) : COCO AP of IoU=50:95 + ap50 (float) : COCO AP of IoU=50 + summary (sr): summary info of evaluation. + """ + # TODO half to amp_test + tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor + model = model.eval() + if half: + model = model.half() + ids = [] + data_list = [] + progress_bar = tqdm if is_main_process() else iter + + inference_time = 0 + nms_time = 0 + n_samples = len(self.dataloader) - 1 + + if trt_file is not None: + from torch2trt import TRTModule + + model_trt = TRTModule() + model_trt.load_state_dict(torch.load(trt_file)) + + x = torch.ones(1, 3, test_size[0], test_size[1]).cuda() + dump_out = model(x) + model = model_trt + + for cur_iter, (imgs, _, info_imgs, ids) in enumerate( + progress_bar(self.dataloader) + ): + with torch.no_grad(): + imgs = imgs.type(tensor_type) + + # skip the the last iters since batchsize might be not enough for batch inference + is_time_record = cur_iter < len(self.dataloader) - 1 + if is_time_record: + start = time.time() + + outputs = model(imgs) + if decoder is not None: + outputs = decoder(outputs, dtype=outputs.type()) + + if is_time_record: + infer_end = time_synchronized() + inference_time += infer_end - start + + outputs = postprocess( + outputs, self.num_classes, self.confthre, self.nmsthre + ) + if is_time_record: + nms_end = time_synchronized() + nms_time += nms_end - infer_end + + data_list.extend(self.convert_to_coco_format(outputs, info_imgs, ids)) + + statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples]) + if distributed: + data_list = gather(data_list, dst=0) + data_list = list(itertools.chain(*data_list)) + torch.distributed.reduce(statistics, dst=0) + + eval_results = self.evaluate_prediction(data_list, statistics) + synchronize() + return eval_results + + def convert_to_coco_format(self, outputs, info_imgs, ids): + data_list = [] + for (output, img_h, img_w, img_id) in zip( + outputs, info_imgs[0], info_imgs[1], ids + ): + if output is None: + continue + output = output.cpu() + + bboxes = output[:, 0:4] + + # preprocessing: resize + scale = min( + self.img_size[0] / float(img_h), self.img_size[1] / float(img_w) + ) + bboxes /= scale + bboxes = xyxy2xywh(bboxes) + + cls = output[:, 6] + scores = output[:, 4] * output[:, 5] + for ind in range(bboxes.shape[0]): + label = self.dataloader.dataset.class_ids[int(cls[ind])] + pred_data = { + "image_id": int(img_id), + "category_id": label, + "bbox": bboxes[ind].numpy().tolist(), + "score": scores[ind].numpy().item(), + "segmentation": [], + } # COCO json format + data_list.append(pred_data) + return data_list + + def evaluate_prediction(self, data_dict, statistics): + if not is_main_process(): + return 0, 0, None + + logger.info("Evaluate in main process...") + + annType = ["segm", "bbox", "keypoints"] + + inference_time = statistics[0].item() + nms_time = statistics[1].item() + n_samples = statistics[2].item() + + a_infer_time = 1000 * inference_time / (n_samples * self.dataloader.batch_size) + a_nms_time = 1000 * nms_time / (n_samples * self.dataloader.batch_size) + + time_info = ", ".join( + [ + "Average {} time: {:.2f} ms".format(k, v) + for k, v in zip( + ["forward", "NMS", "inference"], + [a_infer_time, a_nms_time, (a_infer_time + a_nms_time)], + ) + ] + ) + + info = time_info + "\n" + + # Evaluate the Dt (detection) json comparing with the ground truth + if len(data_dict) > 0: + cocoGt = self.dataloader.dataset.coco + # TODO: since pycocotools can't process dict in py36, write data to json file. + if self.testdev: + json.dump(data_dict, open("./yolox_testdev_2017.json", "w")) + cocoDt = cocoGt.loadRes("./yolox_testdev_2017.json") + else: + _, tmp = tempfile.mkstemp() + json.dump(data_dict, open(tmp, "w")) + cocoDt = cocoGt.loadRes(tmp) + try: + from yolox.layers import COCOeval_opt as COCOeval + except ImportError: + from .cocoeval_mr import COCOeval + + logger.warning("Use standard COCOeval.") + + cocoEval = COCOeval(cocoGt, cocoDt, annType[1]) + cocoEval.evaluate() + cocoEval.accumulate() + redirect_string = io.StringIO() + with contextlib.redirect_stdout(redirect_string): + cocoEval.summarize() + info += redirect_string.getvalue() + return cocoEval.stats[0], cocoEval.stats[1], info + else: + return 0, 0, info diff --git a/yolox/evalutors/voc_eval.py b/yolox/evalutors/voc_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..0744fd8e97d0819302142517f0bd864eb9862328 --- /dev/null +++ b/yolox/evalutors/voc_eval.py @@ -0,0 +1,184 @@ +# -------------------------------------------------------- +# Fast/er R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Bharath Hariharan +# -------------------------------------------------------- + +import os +import pickle +import xml.etree.ElementTree as ET + +import numpy as np + + +def parse_rec(filename): + """ Parse a PASCAL VOC xml file """ + tree = ET.parse(filename) + objects = [] + for obj in tree.findall("object"): + obj_struct = {} + obj_struct["name"] = obj.find("name").text + obj_struct["pose"] = obj.find("pose").text + obj_struct["truncated"] = int(obj.find("truncated").text) + obj_struct["difficult"] = int(obj.find("difficult").text) + bbox = obj.find("bndbox") + obj_struct["bbox"] = [ + int(bbox.find("xmin").text), + int(bbox.find("ymin").text), + int(bbox.find("xmax").text), + int(bbox.find("ymax").text), + ] + objects.append(obj_struct) + + return objects + + +def voc_ap(rec, prec, use_07_metric=False): + """ ap = voc_ap(rec, prec, [use_07_metric]) + Compute VOC AP given precision and recall. + If use_07_metric is true, uses the + VOC 07 11 point method (default:False). + """ + if use_07_metric: + # 11 point metric + ap = 0.0 + for t in np.arange(0.0, 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11.0 + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.0], rec, [1.0])) + mpre = np.concatenate(([0.0], prec, [0.0])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def voc_eval( + detpath, + annopath, + imagesetfile, + classname, + cachedir, + ovthresh=0.5, + use_07_metric=False, +): + # first load gt + if not os.path.isdir(cachedir): + os.mkdir(cachedir) + cachefile = os.path.join(cachedir, "annots.pkl") + # read list of images + with open(imagesetfile, "r") as f: + lines = f.readlines() + imagenames = [x.strip() for x in lines] + + if not os.path.isfile(cachefile): + # load annots + recs = {} + for i, imagename in enumerate(imagenames): + recs[imagename] = parse_rec(annopath.format(imagename)) + if i % 100 == 0: + print("Reading annotation for {:d}/{:d}".format(i + 1, len(imagenames))) + # save + print("Saving cached annotations to {:s}".format(cachefile)) + with open(cachefile, "wb") as f: + pickle.dump(recs, f) + else: + # load + with open(cachefile, "rb") as f: + recs = pickle.load(f) + + # extract gt objects for this class + class_recs = {} + npos = 0 + for imagename in imagenames: + R = [obj for obj in recs[imagename] if obj["name"] == classname] + bbox = np.array([x["bbox"] for x in R]) + difficult = np.array([x["difficult"] for x in R]).astype(np.bool) + det = [False] * len(R) + npos = npos + sum(~difficult) + class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det} + + # read dets + detfile = detpath.format(classname) + with open(detfile, "r") as f: + lines = f.readlines() + + if len(lines) == 0: + return 0, 0, 0 + + splitlines = [x.strip().split(" ") for x in lines] + image_ids = [x[0] for x in splitlines] + confidence = np.array([float(x[1]) for x in splitlines]) + BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) + + # sort by confidence + sorted_ind = np.argsort(-confidence) + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + + # go down dets and mark TPs and FPs + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + for d in range(nd): + R = class_recs[image_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = R["bbox"].astype(float) + + if BBGT.size > 0: + # compute overlaps + # intersection + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1.0, 0.0) + ih = np.maximum(iymax - iymin + 1.0, 0.0) + inters = iw * ih + + # union + uni = ( + (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0) + + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0) + - inters + ) + + overlaps = inters / uni + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + + if ovmax > ovthresh: + if not R["difficult"][jmax]: + if not R["det"][jmax]: + tp[d] = 1.0 + R["det"][jmax] = 1 + else: + fp[d] = 1.0 + else: + fp[d] = 1.0 + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + + return rec, prec, ap diff --git a/yolox/evalutors/voc_evaluator.py b/yolox/evalutors/voc_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..8761f2232dc28db04e7f494e93769e3b7cc83f63 --- /dev/null +++ b/yolox/evalutors/voc_evaluator.py @@ -0,0 +1,200 @@ +import sys +import tempfile +import time +from tqdm import tqdm + +import torch + +# TODO check VOC +from yolox.data.dataset.vocdataset import ValTransform +from yolox.utils import get_rank, is_main_process, make_pred_vis, make_vis, synchronize + +# TODO refactor this file in the future. + + +def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu): + all_predictions = dist.scatter_gather(predictions_per_gpu) + if not is_main_process(): + return + # merge the list of dicts + predictions = {} + for p in all_predictions: + predictions.update(p) + # convert a dict where the key is the index in a list + image_ids = list(sorted(predictions.keys())) + if len(image_ids) != image_ids[-1] + 1: + print("num_imgs: ", len(image_ids)) + print("last img_id: ", image_ids[-1]) + print( + "Number of images that were gathered from multiple processes is not " + "a contiguous set. Some images might be missing from the evaluation" + ) + + # convert to a list + predictions = [predictions[i] for i in image_ids] + return predictions + + +class VOCEvaluator: + """ + COCO AP Evaluation class. + All the data in the val2017 dataset are processed \ + and evaluated by COCO API. + """ + + def __init__(self, data_dir, img_size, confthre, nmsthre, vis=False): + """ + Args: + data_dir (str): dataset root directory + img_size (int): image size after preprocess. images are resized \ + to squares whose shape is (img_size, img_size). + confthre (float): + confidence threshold ranging from 0 to 1, \ + which is defined in the config file. + nmsthre (float): + IoU threshold of non-max supression ranging from 0 to 1. + """ + test_sets = [("2007", "test")] + self.dataset = VOCDetection( + root=data_dir, + image_sets=test_sets, + input_dim=img_size, + preproc=ValTransform( + rgb_means=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225) + ), + ) + self.num_images = len(self.dataset) + self.dataloader = torch.utils.data.DataLoader( + self.dataset, batch_size=1, shuffle=False, num_workers=0 + ) + self.img_size = img_size + self.confthre = confthre + self.nmsthre = nmsthre + self.vis = vis + + def evaluate(self, model, distributed=False): + """ + COCO average precision (AP) Evaluation. Iterate inference on the test dataset + and the results are evaluated by COCO API. + Args: + model : model object + Returns: + ap50_95 (float) : calculated COCO AP for IoU=50:95 + ap50 (float) : calculated COCO AP for IoU=50 + """ + if isinstance(model, torch.nn.parallel.DistributedDataParallel): + model = model.module + model.eval() + cuda = torch.cuda.is_available() + Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor + + ids = [] + data_dict = [] + dataiterator = iter(self.dataloader) + img_num = 0 + indices = list(range(self.num_images)) + dis_indices = indices[get_rank() :: distributed_util.get_world_size()] + progress_bar = tqdm if distributed_util.is_main_process() else iter + num_classes = 20 + predictions = {} + + if is_main_process(): + inference_time = 0 + nms_time = 0 + n_samples = len(dis_indices) + + for i in progress_bar(dis_indices): + img, _, info_img, id_ = self.dataset[i] # load a batch + info_img = [float(info) for info in info_img] + ids.append(id_) + with torch.no_grad(): + img = Variable(img.type(Tensor).unsqueeze(0)) + + if is_main_process() and i > 9: + start = time.time() + + if self.vis: + outputs, fuse_weights, fused_f = model(img) + else: + outputs = model(img) + + if is_main_process() and i > 9: + infer_end = time.time() + inference_time += infer_end - start + + outputs = postprocess(outputs, 20, self.confthre, self.nmsthre) + + if is_main_process() and i > 9: + nms_end = time.time() + nms_time += nms_end - infer_end + + if outputs[0] is None: + predictions[i] = (None, None, None) + continue + outputs = outputs[0].cpu().data + + bboxes = outputs[:, 0:4] + bboxes[:, 0::2] *= info_img[0] / self.img_size[0] + bboxes[:, 1::2] *= info_img[1] / self.img_size[1] + cls = outputs[:, 6] + scores = outputs[:, 4] * outputs[:, 5] + predictions[i] = (bboxes, cls, scores) + + if self.vis: + o_img, _, _, _ = self.dataset.pull_item(i) + make_vis("VOC", i, o_img, fuse_weights, fused_f) + class_names = self.dataset._classes + + bbox = bboxes.clone() + bbox[:, 2] = bbox[:, 2] - bbox[:, 0] + bbox[:, 3] = bbox[:, 3] - bbox[:, 1] + + make_pred_vis("VOC", i, o_img, class_names, bbox, cls, scores) + + if is_main_process(): + o_img, _, _, _ = self.dataset.pull_item(i) + class_names = self.dataset._classes + bbox = bboxes.clone() + bbox[:, 2] = bbox[:, 2] - bbox[:, 0] + bbox[:, 3] = bbox[:, 3] - bbox[:, 1] + make_pred_vis("VOC", i, o_img, class_names, bbox, cls, scores) + + synchronize() + predictions = _accumulate_predictions_from_multiple_gpus(predictions) + if not is_main_process(): + return 0, 0 + + print("Main process Evaluating...") + + a_infer_time = 1000 * inference_time / (n_samples - 10) + a_nms_time = 1000 * nms_time / (n_samples - 10) + + print( + "Average forward time: %.2f ms, Average NMS time: %.2f ms, Average inference time: %.2f ms" + % (a_infer_time, a_nms_time, (a_infer_time + a_nms_time)) + ) + + all_boxes = [[[] for _ in range(self.num_images)] for _ in range(num_classes)] + for img_num in range(self.num_images): + bboxes, cls, scores = predictions[img_num] + if bboxes is None: + for j in range(num_classes): + all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32) + continue + for j in range(num_classes): + mask_c = cls == j + if sum(mask_c) == 0: + all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32) + continue + + c_dets = torch.cat((bboxes, scores.unsqueeze(1)), dim=1) + all_boxes[j][img_num] = c_dets[mask_c].numpy() + + sys.stdout.write( + "im_eval: {:d}/{:d} \r".format(img_num + 1, self.num_images) + ) + sys.stdout.flush() + + with tempfile.TemporaryDirectory() as tempdir: + mAP50, mAP70 = self.dataset.evaluate_detections(all_boxes, tempdir) + return mAP50, mAP70 diff --git a/yolox/exp/__init__.py b/yolox/exp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1988be2fd49cde72150eb60778d134ba1e7ef9ae --- /dev/null +++ b/yolox/exp/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + + +from .base_exp import BaseExp +from .build import get_exp +from .yolox_base import Exp diff --git a/yolox/exp/base_exp.py b/yolox/exp/base_exp.py new file mode 100644 index 0000000000000000000000000000000000000000..501067419e49bdde05febcf329eef3983c920d0e --- /dev/null +++ b/yolox/exp/base_exp.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. +import ast +import pprint +from abc import ABCMeta, abstractmethod +from typing import Dict +from tabulate import tabulate + +import torch +from torch.nn import Module + +from yolox.utils import LRScheduler + + +class BaseExp(metaclass=ABCMeta): + """Basic class for any experiment. + """ + + def __init__(self): + self.seed = None + self.output_dir = "/data/YOLOX_outputs" + self.print_interval = 100 + self.eval_interval = 10 + + @abstractmethod + def get_model(self) -> Module: + pass + + @abstractmethod + def get_data_loader( + self, batch_size: int, is_distributed: bool + ) -> Dict[str, torch.utils.data.DataLoader]: + pass + + @abstractmethod + def get_optimizer(self, batch_size: int) -> torch.optim.Optimizer: + pass + + @abstractmethod + def get_lr_scheduler( + self, lr: float, iters_per_epoch: int, **kwargs + ) -> LRScheduler: + pass + + @abstractmethod + def get_evaluator(self): + pass + + @abstractmethod + def eval(self, model, evaluator, weights): + pass + + def __repr__(self): + table_header = ["keys", "values"] + exp_table = [ + (str(k), pprint.pformat(v)) for k, v in vars(self).items() if not k.startswith("_") + ] + return tabulate(exp_table, headers=table_header, tablefmt="fancy_grid") + + def merge(self, cfg_list): + assert len(cfg_list) % 2 == 0 + for k, v in zip(cfg_list[0::2], cfg_list[1::2]): + # only update value with same key + if hasattr(self, k): + src_value = getattr(self, k) + src_type = type(src_value) + if src_value is not None and src_type != type(v): + try: + v = src_type(v) + except Exception: + v = ast.literal_eval(v) + setattr(self, k, v) diff --git a/yolox/exp/build.py b/yolox/exp/build.py new file mode 100644 index 0000000000000000000000000000000000000000..db851fa48b051b7a815e29e047317842ead6b900 --- /dev/null +++ b/yolox/exp/build.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import importlib +import os +import sys + + +def get_exp_by_file(exp_file): + try: + sys.path.append(os.path.dirname(exp_file)) + current_exp = importlib.import_module(os.path.basename(exp_file).split(".")[0]) + exp = current_exp.Exp() + except Exception: + raise ImportError("{} doesn't contains class named 'Exp'".format(exp_file)) + return exp + + +def get_exp_by_name(exp_name): + import yolox + yolox_path = os.path.dirname(os.path.dirname(yolox.__file__)) + filedict = { + "yolox-s": "yolox_s.py", + "yolox-m": "yolox_l.py", + "yolox-l": "yolox_l.py", + "yolox-x": "yolox_x.py", + "yolox-nano": "nano.py", + "yolov3": "yolov3.py", + } + filename = filedict[exp_name] + exp_path = os.path.join(yolox_path, "exps", "base", filename) + return get_exp_by_file(exp_path) + + +def get_exp(exp_file, exp_name): + """ + get Exp object by file or name. If exp_file and exp_name + are both provided, get Exp by exp_file. + + Args: + exp_file (str): file path of experiment. + exp_name (str): name of experiment. "yolo-s", + """ + assert exp_file is not None or exp_name is not None, "plz provide exp file or exp name." + if exp_file is not None: + return get_exp_by_file(exp_file) + else: + return get_exp_by_name(exp_name) diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py new file mode 100644 index 0000000000000000000000000000000000000000..3cc48466528309edbd5ea1f95380c5d80d0d981f --- /dev/null +++ b/yolox/exp/yolox_base.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + +import os +import random + +import torch +import torch.distributed as dist +import torch.nn as nn + +from .base_exp import BaseExp + + +class Exp(BaseExp): + + def __init__(self): + super().__init__() + + # ---------------- model config ---------------- # + self.num_classes = 80 + self.depth = 1.00 + self.width = 1.00 + + # ---------------- dataloader config ---------------- # + self.data_num_workers = 6 + self.input_size = (640, 640) + self.random_size = (14, 26) + self.train_ann = "instances_train2017.json" + self.val_ann = "instances_val2017.json" + + # --------------- transform config ----------------- # + self.degrees = 10.0 + self.translate = 0.1 + self.scale = (0.1, 2) + self.mscale = (0.8, 1.6) + self.shear = 2.0 + self.perspective = 0.0 + self.enable_mixup = True + + # -------------- training config --------------------- # + self.warmup_epochs = 5 + self.max_epoch = 300 + self.warmup_lr = 0 + self.basic_lr_per_img = 0.01 / 64.0 + self.scheduler = "yoloxwarmcos" + self.no_aug_epochs = 15 + self.min_lr_ratio = 0.05 + self.ema = True + + self.weight_decay = 5e-4 + self.momentum = 0.9 + self.print_interval = 10 + self.eval_interval = 10 + self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0] + + # ----------------- testing config ------------------ # + self.test_size = (640, 640) + self.test_conf = 0.01 + self.nmsthre = 0.65 + + def get_model(self): + from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead + + def init_yolo(M): + for m in M.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eps = 1e-3 + m.momentum = 0.03 + + if getattr(self, "model", None) is None: + in_channels = [256, 512, 1024] + backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels) + head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels) + self.model = YOLOX(backbone, head) + + self.model.apply(init_yolo) + self.model.head.initialize_biases(1e-2) + return self.model + + def get_data_loader(self, batch_size, is_distributed, no_aug=False): + from yolox.data import ( + COCODataset, + TrainTransform, + YoloBatchSampler, + DataLoader, + InfiniteSampler, + MosaicDetection, + ) + + dataset = COCODataset( + data_dir=None, + json_file=self.train_ann, + img_size=self.input_size, + preproc=TrainTransform( + rgb_means=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + max_labels=50, + ), + ) + + dataset = MosaicDetection( + dataset, + mosaic=not no_aug, + img_size=self.input_size, + preproc=TrainTransform( + rgb_means=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + max_labels=120, + ), + degrees=self.degrees, + translate=self.translate, + scale=self.scale, + shear=self.shear, + perspective=self.perspective, + enable_mixup=self.enable_mixup, + ) + + self.dataset = dataset + + if is_distributed: + batch_size = batch_size // dist.get_world_size() + sampler = InfiniteSampler( + len(self.dataset), seed=self.seed if self.seed else 0 + ) + else: + sampler = torch.utils.data.RandomSampler(self.dataset) + + batch_sampler = YoloBatchSampler( + sampler=sampler, + batch_size=batch_size, + drop_last=False, + input_dimension=self.input_size, + mosaic=not no_aug, + ) + + dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True} + dataloader_kwargs["batch_sampler"] = batch_sampler + train_loader = DataLoader(self.dataset, **dataloader_kwargs) + + return train_loader + + def random_resize(self, data_loader, epoch, rank, is_distributed): + tensor = torch.LongTensor(1).cuda() + + if rank == 0: + if epoch >= self.max_epoch - 1: + size = self.input_size[0] + else: + size = random.randint(*self.random_size) + size = int(32 * size) + tensor.fill_(size) + + if is_distributed: + dist.barrier() + dist.broadcast(tensor, 0) + + input_size = data_loader.change_input_dim( + multiple=tensor.item(), random_range=None + ) + return input_size + + def get_optimizer(self, batch_size): + if "optimizer" not in self.__dict__: + if self.warmup_epochs > 0: + lr = self.warmup_lr + else: + lr = self.basic_lr_per_img * batch_size + + pg0, pg1, pg2 = [], [], [] # optimizer parameter groups + + for k, v in self.model.named_modules(): + if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter): + pg2.append(v.bias) # biases + if isinstance(v, nn.BatchNorm2d) or "bn" in k: + pg0.append(v.weight) # no decay + elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter): + pg1.append(v.weight) # apply decay + + optimizer = torch.optim.SGD( + pg0, lr=lr, momentum=self.momentum, nesterov=True + ) + optimizer.add_param_group( + {"params": pg1, "weight_decay": self.weight_decay} + ) # add pg1 with weight_decay + optimizer.add_param_group({"params": pg2}) + self.optimizer = optimizer + + return self.optimizer + + def get_lr_scheduler(self, lr, iters_per_epoch): + from yolox.utils import LRScheduler + scheduler = LRScheduler( + self.scheduler, + lr, + iters_per_epoch, + self.max_epoch, + warmup_epochs=self.warmup_epochs, + warmup_lr_start=self.warmup_lr, + no_aug_epochs=self.no_aug_epochs, + min_lr_ratio=self.min_lr_ratio, + ) + return scheduler + + def get_eval_loader(self, batch_size, is_distributed, testdev=False): + from yolox.data import COCODataset, ValTransform + + valdataset = COCODataset( + data_dir=None, + json_file=self.val_ann if not testdev else "image_info_test-dev2017.json", + name="val2017" if not testdev else "test2017", + img_size=self.test_size, + preproc=ValTransform( + rgb_means=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225) + ), + ) + + if is_distributed: + batch_size = batch_size // dist.get_world_size() + sampler = torch.utils.data.distributed.DistributedSampler( + valdataset, shuffle=False + ) + else: + sampler = torch.utils.data.SequentialSampler(valdataset) + + dataloader_kwargs = { + "num_workers": self.data_num_workers, + "pin_memory": True, + "sampler": sampler, + } + dataloader_kwargs["batch_size"] = batch_size + val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs) + + return val_loader + + def get_evaluator(self, batch_size, is_distributed, testdev=False): + from yolox.evalutors import COCOEvaluator + + val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev) + evaluator = COCOEvaluator( + dataloader=val_loader, + img_size=self.test_size, + confthre=self.test_conf, + nmsthre=self.nmsthre, + num_classes=self.num_classes, + testdev=testdev, + ) + return evaluator + + def eval(self, model, evaluator, is_distributed, half=False): + return evaluator.evaluate(model, is_distributed, half) diff --git a/yolox/layers/__init__.py b/yolox/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e2a51963f43d663af81d74a82fa0108970122083 --- /dev/null +++ b/yolox/layers/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +from .fast_coco_eval_api import COCOeval_opt diff --git a/yolox/layers/csrc/cocoeval/cocoeval.cpp b/yolox/layers/csrc/cocoeval/cocoeval.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2e63bc9952918060f55999ec100b283d83616b46 --- /dev/null +++ b/yolox/layers/csrc/cocoeval/cocoeval.cpp @@ -0,0 +1,502 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#include "cocoeval.h" +#include +#include +#include +#include + +using namespace pybind11::literals; + +namespace COCOeval { + +// Sort detections from highest score to lowest, such that +// detection_instances[detection_sorted_indices[t]] >= +// detection_instances[detection_sorted_indices[t+1]]. Use stable_sort to match +// original COCO API +void SortInstancesByDetectionScore( + const std::vector& detection_instances, + std::vector* detection_sorted_indices) { + detection_sorted_indices->resize(detection_instances.size()); + std::iota( + detection_sorted_indices->begin(), detection_sorted_indices->end(), 0); + std::stable_sort( + detection_sorted_indices->begin(), + detection_sorted_indices->end(), + [&detection_instances](size_t j1, size_t j2) { + return detection_instances[j1].score > detection_instances[j2].score; + }); +} + +// Partition the ground truth objects based on whether or not to ignore them +// based on area +void SortInstancesByIgnore( + const std::array& area_range, + const std::vector& ground_truth_instances, + std::vector* ground_truth_sorted_indices, + std::vector* ignores) { + ignores->clear(); + ignores->reserve(ground_truth_instances.size()); + for (auto o : ground_truth_instances) { + ignores->push_back( + o.ignore || o.area < area_range[0] || o.area > area_range[1]); + } + + ground_truth_sorted_indices->resize(ground_truth_instances.size()); + std::iota( + ground_truth_sorted_indices->begin(), + ground_truth_sorted_indices->end(), + 0); + std::stable_sort( + ground_truth_sorted_indices->begin(), + ground_truth_sorted_indices->end(), + [&ignores](size_t j1, size_t j2) { + return (int)(*ignores)[j1] < (int)(*ignores)[j2]; + }); +} + +// For each IOU threshold, greedily match each detected instance to a ground +// truth instance (if possible) and store the results +void MatchDetectionsToGroundTruth( + const std::vector& detection_instances, + const std::vector& detection_sorted_indices, + const std::vector& ground_truth_instances, + const std::vector& ground_truth_sorted_indices, + const std::vector& ignores, + const std::vector>& ious, + const std::vector& iou_thresholds, + const std::array& area_range, + ImageEvaluation* results) { + // Initialize memory to store return data matches and ignore + const int num_iou_thresholds = iou_thresholds.size(); + const int num_ground_truth = ground_truth_sorted_indices.size(); + const int num_detections = detection_sorted_indices.size(); + std::vector ground_truth_matches( + num_iou_thresholds * num_ground_truth, 0); + std::vector& detection_matches = results->detection_matches; + std::vector& detection_ignores = results->detection_ignores; + std::vector& ground_truth_ignores = results->ground_truth_ignores; + detection_matches.resize(num_iou_thresholds * num_detections, 0); + detection_ignores.resize(num_iou_thresholds * num_detections, false); + ground_truth_ignores.resize(num_ground_truth); + for (auto g = 0; g < num_ground_truth; ++g) { + ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]]; + } + + for (auto t = 0; t < num_iou_thresholds; ++t) { + for (auto d = 0; d < num_detections; ++d) { + // information about best match so far (match=-1 -> unmatched) + double best_iou = std::min(iou_thresholds[t], 1 - 1e-10); + int match = -1; + for (auto g = 0; g < num_ground_truth; ++g) { + // if this ground truth instance is already matched and not a + // crowd, it cannot be matched to another detection + if (ground_truth_matches[t * num_ground_truth + g] > 0 && + !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) { + continue; + } + + // if detected instance matched to a regular ground truth + // instance, we can break on the first ground truth instance + // tagged as ignore (because they are sorted by the ignore tag) + if (match >= 0 && !ground_truth_ignores[match] && + ground_truth_ignores[g]) { + break; + } + + // if IOU overlap is the best so far, store the match appropriately + if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) { + best_iou = ious[d][ground_truth_sorted_indices[g]]; + match = g; + } + } + // if match was made, store id of match for both detection and + // ground truth + if (match >= 0) { + detection_ignores[t * num_detections + d] = ground_truth_ignores[match]; + detection_matches[t * num_detections + d] = + ground_truth_instances[ground_truth_sorted_indices[match]].id; + ground_truth_matches[t * num_ground_truth + match] = + detection_instances[detection_sorted_indices[d]].id; + } + + // set unmatched detections outside of area range to ignore + const InstanceAnnotation& detection = + detection_instances[detection_sorted_indices[d]]; + detection_ignores[t * num_detections + d] = + detection_ignores[t * num_detections + d] || + (detection_matches[t * num_detections + d] == 0 && + (detection.area < area_range[0] || detection.area > area_range[1])); + } + } + + // store detection score results + results->detection_scores.resize(detection_sorted_indices.size()); + for (size_t d = 0; d < detection_sorted_indices.size(); ++d) { + results->detection_scores[d] = + detection_instances[detection_sorted_indices[d]].score; + } +} + +std::vector EvaluateImages( + const std::vector>& area_ranges, + int max_detections, + const std::vector& iou_thresholds, + const ImageCategoryInstances>& image_category_ious, + const ImageCategoryInstances& + image_category_ground_truth_instances, + const ImageCategoryInstances& + image_category_detection_instances) { + const int num_area_ranges = area_ranges.size(); + const int num_images = image_category_ground_truth_instances.size(); + const int num_categories = + image_category_ious.size() > 0 ? image_category_ious[0].size() : 0; + std::vector detection_sorted_indices; + std::vector ground_truth_sorted_indices; + std::vector ignores; + std::vector results_all( + num_images * num_area_ranges * num_categories); + + // Store results for each image, category, and area range combination. Results + // for each IOU threshold are packed into the same ImageEvaluation object + for (auto i = 0; i < num_images; ++i) { + for (auto c = 0; c < num_categories; ++c) { + const std::vector& ground_truth_instances = + image_category_ground_truth_instances[i][c]; + const std::vector& detection_instances = + image_category_detection_instances[i][c]; + + SortInstancesByDetectionScore( + detection_instances, &detection_sorted_indices); + if ((int)detection_sorted_indices.size() > max_detections) { + detection_sorted_indices.resize(max_detections); + } + + for (size_t a = 0; a < area_ranges.size(); ++a) { + SortInstancesByIgnore( + area_ranges[a], + ground_truth_instances, + &ground_truth_sorted_indices, + &ignores); + + MatchDetectionsToGroundTruth( + detection_instances, + detection_sorted_indices, + ground_truth_instances, + ground_truth_sorted_indices, + ignores, + image_category_ious[i][c], + iou_thresholds, + area_ranges[a], + &results_all + [c * num_area_ranges * num_images + a * num_images + i]); + } + } + } + + return results_all; +} + +// Convert a python list to a vector +template +std::vector list_to_vec(const py::list& l) { + std::vector v(py::len(l)); + for (int i = 0; i < (int)py::len(l); ++i) { + v[i] = l[i].cast(); + } + return v; +} + +// Helper function to Accumulate() +// Considers the evaluation results applicable to a particular category, area +// range, and max_detections parameter setting, which begin at +// evaluations[evaluation_index]. Extracts a sorted list of length n of all +// applicable detection instances concatenated across all images in the dataset, +// which are represented by the outputs evaluation_indices, detection_scores, +// image_detection_indices, and detection_sorted_indices--all of which are +// length n. evaluation_indices[i] stores the applicable index into +// evaluations[] for instance i, which has detection score detection_score[i], +// and is the image_detection_indices[i]'th of the list of detections +// for the image containing i. detection_sorted_indices[] defines a sorted +// permutation of the 3 other outputs +int BuildSortedDetectionList( + const std::vector& evaluations, + const int64_t evaluation_index, + const int64_t num_images, + const int max_detections, + std::vector* evaluation_indices, + std::vector* detection_scores, + std::vector* detection_sorted_indices, + std::vector* image_detection_indices) { + assert(evaluations.size() >= evaluation_index + num_images); + + // Extract a list of object instances of the applicable category, area + // range, and max detections requirements such that they can be sorted + image_detection_indices->clear(); + evaluation_indices->clear(); + detection_scores->clear(); + image_detection_indices->reserve(num_images * max_detections); + evaluation_indices->reserve(num_images * max_detections); + detection_scores->reserve(num_images * max_detections); + int num_valid_ground_truth = 0; + for (auto i = 0; i < num_images; ++i) { + const ImageEvaluation& evaluation = evaluations[evaluation_index + i]; + + for (int d = 0; + d < (int)evaluation.detection_scores.size() && d < max_detections; + ++d) { // detected instances + evaluation_indices->push_back(evaluation_index + i); + image_detection_indices->push_back(d); + detection_scores->push_back(evaluation.detection_scores[d]); + } + for (auto ground_truth_ignore : evaluation.ground_truth_ignores) { + if (!ground_truth_ignore) { + ++num_valid_ground_truth; + } + } + } + + // Sort detections by decreasing score, using stable sort to match + // python implementation + detection_sorted_indices->resize(detection_scores->size()); + std::iota( + detection_sorted_indices->begin(), detection_sorted_indices->end(), 0); + std::stable_sort( + detection_sorted_indices->begin(), + detection_sorted_indices->end(), + [&detection_scores](size_t j1, size_t j2) { + return (*detection_scores)[j1] > (*detection_scores)[j2]; + }); + + return num_valid_ground_truth; +} + +// Helper function to Accumulate() +// Compute a precision recall curve given a sorted list of detected instances +// encoded in evaluations, evaluation_indices, detection_scores, +// detection_sorted_indices, image_detection_indices (see +// BuildSortedDetectionList()). Using vectors precisions and recalls +// and temporary storage, output the results into precisions_out, recalls_out, +// and scores_out, which are large buffers containing many precion/recall curves +// for all possible parameter settings, with precisions_out_index and +// recalls_out_index defining the applicable indices to store results. +void ComputePrecisionRecallCurve( + const int64_t precisions_out_index, + const int64_t precisions_out_stride, + const int64_t recalls_out_index, + const std::vector& recall_thresholds, + const int iou_threshold_index, + const int num_iou_thresholds, + const int num_valid_ground_truth, + const std::vector& evaluations, + const std::vector& evaluation_indices, + const std::vector& detection_scores, + const std::vector& detection_sorted_indices, + const std::vector& image_detection_indices, + std::vector* precisions, + std::vector* recalls, + std::vector* precisions_out, + std::vector* scores_out, + std::vector* recalls_out) { + assert(recalls_out->size() > recalls_out_index); + + // Compute precision/recall for each instance in the sorted list of detections + int64_t true_positives_sum = 0, false_positives_sum = 0; + precisions->clear(); + recalls->clear(); + precisions->reserve(detection_sorted_indices.size()); + recalls->reserve(detection_sorted_indices.size()); + assert(!evaluations.empty() || detection_sorted_indices.empty()); + for (auto detection_sorted_index : detection_sorted_indices) { + const ImageEvaluation& evaluation = + evaluations[evaluation_indices[detection_sorted_index]]; + const auto num_detections = + evaluation.detection_matches.size() / num_iou_thresholds; + const auto detection_index = iou_threshold_index * num_detections + + image_detection_indices[detection_sorted_index]; + assert(evaluation.detection_matches.size() > detection_index); + assert(evaluation.detection_ignores.size() > detection_index); + const int64_t detection_match = + evaluation.detection_matches[detection_index]; + const bool detection_ignores = + evaluation.detection_ignores[detection_index]; + const auto true_positive = detection_match > 0 && !detection_ignores; + const auto false_positive = detection_match == 0 && !detection_ignores; + if (true_positive) { + ++true_positives_sum; + } + if (false_positive) { + ++false_positives_sum; + } + + const double recall = + static_cast(true_positives_sum) / num_valid_ground_truth; + recalls->push_back(recall); + const int64_t num_valid_detections = + true_positives_sum + false_positives_sum; + const double precision = num_valid_detections > 0 + ? static_cast(true_positives_sum) / num_valid_detections + : 0.0; + precisions->push_back(precision); + } + + (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0; + + for (int64_t i = static_cast(precisions->size()) - 1; i > 0; --i) { + if ((*precisions)[i] > (*precisions)[i - 1]) { + (*precisions)[i - 1] = (*precisions)[i]; + } + } + + // Sample the per instance precision/recall list at each recall threshold + for (size_t r = 0; r < recall_thresholds.size(); ++r) { + // first index in recalls >= recall_thresholds[r] + std::vector::iterator low = std::lower_bound( + recalls->begin(), recalls->end(), recall_thresholds[r]); + size_t precisions_index = low - recalls->begin(); + + const auto results_ind = precisions_out_index + r * precisions_out_stride; + assert(results_ind < precisions_out->size()); + assert(results_ind < scores_out->size()); + if (precisions_index < precisions->size()) { + (*precisions_out)[results_ind] = (*precisions)[precisions_index]; + (*scores_out)[results_ind] = + detection_scores[detection_sorted_indices[precisions_index]]; + } else { + (*precisions_out)[results_ind] = 0; + (*scores_out)[results_ind] = 0; + } + } +} +py::dict Accumulate( + const py::object& params, + const std::vector& evaluations) { + const std::vector recall_thresholds = + list_to_vec(params.attr("recThrs")); + const std::vector max_detections = + list_to_vec(params.attr("maxDets")); + const int num_iou_thresholds = py::len(params.attr("iouThrs")); + const int num_recall_thresholds = py::len(params.attr("recThrs")); + const int num_categories = params.attr("useCats").cast() == 1 + ? py::len(params.attr("catIds")) + : 1; + const int num_area_ranges = py::len(params.attr("areaRng")); + const int num_max_detections = py::len(params.attr("maxDets")); + const int num_images = py::len(params.attr("imgIds")); + + std::vector precisions_out( + num_iou_thresholds * num_recall_thresholds * num_categories * + num_area_ranges * num_max_detections, + -1); + std::vector recalls_out( + num_iou_thresholds * num_categories * num_area_ranges * + num_max_detections, + -1); + std::vector scores_out( + num_iou_thresholds * num_recall_thresholds * num_categories * + num_area_ranges * num_max_detections, + -1); + + // Consider the list of all detected instances in the entire dataset in one + // large list. evaluation_indices, detection_scores, + // image_detection_indices, and detection_sorted_indices all have the same + // length as this list, such that each entry corresponds to one detected + // instance + std::vector evaluation_indices; // indices into evaluations[] + std::vector detection_scores; // detection scores of each instance + std::vector detection_sorted_indices; // sorted indices of all + // instances in the dataset + std::vector + image_detection_indices; // indices into the list of detected instances in + // the same image as each instance + std::vector precisions, recalls; + + for (auto c = 0; c < num_categories; ++c) { + for (auto a = 0; a < num_area_ranges; ++a) { + for (auto m = 0; m < num_max_detections; ++m) { + // The COCO PythonAPI assumes evaluations[] (the return value of + // COCOeval::EvaluateImages() is one long list storing results for each + // combination of category, area range, and image id, with categories in + // the outermost loop and images in the innermost loop. + const int64_t evaluations_index = + c * num_area_ranges * num_images + a * num_images; + int num_valid_ground_truth = BuildSortedDetectionList( + evaluations, + evaluations_index, + num_images, + max_detections[m], + &evaluation_indices, + &detection_scores, + &detection_sorted_indices, + &image_detection_indices); + + if (num_valid_ground_truth == 0) { + continue; + } + + for (auto t = 0; t < num_iou_thresholds; ++t) { + // recalls_out is a flattened vectors representing a + // num_iou_thresholds X num_categories X num_area_ranges X + // num_max_detections matrix + const int64_t recalls_out_index = + t * num_categories * num_area_ranges * num_max_detections + + c * num_area_ranges * num_max_detections + + a * num_max_detections + m; + + // precisions_out and scores_out are flattened vectors + // representing a num_iou_thresholds X num_recall_thresholds X + // num_categories X num_area_ranges X num_max_detections matrix + const int64_t precisions_out_stride = + num_categories * num_area_ranges * num_max_detections; + const int64_t precisions_out_index = t * num_recall_thresholds * + num_categories * num_area_ranges * num_max_detections + + c * num_area_ranges * num_max_detections + + a * num_max_detections + m; + + ComputePrecisionRecallCurve( + precisions_out_index, + precisions_out_stride, + recalls_out_index, + recall_thresholds, + t, + num_iou_thresholds, + num_valid_ground_truth, + evaluations, + evaluation_indices, + detection_scores, + detection_sorted_indices, + image_detection_indices, + &precisions, + &recalls, + &precisions_out, + &scores_out, + &recalls_out); + } + } + } + } + + time_t rawtime; + struct tm local_time; + std::array buffer; + time(&rawtime); +#ifdef _WIN32 + localtime_s(&local_time, &rawtime); +#else + localtime_r(&rawtime, &local_time); +#endif + strftime( + buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time); + return py::dict( + "params"_a = params, + "counts"_a = std::vector({num_iou_thresholds, + num_recall_thresholds, + num_categories, + num_area_ranges, + num_max_detections}), + "date"_a = buffer, + "precision"_a = precisions_out, + "recall"_a = recalls_out, + "scores"_a = scores_out); +} + +} // namespace COCOeval diff --git a/yolox/layers/csrc/cocoeval/cocoeval.h b/yolox/layers/csrc/cocoeval/cocoeval.h new file mode 100644 index 0000000000000000000000000000000000000000..f9def4151102d1c493dc88186384342565798d05 --- /dev/null +++ b/yolox/layers/csrc/cocoeval/cocoeval.h @@ -0,0 +1,85 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#pragma once + +#include +#include +#include +#include +#include + +namespace py = pybind11; + +namespace COCOeval { + +// Annotation data for a single object instance in an image +struct InstanceAnnotation { + InstanceAnnotation( + uint64_t id, + double score, + double area, + bool is_crowd, + bool ignore) + : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {} + uint64_t id; + double score = 0.; + double area = 0.; + bool is_crowd = false; + bool ignore = false; +}; + +// Stores intermediate results for evaluating detection results for a single +// image that has D detected instances and G ground truth instances. This stores +// matches between detected and ground truth instances +struct ImageEvaluation { + // For each of the D detected instances, the id of the matched ground truth + // instance, or 0 if unmatched + std::vector detection_matches; + + // The detection score of each of the D detected instances + std::vector detection_scores; + + // Marks whether or not each of G instances was ignored from evaluation (e.g., + // because it's outside area_range) + std::vector ground_truth_ignores; + + // Marks whether or not each of D instances was ignored from evaluation (e.g., + // because it's outside aRng) + std::vector detection_ignores; +}; + +template +using ImageCategoryInstances = std::vector>>; + +// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg(). For each +// combination of image, category, area range settings, and IOU thresholds to +// evaluate, it matches detected instances to ground truth instances and stores +// the results into a vector of ImageEvaluation results, which will be +// interpreted by the COCOeval::Accumulate() function to produce precion-recall +// curves. The parameters of nested vectors have the following semantics: +// image_category_ious[i][c][d][g] is the intersection over union of the d'th +// detected instance and g'th ground truth instance of +// category category_ids[c] in image image_ids[i] +// image_category_ground_truth_instances[i][c] is a vector of ground truth +// instances in image image_ids[i] of category category_ids[c] +// image_category_detection_instances[i][c] is a vector of detected +// instances in image image_ids[i] of category category_ids[c] +std::vector EvaluateImages( + const std::vector>& area_ranges, // vector of 2-tuples + int max_detections, + const std::vector& iou_thresholds, + const ImageCategoryInstances>& image_category_ious, + const ImageCategoryInstances& + image_category_ground_truth_instances, + const ImageCategoryInstances& + image_category_detection_instances); + +// C++ implementation of COCOeval.accumulate(), which generates precision +// recall curves for each set of category, IOU threshold, detection area range, +// and max number of detections parameters. It is assumed that the parameter +// evaluations is the return value of the functon COCOeval::EvaluateImages(), +// which was called with the same parameter settings params +py::dict Accumulate( + const py::object& params, + const std::vector& evalutations); + +} // namespace COCOeval diff --git a/yolox/layers/csrc/vision.cpp b/yolox/layers/csrc/vision.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7663d0faf5c58542624d2f01730618b9aa9d4a25 --- /dev/null +++ b/yolox/layers/csrc/vision.cpp @@ -0,0 +1,13 @@ +#include "cocoeval/cocoeval.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate"); + m.def( + "COCOevalEvaluateImages", + &COCOeval::EvaluateImages, + "COCOeval::EvaluateImages"); + pybind11::class_(m, "InstanceAnnotation") + .def(pybind11::init()); + pybind11::class_(m, "ImageEvaluation") + .def(pybind11::init<>()); +} diff --git a/yolox/layers/fast_coco_eval_api.py b/yolox/layers/fast_coco_eval_api.py new file mode 100644 index 0000000000000000000000000000000000000000..560c6d507107fc9915d7d3effb4fb09319bccbeb --- /dev/null +++ b/yolox/layers/fast_coco_eval_api.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# This file comes from +# https://github.com/facebookresearch/detectron2/blob/master/detectron2/evaluation/fast_eval_api.py +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import copy +import time + +import numpy as np +from pycocotools.cocoeval import COCOeval + +# import torch first to make yolox._C work without ImportError of libc10.so +# in YOLOX, env is already set in __init__.py. +from yolox import _C + + +class COCOeval_opt(COCOeval): + """ + This is a slightly modified version of the original COCO API, where the functions evaluateImg() + and accumulate() are implemented in C++ to speedup evaluation + """ + + def evaluate(self): + """ + Run per image evaluation on given images and store results in self.evalImgs_cpp, a + datastructure that isn't readable from Python but is used by a c++ implementation of + accumulate(). Unlike the original COCO PythonAPI, we don't populate the datastructure + self.evalImgs because this datastructure is a computational bottleneck. + :return: None + """ + tic = time.time() + + print("Running per image evaluation...") + p = self.params + # add backward compatibility if useSegm is specified in params + if p.useSegm is not None: + p.iouType = "segm" if p.useSegm == 1 else "bbox" + print( + "useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType) + ) + print("Evaluate annotation type *{}*".format(p.iouType)) + p.imgIds = list(np.unique(p.imgIds)) + if p.useCats: + p.catIds = list(np.unique(p.catIds)) + p.maxDets = sorted(p.maxDets) + self.params = p + + self._prepare() + + # loop through images, area range, max detection number + catIds = p.catIds if p.useCats else [-1] + + if p.iouType == "segm" or p.iouType == "bbox": + computeIoU = self.computeIoU + elif p.iouType == "keypoints": + computeIoU = self.computeOks + self.ious = { + (imgId, catId): computeIoU(imgId, catId) + for imgId in p.imgIds + for catId in catIds + } + + maxDet = p.maxDets[-1] + + # <<<< Beginning of code differences with original COCO API + def convert_instances_to_cpp(instances, is_det=False): + # Convert annotations for a list of instances in an image to a format that's fast + # to access in C++ + instances_cpp = [] + for instance in instances: + instance_cpp = _C.InstanceAnnotation( + int(instance["id"]), + instance["score"] if is_det else instance.get("score", 0.0), + instance["area"], + bool(instance.get("iscrowd", 0)), + bool(instance.get("ignore", 0)), + ) + instances_cpp.append(instance_cpp) + return instances_cpp + + # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++ + ground_truth_instances = [ + [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds] + for imgId in p.imgIds + ] + detected_instances = [ + [ + convert_instances_to_cpp(self._dts[imgId, catId], is_det=True) + for catId in p.catIds + ] + for imgId in p.imgIds + ] + ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds] + + if not p.useCats: + # For each image, flatten per-category lists into a single list + ground_truth_instances = [ + [[o for c in i for o in c]] for i in ground_truth_instances + ] + detected_instances = [ + [[o for c in i for o in c]] for i in detected_instances + ] + + # Call C++ implementation of self.evaluateImgs() + self._evalImgs_cpp = _C.COCOevalEvaluateImages( + p.areaRng, + maxDet, + p.iouThrs, + ious, + ground_truth_instances, + detected_instances, + ) + self._evalImgs = None + + self._paramsEval = copy.deepcopy(self.params) + toc = time.time() + print("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic)) + # >>>> End of code differences with original COCO API + + def accumulate(self): + """ + Accumulate per image evaluation results and store the result in self.eval. Does not + support changing parameter settings from those used by self.evaluate() + """ + print("Accumulating evaluation results...") + tic = time.time() + if not hasattr(self, "_evalImgs_cpp"): + print("Please run evaluate() first") + + self.eval = _C.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp) + + # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections + self.eval["recall"] = np.array(self.eval["recall"]).reshape( + self.eval["counts"][:1] + self.eval["counts"][2:] + ) + + # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X + # num_area_ranges X num_max_detections + self.eval["precision"] = np.array(self.eval["precision"]).reshape( + self.eval["counts"] + ) + self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"]) + toc = time.time() + print( + "COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic) + ) diff --git a/yolox/models/__init__.py b/yolox/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..34703602a80216db0dd0c5d154ede7704b7ac16b --- /dev/null +++ b/yolox/models/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +from .darknet import CSPDarknet, Darknet +from .losses import IOUloss +from .yolo_fpn import YOLOFPN +from .yolo_head import YOLOXHead +from .yolo_pafpn import YOLOPAFPN +from .yolox import YOLOX diff --git a/yolox/models/darknet.py b/yolox/models/darknet.py new file mode 100644 index 0000000000000000000000000000000000000000..10d054d6603fd6afd63e0b83a87f63703849088b --- /dev/null +++ b/yolox/models/darknet.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +from torch import nn + +from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck + + +class Darknet(nn.Module): + # number of blocks from dark2 to dark5. + depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]} + + def __init__( + self, depth, in_channels=3, stem_out_channels=32, out_features=("dark3", "dark4", "dark5"), + ): + """ + Args: + depth (int): depth of darknet used in model, usually use [21, 53] for this param. + in_channels (int): number of input channels, for example, use 3 for RGB image. + stem_out_channels (int): number of output chanels of darknet stem. + It decides channels of darknet layer2 to layer5. + out_features (Tuple[str]): desired output layer name. + """ + super().__init__() + assert out_features, "please provide output features of Darknet" + self.out_features = out_features + self.stem = nn.Sequential( + BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"), + *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2), + ) + in_channels = stem_out_channels * 2 # 64 + + num_blocks = Darknet.depth2blocks[depth] + # create darknet with `stem_out_channels` and `num_blocks` layers. + # to make model structure more clear, we don't use `for` statement in python. + self.dark2 = nn.Sequential(*self.make_group_layer(in_channels, num_blocks[0], stride=2)) + in_channels *= 2 # 128 + self.dark3 = nn.Sequential(*self.make_group_layer(in_channels, num_blocks[1], stride=2)) + in_channels *= 2 # 256 + self.dark4 = nn.Sequential(*self.make_group_layer(in_channels, num_blocks[2], stride=2)) + in_channels *= 2 # 512 + + self.dark5 = nn.Sequential( + *self.make_group_layer(in_channels, num_blocks[3], stride=2), + *self.make_spp_block([in_channels, in_channels * 2], in_channels * 2), + ) + + def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1): + "starts with conv layer then has `num_blocks` `ResLayer`" + return [ + BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"), + *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)] + ] + + def make_spp_block(self, filters_list, in_filters): + m = nn.Sequential( + *[ + BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"), + BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"), + SPPBottleneck( + in_channels=filters_list[1], + out_channels=filters_list[0], + activation="lrelu" + ), + BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"), + BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"), + ] + ) + return m + + def forward(self, x): + outputs = {} + x = self.stem(x) + outputs["stem"] = x + x = self.dark2(x) + outputs["dark2"] = x + x = self.dark3(x) + outputs["dark3"] = x + x = self.dark4(x) + outputs["dark4"] = x + x = self.dark5(x) + outputs["dark5"] = x + return {k: v for k, v in outputs.items() if k in self.out_features} + + +class CSPDarknet(nn.Module): + + def __init__(self, dep_mul, wid_mul, out_features=("dark3", "dark4", "dark5"), depthwise=False): + super().__init__() + assert out_features, "please provide output features of Darknet" + self.out_features = out_features + Conv = DWConv if depthwise else BaseConv + + base_channels = int(wid_mul * 64) # 64 + base_depth = max(round(dep_mul * 3), 1) # 3 + + # stem + self.stem = Focus(3, base_channels, ksize=3) + + # dark2 + self.dark2 = nn.Sequential( + Conv(base_channels, base_channels * 2, 3, 2), + CSPLayer(base_channels * 2, base_channels * 2, n=base_depth, depthwise=depthwise), + ) + + # dark3 + self.dark3 = nn.Sequential( + Conv(base_channels * 2, base_channels * 4, 3, 2), + CSPLayer(base_channels * 4, base_channels * 4, n=base_depth * 3, depthwise=depthwise), + ) + + # dark4 + self.dark4 = nn.Sequential( + Conv(base_channels * 4, base_channels * 8, 3, 2), + CSPLayer(base_channels * 8, base_channels * 8, n=base_depth * 3, depthwise=depthwise), + ) + + # dark5 + self.dark5 = nn.Sequential( + Conv(base_channels * 8, base_channels * 16, 3, 2), + SPPBottleneck(base_channels * 16, base_channels * 16), + CSPLayer( + base_channels * 16, base_channels * 16, n=base_depth, + shortcut=False, depthwise=depthwise, + ), + ) + + def forward(self, x): + outputs = {} + x = self.stem(x) + outputs["stem"] = x + x = self.dark2(x) + outputs["dark2"] = x + x = self.dark3(x) + outputs["dark3"] = x + x = self.dark4(x) + outputs["dark4"] = x + x = self.dark5(x) + outputs["dark5"] = x + return {k: v for k, v in outputs.items() if k in self.out_features} diff --git a/yolox/models/losses.py b/yolox/models/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..b1cfc56deea4e5fe496b93af84dab6224c0fd968 --- /dev/null +++ b/yolox/models/losses.py @@ -0,0 +1,48 @@ +import torch +import torch.nn as nn + + +class IOUloss(nn.Module): + def __init__(self, reduction="none", loss_type="iou"): + super(IOUloss, self).__init__() + self.reduction = reduction + self.loss_type = loss_type + + def forward(self, pred, target): + assert pred.shape[0] == target.shape[0] + + pred = pred.view(-1, 4) + target = target.view(-1, 4) + tl = torch.max( + (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2) + ) + br = torch.min( + (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2) + ) + + area_p = torch.prod(pred[:, 2:], 1) + area_g = torch.prod(target[:, 2:], 1) + + en = (tl < br).type(tl.type()).prod(dim=1) + area_i = torch.prod(br - tl, 1) * en + iou = (area_i) / (area_p + area_g - area_i + 1e-16) + + if self.loss_type == "iou": + loss = 1 - iou ** 2 + elif self.loss_type == "giou": + c_tl = torch.min( + (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2) + ) + c_br = torch.max( + (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2) + ) + area_c = torch.prod(c_br - c_tl, 1) + giou = iou - (area_c - area_i) / area_c.clamp(1e-16) + loss = 1 - giou.clamp(min=-1.0, max=1.0) + + if self.reduction == "mean": + loss = loss.mean() + elif self.reduction == "sum": + loss = loss.sum() + + return loss diff --git a/yolox/models/network_blocks.py b/yolox/models/network_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..61c415dafc1594500d900525309c23398d3f6853 --- /dev/null +++ b/yolox/models/network_blocks.py @@ -0,0 +1,166 @@ +import torch +import torch.nn as nn + + +class SiLU(nn.Module): + # export-friendly version of nn.SiLU() + + @staticmethod + def forward(x): + return x * torch.sigmoid(x) + + +def get_activation(name="silu", inplace=True): + if name == "silu": + module = nn.SiLU(inplace=inplace) + elif name == "relu": + module = nn.ReLU(inplace=inplace) + elif name == "lrelu": + module = nn.LeakyReLU(0.1, inplace=inplace) + else: + raise AttributeError("Unsupported act type: {}".format(name)) + return module + + +class BaseConv(nn.Module): + """ + A Conv2d -> Batchnorm -> silu/leaky relu block + """ + + def __init__(self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"): + super().__init__() + # same padding + pad = (ksize - 1) // 2 + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=ksize, + stride=stride, + padding=pad, + groups=groups, + bias=bias, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.act = get_activation(act, inplace=True) + + def forward(self, x): + return self.act(self.bn(self.conv(x))) + + def fuseforward(self, x): + return self.act(self.conv(x)) + + +class DWConv(nn.Module): + """Depthwise Conv + Conv""" + def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"): + super().__init__() + self.dconv = BaseConv( + in_channels, in_channels, ksize=ksize, + stride=stride, groups=in_channels, act=act + ) + self.pconv = BaseConv( + in_channels, out_channels, ksize=1, + stride=1, groups=1, act=act + ) + + def forward(self, x): + x = self.dconv(x) + return self.pconv(x) + + +class Bottleneck(nn.Module): + # Standard bottleneck + def __init__(self, in_channels, out_channels, shortcut=True, expansion=0.5, depthwise=False): + super().__init__() + hidden_channels = int(out_channels * expansion) + Conv = DWConv if depthwise else BaseConv + self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1) + self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1) + self.use_add = shortcut and in_channels == out_channels + + def forward(self, x): + y = self.conv2(self.conv1(x)) + if self.use_add: + y = y + x + return y + + +class ResLayer(nn.Module): + "Residual layer with `in_channels` inputs." + def __init__(self, in_channels: int): + super().__init__() + mid_channels = in_channels // 2 + self.layer1 = BaseConv(in_channels, mid_channels, ksize=1, stride=1, act="lrelu") + self.layer2 = BaseConv(mid_channels, in_channels, ksize=3, stride=1, act="lrelu") + + def forward(self, x): + out = self.layer2(self.layer1(x)) + return x + out + + +class SPPBottleneck(nn.Module): + # Spatial pyramid pooling layer used in YOLOv3-SPP + def __init__(self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"): + super().__init__() + hidden_channels = in_channels // 2 + self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation) + self.m = nn.ModuleList( + [nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) for ks in kernel_sizes] + ) + conv2_channels = hidden_channels * (len(kernel_sizes) + 1) + self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation) + + def forward(self, x): + x = self.conv1(x) + x = torch.cat([x] + [m(x) for m in self.m], dim=1) + x = self.conv2(x) + return x + + +class CSPLayer(nn.Module): + """C3 in yolov5, CSP Bottleneck with 3 convolutions""" + + def __init__( + self, in_channels, out_channels, n=1, + shortcut=True, expansion=0.5, depthwise=False + ): + """ + Args: + n (int): number of Bottlenecks. Default value: 1. + """ + # ch_in, ch_out, number, shortcut, groups, expansion + super().__init__() + hidden_channels = int(out_channels * expansion) # hidden channels + self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1) + self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1) + self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1) # act=FReLU(c2) + module_list = [ + Bottleneck(hidden_channels, hidden_channels, shortcut, 1.0, depthwise) + for _ in range(n) + ] + self.m = nn.Sequential(*module_list) + + def forward(self, x): + x_1 = self.conv1(x) + x_2 = self.conv2(x) + x_1 = self.m(x_1) + x = torch.cat((x_1, x_2), dim=1) + return self.conv3(x) + + +class Focus(nn.Module): + """Focus width and height information into channel space.""" + def __init__(self, in_channels, out_channels, ksize=1, stride=1): + super().__init__() + self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride) + + def forward(self, x): + # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2) + patch_top_left = x[..., ::2, ::2] + patch_top_right = x[..., ::2, 1::2] + patch_bot_left = x[..., 1::2, ::2] + patch_bot_right = x[..., 1::2, 1::2] + x = torch.cat( + (patch_top_left, patch_bot_left, patch_top_right, patch_bot_right,), dim=1, + ) + return self.conv(x) diff --git a/yolox/models/yolo_fpn.py b/yolox/models/yolo_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..b0db2539d365f25b65c115939e205ebae36fcd03 --- /dev/null +++ b/yolox/models/yolo_fpn.py @@ -0,0 +1,80 @@ +import torch +import torch.nn as nn + +from .darknet import Darknet +from .network_blocks import BaseConv + + +class YOLOFPN(nn.Module): + """ + YOLOFPN module. Darknet 53 is the default backbone of this model. + """ + + def __init__( + self, depth=53, in_features=["dark3", "dark4", "dark5"], + ): + super().__init__() + + self.backbone = Darknet(depth) + self.in_features = in_features + + # out 1 + self.out1_cbl = self._make_cbl(512, 256, 1) + self.out1 = self._make_embedding([256, 512], 512 + 256) + + # out 2 + self.out2_cbl = self._make_cbl(256, 128, 1) + self.out2 = self._make_embedding([128, 256], 256 + 128) + + # upsample + self.upsample = nn.Upsample(scale_factor=2, mode="nearest") + + def _make_cbl(self, _in, _out, ks): + return BaseConv(_in, _out, ks, stride=1, act="lrelu") + + def _make_embedding(self, filters_list, in_filters): + m = nn.Sequential( + *[ + self._make_cbl(in_filters, filters_list[0], 1), + self._make_cbl(filters_list[0], filters_list[1], 3), + + self._make_cbl(filters_list[1], filters_list[0], 1), + + self._make_cbl(filters_list[0], filters_list[1], 3), + self._make_cbl(filters_list[1], filters_list[0], 1), + ] + ) + return m + + def load_pretrained_model(self, filename="./weights/darknet53.mix.pth"): + with open(filename, "rb") as f: + state_dict = torch.load(f, map_location="cpu") + print("loading pretrained weights...") + self.backbone.load_state_dict(state_dict) + + def forward(self, inputs): + """ + Args: + inputs (Tensor): input image. + + Returns: + Tuple[Tensor]: FPN output features.. + """ + # backbone + out_features = self.backbone(inputs) + x2, x1, x0 = [out_features[f] for f in self.in_features] + + # yolo branch 1 + x1_in = self.out1_cbl(x0) + x1_in = self.upsample(x1_in) + x1_in = torch.cat([x1_in, x1], 1) + out_dark4 = self.out1(x1_in) + + # yolo branch 2 + x2_in = self.out2_cbl(out_dark4) + x2_in = self.upsample(x2_in) + x2_in = torch.cat([x2_in, x2], 1) + out_dark3 = self.out2(x2_in) + + outputs = (out_dark3, out_dark4, x0) + return outputs diff --git a/yolox/models/yolo_head.py b/yolox/models/yolo_head.py new file mode 100644 index 0000000000000000000000000000000000000000..f162172a89d7fe6f35dc076e183541660e5e094e --- /dev/null +++ b/yolox/models/yolo_head.py @@ -0,0 +1,525 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import math +from loguru import logger + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from yolox.utils import bboxes_iou + +from .losses import IOUloss +from .network_blocks import BaseConv, DWConv + + +class YOLOXHead(nn.Module): + def __init__( + self, num_classes, width=1.0, strides=[8, 16, 32], + in_channels=[256, 512, 1024], act="silu", depthwise=False + ): + """ + Args: + act (str): activation type of conv. Defalut value: "silu". + depthwise (bool): wheather apply depthwise conv in conv branch. Defalut value: False. + """ + super().__init__() + + self.n_anchors = 1 + self.num_classes = num_classes + self.decode_in_inference = True # for deploy, set to False + + self.cls_convs = nn.ModuleList() + self.reg_convs = nn.ModuleList() + self.cls_preds = nn.ModuleList() + self.reg_preds = nn.ModuleList() + self.obj_preds = nn.ModuleList() + self.stems = nn.ModuleList() + Conv = DWConv if depthwise else BaseConv + + for i in range(len(in_channels)): + self.stems.append( + BaseConv( + in_channels=int(in_channels[i] * width), + out_channels=int(256 * width), + ksize=1, + stride=1, + act=act, + ) + ) + self.cls_convs.append( + nn.Sequential( + *[ + Conv( + in_channels=int(256 * width), + out_channels=int(256 * width), + ksize=3, + stride=1, + act=act, + ), + Conv( + in_channels=int(256 * width), + out_channels=int(256 * width), + ksize=3, + stride=1, + act=act, + ), + ] + ) + ) + self.reg_convs.append( + nn.Sequential( + *[ + Conv( + in_channels=int(256 * width), + out_channels=int(256 * width), + ksize=3, + stride=1, + act=act, + ), + Conv( + in_channels=int(256 * width), + out_channels=int(256 * width), + ksize=3, + stride=1, + act=act, + ), + ] + ) + ) + self.cls_preds.append( + nn.Conv2d( + in_channels=int(256 * width), + out_channels=self.n_anchors * self.num_classes, + kernel_size=1, + stride=1, + padding=0, + ) + ) + self.reg_preds.append( + nn.Conv2d( + in_channels=int(256 * width), + out_channels=4, + kernel_size=1, + stride=1, + padding=0, + ) + ) + self.obj_preds.append( + nn.Conv2d( + in_channels=int(256 * width), + out_channels=self.n_anchors * 1, + kernel_size=1, + stride=1, + padding=0, + ) + ) + + self.use_l1 = False + self.l1_loss = nn.L1Loss(reduction="none") + self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none") + self.iou_loss = IOUloss(reduction="none") + self.strides = strides + self.grids = [torch.zeros(1)] * len(in_channels) + self.expanded_strides = [None] * len(in_channels) + + def initialize_biases(self, prior_prob): + for conv in self.cls_preds: + b = conv.bias.view(self.n_anchors, -1) + b.data.fill_(-math.log((1 - prior_prob) / prior_prob)) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + + for conv in self.obj_preds: + b = conv.bias.view(self.n_anchors, -1) + b.data.fill_(-math.log((1 - prior_prob) / prior_prob)) + conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + + def forward(self, xin, labels=None, imgs=None): + outputs = [] + origin_preds = [] + x_shifts = [] + y_shifts = [] + expanded_strides = [] + + for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate( + zip(self.cls_convs, self.reg_convs, self.strides, xin) + ): + x = self.stems[k](x) + cls_x = x + reg_x = x + + cls_feat = cls_conv(cls_x) + cls_output = self.cls_preds[k](cls_feat) + + reg_feat = reg_conv(reg_x) + reg_output = self.reg_preds[k](reg_feat) + obj_output = self.obj_preds[k](reg_feat) + + if self.training: + output = torch.cat([reg_output, obj_output, cls_output], 1) + output, grid = self.get_output_and_grid(output, k, stride_this_level, xin[0].type()) + x_shifts.append(grid[:, :, 0]) + y_shifts.append(grid[:, :, 1]) + expanded_strides.append( + torch.zeros(1, grid.shape[1]).fill_(stride_this_level).type_as(xin[0]) + ) + if self.use_l1: + origin_preds.append(reg_output.clone()) + + else: + output = torch.cat([reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1) + + outputs.append(output) + + if self.training: + return self.get_losses( + imgs, x_shifts, y_shifts, expanded_strides, labels, + torch.cat(outputs, 1), origin_preds, dtype=xin[0].dtype + ) + else: + self.hw = [x.shape[-2:] for x in outputs] + # [batch, n_anchors_all, 85] + outputs = torch.cat([x.flatten(start_dim=2) for x in outputs], dim=2).permute(0, 2, 1) + if self.decode_in_inference: + return self.decode_outputs(outputs, dtype=xin[0].type()) + else: + return outputs + + def get_output_and_grid(self, output, k, stride, dtype): + grid = self.grids[k] + + batch_size = output.shape[0] + n_ch = 5 + self.num_classes + hsize, wsize = output.shape[-2:] + if grid.shape[2:3] != output.shape[2:3]: + yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)]) + grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).type(dtype) + self.grids[k] = grid + + output = output.view(batch_size, self.n_anchors, n_ch, hsize, wsize) + output = ( + output.permute(0, 1, 3, 4, 2) + .reshape(batch_size, self.n_anchors * hsize * wsize, -1) + ) + grid = grid.view(1, -1, 2) + output[..., :2] = (output[..., :2] + grid) * stride + output[..., 2:4] = torch.exp(output[..., 2:4]) * stride + return output, grid + + def decode_outputs(self, outputs, dtype): + grids = [] + strides = [] + for (hsize, wsize), stride in zip(self.hw, self.strides): + yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)]) + grid = torch.stack((xv, yv), 2).view(1, -1, 2) + grids.append(grid) + shape = grid.shape[:2] + strides.append(torch.full((*shape, 1), stride)) + + grids = torch.cat(grids, dim=1).type(dtype) + strides = torch.cat(strides, dim=1).type(dtype) + + outputs[..., :2] = (outputs[..., :2] + grids) * strides + outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides + return outputs + + def get_losses( + self, imgs, x_shifts, y_shifts, expanded_strides, labels, outputs, origin_preds, dtype, + ): + bbox_preds = outputs[:, :, :4] # [batch, n_anchors_all, 4] + obj_preds = outputs[:, :, 4].unsqueeze(-1) # [batch, n_anchors_all, 1] + cls_preds = outputs[:, :, 5:] # [batch, n_anchors_all, n_cls] + + # calculate targets + mixup = labels.shape[2] > 5 + if mixup: + label_cut = labels[..., :5] + else: + label_cut = labels + nlabel = (label_cut.sum(dim=2) > 0).sum(dim=1) # number of objects + + total_num_anchors = outputs.shape[1] + x_shifts = torch.cat(x_shifts, 1) # [1, n_anchors_all] + y_shifts = torch.cat(y_shifts, 1) # [1, n_anchors_all] + expanded_strides = torch.cat(expanded_strides, 1) + if self.use_l1: + origin_preds = torch.cat(origin_preds, 1) + + cls_targets = [] + reg_targets = [] + l1_targets = [] + obj_targets = [] + fg_masks = [] + + num_fg = 0.0 + num_gts = 0.0 + + for batch_idx in range(outputs.shape[0]): + num_gt = int(nlabel[batch_idx]) + num_gts += num_gt + if num_gt == 0: + cls_target = outputs.new_zeros((0, self.num_classes)) + reg_target = outputs.new_zeros((0, 4)) + l1_target = outputs.new_zeros((0, 4)) + obj_target = outputs.new_zeros((total_num_anchors, 1)) + fg_mask = outputs.new_zeros(total_num_anchors).bool() + else: + gt_bboxes_per_image = labels[batch_idx, :num_gt, 1:5] + gt_classes = labels[batch_idx, :num_gt, 0] + bboxes_preds_per_image = bbox_preds[batch_idx] + + try: + gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg_img = self.get_assignments( # noqa + batch_idx, num_gt, total_num_anchors, gt_bboxes_per_image, gt_classes, + bboxes_preds_per_image, expanded_strides, x_shifts, y_shifts, + cls_preds, bbox_preds, obj_preds, labels, imgs, + ) + except RuntimeError: + logger.error( + "OOM RuntimeError is raised due to the huge memory cost during label assignment. \ + CPU mode is applied in this batch. If you want to avoid this issue, \ + try to reduce the batch size or image size." + ) + torch.cuda.empty_cache() + gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg_img = self.get_assignments( # noqa + batch_idx, num_gt, total_num_anchors, gt_bboxes_per_image, gt_classes, + bboxes_preds_per_image, expanded_strides, x_shifts, y_shifts, + cls_preds, bbox_preds, obj_preds, labels, imgs, "cpu", + ) + + torch.cuda.empty_cache() + num_fg += num_fg_img + + cls_target = F.one_hot( + gt_matched_classes.to(torch.int64), self.num_classes + ) * pred_ious_this_matching.unsqueeze(-1) + obj_target = fg_mask.unsqueeze(-1) + reg_target = gt_bboxes_per_image[matched_gt_inds] + if self.use_l1: + l1_target = self.get_l1_target( + outputs.new_zeros((num_fg_img, 4)), + gt_bboxes_per_image[matched_gt_inds], + expanded_strides[0][fg_mask], + x_shifts=x_shifts[0][fg_mask], + y_shifts=y_shifts[0][fg_mask], + ) + + cls_targets.append(cls_target) + reg_targets.append(reg_target) + obj_targets.append(obj_target.to(dtype)) + fg_masks.append(fg_mask) + if self.use_l1: + l1_targets.append(l1_target) + + cls_targets = torch.cat(cls_targets, 0) + reg_targets = torch.cat(reg_targets, 0) + obj_targets = torch.cat(obj_targets, 0) + fg_masks = torch.cat(fg_masks, 0) + if self.use_l1: + l1_targets = torch.cat(l1_targets, 0) + + num_fg = max(num_fg, 1) + loss_iou = (self.iou_loss(bbox_preds.view(-1, 4)[fg_masks], reg_targets)).sum() / num_fg + loss_obj = (self.bcewithlog_loss(obj_preds.view(-1, 1), obj_targets)).sum() / num_fg + loss_cls = ( + self.bcewithlog_loss(cls_preds.view(-1, self.num_classes)[fg_masks], cls_targets) + ).sum() / num_fg + if self.use_l1: + loss_l1 = (self.l1_loss(origin_preds.view(-1, 4)[fg_masks], l1_targets)).sum() / num_fg + else: + loss_l1 = 0.0 + + reg_weight = 5.0 + loss = reg_weight * loss_iou + loss_obj + loss_cls + loss_l1 + + return loss, reg_weight * loss_iou, loss_obj, loss_cls, loss_l1, num_fg / max(num_gts, 1) + + def get_l1_target(self, l1_target, gt, stride, x_shifts, y_shifts, eps=1e-8): + l1_target[:, 0] = gt[:, 0] / stride - x_shifts + l1_target[:, 1] = gt[:, 1] / stride - y_shifts + l1_target[:, 2] = torch.log(gt[:, 2] / stride + eps) + l1_target[:, 3] = torch.log(gt[:, 3] / stride + eps) + return l1_target + + @torch.no_grad() + def get_assignments( + self, batch_idx, num_gt, total_num_anchors, gt_bboxes_per_image, gt_classes, + bboxes_preds_per_image, expanded_strides, x_shifts, y_shifts, + cls_preds, bbox_preds, obj_preds, labels, imgs, mode="gpu", + ): + + if mode == "cpu": + print("------------CPU Mode for This Batch-------------") + gt_bboxes_per_image = gt_bboxes_per_image.cpu().float() + bboxes_preds_per_image = bboxes_preds_per_image.cpu().float() + gt_classes = gt_classes.cpu().float() + expanded_strides = expanded_strides.cpu().float() + x_shifts = x_shifts.cpu() + y_shifts = y_shifts.cpu() + + fg_mask, is_in_boxes_and_center = self.get_in_boxes_info( + gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, total_num_anchors, num_gt, + ) + + bboxes_preds_per_image = bboxes_preds_per_image[fg_mask] + cls_preds_ = cls_preds[batch_idx][fg_mask] + obj_preds_ = obj_preds[batch_idx][fg_mask] + num_in_boxes_anchor = bboxes_preds_per_image.shape[0] + + if mode == "cpu": + gt_bboxes_per_image = gt_bboxes_per_image.cpu() + bboxes_preds_per_image = bboxes_preds_per_image.cpu() + + pair_wise_ious = bboxes_iou( + gt_bboxes_per_image, bboxes_preds_per_image, False + ) + + gt_cls_per_image = ( + F.one_hot(gt_classes.to(torch.int64), self.num_classes).float() + .unsqueeze(1).repeat(1, num_in_boxes_anchor, 1) + ) + pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8) + + if mode == "cpu": + cls_preds_, obj_preds_ = cls_preds_.cpu(), obj_preds_.cpu() + + cls_preds_ = ( + cls_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() + * obj_preds_.unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() + ) + pair_wise_cls_loss = F.binary_cross_entropy( + cls_preds_.sqrt_(), gt_cls_per_image, reduction="none" + ).sum(-1) + del cls_preds_ + + cost = ( + pair_wise_cls_loss + + 3.0 * pair_wise_ious_loss + + 100000.0 * (~is_in_boxes_and_center) + ) + + ( + num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds + ) = self.dynamic_k_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask) + del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss + + if mode == "cpu": + gt_matched_classes = gt_matched_classes.cuda() + fg_mask = fg_mask.cuda() + pred_ious_this_matching = pred_ious_this_matching.cuda() + matched_gt_inds = matched_gt_inds.cuda() + + return gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg + + def get_in_boxes_info( + self, gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, total_num_anchors, num_gt, + ): + expanded_strides_per_image = expanded_strides[0] + x_shifts_per_image = x_shifts[0] * expanded_strides_per_image + y_shifts_per_image = y_shifts[0] * expanded_strides_per_image + x_centers_per_image = ( + (x_shifts_per_image + 0.5 * expanded_strides_per_image) + .unsqueeze(0) + .repeat(num_gt, 1) + ) # [n_anchor] -> [n_gt, n_anchor] + y_centers_per_image = ( + (y_shifts_per_image + 0.5 * expanded_strides_per_image) + .unsqueeze(0) + .repeat(num_gt, 1) + ) + + gt_bboxes_per_image_l = ( + (gt_bboxes_per_image[:, 0] - 0.5 * gt_bboxes_per_image[:, 2]) + .unsqueeze(1) + .repeat(1, total_num_anchors) + ) + gt_bboxes_per_image_r = ( + (gt_bboxes_per_image[:, 0] + 0.5 * gt_bboxes_per_image[:, 2]) + .unsqueeze(1) + .repeat(1, total_num_anchors) + ) + gt_bboxes_per_image_t = ( + (gt_bboxes_per_image[:, 1] - 0.5 * gt_bboxes_per_image[:, 3]) + .unsqueeze(1) + .repeat(1, total_num_anchors) + ) + gt_bboxes_per_image_b = ( + (gt_bboxes_per_image[:, 1] + 0.5 * gt_bboxes_per_image[:, 3]) + .unsqueeze(1) + .repeat(1, total_num_anchors) + ) + + b_l = x_centers_per_image - gt_bboxes_per_image_l + b_r = gt_bboxes_per_image_r - x_centers_per_image + b_t = y_centers_per_image - gt_bboxes_per_image_t + b_b = gt_bboxes_per_image_b - y_centers_per_image + bbox_deltas = torch.stack([b_l, b_t, b_r, b_b], 2) + + is_in_boxes = bbox_deltas.min(dim=-1).values > 0.0 + is_in_boxes_all = is_in_boxes.sum(dim=0) > 0 + # in fixed center + + center_radius = 2.5 + + gt_bboxes_per_image_l = (gt_bboxes_per_image[:, 0]).unsqueeze(1).repeat( + 1, total_num_anchors + ) - center_radius * expanded_strides_per_image.unsqueeze(0) + gt_bboxes_per_image_r = (gt_bboxes_per_image[:, 0]).unsqueeze(1).repeat( + 1, total_num_anchors + ) + center_radius * expanded_strides_per_image.unsqueeze(0) + gt_bboxes_per_image_t = (gt_bboxes_per_image[:, 1]).unsqueeze(1).repeat( + 1, total_num_anchors + ) - center_radius * expanded_strides_per_image.unsqueeze(0) + gt_bboxes_per_image_b = (gt_bboxes_per_image[:, 1]).unsqueeze(1).repeat( + 1, total_num_anchors + ) + center_radius * expanded_strides_per_image.unsqueeze(0) + + c_l = x_centers_per_image - gt_bboxes_per_image_l + c_r = gt_bboxes_per_image_r - x_centers_per_image + c_t = y_centers_per_image - gt_bboxes_per_image_t + c_b = gt_bboxes_per_image_b - y_centers_per_image + center_deltas = torch.stack([c_l, c_t, c_r, c_b], 2) + is_in_centers = center_deltas.min(dim=-1).values > 0.0 + is_in_centers_all = is_in_centers.sum(dim=0) > 0 + + # in boxes and in centers + is_in_boxes_anchor = is_in_boxes_all | is_in_centers_all + + is_in_boxes_and_center = ( + is_in_boxes[:, is_in_boxes_anchor] & is_in_centers[:, is_in_boxes_anchor] + ) + return is_in_boxes_anchor, is_in_boxes_and_center + + def dynamic_k_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask): + # Dynamic K + # --------------------------------------------------------------- + matching_matrix = torch.zeros_like(cost) + + ious_in_boxes_matrix = pair_wise_ious + n_candidate_k = 10 + topk_ious, _ = torch.topk(ious_in_boxes_matrix, n_candidate_k, dim=1) + dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1) + for gt_idx in range(num_gt): + _, pos_idx = torch.topk( + cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False + ) + matching_matrix[gt_idx][pos_idx] = 1.0 + + del topk_ious, dynamic_ks, pos_idx + + anchor_matching_gt = matching_matrix.sum(0) + if (anchor_matching_gt > 1).sum() > 0: + cost_min, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0) + matching_matrix[:, anchor_matching_gt > 1] *= 0.0 + matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0 + fg_mask_inboxes = matching_matrix.sum(0) > 0.0 + num_fg = fg_mask_inboxes.sum().item() + + fg_mask[fg_mask.clone()] = fg_mask_inboxes + + matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0) + gt_matched_classes = gt_classes[matched_gt_inds] + + pred_ious_this_matching = (matching_matrix * pair_wise_ious).sum(0)[fg_mask_inboxes] + return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds diff --git a/yolox/models/yolo_pafpn.py b/yolox/models/yolo_pafpn.py new file mode 100644 index 0000000000000000000000000000000000000000..99a35e49a607c65f6ced4d81e9ad8032aaa1825a --- /dev/null +++ b/yolox/models/yolo_pafpn.py @@ -0,0 +1,99 @@ +import torch +import torch.nn as nn + +from .darknet import CSPDarknet +from .network_blocks import BaseConv, CSPLayer, DWConv + + +class YOLOPAFPN(nn.Module): + """ + YOLOv3 model. Darknet 53 is the default backbone of this model. + """ + + def __init__( + self, depth=1.0, width=1.0, in_features=("dark3", "dark4", "dark5"), + in_channels=[256, 512, 1024], depthwise=False, + ): + super().__init__() + self.backbone = CSPDarknet(depth, width, depthwise=depthwise) + self.in_features = in_features + self.in_channels = in_channels + Conv = DWConv if depthwise else BaseConv + + self.upsample = nn.Upsample(scale_factor=2, mode="nearest") + self.lateral_conv0 = BaseConv( + int(in_channels[2] * width), int(in_channels[1] * width), 1, 1 + ) + self.C3_p4 = CSPLayer( + int(2 * in_channels[1] * width), + int(in_channels[1] * width), + round(3 * depth), + False, + depthwise=depthwise, + ) # cat + + self.reduce_conv1 = BaseConv( + int(in_channels[1] * width), int(in_channels[0] * width), 1, 1 + ) + self.C3_p3 = CSPLayer( + int(2 * in_channels[0] * width), + int(in_channels[0] * width), + round(3 * depth), + False, + depthwise=depthwise, + ) + + # bottom-up conv + self.bu_conv2 = Conv(int(in_channels[0] * width), int(in_channels[0] * width), 3, 2) + self.C3_n3 = CSPLayer( + int(2 * in_channels[0] * width), + int(in_channels[1] * width), + round(3 * depth), + False, + depthwise=depthwise, + ) + + # bottom-up conv + self.bu_conv1 = Conv(int(in_channels[1] * width), int(in_channels[1] * width), 3, 2) + self.C3_n4 = CSPLayer( + int(2 * in_channels[1] * width), + int(in_channels[2] * width), + round(3 * depth), + False, + depthwise=depthwise, + ) + + def forward(self, input): + """ + Args: + inputs: input images. + + Returns: + Tuple[Tensor]: FPN feature. + """ + + # backbone + out_features = self.backbone(input) + features = [out_features[f] for f in self.in_features] + [x2, x1, x0] = features + + fpn_out0 = self.lateral_conv0(x0) # 1024->512/32 + f_out0 = self.upsample(fpn_out0) # 512/16 + f_out0 = torch.cat([f_out0, x1], 1) # 512->1024/16 + f_out0 = self.C3_p4(f_out0) # 1024->512/16 + + fpn_out1 = self.reduce_conv1(f_out0) # 512->256/16 + f_out1 = self.upsample(fpn_out1) # 256/8 + f_out1 = torch.cat([f_out1, x2], 1) # 256->512/8 + pan_out2 = self.C3_p3(f_out1) # 512->256/8 + + p_out1 = self.bu_conv2(pan_out2) # 256->256/16 + p_out1 = torch.cat([p_out1, fpn_out1], 1) # 256->512/16 + pan_out1 = self.C3_n3(p_out1) # 512->512/16 + + p_out0 = self.bu_conv1(pan_out1) # 512->512/32 + p_out0 = torch.cat([p_out0, fpn_out0], 1) # 512->1024/32 + pan_out0 = self.C3_n4(p_out0) # 1024->1024/32 + + outputs = (pan_out2, pan_out1, pan_out0) + return outputs diff --git a/yolox/models/yolox.py b/yolox/models/yolox.py new file mode 100644 index 0000000000000000000000000000000000000000..9d4c789c640fbcf860bfbc09fb54e058b2af11ac --- /dev/null +++ b/yolox/models/yolox.py @@ -0,0 +1,45 @@ + +import torch.nn as nn + +from .yolo_head import YOLOXHead +from .yolo_pafpn import YOLOPAFPN + + +class YOLOX(nn.Module): + """ + YOLOX model module. The module list is defined by create_yolov3_modules function. + The network returns loss values from three YOLO layers during training + and detection results during test. + """ + + def __init__(self, backbone=None, head=None): + super().__init__() + if backbone is None: + backbone = YOLOPAFPN() + if head is None: + head = YOLOXHead(80) + + self.backbone = backbone + self.head = head + + def forward(self, x, targets=None): + # fpn output content features of [dark3, dark4, dark5] + fpn_outs = self.backbone(x) + + if self.training: + assert targets is not None + loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head( + fpn_outs, targets, x + ) + outputs = { + "total_loss": loss, + "iou_loss": iou_loss, + "l1_loss": l1_loss, + "conf_loss": conf_loss, + "cls_loss": cls_loss, + "num_fg": num_fg, + } + else: + outputs = self.head(fpn_outs) + + return outputs diff --git a/yolox/utils/__init__.py b/yolox/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b212f3888cfae4b83abd9fdef0b3121547058151 --- /dev/null +++ b/yolox/utils/__init__.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +from .allreduce_norm import * +from .boxes import * +from .checkpoint import load_ckpt, save_checkpoint +from .dist import * +from .ema import ModelEMA +from .logger import setup_logger +from .lr_scheduler import LRScheduler +from .metric import * +from .model_utils import * +from .setup_env import * +from .visualize import * diff --git a/yolox/utils/allreduce_norm.py b/yolox/utils/allreduce_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..2092fe65e185dd3f8462937cc9b21f841ca6823c --- /dev/null +++ b/yolox/utils/allreduce_norm.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- + +import pickle +from collections import OrderedDict + +import torch +from torch import distributed as dist +from torch import nn + +from .dist import _get_global_gloo_group, get_world_size + +ASYNC_NORM = ( + nn.BatchNorm1d, + nn.BatchNorm2d, + nn.BatchNorm3d, + nn.InstanceNorm1d, + nn.InstanceNorm2d, + nn.InstanceNorm3d, +) + +__all__ = [ + "get_async_norm_states", "pyobj2tensor", "tensor2pyobj", "all_reduce", "all_reduce_norm" +] + + +def get_async_norm_states(module): + async_norm_states = OrderedDict() + for name, child in module.named_modules(): + if isinstance(child, ASYNC_NORM): + for k, v in child.state_dict().items(): + async_norm_states[".".join([name, k])] = v + return async_norm_states + + +def pyobj2tensor(pyobj, device="cuda"): + """serialize picklable python object to tensor""" + storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj)) + return torch.ByteTensor(storage).to(device=device) + + +def tensor2pyobj(tensor): + """deserialize tensor to picklable python object""" + return pickle.loads(tensor.cpu().numpy().tobytes()) + + +def _get_reduce_op(op_name): + return { + "sum": dist.ReduceOp.SUM, + "mean": dist.ReduceOp.SUM, + }[op_name.lower()] + + +def all_reduce(py_dict, op="sum", group=None): + """ + Apply all reduce function for python dict object. + NOTE: make sure that every py_dict has the same keys and values are in the same shape. + + Args: + py_dict (dict): dict to apply all reduce op. + op (str): operator, could be "sum" or "mean". + """ + world_size = get_world_size() + if world_size == 1: + return py_dict + if group is None: + group = _get_global_gloo_group() + if dist.get_world_size(group) == 1: + return py_dict + + # all reduce logic across different devices. + py_key = list(py_dict.keys()) + py_key_tensor = pyobj2tensor(py_key) + dist.broadcast(py_key_tensor, src=0) + py_key = tensor2pyobj(py_key_tensor) + + tensor_shapes = [py_dict[k].shape for k in py_key] + tensor_numels = [py_dict[k].numel() for k in py_key] + + flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key]) + dist.all_reduce(flatten_tensor, op=_get_reduce_op(op)) + if op == "mean": + flatten_tensor /= world_size + + split_tensors = [ + x.reshape(shape) for x, shape in zip( + torch.split(flatten_tensor, tensor_numels), tensor_shapes + ) + ] + return OrderedDict({k: v for k, v in zip(py_key, split_tensors)}) + + +def all_reduce_norm(module): + """ + All reduce norm statistics in different devices. + """ + states = get_async_norm_states(module) + states = all_reduce(states, op="mean") + module.load_state_dict(states, strict=False) diff --git a/yolox/utils/boxes.py b/yolox/utils/boxes.py new file mode 100644 index 0000000000000000000000000000000000000000..a590567288a469348ca99bb000450d2da3f59c3e --- /dev/null +++ b/yolox/utils/boxes.py @@ -0,0 +1,111 @@ +import numpy as np + +import torch +import torchvision + +__all__ = [ + "filter_box", "postprocess", "bboxes_iou", "matrix_iou", + "adjust_box_anns", "xyxy2xywh", +] + + +def filter_box(output, scale_range): + """ + output: (N, 5+class) shape + """ + min_scale, max_scale = scale_range + w = output[:, 2] - output[:, 0] + h = output[:, 3] - output[:, 1] + keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale) + return output[keep] + + +def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45): + box_corner = prediction.new(prediction.shape) + box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 + box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 + box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 + box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 + prediction[:, :, :4] = box_corner[:, :, :4] + + output = [None for _ in range(len(prediction))] + for i, image_pred in enumerate(prediction): + + # If none are remaining => process next image + if not image_pred.size(0): + continue + # Get score and class with highest confidence + class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True) + + conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze() + # _, conf_mask = torch.topk((image_pred[:, 4] * class_conf.squeeze()), 1000) + # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred) + detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1) + detections = detections[conf_mask] + if not detections.size(0): + continue + + nms_out_index = torchvision.ops.batched_nms( + detections[:, :4], + detections[:, 4] * detections[:, 5], + detections[:, 6], + nms_thre, + ) + detections = detections[nms_out_index] + if output[i] is None: + output[i] = detections + else: + output[i] = torch.cat((output[i], detections)) + + return output + + +def bboxes_iou(bboxes_a, bboxes_b, xyxy=True): + if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4: + raise IndexError + + if xyxy: + tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2]) + br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:]) + area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1) + area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1) + else: + tl = torch.max( + (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2), + (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2), + ) + br = torch.min( + (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2), + (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2), + ) + + area_a = torch.prod(bboxes_a[:, 2:], 1) + area_b = torch.prod(bboxes_b[:, 2:], 1) + en = (tl < br).type(tl.type()).prod(dim=2) + area_i = torch.prod(br - tl, 2) * en # * ((tl < br).all()) + return area_i / (area_a[:, None] + area_b - area_i) + + +def matrix_iou(a, b): + """ + return iou of a and b, numpy version for data augenmentation + """ + lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + + area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) + return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12) + + +def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max): + bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max) + bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max) + return bbox + + +def xyxy2xywh(bboxes): + bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] + bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] + return bboxes diff --git a/yolox/utils/checkpoint.py b/yolox/utils/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..b5760472db287a5066850bfcde2ddd2d484e16dd --- /dev/null +++ b/yolox/utils/checkpoint.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +import os +import shutil +from loguru import logger + +import torch + + +def load_ckpt(model, ckpt): + model_state_dict = model.state_dict() + load_dict = {} + for key_model, v in model_state_dict.items(): + if key_model not in ckpt: + logger.warning( + "{} is not in the ckpt. Please double check and see if this is desired.".format( + key_model + ) + ) + continue + v_ckpt = ckpt[key_model] + if v.shape != v_ckpt.shape: + logger.warning( + "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format( + key_model, v_ckpt.shape, key_model, v.shape + ) + ) + continue + load_dict[key_model] = v_ckpt + + model.load_state_dict(load_dict, strict=False) + return model + + +def save_checkpoint(state, is_best, save_dir, model_name=""): + if not os.path.exists(save_dir): + os.makedirs(save_dir) + filename = os.path.join(save_dir, model_name + "_ckpt.pth.tar") + torch.save(state, filename) + if is_best: + best_filename = os.path.join(save_dir, "best_ckpt.pth.tar") + shutil.copyfile(filename, best_filename) diff --git a/yolox/utils/dist.py b/yolox/utils/dist.py new file mode 100644 index 0000000000000000000000000000000000000000..ed6419b36b3aa07515efb81925cb6fb6b4a9f030 --- /dev/null +++ b/yolox/utils/dist.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# This file comes from +# https://github.com/facebookresearch/detectron2/blob/master/detectron2/utils/comm.py +# Copyright (c) Facebook, Inc. and its affiliates. +""" +This file contains primitives for multi-gpu communication. +This is useful when doing distributed training. +""" + +import functools +import logging +import pickle +import time + +import numpy as np + +import torch +from torch import distributed as dist + +__all__ = [ + "is_main_process", + "synchronize", + "get_world_size", + "get_rank", + "get_local_rank", + "get_local_size", + "time_synchronized", + "gather", + "all_gather", +] + +_LOCAL_PROCESS_GROUP = None + + +def synchronize(): + """ + Helper function to synchronize (barrier) among all processes when using distributed training + """ + if not dist.is_available(): + return + if not dist.is_initialized(): + return + world_size = dist.get_world_size() + if world_size == 1: + return + dist.barrier() + + +def get_world_size() -> int: + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank() -> int: + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + return dist.get_rank() + + +def get_local_rank() -> int: + """ + Returns: + The rank of the current process within the local (per-machine) process group. + """ + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + assert _LOCAL_PROCESS_GROUP is not None + return dist.get_rank(group=_LOCAL_PROCESS_GROUP) + + +def get_local_size() -> int: + """ + Returns: + The size of the per-machine process group, i.e. the number of processes per machine. + """ + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size(group=_LOCAL_PROCESS_GROUP) + + +def is_main_process() -> bool: + return get_rank() == 0 + + +@functools.lru_cache() +def _get_global_gloo_group(): + """ + Return a process group based on gloo backend, containing all the ranks + The result is cached. + """ + if dist.get_backend() == "nccl": + return dist.new_group(backend="gloo") + else: + return dist.group.WORLD + + +def _serialize_to_tensor(data, group): + backend = dist.get_backend(group) + assert backend in ["gloo", "nccl"] + device = torch.device("cpu" if backend == "gloo" else "cuda") + + buffer = pickle.dumps(data) + if len(buffer) > 1024 ** 3: + logger = logging.getLogger(__name__) + logger.warning( + "Rank {} trying to all-gather {:.2f} GB of data on device {}".format( + get_rank(), len(buffer) / (1024 ** 3), device + ) + ) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to(device=device) + return tensor + + +def _pad_to_largest_tensor(tensor, group): + """ + Returns: + list[int]: size of the tensor, on each rank + Tensor: padded tensor that has the max size + """ + world_size = dist.get_world_size(group=group) + assert ( + world_size >= 1 + ), "comm.gather/all_gather must be called from ranks within the given group!" + local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) + size_list = [ + torch.zeros([1], dtype=torch.int64, device=tensor.device) + for _ in range(world_size) + ] + dist.all_gather(size_list, local_size, group=group) + size_list = [int(size.item()) for size in size_list] + + max_size = max(size_list) + + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + if local_size != max_size: + padding = torch.zeros( + (max_size - local_size,), dtype=torch.uint8, device=tensor.device + ) + tensor = torch.cat((tensor, padding), dim=0) + return size_list, tensor + + +def all_gather(data, group=None): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors). + + Args: + data: any picklable object + group: a torch process group. By default, will use a group which + contains all ranks on gloo backend. + Returns: + list[data]: list of data gathered from each rank + """ + if get_world_size() == 1: + return [data] + if group is None: + group = _get_global_gloo_group() + if dist.get_world_size(group) == 1: + return [data] + + tensor = _serialize_to_tensor(data, group) + + size_list, tensor = _pad_to_largest_tensor(tensor, group) + max_size = max(size_list) + + # receiving Tensor from all ranks + tensor_list = [ + torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) + for _ in size_list + ] + dist.all_gather(tensor_list, tensor, group=group) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def gather(data, dst=0, group=None): + """ + Run gather on arbitrary picklable data (not necessarily tensors). + + Args: + data: any picklable object + dst (int): destination rank + group: a torch process group. By default, will use a group which + contains all ranks on gloo backend. + + Returns: + list[data]: on dst, a list of data gathered from each rank. Otherwise, + an empty list. + """ + if get_world_size() == 1: + return [data] + if group is None: + group = _get_global_gloo_group() + if dist.get_world_size(group=group) == 1: + return [data] + rank = dist.get_rank(group=group) + + tensor = _serialize_to_tensor(data, group) + size_list, tensor = _pad_to_largest_tensor(tensor, group) + + # receiving Tensor from all ranks + if rank == dst: + max_size = max(size_list) + tensor_list = [ + torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) + for _ in size_list + ] + dist.gather(tensor, tensor_list, dst=dst, group=group) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + return data_list + else: + dist.gather(tensor, [], dst=dst, group=group) + return [] + + +def shared_random_seed(): + """ + Returns: + int: a random number that is the same across all workers. + If workers need a shared RNG, they can use this shared seed to + create one. + All workers must call this function, otherwise it will deadlock. + """ + ints = np.random.randint(2 ** 31) + all_ints = all_gather(ints) + return all_ints[0] + + +def time_synchronized(): + # pytorch-accurate time + if torch.cuda.is_available(): + torch.cuda.synchronize() + return time.time() diff --git a/yolox/utils/ema.py b/yolox/utils/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..16b78c2c9b2d85119bfd34ae3f69676830bb3f4a --- /dev/null +++ b/yolox/utils/ema.py @@ -0,0 +1,69 @@ +import math +from copy import deepcopy + +import apex +import torch +import torch.nn as nn + + +def is_parallel(model): + """check if model is in parallel mode.""" + parallel_type = ( + nn.parallel.DataParallel, + nn.parallel.DistributedDataParallel, + apex.parallel.distributed.DistributedDataParallel, + ) + return isinstance(model, parallel_type) + + +def copy_attr(a, b, include=(), exclude=()): + # Copy attributes from b to a, options to only include [...] and to exclude [...] + for k, v in b.__dict__.items(): + if (len(include) and k not in include) or k.startswith("_") or k in exclude: + continue + else: + setattr(a, k, v) + + +class ModelEMA: + """ + Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models + Keep a moving average of everything in the model state_dict (parameters and buffers). + This is intended to allow functionality like + https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage + A smoothed version of the weights is necessary for some training schemes to perform well. + This class is sensitive where it is initialized in the sequence of model init, + GPU assignment and distributed training wrappers. + """ + def __init__(self, model, decay=0.9999, updates=0): + """ + Args: + model (nn.Module): model to apply EMA. + decay (float): ema decay reate. + updates (int): counter of EMA updates. + """ + # Create EMA(FP32) + self.ema = deepcopy(model.module if is_parallel(model) else model).eval() + self.updates = updates + # decay exponential ramp (to help early epochs) + self.decay = lambda x: decay * (1 - math.exp(-x / 2000)) + for p in self.ema.parameters(): + p.requires_grad_(False) + + def update(self, model): + # Update EMA parameters + with torch.no_grad(): + self.updates += 1 + d = self.decay(self.updates) + + msd = ( + model.module.state_dict() if is_parallel(model) else model.state_dict() + ) # model state_dict + for k, v in self.ema.state_dict().items(): + if v.dtype.is_floating_point: + v *= d + v += (1.0 - d) * msd[k].detach() + + def update_attr(self, model, include=(), exclude=("process_group", "reducer")): + # Update EMA attributes + copy_attr(self.ema, model, include, exclude) diff --git a/yolox/utils/logger.py b/yolox/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..8f329a2fd7f3ea4911281df9bd601d24c01f8a1e --- /dev/null +++ b/yolox/utils/logger.py @@ -0,0 +1,90 @@ +import inspect +import os +import sys +from loguru import logger + + +def get_caller_name(depth=0): + """ + Args: + depth (int): Depth of caller conext, use 0 for caller depth. Default value: 0. + + Returns: + str: module name of the caller + """ + # the following logic is a little bit faster than inspect.stack() logic + frame = inspect.currentframe().f_back + for _ in range(depth): + frame = frame.f_back + + return frame.f_globals["__name__"] + + +class StreamToLoguru: + """ + stream object that redirects writes to a logger instance. + """ + def __init__(self, level="INFO", caller_names=("apex", "pycocotools")): + """ + Args: + level(str): log level string of loguru. Default value: "INFO". + caller_names(tuple): caller names of redirected module. + Default value: (apex, pycocotools). + """ + self.level = level + self.linebuf = "" + self.caller_names = caller_names + + def write(self, buf): + full_name = get_caller_name(depth=1) + module_name = full_name.rsplit(".", maxsplit=-1)[0] + if module_name in self.caller_names: + for line in buf.rstrip().splitlines(): + # use caller level log + logger.opt(depth=2).log(self.level, line.rstrip()) + else: + sys.__stdout__.write(buf) + + def flush(self): + pass + + +def redirect_sys_output(log_level="INFO"): + redirect_logger = StreamToLoguru(log_level) + sys.stderr = redirect_logger + sys.stdout = redirect_logger + + +def setup_logger(save_dir, distributed_rank=0, filename="log.txt", mode="a"): + """setup logger for training and testing. + Args: + save_dir(str): location to save log file + distributed_rank(int): device rank when multi-gpu environment + filename (string): log save name. + mode(str): log file write mode, `append` or `override`. default is `a`. + + Return: + logger instance. + """ + loguru_format = ( + "{time:YYYY-MM-DD HH:mm:ss} | " + "{level: <8} | " + "{name}:{line} - {message}" + ) + + logger.remove() + save_file = os.path.join(save_dir, filename) + if mode == "o" and os.path.exists(save_file): + os.remove(save_file) + # only keep logger in rank0 process + if distributed_rank == 0: + logger.add( + sys.stderr, + format=loguru_format, + level="INFO", + enqueue=True, + ) + logger.add(save_file) + + # redirect stdout/stderr to loguru + redirect_sys_output("INFO") diff --git a/yolox/utils/lr_scheduler.py b/yolox/utils/lr_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..09bd56b664ae6bb30ad19336187cb02270e55008 --- /dev/null +++ b/yolox/utils/lr_scheduler.py @@ -0,0 +1,197 @@ +import math +from functools import partial + + +class LRScheduler: + def __init__(self, name, lr, iters_per_epoch, total_epochs, **kwargs): + """ + Supported lr schedulers: [cos, warmcos, multistep] + + Extra keyword arguments: + - cos: None + - warmcos: [warmup_epochs, warmup_lr_start (default 1e-6)] + - multistep: [milestones (epochs), gamma (default 0.1)] + """ + + self.lr = lr + self.iters_per_epoch = iters_per_epoch + self.total_epochs = total_epochs + self.total_iters = iters_per_epoch * total_epochs + + self.__dict__.update(kwargs) + + self.lr_func = self._get_lr_func(name) + + def update_lr(self, iters): + return self.lr_func(iters) + + def _get_lr_func(self, name): + if name == "cos": # cosine lr schedule + lr_func = partial(cos_lr, self.lr, self.total_iters) + elif name == "warmcos": + warmup_total_iters = self.iters_per_epoch * self.warmup_epochs + warmup_lr_start = getattr(self, "warmup_lr_start", 1e-6) + lr_func = partial( + warm_cos_lr, + self.lr, + self.total_iters, + warmup_total_iters, + warmup_lr_start, + ) + elif name == "yoloxwarmcos": + warmup_total_iters = self.iters_per_epoch * self.warmup_epochs + no_aug_iters = self.iters_per_epoch * self.no_aug_epochs + warmup_lr_start = getattr(self, "warmup_lr_start", 0) + min_lr_ratio = getattr(self, "min_lr_ratio", 0.2) + lr_func = partial( + yolox_warm_cos_lr, + self.lr, + min_lr_ratio, + self.total_iters, + warmup_total_iters, + warmup_lr_start, + no_aug_iters, + ) + elif name == "yoloxsemiwarmcos": + warmup_lr_start = getattr(self, "warmup_lr_start", 0) + min_lr_ratio = getattr(self, "min_lr_ratio", 0.2) + warmup_total_iters = self.iters_per_epoch * self.warmup_epochs + no_aug_iters = self.iters_per_epoch * self.no_aug_epochs + normal_iters = self.iters_per_epoch * self.semi_epoch + semi_iters = self.iters_per_epoch_semi * ( + self.total_epochs - self.semi_epoch - self.no_aug_epochs + ) + lr_func = partial( + yolox_semi_warm_cos_lr, + self.lr, + min_lr_ratio, + warmup_lr_start, + self.total_iters, + normal_iters, + no_aug_iters, + warmup_total_iters, + semi_iters, + self.iters_per_epoch, + self.iters_per_epoch_semi, + ) + elif name == "multistep": # stepwise lr schedule + milestones = [ + int(self.total_iters * milestone / self.total_epochs) + for milestone in self.milestones + ] + gamma = getattr(self, "gamma", 0.1) + lr_func = partial(multistep_lr, self.lr, milestones, gamma) + else: + raise ValueError("Scheduler version {} not supported.".format(name)) + return lr_func + + +def cos_lr(lr, total_iters, iters): + """Cosine learning rate""" + lr *= 0.5 * (1.0 + math.cos(math.pi * iters / total_iters)) + return lr + + +def warm_cos_lr(lr, total_iters, warmup_total_iters, warmup_lr_start, iters): + """Cosine learning rate with warm up.""" + if iters <= warmup_total_iters: + lr = (lr - warmup_lr_start) * iters / float( + warmup_total_iters + ) + warmup_lr_start + else: + lr *= 0.5 * ( + 1.0 + + math.cos( + math.pi + * (iters - warmup_total_iters) + / (total_iters - warmup_total_iters) + ) + ) + return lr + + +def yolox_warm_cos_lr( + lr, + min_lr_ratio, + total_iters, + warmup_total_iters, + warmup_lr_start, + no_aug_iter, + iters, +): + """Cosine learning rate with warm up.""" + min_lr = lr * min_lr_ratio + if iters <= warmup_total_iters: + # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start + lr = (lr - warmup_lr_start) * pow( + iters / float(warmup_total_iters), 2 + ) + warmup_lr_start + elif iters >= total_iters - no_aug_iter: + lr = min_lr + else: + lr = min_lr + 0.5 * (lr - min_lr) * ( + 1.0 + + math.cos( + math.pi + * (iters - warmup_total_iters) + / (total_iters - warmup_total_iters - no_aug_iter) + ) + ) + return lr + + +def yolox_semi_warm_cos_lr( + lr, + min_lr_ratio, + warmup_lr_start, + total_iters, + normal_iters, + no_aug_iters, + warmup_total_iters, + semi_iters, + iters_per_epoch, + iters_per_epoch_semi, + iters, +): + """Cosine learning rate with warm up.""" + min_lr = lr * min_lr_ratio + if iters <= warmup_total_iters: + # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start + lr = (lr - warmup_lr_start) * pow( + iters / float(warmup_total_iters), 2 + ) + warmup_lr_start + elif iters >= normal_iters + semi_iters: + lr = min_lr + elif iters <= normal_iters: + lr = min_lr + 0.5 * (lr - min_lr) * ( + 1.0 + + math.cos( + math.pi + * (iters - warmup_total_iters) + / (total_iters - warmup_total_iters - no_aug_iters) + ) + ) + else: + lr = min_lr + 0.5 * (lr - min_lr) * ( + 1.0 + + math.cos( + math.pi + * ( + normal_iters + - warmup_total_iters + + (iters - normal_iters) + * iters_per_epoch + * 1.0 + / iters_per_epoch_semi + ) + / (total_iters - warmup_total_iters - no_aug_iters) + ) + ) + return lr + + +def multistep_lr(lr, milestones, gamma, iters): + """MultiStep learning rate""" + for milestone in milestones: + lr *= gamma if iters >= milestone else 1.0 + return lr diff --git a/yolox/utils/metric.py b/yolox/utils/metric.py new file mode 100644 index 0000000000000000000000000000000000000000..e34e9dad12706efecfbced69ed01b0245001b68f --- /dev/null +++ b/yolox/utils/metric.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. +import functools +import os +import time +from collections import defaultdict, deque + +import numpy as np + +import torch + +__all__ = [ + "AverageMeter", + "MeterBuffer", + "get_total_and_free_memory_in_Mb", + "occumpy_mem", + "gpu_mem_usage", +] + + +def get_total_and_free_memory_in_Mb(cuda_device): + devices_info_str = os.popen( + "nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader" + ) + devices_info = devices_info_str.read().strip().split("\n") + total, used = devices_info[int(cuda_device)].split(",") + return int(total), int(used) + + +def occumpy_mem(cuda_device, mem_ratio=0.9): + """ + pre-allocate gpu memory for training to avoid memory Fragmentation. + """ + total, used = get_total_and_free_memory_in_Mb(cuda_device) + max_mem = int(total * mem_ratio) + block_mem = max_mem - used + x = torch.cuda.FloatTensor(256, 1024, block_mem) + del x + time.sleep(5) + + +def gpu_mem_usage(): + """ + Compute the GPU memory usage for the current device (MB). + """ + mem_usage_bytes = torch.cuda.max_memory_allocated() + return mem_usage_bytes / (1024 * 1024) + + +class AverageMeter: + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=50): + self._deque = deque(maxlen=window_size) + self._total = 0.0 + self._count = 0 + + def update(self, value): + self._deque.append(value) + self._count += 1 + self._total += value + + @property + def median(self): + d = np.array(list(self._deque)) + return np.median(d) + + @property + def avg(self): + # if deque is empty, nan will be returned. + d = np.array(list(self._deque)) + return d.mean() + + @property + def global_avg(self): + return self._total / max(self._count, 1e-5) + + @property + def latest(self): + return self._deque[-1] if len(self._deque) > 0 else None + + @property + def total(self): + return self._total + + def reset(self): + self._deque.clear() + self._total = 0.0 + self._count = 0 + + def clear(self): + self._deque.clear() + + +class MeterBuffer(defaultdict): + """Computes and stores the average and current value""" + + def __init__(self, window_size=20): + factory = functools.partial(AverageMeter, window_size=window_size) + super().__init__(factory) + + def reset(self): + for v in self.values(): + v.reset() + + def get_filtered_meter(self, filter_key="time"): + return {k: v for k, v in self.items() if filter_key in k} + + def update(self, values=None, **kwargs): + if values is None: + values = {} + values.update(kwargs) + for k, v in values.items(): + self[k].update(v) + + def clear_meters(self): + for v in self.values(): + v.clear() diff --git a/yolox/utils/model_utils.py b/yolox/utils/model_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..704afbfd4f3c6ea0a088f2bc8d5f9763a31ea549 --- /dev/null +++ b/yolox/utils/model_utils.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +from copy import deepcopy + +import torch +import torch.nn as nn +from thop import profile + +__all__ = [ + "fuse_conv_and_bn", "fuse_model", "get_model_info", "replace_module", +] + + +def get_model_info(model, tsize): + + stride = 64 + img = torch.zeros((1, 3, stride, stride), device=next(model.parameters()).device) + flops, params = profile(deepcopy(model), inputs=(img,), verbose=False) + params /= 1e6 + flops /= 1e9 + flops *= tsize[0] * tsize[1] / stride / stride * 2 # Gflops + info = "Params: {:.2f}M, Gflops: {:.2f}".format(params, flops) + return info + + +def fuse_conv_and_bn(conv, bn): + # Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/ + fusedconv = ( + nn.Conv2d( + conv.in_channels, + conv.out_channels, + kernel_size=conv.kernel_size, + stride=conv.stride, + padding=conv.padding, + groups=conv.groups, + bias=True, + ) + .requires_grad_(False) + .to(conv.weight.device) + ) + + # prepare filters + w_conv = conv.weight.clone().view(conv.out_channels, -1) + w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) + fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape)) + + # prepare spatial bias + b_conv = ( + torch.zeros(conv.weight.size(0), device=conv.weight.device) + if conv.bias is None + else conv.bias + ) + b_bn = bn.bias - bn.weight.mul(bn.running_mean).div( + torch.sqrt(bn.running_var + bn.eps) + ) + fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) + + return fusedconv + + +def fuse_model(model): + from yolox.models.network_blocks import BaseConv + + for m in model.modules(): + if type(m) is BaseConv and hasattr(m, "bn"): + m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv + delattr(m, "bn") # remove batchnorm + m.forward = m.fuseforward # update forward + return model + + +def replace_module(module, replaced_module_type, new_module_type, replace_func=None): + """ + Replace given type in module to a new type. mostly used in deploy. + + Args: + module (nn.Module): model to apply replace operation. + replaced_module_type (Type): module type to be replaced. + new_module_type (Type) + replace_func (function): python function to describe replace logic. Defalut value None. + + Returns: + model (nn.Module): module that already been replaced. + """ + def default_replace_func(replaced_module_type, new_module_type): + return new_module_type() + + if replace_func is None: + replace_func = default_replace_func + + model = module + if isinstance(module, replaced_module_type): + model = replace_func(replaced_module_type, new_module_type) + else: # recurrsively replace + for name, child in module.named_children(): + new_child = replace_module(child, replaced_module_type, new_module_type) + if new_child is not child: # child is already replaced + model.add_module(name, new_child) + + return model diff --git a/yolox/utils/setup_env.py b/yolox/utils/setup_env.py new file mode 100644 index 0000000000000000000000000000000000000000..c9af2604c2d3b49497d66fa21e53871f771d4962 --- /dev/null +++ b/yolox/utils/setup_env.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + +import os +import resource +import subprocess + +import cv2 + +__all__ = ["configure_nccl", "configure_module"] + + +def configure_nccl(): + """Configure multi-machine environment variables of NCCL.""" + os.environ["NCCL_SOCKET_IFNAME"] = "ib0" + os.environ["GLOO_SOCKET_IFNAME"] = "ib0" + os.environ["NCCL_IB_DISABLE"] = "1" + + os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL" + os.environ["NCCL_IB_HCA"] = subprocess.getoutput( + "cd /sys/class/infiniband/ > /dev/null; for i in mlx5_*; " + "do cat $i/ports/1/gid_attrs/types/* 2>/dev/null " + "| grep v >/dev/null && echo $i ; done; > /dev/null" + ) + os.environ["NCCL_IB_GID_INDEX"] = "3" + os.environ["NCCL_IB_TC"] = "106" + + +def configure_module(ulimit_value=8192): + """ + Configure pytorch module environment. setting of ulimit and cv2 will be set. + + Args: + ulimit_value(int): default open file number on linux. Default value: 4096. + """ + # system setting + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + resource.setrlimit(resource.RLIMIT_NOFILE, (ulimit_value, rlimit[1])) + # cv2 + # multiprocess might be harmful on performance of torch dataloader + os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled" + cv2.setNumThreads(0) + cv2.ocl.setUseOpenCL(False) diff --git a/yolox/utils/visualize.py b/yolox/utils/visualize.py new file mode 100644 index 0000000000000000000000000000000000000000..44ea54509f536fcfc132e0a08b79b9a34ddca20d --- /dev/null +++ b/yolox/utils/visualize.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- + +import cv2 +import numpy as np + + +def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None): + + for i in range(len(boxes)): + box = boxes[i] + cls_id = int(cls_ids[i]) + score = scores[i] + if score < conf: + continue + x0 = int(box[0]) + y0 = int(box[1]) + x1 = int(box[0] + box[2]) + y1 = int(box[1] + box[3]) + + color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist() + text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100) + txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255) + font = cv2.FONT_HERSHEY_COMPLEX + + txt_size = cv2.getTextSize(text, font, 0.4, 1)[0] + cv2.rectangle(img, (x0, y0), (x1, y1), color, 2) + + txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist() + cv2.rectangle( + img, + (x0, y0 + 1), + (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])), + txt_bk_color, + -1 + ) + cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1) + + return img + + +_COLORS = np.array( + [ + 0.000, 0.447, 0.741, + 0.850, 0.325, 0.098, + 0.929, 0.694, 0.125, + 0.494, 0.184, 0.556, + 0.466, 0.674, 0.188, + 0.301, 0.745, 0.933, + 0.635, 0.078, 0.184, + 0.300, 0.300, 0.300, + 0.600, 0.600, 0.600, + 1.000, 0.000, 0.000, + 1.000, 0.500, 0.000, + 0.749, 0.749, 0.000, + 0.000, 1.000, 0.000, + 0.000, 0.000, 1.000, + 0.667, 0.000, 1.000, + 0.333, 0.333, 0.000, + 0.333, 0.667, 0.000, + 0.333, 1.000, 0.000, + 0.667, 0.333, 0.000, + 0.667, 0.667, 0.000, + 0.667, 1.000, 0.000, + 1.000, 0.333, 0.000, + 1.000, 0.667, 0.000, + 1.000, 1.000, 0.000, + 0.000, 0.333, 0.500, + 0.000, 0.667, 0.500, + 0.000, 1.000, 0.500, + 0.333, 0.000, 0.500, + 0.333, 0.333, 0.500, + 0.333, 0.667, 0.500, + 0.333, 1.000, 0.500, + 0.667, 0.000, 0.500, + 0.667, 0.333, 0.500, + 0.667, 0.667, 0.500, + 0.667, 1.000, 0.500, + 1.000, 0.000, 0.500, + 1.000, 0.333, 0.500, + 1.000, 0.667, 0.500, + 1.000, 1.000, 0.500, + 0.000, 0.333, 1.000, + 0.000, 0.667, 1.000, + 0.000, 1.000, 1.000, + 0.333, 0.000, 1.000, + 0.333, 0.333, 1.000, + 0.333, 0.667, 1.000, + 0.333, 1.000, 1.000, + 0.667, 0.000, 1.000, + 0.667, 0.333, 1.000, + 0.667, 0.667, 1.000, + 0.667, 1.000, 1.000, + 1.000, 0.000, 1.000, + 1.000, 0.333, 1.000, + 1.000, 0.667, 1.000, + 0.333, 0.000, 0.000, + 0.500, 0.000, 0.000, + 0.667, 0.000, 0.000, + 0.833, 0.000, 0.000, + 1.000, 0.000, 0.000, + 0.000, 0.167, 0.000, + 0.000, 0.333, 0.000, + 0.000, 0.500, 0.000, + 0.000, 0.667, 0.000, + 0.000, 0.833, 0.000, + 0.000, 1.000, 0.000, + 0.000, 0.000, 0.167, + 0.000, 0.000, 0.333, + 0.000, 0.000, 0.500, + 0.000, 0.000, 0.667, + 0.000, 0.000, 0.833, + 0.000, 0.000, 1.000, + 0.000, 0.000, 0.000, + 0.143, 0.143, 0.143, + 0.286, 0.286, 0.286, + 0.429, 0.429, 0.429, + 0.571, 0.571, 0.571, + 0.714, 0.714, 0.714, + 0.857, 0.857, 0.857, + 0.000, 0.447, 0.741, + 0.314, 0.717, 0.741, + 0.50, 0.5, 0 + ] +).astype(np.float32).reshape(-1, 3)