diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..cf2841044ee43a674119c780a43facf2d1180b69
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,214 @@
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### PyCharm ###
+# User-specific stuff
+.idea
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+# JetBrains templates
+**___jb_tmp___
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+docs/build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don’t work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+### Vim ###
+# Swap
+[._]*.s[a-v][a-z]
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+
+# Session
+Session.vim
+
+# Temporary
+.netrwhist
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+
+# output
+docs/api
+.code-workspace.code-workspace
+*.pkl
+*.npy
+*.pth
+*.onnx
+events.out.tfevents*
+
+# vscode
+*.code-workspace
+.vscode
+
+# vim
+.vim
diff --git a/LICENSE b/LICENSE
index 261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64..cc87e8683f8accf92fb441738e981d6ab8ce7536 100644
--- a/LICENSE
+++ b/LICENSE
@@ -178,7 +178,7 @@
    APPENDIX: How to apply the Apache License to your work.
 
       To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
+      boilerplate notice, with the fields enclosed by brackets "{}"
       replaced with your own identifying information. (Don't include
       the brackets!)  The text should be enclosed in the appropriate
       comment syntax for the file format. We also recommend that a
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2021 Megvii, Base Detection
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/demo/ncnn/yolox.cpp b/demo/ncnn/yolox.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ce4f4b4fd62f82a64c0d589244d80dd3d1681a7
--- /dev/null
+++ b/demo/ncnn/yolox.cpp
@@ -0,0 +1,419 @@
+// This file is wirtten base on the following file:
+// https://github.com/Tencent/ncnn/blob/master/examples/yolov5.cpp
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+// ------------------------------------------------------------------------------
+// Copyright (C) 2020-2021, Megvii Inc. All rights reserved.
+
+#include "layer.h"
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+// YOLOX use the same focus in yolov5
+class YoloV5Focus : public ncnn::Layer
+{
+public:
+    YoloV5Focus()
+    {
+        one_blob_only = true;
+    }
+
+    virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+
+        int outw = w / 2;
+        int outh = h / 2;
+        int outc = channels * 4;
+
+        top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < outc; p++)
+        {
+            const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
+            float* outptr = top_blob.channel(p);
+
+            for (int i = 0; i < outh; i++)
+            {
+                for (int j = 0; j < outw; j++)
+                {
+                    *outptr = *ptr;
+
+                    outptr += 1;
+                    ptr += 2;
+                }
+
+                ptr += w;
+            }
+        }
+
+        return 0;
+    }
+};
+
+DEFINE_LAYER_CREATOR(YoloV5Focus)
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+struct GridAndStride
+{
+    int grid0;
+    int grid1;
+    int stride;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+    if (objects.empty())
+        return;
+
+    qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static int generate_grids_and_stride(const int target_size, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
+{
+    for (auto stride : strides)
+    {
+        int num_grid = target_size / stride;
+        for (int g1 = 0; g1 < num_grid; g1++)
+        {
+            for (int g0 = 0; g0 < num_grid; g0++)
+            {
+                grid_strides.push_back((GridAndStride){g0, g1, stride});
+            }
+        }
+    }
+}
+
+static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid = feat_blob.h;
+    fprintf(stderr, "output height: %d, width: %d, channels: %d, dims:%d\n", feat_blob.h, feat_blob.w, feat_blob.c, feat_blob.dims);
+
+    const int num_class = feat_blob.w - 5;
+
+    const int num_anchors = grid_strides.size();
+
+    const float* feat_ptr = feat_blob.channel(0);
+    for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
+    {
+        const int grid0 = grid_strides[anchor_idx].grid0;
+        const int grid1 = grid_strides[anchor_idx].grid1;
+        const int stride = grid_strides[anchor_idx].stride;
+
+        // yolox/models/yolo_head.py decode logic
+        //  outputs[..., :2] = (outputs[..., :2] + grids) * strides
+        //  outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
+        float x_center = (feat_ptr[0] + grid0) * stride;
+        float y_center = (feat_ptr[1] + grid1) * stride;
+        float w = exp(feat_ptr[2]) * stride;
+        float h = exp(feat_ptr[3]) * stride;
+        float x0 = x_center - w * 0.5f;
+        float y0 = y_center - h * 0.5f;
+
+        float box_objectness = feat_ptr[4];
+        for (int class_idx = 0; class_idx < num_class; class_idx++)
+        {
+            float box_cls_score = feat_ptr[5 + class_idx];
+            float box_prob = box_objectness * box_cls_score;
+            if (box_prob > prob_threshold)
+            {
+                Object obj;
+                obj.rect.x = x0;
+                obj.rect.y = y0;
+                obj.rect.width = w;
+                obj.rect.height = h;
+                obj.label = class_idx;
+                obj.prob = box_prob;
+
+                objects.push_back(obj);
+            }
+
+        } // class loop
+        feat_ptr += feat_blob.w;
+
+    } // point anchor loop
+}
+ 
+static int detect_yolox(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolox;
+
+    yolox.opt.use_vulkan_compute = true;
+    // yolox.opt.use_bf16_storage = true;
+
+    yolox.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator);
+
+    // original pretrained model from https://github.com/yolox
+    // TODO ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    yolox.load_param("yolox.param");
+    yolox.load_model("yolox.bin");
+
+    const int target_size = 416;
+    const float prob_threshold = 0.3f;
+    const float nms_threshold = 0.65f;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    int w = img_w;
+    int h = img_h;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
+
+    // pad to target_size rectangle
+    int wpad = target_size - w;
+    int hpad = target_size - h;
+    ncnn::Mat in_pad;
+    // different from yolov5, yolox only pad on bottom and right side,
+    // which means users don't need to extra padding info to decode boxes coordinate.
+    ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f);
+
+    // python 0-1 input tensor with rgb_means = (0.485, 0.456, 0.406), std = (0.229, 0.224, 0.225)
+    // so for 0-255 input image, rgb_mean should multiply 255 and norm should div by std.
+    const float mean_vals[3] = {255.f * 0.485f, 255.f * 0.456, 255.f * 0.406f};
+    const float norm_vals[3] = {1 / (255.f * 0.229f), 1 / (255.f * 0.224f), 1 / (255.f * 0.225f)};
+
+    in_pad.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = yolox.create_extractor();
+
+    ex.input("images", in_pad);
+
+    std::vector<Object> proposals;
+
+    {
+        ncnn::Mat out;
+        ex.extract("output", out);
+
+        std::vector<int> strides = {8, 16, 32}; // might have stride=64
+        std::vector<GridAndStride> grid_strides;
+        generate_grids_and_stride(target_size, strides, grid_strides);
+        generate_yolox_proposals(grid_strides, out, prob_threshold, proposals);
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x) / scale;
+        float y0 = (objects[i].rect.y) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolox(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8d4f717307c1e58a637ee921724d9be478241b37
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,14 @@
+numpy
+torch
+opencv_python
+loguru
+scikit_image
+tqdm
+apex
+torchvision
+pycocotools
+apex
+Pillow
+skimage
+thop
+ninja
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..e6352cedd44804d41cacb806067b4e9c2c1296b6
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,18 @@
+[isort]
+line_length = 100
+multi_line_output = 3
+balanced_wrapping = True
+known_standard_library = setuptools
+known_third_party = tqdm,loguru
+known_data_processing = cv2,numpy,scipy,PIL,matplotlib,scikit_image
+known_datasets = pycocotools
+known_deeplearning = torch,torchvision,caffe2,onnx,apex,timm,thop,torch2trt,tensorrt
+known_myself = yolox
+sections = FUTURE,STDLIB,THIRDPARTY,data_processing,datasets,deeplearning,myself,FIRSTPARTY,LOCALFOLDER
+no_lines_before=STDLIB,THIRDPARTY,datasets
+default_section = FIRSTPARTY
+
+[flake8]
+max-line-length = 100
+max-complexity = 18
+exclude = __init__.py
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d243ef66587db9f83b028320e9211f094165205
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# Copyright (c) Megvii, Inc. and its affiliates. All Rights Reserved
+
+import re
+import setuptools
+import glob
+from os import path
+import torch
+from torch.utils.cpp_extension import CppExtension
+
+torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
+assert torch_ver >= [1, 3], "Requires PyTorch >= 1.3"
+
+
+def get_extensions():
+    this_dir = path.dirname(path.abspath(__file__))
+    extensions_dir = path.join(this_dir, "yolox", "layers", "csrc")
+
+    main_source = path.join(extensions_dir, "vision.cpp")
+    sources = glob.glob(path.join(extensions_dir, "**", "*.cpp"))
+
+    sources = [main_source] + sources
+    extension = CppExtension
+
+    extra_compile_args = {"cxx": ["-O3"]}
+    define_macros = []
+
+    include_dirs = [extensions_dir]
+
+    ext_modules = [
+        extension(
+            "yolox._C",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+
+    return ext_modules
+
+
+with open("yolox/__init__.py", "r") as f:
+    version = re.search(
+        r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
+        f.read(), re.MULTILINE
+    ).group(1)
+
+
+with open("README.md", "r") as f:
+    long_description = f.read()
+
+
+setuptools.setup(
+    name="yolox",
+    version=version,
+    author="basedet team",
+    python_requires=">=3.6",
+    long_description=long_description,
+    ext_modules=get_extensions(),
+    classifiers=["Programming Language :: Python :: 3", "Operating System :: OS Independent"],
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+    packages=setuptools.find_packages(),
+)
diff --git a/tools/demo.py b/tools/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc1460a8cd4adf1700656df9f2b792b7dde6df84
--- /dev/null
+++ b/tools/demo.py
@@ -0,0 +1,278 @@
+import argparse
+import os
+import time
+from loguru import logger
+
+import cv2
+
+import torch
+import torch.backends.cudnn as cudnn
+
+from yolox.data.data_augment import preproc
+from yolox.data.datasets import COCO_CLASSES
+from yolox.exp import get_exp
+from yolox.utils import fuse_model, get_model_info, postprocess, setup_logger, vis, xyxy2xywh
+
+IMAGE_EXT = ['.jpg', '.jpeg', '.webp', '.bmp', '.png']
+
+
+def make_parser():
+    parser = argparse.ArgumentParser("YOLOX Demo!")
+    parser.add_argument('demo', default='image', help='demo type, eg. image, video and webcam')
+    parser.add_argument("-expn", "--experiment-name", type=str, default=None)
+    parser.add_argument("-n", "--name", type=str, default=None, help="model name")
+
+    parser.add_argument('--path', default='./demo', help='path to images or video')
+    parser.add_argument('--camid', type=int, default=0, help='webcam demo camera id')
+    parser.add_argument(
+        '--save_result', action='store_true',
+        help='whether to save the inference result of image/video'
+    )
+
+    # exp file
+    parser.add_argument(
+        "-f",
+        "--exp_file",
+        default=None,
+        type=str,
+        help="pls input your expriment description file",
+    )
+    parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
+    parser.add_argument("--conf", default=None, type=float, help="test conf")
+    parser.add_argument("--nms", default=None, type=float, help="test nms threshold")
+    parser.add_argument("--tsize", default=None, type=int, help="test img size")
+    parser.add_argument(
+        "--fp16",
+        dest="fp16",
+        default=False,
+        action="store_true",
+        help="Adopting mix precision evaluating.",
+    )
+    parser.add_argument(
+        "--fuse",
+        dest="fuse",
+        default=False,
+        action="store_true",
+        help="Fuse conv and bn for testing.",
+    )
+    parser.add_argument(
+        "--trt",
+        dest="trt",
+        default=False,
+        action="store_true",
+        help="Using TensorRT model for testing.",
+    )
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+def get_image_list(path):
+    image_names = []
+    for maindir, subdir, file_name_list in os.walk(path):
+        for filename in file_name_list:
+            apath = os.path.join(maindir, filename)
+            ext = os.path.splitext(apath)[1]
+            if ext in IMAGE_EXT:
+                image_names.append(apath)
+    return image_names
+
+
+class Predictor(object):
+    def __init__(self, model, exp, cls_names=COCO_CLASSES, trt_file=None, decoder=None):
+        self.model = model
+        self.cls_names = cls_names
+        self.decoder = decoder
+        self.num_classes = exp.num_classes
+        self.confthre = exp.test_conf
+        self.nmsthre = exp.nmsthre
+        self.test_size = exp.test_size
+        if trt_file is not None:
+            from torch2trt import TRTModule
+            model_trt = TRTModule()
+            model_trt.load_state_dict(torch.load(trt_file))
+
+            x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
+            self.model(x)
+            self.model = model_trt
+        self.rgb_means = (0.485, 0.456, 0.406)
+        self.std = (0.229, 0.224, 0.225)
+
+    def inference(self, img):
+        img_info = {'id': 0}
+        if isinstance(img, str):
+            img_info['file_name'] = os.path.basename(img)
+            img = cv2.imread(img)
+        else:
+            img_info['file_name'] = None
+
+        height, width = img.shape[:2]
+        img_info['height'] = height
+        img_info['width'] = width
+        img_info['raw_img'] = img
+
+        img, ratio = preproc(img, self.test_size, self.rgb_means, self.std)
+        img_info['ratio'] = ratio
+        img = torch.from_numpy(img).unsqueeze(0).cuda()
+
+        with torch.no_grad():
+            t0 = time.time()
+            outputs = self.model(img)
+            if self.decoder is not None:
+                outputs = self.decoder(outputs, dtype=outputs.type())
+            outputs = postprocess(
+                        outputs, self.num_classes, self.confthre, self.nmsthre
+                    )
+            logger.info('Infer time: {:.4f}s'.format(time.time()-t0))
+        return outputs, img_info
+
+    def visual(self, output, img_info, cls_conf=0.35):
+        ratio = img_info['ratio']
+        img = img_info['raw_img']
+        output = output.cpu()
+
+        bboxes = output[:, 0:4]
+
+        # preprocessing: resize
+        bboxes /= ratio
+        bboxes = xyxy2xywh(bboxes)
+
+        cls = output[:, 6]
+        scores = output[:, 4] * output[:, 5]
+
+        vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
+        return vis_res
+
+
+def image_demo(predictor, vis_folder, path, current_time, save_result):
+    if os.path.isdir(path):
+        files = get_image_list(path)
+    else:
+        files = [path]
+    files.sort()
+    for image_name in files:
+        outputs, img_info = predictor.inference(image_name)
+        result_image = predictor.visual(outputs[0], img_info)
+        if save_result:
+            save_folder = os.path.join(
+                vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
+            )
+            os.makedirs(save_folder, exist_ok=True)
+            save_file_name = os.path.join(save_folder, os.path.basename(image_name))
+            logger.info("Saving detection result in {}".format(save_file_name))
+            cv2.imwrite(save_file_name, result_image)
+        ch = cv2.waitKey(0)
+        if ch == 27 or ch == ord('q') or ch == ord('Q'):
+            break
+
+
+def imageflow_demo(predictor, vis_folder, current_time, args):
+    cap = cv2.VideoCapture(args.path if args.demo == 'video' else args.camid)
+    width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)  # float
+    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)  # float
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    save_folder = os.path.join(vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time))
+    os.makedirs(save_folder, exist_ok=True)
+    if args.demo == "video":
+        save_path = os.path.join(save_folder, args.path.split('/')[-1])
+    else:
+        save_path = os.path.join(save_folder, 'camera.mp4')
+    logger.info(f'video save_path is {save_path}')
+    vid_writer = cv2.VideoWriter(
+        save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (int(width), int(height))
+    )
+    while True:
+        ret_val, frame = cap.read()
+        if ret_val:
+            outputs, img_info = predictor.inference(frame)
+            result_frame = predictor.visualize(outputs[0], img_info)
+            if args.save_result:
+                vid_writer.write(result_frame)
+            ch = cv2.waitKey(1)
+            if ch == 27 or ch == ord('q') or ch == ord('Q'):
+                break
+        else:
+            break
+
+
+def main(exp, args):
+    if not args.experiment_name:
+        args.experiment_name = exp.exp_name
+
+    # set environment variables for distributed training
+    cudnn.benchmark = True
+    rank = 0
+
+    file_name = os.path.join(exp.output_dir, args.experiment_name)
+    os.makedirs(file_name, exist_ok=True)
+
+    if args.save_result:
+        vis_folder = os.path.join(file_name, 'vis_res')
+        os.makedirs(vis_folder, exist_ok=True)
+
+    setup_logger(
+        file_name, distributed_rank=rank, filename="demo_log.txt", mode="a"
+    )
+    logger.info("Args: {}".format(args))
+
+    if args.conf is not None:
+        exp.test_conf = args.conf
+    if args.nms is not None:
+        exp.nmsthre = args.nms
+    if args.tsize is not None:
+        exp.test_size = (args.tsize, args.tsize)
+
+    model = exp.get_model()
+    logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
+
+    torch.cuda.set_device(rank)
+    model.cuda(rank)
+    model.eval()
+
+    if not args.trt:
+        if args.ckpt is None:
+            ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar")
+        else:
+            ckpt_file = args.ckpt
+        logger.info("loading checkpoint")
+        loc = "cuda:{}".format(rank)
+        ckpt = torch.load(ckpt_file, map_location=loc)
+        # load the model state dict
+        model.load_state_dict(ckpt["model"])
+        logger.info("loaded checkpoint done.")
+
+    if args.fuse:
+        logger.info("\tFusing model...")
+        model = fuse_model(model)
+
+    if args.trt:
+        assert (not args.fuse),\
+            "TensorRT model is not support model fusing!"
+        trt_file = os.path.join(file_name, "model_trt.pth")
+        assert os.path.exists(trt_file), (
+            "TensorRT model is not found!\n Run python3 yolox/deploy/trt.py first!"
+        )
+        model.head.decode_in_inference = False
+        decoder = model.head.decode_outputs
+        logger.info("Using TensorRT to inference")
+    else:
+        trt_file = None
+        decoder = None
+
+    predictor = Predictor(model, exp, COCO_CLASSES, trt_file, decoder)
+    current_time = time.localtime()
+    if args.demo == 'image':
+        image_demo(predictor, vis_folder, args.path, current_time, args.save_result)
+    elif args.demo == 'video' or args.demo == 'webcam':
+        imageflow_demo(predictor, vis_folder, current_time, args)
+
+
+if __name__ == "__main__":
+    args = make_parser().parse_args()
+    exp = get_exp(args.exp_file, args.name)
+
+    main(exp, args)
diff --git a/tools/eval.py b/tools/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbc0bb6c9bd79fd10a1048e1b827c4b54cbcd233
--- /dev/null
+++ b/tools/eval.py
@@ -0,0 +1,195 @@
+import argparse
+import os
+import random
+import warnings
+from loguru import logger
+
+import torch
+import torch.backends.cudnn as cudnn
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from yolox.core import launch
+from yolox.exp import get_exp
+from yolox.utils import configure_nccl, fuse_model, get_local_rank, get_model_info, setup_logger
+
+
+def make_parser():
+    parser = argparse.ArgumentParser("YOLOX Eval")
+    parser.add_argument("-expn", "--experiment-name", type=str, default=None)
+    parser.add_argument("-n", "--name", type=str, default=None, help="model name")
+
+    # distributed
+    parser.add_argument(
+        "--dist-backend", default="nccl", type=str, help="distributed backend"
+    )
+    parser.add_argument(
+        "--dist-url", default=None, type=str, help="url used to set up distributed training"
+    )
+    parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size")
+    parser.add_argument(
+        "-d", "--devices", default=None, type=int, help="device for training"
+    )
+    parser.add_argument(
+        "--local_rank", default=0, type=int, help="local rank for dist training"
+    )
+    parser.add_argument(
+        "--num_machine", default=1, type=int, help="num of node for training"
+    )
+    parser.add_argument(
+        "--machine_rank", default=0, type=int, help="node rank for multi-node training"
+    )
+    parser.add_argument(
+        "-f",
+        "--exp_file",
+        default=None,
+        type=str,
+        help="pls input your expriment description file",
+    )
+    parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
+    parser.add_argument("--conf", default=None, type=float, help="test conf")
+    parser.add_argument("--nms", default=None, type=float, help="test nms threshold")
+    parser.add_argument("--tsize", default=None, type=int, help="test img size")
+    parser.add_argument("--seed", default=None, type=int, help="eval seed")
+    parser.add_argument(
+        "--fp16",
+        dest="fp16",
+        default=False,
+        action="store_true",
+        help="Adopting mix precision evaluating.",
+    )
+    parser.add_argument(
+        "--fuse",
+        dest="fuse",
+        default=False,
+        action="store_true",
+        help="Fuse conv and bn for testing.",
+    )
+    parser.add_argument(
+        "--trt",
+        dest="trt",
+        default=False,
+        action="store_true",
+        help="Using TensorRT model for testing.",
+    )
+    parser.add_argument(
+        "--test",
+        dest="test",
+        default=False,
+        action="store_true",
+        help="Evaluating on test-dev set.",
+    )
+    parser.add_argument(
+        "--speed", dest="speed", default=False, action="store_true", help="speed test only."
+    )
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+@logger.catch
+def main(exp, num_gpu, args):
+    if not args.experiment_name:
+        args.experiment_name = exp.exp_name
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        warnings.warn(
+            "You have chosen to seed testing. This will turn on the CUDNN deterministic setting, "
+        )
+
+    is_distributed = num_gpu > 1
+
+    # set environment variables for distributed training
+    configure_nccl()
+    cudnn.benchmark = True
+
+    # rank = args.local_rank
+    rank = get_local_rank()
+
+    if rank == 0:
+        if os.path.exists("./" + args.experiment_name + "ip_add.txt"):
+            os.remove("./" + args.experiment_name + "ip_add.txt")
+
+    file_name = os.path.join(exp.output_dir, args.experiment_name)
+
+    if rank == 0:
+        os.makedirs(file_name, exist_ok=True)
+
+    setup_logger(
+        file_name, distributed_rank=rank, filename="val_log.txt", mode="a"
+    )
+    logger.info("Args: {}".format(args))
+
+    if args.conf is not None:
+        exp.test_conf = args.conf
+    if args.nms is not None:
+        exp.nmsthre = args.nms
+    if args.tsize is not None:
+        exp.test_size = (args.tsize, args.tsize)
+
+    model = exp.get_model()
+    logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
+    logger.info("Model Structure:\n{}".format(str(model)))
+
+    evaluator = exp.get_evaluator(args.batch_size, is_distributed, args.test)
+
+    torch.cuda.set_device(rank)
+    model.cuda(rank)
+    model.eval()
+
+    if not args.speed and not args.trt:
+        if args.ckpt is None:
+            ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar")
+        else:
+            ckpt_file = args.ckpt
+        logger.info("loading checkpoint")
+        loc = "cuda:{}".format(rank)
+        ckpt = torch.load(ckpt_file, map_location=loc)
+        # load the model state dict
+        model.load_state_dict(ckpt["model"])
+        logger.info("loaded checkpoint done.")
+
+    if is_distributed:
+        model = DDP(model, device_ids=[rank])
+
+    if args.fuse:
+        logger.info("\tFusing model...")
+        model = fuse_model(model)
+
+    if args.trt:
+        assert (not args.fuse and not is_distributed and args.batch_size == 1),\
+            "TensorRT model is not support model fusing and distributed inferencing!"
+        trt_file = os.path.join(file_name, "model_trt.pth")
+        assert os.path.exists(trt_file), "TensorRT model is not found!\n Run tools/trt.py first!"
+        model.head.decode_in_inference = False
+        decoder = model.head.decode_outputs
+    else:
+        trt_file = None
+        decoder = None
+
+    # start evaluate
+    *_, summary = evaluator.evaluate(
+        model, is_distributed, args.fp16, trt_file, decoder, exp.test_size
+    )
+    logger.info("\n" + summary)
+
+
+if __name__ == "__main__":
+    args = make_parser().parse_args()
+    exp = get_exp(args.exp_file, args.name)
+    exp.merge(args.opts)
+
+    num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
+    assert num_gpu <= torch.cuda.device_count()
+
+    dist_url = "auto" if args.dist_url is None else args.dist_url
+    launch(
+        main, num_gpu, args.num_machine, backend=args.dist_backend,
+        dist_url=dist_url, args=(exp, num_gpu, args)
+    )
diff --git a/tools/export_onnx.py b/tools/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eae933252bbc853acde3798b683439d79b0a455
--- /dev/null
+++ b/tools/export_onnx.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import argparse
+import os
+from loguru import logger
+
+import torch
+from torch import nn
+
+from yolox.exp import get_exp
+from yolox.models.network_blocks import SiLU
+from yolox.utils import replace_module
+
+
+def make_parser():
+    parser = argparse.ArgumentParser("YOLOX onnx deploy")
+    parser.add_argument(
+        "--output-name", type=str, default="yolox.onnx", help="output name of models"
+    )
+    parser.add_argument("--input", default="images", type=str, help="input name of onnx model")
+    parser.add_argument("--output", default="output", type=str, help="output name of onnx model")
+    parser.add_argument("-o", "--opset", default=11, type=int, help="onnx opset version")
+    parser.add_argument("--no-onnxsim", action="store_true", help="use onnxsim or not")
+
+    parser.add_argument(
+        "-f",
+        "--exp_file",
+        default=None,
+        type=str,
+        help="expriment description file",
+    )
+    parser.add_argument("-expn", "--experiment-name", type=str, default=None)
+    parser.add_argument("-n", "--name", type=str, default=None, help="model name")
+    parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt path")
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+
+    return parser
+
+
+@logger.catch
+def main():
+    args = make_parser().parse_args()
+    logger.info("args value: {}".format(args))
+    exp = get_exp(args.exp_file, args.name)
+    exp.merge(args.opts)
+
+    if not args.experiment_name:
+        args.experiment_name = exp.exp_name
+
+    model = exp.get_model()
+    if args.ckpt is None:
+        file_name = os.path.join(exp.output_dir, args.experiment_name)
+        ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar")
+    else:
+        ckpt_file = args.ckpt
+
+    ckpt = torch.load(ckpt_file, map_location="cpu")
+    # load the model state dict
+
+    model.eval()
+    if "model" in ckpt:
+        ckpt = ckpt["model"]
+    model.load_state_dict(ckpt)
+    model = replace_module(model, nn.SiLU, SiLU)
+    model.head.decode_in_inference = False
+
+    logger.info("loaded checkpoint done.")
+    dummy_input = torch.randn(1, 3, exp.test_size[0], exp.test_size[1])
+    torch.onnx._export(
+        model,
+        dummy_input,
+        args.output_name,
+        input_names=[args.input],
+        output_names=[args.output],
+        opset_version=args.opset,
+    )
+    logger.info("generate onnx named {}".format(args.output_name))
+
+    if not args.no_onnxsim:
+        # use onnxsimplify to reduce reduent model.
+        os.system("python3 -m onnxsim {} {}".format(args.output_name, args.output_name))
+        logger.info("generate simplify onnx named {}".format(args.output_name))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/train.py b/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..848086d1d0ae978ebd3d1db856e7a4e1388247f9
--- /dev/null
+++ b/tools/train.py
@@ -0,0 +1,112 @@
+import argparse
+import random
+import warnings
+from loguru import logger
+
+import torch
+import torch.backends.cudnn as cudnn
+
+from yolox.core import Trainer, launch
+from yolox.exp import get_exp
+from yolox.utils import configure_nccl
+
+
+def make_parser():
+    parser = argparse.ArgumentParser("YOLOX train parser")
+    parser.add_argument("-expn", "--experiment-name", type=str, default=None)
+    parser.add_argument("-n", "--name", type=str, default=None, help="model name")
+
+    # distributed
+    parser.add_argument(
+        "--dist-backend", default="nccl", type=str, help="distributed backend"
+    )
+    parser.add_argument(
+        "--dist-url", default=None, type=str, help="url used to set up distributed training"
+    )
+    parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size")
+    parser.add_argument(
+        "-d", "--devices", default=None, type=int, help="device for training"
+    )
+    parser.add_argument(
+        "--local_rank", default=0, type=int, help="local rank for dist training"
+    )
+    parser.add_argument(
+        "-f",
+        "--exp_file",
+        default=None,
+        type=str,
+        help="plz input your expriment description file",
+    )
+    parser.add_argument(
+        "--resume", default=False, action="store_true", help="resume training"
+    )
+    parser.add_argument("-c", "--ckpt", default=None, type=str, help="checkpoint file")
+    parser.add_argument(
+        "-e", "--start_epoch", default=None, type=int, help="resume training start epoch"
+    )
+    parser.add_argument(
+        "--num_machine", default=1, type=int, help="num of node for training"
+    )
+    parser.add_argument(
+        "--machine_rank", default=0, type=int, help="node rank for multi-node training"
+    )
+    parser.add_argument(
+        "--fp16",
+        dest="fp16",
+        default=True,
+        action="store_true",
+        help="Adopting mix precision training.",
+    )
+    parser.add_argument(
+        "-o",
+        "--occumpy",
+        dest="occumpy",
+        default=False,
+        action="store_true",
+        help="occumpy GPU memory first for training.",
+    )
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+@logger.catch
+def main(exp, args):
+    if not args.experiment_name:
+        args.experiment_name = exp.exp_name
+
+    if exp.seed is not None:
+        random.seed(exp.seed)
+        torch.manual_seed(exp.seed)
+        cudnn.deterministic = True
+        warnings.warn(
+            "You have chosen to seed training. This will turn on the CUDNN deterministic setting, "
+            "which can slow down your training considerably! You may see unexpected behavior "
+            "when restarting from checkpoints."
+        )
+
+    # set environment variables for distributed training
+    configure_nccl()
+    cudnn.benchmark = True
+
+    trainer = Trainer(exp, args)
+    trainer.train()
+
+
+if __name__ == "__main__":
+    args = make_parser().parse_args()
+    exp = get_exp(args.exp_file, args.name)
+    exp.merge(args.opts)
+
+    num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
+    assert num_gpu <= torch.cuda.device_count()
+
+    dist_url = "auto" if args.dist_url is None else args.dist_url
+    launch(
+        main, num_gpu, args.num_machine, backend=args.dist_backend,
+        dist_url=dist_url, args=(exp, args)
+    )
diff --git a/tools/trt.py b/tools/trt.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ccf0a2b13bb7438849d8d1794be8897ffc644ba
--- /dev/null
+++ b/tools/trt.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import argparse
+import os
+from loguru import logger
+
+import tensorrt as trt
+import torch
+from torch2trt import torch2trt
+
+from yolox.exp import get_exp
+
+
+def make_parser():
+    parser = argparse.ArgumentParser("YOLOX ncnn deploy")
+    parser.add_argument("-expn", "--experiment-name", type=str, default=None)
+    parser.add_argument("-n", "--name", type=str, default=None, help="model name")
+
+    parser.add_argument(
+        "-f",
+        "--exp_file",
+        default=None,
+        type=str,
+        help="pls input your expriment description file",
+    )
+    parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt path")
+    return parser
+
+
+@logger.catch
+def main():
+    args = make_parser().parse_args()
+    exp = get_exp(args.exp_file, args.name)
+    if not args.experiment_name:
+        args.experiment_name = exp.exp_name
+
+    model = exp.get_model()
+    file_name = os.path.join(exp.output_dir, args.experiment_name)
+    os.makedirs(file_name, exist_ok=True)
+    if args.ckpt is None:
+        ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar")
+    else:
+        ckpt_file = args.ckpt
+
+    ckpt = torch.load(ckpt_file, map_location="cpu")
+    # load the model state dict
+
+    model.load_state_dict(ckpt["model"])
+    logger.info("loaded checkpoint done.")
+    model.eval()
+    model.cuda()
+    model.head.decode_in_inference = False
+    x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
+    model_trt = torch2trt(
+        model,
+        [x],
+        fp16_mode=True,
+        log_level=trt.Logger.INFO,
+        max_workspace_size=(1 << 32),
+    )
+    torch.save(model_trt.state_dict(), os.path.join(file_name, 'model_trt.pth'))
+    logger.info("Converted TensorRT model done.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/yolox/__init__.py b/yolox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cbc411d419c55098e7d4e24ff0f21caaaf10a1f
--- /dev/null
+++ b/yolox/__init__.py
@@ -0,0 +1,8 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+from .utils import configure_module
+
+configure_module()
+
+__version__ = "0.1.0"
diff --git a/yolox/core/__init__.py b/yolox/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8484835d0a57aedada22894e085776c9effd3b20
--- /dev/null
+++ b/yolox/core/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+from .launch import launch
+from .trainer import Trainer
diff --git a/yolox/core/launch.py b/yolox/core/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..94e84f7865c27a63613ae7bbc2c082521abf66b7
--- /dev/null
+++ b/yolox/core/launch.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Code are based on
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/engine/launch.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Megvii, Inc. and its affiliates.
+
+
+from loguru import logger
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+import yolox.utils.dist as comm
+
+__all__ = ["launch"]
+
+
+def _find_free_port():
+    """
+    Find an available port of current machine / node.
+    """
+    import socket
+
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(("", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+
+
+def launch(
+    main_func, num_gpus_per_machine, num_machines=1, machine_rank=0,
+    backend="nccl", dist_url=None, args=()
+):
+    """
+    Args:
+        main_func: a function that will be called by `main_func(*args)`
+        num_machines (int): the total number of machines
+        machine_rank (int): the rank of this machine (one per machine)
+        dist_url (str): url to connect to for distributed training, including protocol
+                       e.g. "tcp://127.0.0.1:8686".
+                       Can be set to auto to automatically select a free port on localhost
+        args (tuple): arguments passed to main_func
+    """
+    world_size = num_machines * num_gpus_per_machine
+    if world_size > 1:
+        # https://github.com/pytorch/pytorch/pull/14391
+        # TODO prctl in spawned processes
+
+        if dist_url == "auto":
+            assert num_machines == 1, "dist_url=auto cannot work with distributed training."
+            port = _find_free_port()
+            dist_url = f"tcp://127.0.0.1:{port}"
+
+        mp.spawn(
+            _distributed_worker,
+            nprocs=num_gpus_per_machine,
+            args=(
+                main_func, world_size, num_gpus_per_machine,
+                machine_rank, backend, dist_url, args
+            ),
+            daemon=False,
+        )
+    else:
+        main_func(*args)
+
+
+def _distributed_worker(
+    local_rank, main_func, world_size, num_gpus_per_machine,
+    machine_rank, backend, dist_url, args
+):
+    assert torch.cuda.is_available(), "cuda is not available. Please check your installation."
+    global_rank = machine_rank * num_gpus_per_machine + local_rank
+    logger.info("Rank {} initialization finished.".format(global_rank))
+    try:
+        dist.init_process_group(
+            backend=backend,
+            init_method=dist_url,
+            world_size=world_size,
+            rank=global_rank,
+        )
+    except Exception:
+        logger.error("Process group URL: {}".format(dist_url))
+        raise
+    # synchronize is needed here to prevent a possible timeout after calling init_process_group
+    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
+    comm.synchronize()
+
+    assert num_gpus_per_machine <= torch.cuda.device_count()
+    torch.cuda.set_device(local_rank)
+
+    # Setup the local process group (which contains ranks within the same machine)
+    assert comm._LOCAL_PROCESS_GROUP is None
+    num_machines = world_size // num_gpus_per_machine
+    for i in range(num_machines):
+        ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
+        pg = dist.new_group(ranks_on_i)
+        if i == machine_rank:
+            comm._LOCAL_PROCESS_GROUP = pg
+
+    main_func(*args)
diff --git a/yolox/core/trainer.py b/yolox/core/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f5d24bea1d27a0bd127fdef7883ce1c03044424
--- /dev/null
+++ b/yolox/core/trainer.py
@@ -0,0 +1,318 @@
+import datetime
+import os
+import time
+from loguru import logger
+
+import apex
+import torch
+from apex import amp
+from torch.utils.tensorboard import SummaryWriter
+
+from yolox.data import DataPrefetcher
+from yolox.utils import (
+    MeterBuffer,
+    ModelEMA,
+    all_reduce_norm,
+    get_local_rank,
+    get_model_info,
+    get_rank,
+    get_world_size,
+    gpu_mem_usage,
+    load_ckpt,
+    occumpy_mem,
+    save_checkpoint,
+    setup_logger,
+    synchronize
+)
+
+
+class Trainer:
+
+    def __init__(self, exp, args):
+        # init function only defines some basic attr, other attrs like model, optimizer are built in
+        # before_train methods.
+        self.exp = exp
+        self.args = args
+
+        # training related attr
+        self.max_epoch = exp.max_epoch
+        self.amp_training = args.fp16
+        self.is_distributed = get_world_size() > 1
+        self.rank = get_rank()
+        self.local_rank = get_local_rank()
+        self.device = "cuda:{}".format(self.local_rank)
+        self.use_model_ema = exp.ema
+
+        # data/dataloader related attr
+        self.data_type = torch.float16 if args.fp16 else torch.float32
+        self.input_size = exp.input_size
+        self.best_ap = 0
+
+        # metric record
+        self.meter = MeterBuffer(window_size=exp.print_interval)
+        self.file_name = os.path.join(exp.output_dir, args.experiment_name)
+
+        if self.rank == 0 and os.path.exists("./" + args.experiment_name + "ip_add.txt"):
+            os.remove("./" + args.experiment_name + "ip_add.txt")
+
+        if self.rank == 0:
+            os.makedirs(self.file_name, exist_ok=True)
+
+        setup_logger(self.file_name, distributed_rank=self.rank, filename="train_log.txt", mode="a")
+
+    def train(self):
+        self.before_train()
+        try:
+            self.train_in_epoch()
+        except Exception:
+            raise
+        finally:
+            self.after_train()
+
+    def train_in_epoch(self):
+        for self.epoch in range(self.start_epoch, self.max_epoch):
+            self.before_epoch()
+            self.train_in_iter()
+            self.after_epoch()
+
+    def train_in_iter(self):
+        for self.iter in range(self.max_iter):
+            self.before_iter()
+            self.train_one_iter()
+            self.after_iter()
+
+    def train_one_iter(self):
+        iter_start_time = time.time()
+
+        inps, targets = self.prefetcher.next()
+        inps = inps.to(self.data_type)
+        targets = targets.to(self.data_type)
+        targets.requires_grad = False
+        data_end_time = time.time()
+
+        outputs = self.model(inps, targets)
+        loss = outputs["total_loss"]
+
+        self.optimizer.zero_grad()
+        if self.amp_training:
+            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        else:
+            loss.backward()
+        self.optimizer.step()
+
+        if self.use_model_ema:
+            self.ema_model.update(self.model)
+
+        lr = self.lr_scheduler.update_lr(self.progress_in_iter + 1)
+        for param_group in self.optimizer.param_groups:
+            param_group["lr"] = lr
+
+        iter_end_time = time.time()
+        self.meter.update(
+            iter_time=iter_end_time - iter_start_time,
+            data_time=data_end_time - iter_start_time,
+            lr=lr,
+            **outputs,
+        )
+
+    def before_train(self):
+        logger.info("args: {}".format(self.args))
+        logger.info("exp value:\n{}".format(self.exp))
+
+        # model related init
+        torch.cuda.set_device(self.local_rank)
+        model = self.exp.get_model()
+        logger.info("Model Summary: {}".format(get_model_info(model, self.exp.test_size)))
+        model.to(self.device)
+
+        # solver related init
+        self.optimizer = self.exp.get_optimizer(self.args.batch_size)
+
+        if self.amp_training:
+            model, optimizer = amp.initialize(model, self.optimizer, opt_level="O1")
+
+        # value of epoch will be set in `resume_train`
+        model = self.resume_train(model)
+
+        # data related init
+        self.no_aug = self.start_epoch >= self.max_epoch - self.exp.no_aug_epochs
+        self.train_loader = self.exp.get_data_loader(
+            batch_size=self.args.batch_size,
+            is_distributed=self.is_distributed,
+            no_aug=self.no_aug
+        )
+        logger.info("init prefetcher, this might take a while...")
+        self.prefetcher = DataPrefetcher(self.train_loader)
+        # max_iter means iters per epoch
+        self.max_iter = len(self.train_loader)
+
+        self.lr_scheduler = self.exp.get_lr_scheduler(
+            self.exp.basic_lr_per_img * self.args.batch_size, self.max_iter
+        )
+        if self.args.occumpy:
+            occumpy_mem(self.local_rank)
+
+        if self.is_distributed:
+            model = apex.parallel.DistributedDataParallel(model)
+            # from torch.nn.parallel import DistributedDataParallel as DDP
+            # model = DDP(model, device_ids=[self.local_rank], broadcast_buffers=False)
+
+        if self.use_model_ema:
+            self.ema_model = ModelEMA(model, 0.9998)
+            self.ema_model.updates = self.max_iter * self.start_epoch
+
+        self.model = model
+        self.model.train()
+
+        self.evaluator = self.exp.get_evaluator(
+            batch_size=self.args.batch_size, is_distributed=self.is_distributed
+        )
+        # Tensorboard logger
+        if self.rank == 0:
+            self.tblogger = SummaryWriter(self.file_name)
+
+        logger.info("Training start...")
+        logger.info("\n{}".format(model))
+
+    def after_train(self):
+        logger.info(
+            "Training of experiment is done and the best AP is {:.2f}".format(self.best_ap * 100)
+        )
+
+    def before_epoch(self):
+        logger.info("---> start train epoch{}".format(self.epoch + 1))
+
+        if self.epoch + 1 == self.max_epoch - self.exp.no_aug_epochs or self.no_aug:
+            logger.info("--->No mosaic aug now!")
+            self.train_loader.close_mosaic()
+            logger.info("--->Add additional L1 loss now!")
+            if self.is_distributed:
+                self.model.module.head.use_l1 = True
+            else:
+                self.model.head.use_l1 = True
+            self.exp.eval_interval = 1
+            if not self.no_aug:
+                self.save_ckpt(ckpt_name="last_mosaic_epoch")
+
+    def after_epoch(self):
+        if self.use_model_ema:
+            self.ema_model.update_attr(self.model)
+
+        self.save_ckpt(ckpt_name="latest")
+
+        if (self.epoch + 1) % self.exp.eval_interval == 0:
+            all_reduce_norm(self.model)
+            self.evaluate_and_save_model()
+
+    def before_iter(self):
+        pass
+
+    def after_iter(self):
+        """
+        `after_iter` contains two parts of logic:
+            * log information
+            * reset setting of resize
+        """
+        # log needed information
+        if (self.iter + 1) % self.exp.print_interval == 0:
+            # TODO check ETA logic
+            left_iters = self.max_iter * self.max_epoch - (self.progress_in_iter + 1)
+            eta_seconds = self.meter["iter_time"].global_avg * left_iters
+            eta_str = "ETA: {}".format(datetime.timedelta(seconds=int(eta_seconds)))
+
+            progress_str = "epoch: {}/{}, iter: {}/{}".format(
+                self.epoch + 1, self.max_epoch, self.iter + 1, self.max_iter
+            )
+            loss_meter = self.meter.get_filtered_meter("loss")
+            loss_str = ", ".join(["{}: {:.1f}".format(k, v.latest) for k, v in loss_meter.items()])
+
+            time_meter = self.meter.get_filtered_meter("time")
+            time_str = ", ".join(["{}: {:.3f}s".format(k, v.avg) for k, v in time_meter.items()])
+
+            logger.info(
+                "{}, mem: {:.0f}Mb, {}, {}, lr: {:.3e}".format(
+                    progress_str,
+                    gpu_mem_usage(),
+                    time_str,
+                    loss_str,
+                    self.meter["lr"].latest,
+                )
+                + (", size: {:d}, {}".format(self.input_size[0], eta_str))
+            )
+            self.meter.clear_meters()
+
+        # random resizing
+        if self.exp.random_size is not None and (self.progress_in_iter + 1) % 10 == 0:
+            self.input_size = self.exp.random_resize(
+                self.train_loader, self.epoch, self.rank, self.is_distributed
+            )
+
+    @property
+    def progress_in_iter(self):
+        return self.epoch * self.max_iter + self.iter
+
+    def resume_train(self, model):
+        if self.args.resume:
+            logger.info("resume training")
+            if self.args.ckpt is None:
+                ckpt_file = os.path.join(self.file_name, "latest" + "_ckpt.pth.tar")
+            else:
+                ckpt_file = self.args.ckpt
+
+            ckpt = torch.load(ckpt_file, map_location=self.device)
+            # resume the model/optimizer state dict
+            model.load_state_dict(ckpt["model"])
+            self.optimizer.load_state_dict(ckpt["optimizer"])
+            # resume the training states variables
+            if self.amp_training and "amp" in ckpt:
+                amp.load_state_dict(ckpt["amp"])
+            start_epoch = (
+                self.args.start_epoch - 1
+                if self.args.start_epoch is not None
+                else ckpt["start_epoch"]
+            )
+            self.start_epoch = start_epoch
+            logger.info("loaded checkpoint '{}' (epoch {})".format(self.args.resume, self.start_epoch))  # noqa
+        else:
+            if self.args.ckpt is not None:
+                logger.info("loading checkpoint for fine tuning")
+                ckpt_file = self.args.ckpt
+                ckpt = torch.load(ckpt_file, map_location=self.device)["model"]
+                model = load_ckpt(self.model, ckpt)
+            self.start_epoch = 0
+
+        return model
+
+    def evaluate_and_save_model(self):
+        evalmodel = self.ema_model.ema if self.use_model_ema else self.model
+        ap50_95, ap50, summary = self.exp.eval(evalmodel, self.evaluator, self.is_distributed)
+        self.model.train()
+        if self.rank == 0:
+            self.tblogger.add_scalar("val/COCOAP50", ap50, self.epoch + 1)
+            self.tblogger.add_scalar("val/COCOAP50_95", ap50_95, self.epoch + 1)
+            logger.info("\n" + summary)
+        synchronize()
+
+        self.save_ckpt("last_epoch", ap50_95 > self.best_ap)
+        self.best_ap = max(self.best_ap, ap50_95)
+
+    def save_ckpt(self, ckpt_name, update_best_ckpt=False):
+        if self.rank == 0:
+            save_model = self.ema_model.ema if self.use_model_ema else self.model
+            logger.info("Save weights to {}".format(self.file_name))
+            ckpt_state = {
+                "start_epoch": self.epoch + 1,
+                "model": save_model.state_dict(),
+                "optimizer": self.optimizer.state_dict(),
+            }
+            if self.amp_training:
+                # save amp state according to
+                # https://nvidia.github.io/apex/amp.html#checkpointing
+                ckpt_state["amp"] = amp.state_dict()
+            save_checkpoint(
+                ckpt_state,
+                update_best_ckpt,
+                self.file_name,
+                ckpt_name,
+            )
diff --git a/yolox/data/__init__.py b/yolox/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b6bb501915f4bd131e3e706168122764a3514cc
--- /dev/null
+++ b/yolox/data/__init__.py
@@ -0,0 +1,8 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+from .data_augment import TrainTransform, ValTransform
+from .data_prefetcher import DataPrefetcher
+from .dataloading import DataLoader, get_yolox_datadir
+from .datasets import *
+from .samplers import InfiniteSampler, YoloBatchSampler
diff --git a/yolox/data/data_augment.py b/yolox/data/data_augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ccdbbe5f285e9e49b67ce4469fcb868c5ae0efc
--- /dev/null
+++ b/yolox/data/data_augment.py
@@ -0,0 +1,389 @@
+"""
+Data augmentation functionality. Passed as callable transformations to
+Dataset classes.
+
+The data augmentation procedures were interpreted from @weiliu89's SSD paper
+http://arxiv.org/abs/1512.02325
+"""
+
+import math
+import random
+
+import cv2
+import numpy as np
+
+import torch
+
+
+def augment_hsv(img, hgain=0.015, sgain=0.7, vgain=0.4):
+    r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1  # random gains
+    hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
+    dtype = img.dtype  # uint8
+
+    x = np.arange(0, 256, dtype=np.int16)
+    lut_hue = ((x * r[0]) % 180).astype(dtype)
+    lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
+    lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
+
+    img_hsv = cv2.merge(
+        (cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))
+    ).astype(dtype)
+    cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)  # no return needed
+
+
+def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.2):
+    # box1(4,n), box2(4,n)
+    # Compute candidate boxes which include follwing 5 things:
+    # box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio
+    w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
+    w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
+    ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16))  # aspect ratio
+    return (
+        (w2 > wh_thr)
+        & (h2 > wh_thr)
+        & (w2 * h2 / (w1 * h1 + 1e-16) > area_thr)
+        & (ar < ar_thr)
+    )  # candidates
+
+
+def random_perspective(
+    img, targets=(), degrees=10, translate=0.1, scale=0.1, shear=10, perspective=0.0, border=(0, 0),
+):
+    # targets = [cls, xyxy]
+    height = img.shape[0] + border[0] * 2  # shape(h,w,c)
+    width = img.shape[1] + border[1] * 2
+
+    # Center
+    C = np.eye(3)
+    C[0, 2] = -img.shape[1] / 2  # x translation (pixels)
+    C[1, 2] = -img.shape[0] / 2  # y translation (pixels)
+
+    # Rotation and Scale
+    R = np.eye(3)
+    a = random.uniform(-degrees, degrees)
+    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
+    s = random.uniform(scale[0], scale[1])
+    # s = 2 ** random.uniform(-scale, scale)
+    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+
+    # Shear
+    S = np.eye(3)
+    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
+    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)
+
+    # Translation
+    T = np.eye(3)
+    T[0, 2] = (random.uniform(0.5 - translate, 0.5 + translate) * width)  # x translation (pixels)
+    T[1, 2] = (random.uniform(0.5 - translate, 0.5 + translate) * height)  # y translation (pixels)
+
+    # Combined rotation matrix
+    M = T @ S @ R @ C  # order of operations (right to left) is IMPORTANT
+
+    ###########################
+    # For Aug out of Mosaic
+    # s = 1.
+    # M = np.eye(3)
+    ###########################
+
+    if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
+        if perspective:
+            img = cv2.warpPerspective(img, M, dsize=(width, height), borderValue=(114, 114, 114))
+        else:  # affine
+            img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114))
+
+    # Transform label coordinates
+    n = len(targets)
+    if n:
+        # warp points
+        xy = np.ones((n * 4, 3))
+        xy[:, :2] = targets[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+        xy = xy @ M.T  # transform
+        if perspective:
+            xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
+        else:  # affine
+            xy = xy[:, :2].reshape(n, 8)
+
+        # create new boxes
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+
+        # clip boxes
+        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
+        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
+
+        # filter candidates
+        i = box_candidates(box1=targets[:, :4].T * s, box2=xy.T)
+        targets = targets[i]
+        targets[:, :4] = xy[i]
+
+    return img, targets
+
+
+def _distort(image):
+    def _convert(image, alpha=1, beta=0):
+        tmp = image.astype(float) * alpha + beta
+        tmp[tmp < 0] = 0
+        tmp[tmp > 255] = 255
+        image[:] = tmp
+
+    image = image.copy()
+
+    if random.randrange(2):
+        _convert(image, beta=random.uniform(-32, 32))
+
+    if random.randrange(2):
+        _convert(image, alpha=random.uniform(0.5, 1.5))
+
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+
+    if random.randrange(2):
+        tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
+        tmp %= 180
+        image[:, :, 0] = tmp
+
+    if random.randrange(2):
+        _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))
+
+    image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
+
+    return image
+
+
+def _mirror(image, boxes):
+    _, width, _ = image.shape
+    if random.randrange(2):
+        image = image[:, ::-1]
+        boxes = boxes.copy()
+        boxes[:, 0::2] = width - boxes[:, 2::-2]
+    return image, boxes
+
+
+# TODO: reorg: use mosaicDet instead
+def _random_affine(
+    img,
+    targets=None,
+    degrees=(-10, 10),
+    translate=(0.1, 0.1),
+    scale=(0.9, 1.1),
+    shear=(-2, 2),
+    borderValue=(114, 114, 114),
+):
+    # degrees = (0, 0)
+    # shear = (0, 0)
+    border = 0  # width of added border (optional)
+    # height = max(img.shape[0], img.shape[1]) + border * 2
+    height, width, _ = img.shape
+
+    # Rotation and Scale
+    R = np.eye(3)
+    a = random.random() * (degrees[1] - degrees[0]) + degrees[0]
+    # a += random.choice([-180, -90, 0, 90])  # 90deg rotations added to small rotations
+    s = random.random() * (scale[1] - scale[0]) + scale[0]
+    R[:2] = cv2.getRotationMatrix2D(
+        angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s
+    )
+
+    # Translation
+    T = np.eye(3)
+    # x translation (pixels)
+    T[0, 2] = (random.random() * 2 - 1) * translate[0] * img.shape[0] + border
+    # y translation (pixels)
+    T[1, 2] = (random.random() * 2 - 1) * translate[1] * img.shape[1] + border
+
+    # Shear
+    S = np.eye(3)
+    # x shear (deg)
+    S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180)
+    # y shear (deg)
+    S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180)
+
+    # Combined rotation matrix. NOTE: ORDER IS IMPORTANT HERE!!
+    M = S @ T @ R
+    # BGR order borderValue
+    imw = cv2.warpPerspective(
+        img, M, dsize=(width, height), flags=cv2.INTER_LINEAR, borderValue=borderValue
+    )
+
+    # Return warped points also
+    if targets is not None:
+        if len(targets) > 0:
+            n = targets.shape[0]
+            points = targets[:, 0:4].copy()
+
+            # warp points
+            xy = np.ones((n * 4, 3))
+            xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
+                n * 4, 2
+            )  # x1y1, x2y2, x1y2, x2y1
+            xy = (xy @ M.T)[:, :2].reshape(n, 8)
+
+            # create new boxes
+            x = xy[:, [0, 2, 4, 6]]
+            y = xy[:, [1, 3, 5, 7]]
+            xy = (
+                np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+            )
+
+            # apply angle-based reduction
+            radians = a * math.pi / 180
+            reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
+            x = (xy[:, 2] + xy[:, 0]) / 2
+            y = (xy[:, 3] + xy[:, 1]) / 2
+            w = (xy[:, 2] - xy[:, 0]) * reduction
+            h = (xy[:, 3] - xy[:, 1]) * reduction
+            xy = (
+                np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2))
+                .reshape(4, n)
+                .T
+            )
+
+            # reject warped points outside of image
+            x1 = np.clip(xy[:, 0], 0, width)
+            y1 = np.clip(xy[:, 1], 0, height)
+            x2 = np.clip(xy[:, 2], 0, width)
+            y2 = np.clip(xy[:, 3], 0, height)
+            boxes = np.concatenate((x1, y1, x2, y2)).reshape(4, n).T
+
+        return imw, boxes, M
+    else:
+        return imw
+
+
+def preproc(image, input_size, mean, std, swap=(2, 0, 1)):
+    if len(image.shape) == 3:
+        padded_img = np.ones((input_size[0], input_size[1], 3)) * 114.0
+    else:
+        padded_img = np.ones(input_size) * 114.0
+    img = np.array(image)
+    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+    resized_img = cv2.resize(
+        img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LINEAR
+    ).astype(np.float32)
+    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+    image = padded_img
+
+    image = image.astype(np.float32)
+    image = image[:, :, ::-1]
+    image /= 255.0
+    if mean is not None:
+        image -= mean
+    if std is not None:
+        image /= std
+    image = image.transpose(swap)
+    image = np.ascontiguousarray(image, dtype=np.float32)
+    return image, r
+
+
+class TrainTransform:
+    def __init__(self, p=0.5, rgb_means=None, std=None, max_labels=50):
+        self.means = rgb_means
+        self.std = std
+        self.p = p
+        self.max_labels = max_labels
+
+    def __call__(self, image, targets, input_dim):
+        boxes = targets[:, :4].copy()
+        labels = targets[:, 4].copy()
+        if targets.shape[1] > 5:
+            mixup = True
+            ratios = targets[:, -1].copy()
+            ratios_o = targets[:, -1].copy()
+        else:
+            mixup = False
+            ratios = None
+            ratios_o = None
+        lshape = 6 if mixup else 5
+        if len(boxes) == 0:
+            targets = np.zeros((self.max_labels, lshape), dtype=np.float32)
+            image, r_o = preproc(image, input_dim, self.means, self.std)
+            image = np.ascontiguousarray(image, dtype=np.float32)
+            return image, targets
+
+        image_o = image.copy()
+        targets_o = targets.copy()
+        height_o, width_o, _ = image_o.shape
+        boxes_o = targets_o[:, :4]
+        labels_o = targets_o[:, 4]
+        # bbox_o: [xyxy] to [c_x,c_y,w,h]
+        b_x_o = (boxes_o[:, 2] + boxes_o[:, 0]) * 0.5
+        b_y_o = (boxes_o[:, 3] + boxes_o[:, 1]) * 0.5
+        b_w_o = (boxes_o[:, 2] - boxes_o[:, 0]) * 1.0
+        b_h_o = (boxes_o[:, 3] - boxes_o[:, 1]) * 1.0
+        boxes_o[:, 0] = b_x_o
+        boxes_o[:, 1] = b_y_o
+        boxes_o[:, 2] = b_w_o
+        boxes_o[:, 3] = b_h_o
+
+        image_t = _distort(image)
+        image_t, boxes = _mirror(image_t, boxes)
+        height, width, _ = image_t.shape
+        image_t, r_ = preproc(image_t, input_dim, self.means, self.std)
+        boxes = boxes.copy()
+        # boxes [xyxy] 2 [cx,cy,w,h]
+        b_x = (boxes[:, 2] + boxes[:, 0]) * 0.5
+        b_y = (boxes[:, 3] + boxes[:, 1]) * 0.5
+        b_w = (boxes[:, 2] - boxes[:, 0]) * 1.0
+        b_h = (boxes[:, 3] - boxes[:, 1]) * 1.0
+        boxes[:, 0] = b_x
+        boxes[:, 1] = b_y
+        boxes[:, 2] = b_w
+        boxes[:, 3] = b_h
+
+        boxes *= r_
+
+        mask_b = np.minimum(boxes[:, 2], boxes[:, 3]) > 8
+        boxes_t = boxes[mask_b]
+        labels_t = labels[mask_b].copy()
+        if mixup:
+            ratios_t = ratios[mask_b].copy()
+
+        if len(boxes_t) == 0:
+            image_t, r_o = preproc(image_o, input_dim, self.means, self.std)
+            boxes_o *= r_o
+            boxes_t = boxes_o
+            labels_t = labels_o
+            ratios_t = ratios_o
+
+        labels_t = np.expand_dims(labels_t, 1)
+        if mixup:
+            ratios_t = np.expand_dims(ratios_t, 1)
+            targets_t = np.hstack((labels_t, boxes_t, ratios_t))
+        else:
+            targets_t = np.hstack((labels_t, boxes_t))
+        padded_labels = np.zeros((self.max_labels, lshape))
+        padded_labels[range(len(targets_t))[: self.max_labels]] = targets_t[
+            : self.max_labels
+        ]
+        padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32)
+        image_t = np.ascontiguousarray(image_t, dtype=np.float32)
+        return image_t, padded_labels
+
+
+class ValTransform:
+    """
+    Defines the transformations that should be applied to test PIL image
+    for input into the network
+
+    dimension -> tensorize -> color adj
+
+    Arguments:
+        resize (int): input dimension to SSD
+        rgb_means ((int,int,int)): average RGB of the dataset
+            (104,117,123)
+        swap ((int,int,int)): final order of channels
+
+    Returns:
+        transform (transform) : callable transform to be applied to test/val
+        data
+    """
+
+    def __init__(self, rgb_means=None, std=None, swap=(2, 0, 1)):
+        self.means = rgb_means
+        self.swap = swap
+        self.std = std
+
+    # assume input is cv2 img for now
+    def __call__(self, img, res, input_size):
+        img, _ = preproc(img, input_size, self.means, self.std, self.swap)
+        return torch.from_numpy(img), torch.zeros(1, 5)
diff --git a/yolox/data/data_prefetcher.py b/yolox/data/data_prefetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ef73c0a70ece9a7bf5692c5cbc58bbc52c864d6
--- /dev/null
+++ b/yolox/data/data_prefetcher.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import random
+
+import torch
+import torch.distributed as dist
+
+from yolox.utils import synchronize
+
+
+class DataPrefetcher:
+    """
+    DataPrefetcher is inspired by code of following file:
+    https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main_amp.py
+    It could speedup your pytorch dataloader. For more information, please check
+    https://github.com/NVIDIA/apex/issues/304#issuecomment-493562789.
+    """
+
+    def __init__(self, loader):
+        self.loader = iter(loader)
+        self.stream = torch.cuda.Stream()
+        self.input_cuda = self._input_cuda_for_image
+        self.record_stream = DataPrefetcher._record_stream_for_image
+        self.preload()
+
+    def preload(self):
+        try:
+            self.next_input, self.next_target, _, _ = next(self.loader)
+        except StopIteration:
+            self.next_input = None
+            self.next_target = None
+            return
+
+        with torch.cuda.stream(self.stream):
+            self.input_cuda()
+            self.next_target = self.next_target.cuda(non_blocking=True)
+
+    def next(self):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        input = self.next_input
+        target = self.next_target
+        if input is not None:
+            self.record_stream(input)
+        if target is not None:
+            target.record_stream(torch.cuda.current_stream())
+        self.preload()
+        return input, target
+
+    def _input_cuda_for_image(self):
+        self.next_input = self.next_input.cuda(non_blocking=True)
+
+    @staticmethod
+    def _record_stream_for_image(input):
+        input.record_stream(torch.cuda.current_stream())
+
+
+def random_resize(data_loader, exp, epoch, rank, is_distributed):
+    tensor = torch.LongTensor(1).cuda()
+    if is_distributed:
+        synchronize()
+
+    if rank == 0:
+        if epoch > exp.max_epoch - 10:
+            size = exp.input_size
+        else:
+            size = random.randint(*exp.random_size)
+            size = int(32 * size)
+        tensor.fill_(size)
+
+    if is_distributed:
+        synchronize()
+        dist.broadcast(tensor, 0)
+
+    input_size = data_loader.change_input_dim(multiple=tensor.item(), random_range=None)
+    return input_size
diff --git a/yolox/data/dataloading.py b/yolox/data/dataloading.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4f9512ddc90f8751d2b1de1ba819c58f5a7a40d
--- /dev/null
+++ b/yolox/data/dataloading.py
@@ -0,0 +1,172 @@
+import os
+import random
+
+import torch
+from torch.utils.data.dataloader import DataLoader as torchDataLoader
+from torch.utils.data.dataloader import default_collate
+
+from .samplers import YoloBatchSampler
+
+
+def get_yolox_datadir():
+    """
+    get dataset dir of YOLOX. If environment variable named `YOLOX_DATADIR` is set,
+    this function will return value of the environment variable. Otherwise, use data
+    """
+    yolox_datadir = os.getenv("YOLOX_DATADIR", None)
+    if yolox_datadir is None:
+        import yolox
+        yolox_datadir = os.path.join(os.path.dirname(yolox.__file__), "data")
+    return yolox_datadir
+
+
+class DataLoader(torchDataLoader):
+    """
+    Lightnet dataloader that enables on the fly resizing of the images.
+    See :class:`torch.utils.data.DataLoader` for more information on the arguments.
+    Check more on the following website:
+    https://gitlab.com/EAVISE/lightnet/-/blob/master/lightnet/data/_dataloading.py
+
+    Note:
+        This dataloader only works with :class:`lightnet.data.Dataset` based datasets.
+
+    Example:
+        >>> class CustomSet(ln.data.Dataset):
+        ...     def __len__(self):
+        ...         return 4
+        ...     @ln.data.Dataset.resize_getitem
+        ...     def __getitem__(self, index):
+        ...         # Should return (image, anno) but here we return (input_dim,)
+        ...         return (self.input_dim,)
+        >>> dl = ln.data.DataLoader(
+        ...     CustomSet((200,200)),
+        ...     batch_size = 2,
+        ...     collate_fn = ln.data.list_collate   # We want the data to be grouped as a list
+        ... )
+        >>> dl.dataset.input_dim    # Default input_dim
+        (200, 200)
+        >>> for d in dl:
+        ...     d
+        [[(200, 200), (200, 200)]]
+        [[(200, 200), (200, 200)]]
+        >>> dl.change_input_dim(320, random_range=None)
+        (320, 320)
+        >>> for d in dl:
+        ...     d
+        [[(320, 320), (320, 320)]]
+        [[(320, 320), (320, 320)]]
+        >>> dl.change_input_dim((480, 320), random_range=None)
+        (480, 320)
+        >>> for d in dl:
+        ...     d
+        [[(480, 320), (480, 320)]]
+        [[(480, 320), (480, 320)]]
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.__initialized = False
+        shuffle = False
+        batch_sampler = None
+        if len(args) > 5:
+            shuffle = args[2]
+            sampler = args[3]
+            batch_sampler = args[4]
+        elif len(args) > 4:
+            shuffle = args[2]
+            sampler = args[3]
+            if "batch_sampler" in kwargs:
+                batch_sampler = kwargs["batch_sampler"]
+        elif len(args) > 3:
+            shuffle = args[2]
+            if "sampler" in kwargs:
+                sampler = kwargs["sampler"]
+            if "batch_sampler" in kwargs:
+                batch_sampler = kwargs["batch_sampler"]
+        else:
+            if "shuffle" in kwargs:
+                shuffle = kwargs["shuffle"]
+            if "sampler" in kwargs:
+                sampler = kwargs["sampler"]
+            if "batch_sampler" in kwargs:
+                batch_sampler = kwargs["batch_sampler"]
+
+        # Use custom BatchSampler
+        if batch_sampler is None:
+            if sampler is None:
+                if shuffle:
+                    sampler = torch.utils.data.sampler.RandomSampler(self.dataset)
+                    # sampler = torch.utils.data.DistributedSampler(self.dataset)
+                else:
+                    sampler = torch.utils.data.sampler.SequentialSampler(self.dataset)
+            batch_sampler = YoloBatchSampler(
+                sampler,
+                self.batch_size,
+                self.drop_last,
+                input_dimension=self.dataset.input_dim,
+            )
+            # batch_sampler = IterationBasedBatchSampler(batch_sampler, num_iterations =
+
+        self.batch_sampler = batch_sampler
+
+        self.__initialized = True
+
+    def close_mosaic(self):
+        self.batch_sampler.mosaic = False
+
+    def change_input_dim(self, multiple=32, random_range=(10, 19)):
+        """ This function will compute a new size and update it on the next mini_batch.
+
+        Args:
+            multiple (int or tuple, optional): values to multiply the randomly generated range by.
+                Default **32**
+            random_range (tuple, optional): This (min, max) tuple sets the range
+                for the randomisation; Default **(10, 19)**
+
+        Return:
+            tuple: width, height tuple with new dimension
+
+        Note:
+            The new size is generated as follows: |br|
+            First we compute a random integer inside ``[random_range]``.
+            We then multiply that number with the ``multiple`` argument,
+            which gives our final new input size. |br|
+            If ``multiple`` is an integer we generate a square size. If you give a tuple
+            of **(width, height)**, the size is computed
+            as :math:`rng * multiple[0], rng * multiple[1]`.
+
+        Note:
+            You can set the ``random_range`` argument to **None** to set
+            an exact size of multiply. |br|
+            See the example above for how this works.
+        """
+        if random_range is None:
+            size = 1
+        else:
+            size = random.randint(*random_range)
+
+        if isinstance(multiple, int):
+            size = (size * multiple, size * multiple)
+        else:
+            size = (size * multiple[0], size * multiple[1])
+
+        self.batch_sampler.new_input_dim = size
+
+        return size
+
+
+def list_collate(batch):
+    """
+    Function that collates lists or tuples together into one list (of lists/tuples).
+    Use this as the collate function in a Dataloader, if you want to have a list of
+    items as an output, as opposed to tensors (eg. Brambox.boxes).
+    """
+    items = list(zip(*batch))
+
+    for i in range(len(items)):
+        if isinstance(items[i][0], (list, tuple)):
+            items[i] = list(items[i])
+        else:
+            items[i] = default_collate(items[i])
+
+    return items
diff --git a/yolox/data/datasets/__init__.py b/yolox/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..75dc1aaf4ae7438a98bd7fe4bea6c5bc2d878a7a
--- /dev/null
+++ b/yolox/data/datasets/__init__.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+from .coco import COCODataset
+from .coco_classes import COCO_CLASSES
+from .datasets_wrapper import ConcatDataset, Dataset, MixConcatDataset
+from .mosaicdetection import MosaicDetection
diff --git a/yolox/data/datasets/coco.py b/yolox/data/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f32f98817de54f7a574f33fc5a0e78a55ce8399d
--- /dev/null
+++ b/yolox/data/datasets/coco.py
@@ -0,0 +1,119 @@
+import os
+
+import cv2
+import numpy as np
+from pycocotools.coco import COCO
+
+from ..dataloading import get_yolox_datadir
+from .datasets_wrapper import Dataset
+
+
+class COCODataset(Dataset):
+    """
+    COCO dataset class.
+    """
+
+    def __init__(
+        self,
+        data_dir=None,
+        json_file="instances_train2017.json",
+        name="train2017",
+        img_size=(416, 416),
+        preproc=None,
+    ):
+        """
+        COCO dataset initialization. Annotation data are read into memory by COCO API.
+        Args:
+            data_dir (str): dataset root directory
+            json_file (str): COCO json file name
+            name (str): COCO data name (e.g. 'train2017' or 'val2017')
+            img_size (int): target image size after pre-processing
+            preproc: data augmentation strategy
+        """
+        super().__init__(img_size)
+        if data_dir is None:
+            data_dir = os.path.join(get_yolox_datadir(), "COCO")
+        self.data_dir = data_dir
+        self.json_file = json_file
+
+        self.coco = COCO(os.path.join(self.data_dir, "annotations", self.json_file))
+        self.ids = self.coco.getImgIds()
+        self.class_ids = sorted(self.coco.getCatIds())
+        cats = self.coco.loadCats(self.coco.getCatIds())
+        self._classes = tuple([c["name"] for c in cats])
+        self.name = name
+        self.max_labels = 50
+        self.img_size = img_size
+        self.preproc = preproc
+
+    def __len__(self):
+        return len(self.ids)
+
+    def pull_item(self, index):
+        id_ = self.ids[index]
+
+        im_ann = self.coco.loadImgs(id_)[0]
+        width = im_ann["width"]
+        height = im_ann["height"]
+        anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=False)
+        annotations = self.coco.loadAnns(anno_ids)
+
+        # load image and preprocess
+        img_file = os.path.join(
+            self.data_dir, self.name, "{:012}".format(id_) + ".jpg"
+        )
+
+        img = cv2.imread(img_file)
+        assert img is not None
+
+        # load labels
+        valid_objs = []
+        for obj in annotations:
+            x1 = np.max((0, obj["bbox"][0]))
+            y1 = np.max((0, obj["bbox"][1]))
+            x2 = np.min((width - 1, x1 + np.max((0, obj["bbox"][2] - 1))))
+            y2 = np.min((height - 1, y1 + np.max((0, obj["bbox"][3] - 1))))
+            if obj["area"] > 0 and x2 >= x1 and y2 >= y1:
+                obj["clean_bbox"] = [x1, y1, x2, y2]
+                valid_objs.append(obj)
+        objs = valid_objs
+        num_objs = len(objs)
+
+        res = np.zeros((num_objs, 5))
+
+        for ix, obj in enumerate(objs):
+            cls = self.class_ids.index(obj["category_id"])
+            res[ix, 0:4] = obj["clean_bbox"]
+            res[ix, 4] = cls
+
+        img_info = (height, width)
+
+        return img, res, img_info, id_
+
+    @Dataset.resize_getitem
+    def __getitem__(self, index):
+        """
+        One image / label pair for the given index is picked up and pre-processed.
+
+        Args:
+            index (int): data index
+
+        Returns:
+            img (numpy.ndarray): pre-processed image
+            padded_labels (torch.Tensor): pre-processed label data.
+                The shape is :math:`[self.max_labels, 5]`.
+                each label consists of [class, xc, yc, w, h]:
+                    class (float): class index.
+                    xc, yc (float) : center of bbox whose values range from 0 to 1.
+                    w, h (float) : size of bbox whose values range from 0 to 1.
+            info_img : tuple of h, w, nh, nw, dx, dy.
+                h, w (int): original shape of the image
+                nh, nw (int): shape of the resized image without padding
+                dx, dy (int): pad size
+            img_id (int): same as the input index. Used for evaluation.
+        """
+        img, res, img_info, img_id = self.pull_item(index)
+
+        if self.preproc is not None:
+            img, target = self.preproc(img, res, self.input_dim)
+        return img, target, img_info, img_id
diff --git a/yolox/data/datasets/coco_classes.py b/yolox/data/datasets/coco_classes.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3ca22671fd20f0303667761226dc6fd945e65bb
--- /dev/null
+++ b/yolox/data/datasets/coco_classes.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+
+COCO_CLASSES = (
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+)
diff --git a/yolox/data/datasets/datasets_wrapper.py b/yolox/data/datasets/datasets_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..056ca85a3c9a575aa1edea64ed54bd7d68de7f89
--- /dev/null
+++ b/yolox/data/datasets/datasets_wrapper.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+import bisect
+from functools import wraps
+
+from torch.utils.data.dataset import ConcatDataset as torchConcatDataset
+from torch.utils.data.dataset import Dataset as torchDataset
+
+
+class ConcatDataset(torchConcatDataset):
+    def __init__(self, datasets):
+        super(ConcatDataset, self).__init__(datasets)
+        if hasattr(self.datasets[0], "input_dim"):
+            self._input_dim = self.datasets[0].input_dim
+            self.input_dim = self.datasets[0].input_dim
+
+    def pull_item(self, idx):
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError(
+                    "absolute value of index should not exceed dataset length"
+                )
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx].pull_item(sample_idx)
+
+
+class MixConcatDataset(torchConcatDataset):
+    def __init__(self, datasets):
+        super(MixConcatDataset, self).__init__(datasets)
+        if hasattr(self.datasets[0], "input_dim"):
+            self._input_dim = self.datasets[0].input_dim
+            self.input_dim = self.datasets[0].input_dim
+
+    def __getitem__(self, index):
+
+        if not isinstance(index, int):
+            idx = index[1]
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError(
+                    "absolute value of index should not exceed dataset length"
+                )
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        if not isinstance(index, int):
+            index = (index[0], sample_idx, index[2])
+
+        return self.datasets[dataset_idx][index]
+
+
+class Dataset(torchDataset):
+    """ This class is a subclass of the base :class:`torch.utils.data.Dataset`,
+    that enables on the fly resizing of the ``input_dim``.
+
+    Args:
+        input_dimension (tuple): (width,height) tuple with default dimensions of the network
+    """
+
+    def __init__(self, input_dimension, mosaic=True):
+        super().__init__()
+        self.__input_dim = input_dimension[:2]
+        self._mosaic = mosaic
+
+    @property
+    def input_dim(self):
+        """
+        Dimension that can be used by transforms to set the correct image size, etc.
+        This allows transforms to have a single source of truth
+        for the input dimension of the network.
+
+        Return:
+            list: Tuple containing the current width,height
+        """
+        if hasattr(self, "_input_dim"):
+            return self._input_dim
+        return self.__input_dim
+
+    @staticmethod
+    def resize_getitem(getitem_fn):
+        """
+        Decorator method that needs to be used around the ``__getitem__`` method. |br|
+        This decorator enables the on the fly resizing of
+        the ``input_dim`` with our :class:`~lightnet.data.DataLoader` class.
+
+        Example:
+            >>> class CustomSet(ln.data.Dataset):
+            ...     def __len__(self):
+            ...         return 10
+            ...     @ln.data.Dataset.resize_getitem
+            ...     def __getitem__(self, index):
+            ...         # Should return (image, anno) but here we return input_dim
+            ...         return self.input_dim
+            >>> data = CustomSet((200,200))
+            >>> data[0]
+            (200, 200)
+            >>> data[(480,320), 0]
+            (480, 320)
+        """
+
+        @wraps(getitem_fn)
+        def wrapper(self, index):
+            if not isinstance(index, int):
+                has_dim = True
+                self._input_dim = index[0]
+                self._mosaic = index[2]
+                index = index[1]
+            else:
+                has_dim = False
+
+            ret_val = getitem_fn(self, index)
+
+            if has_dim:
+                del self._input_dim
+
+            return ret_val
+
+        return wrapper
diff --git a/yolox/data/datasets/mosaicdetection.py b/yolox/data/datasets/mosaicdetection.py
new file mode 100644
index 0000000000000000000000000000000000000000..778f6483b7cedcacc6c26e87f1e85044849b31f9
--- /dev/null
+++ b/yolox/data/datasets/mosaicdetection.py
@@ -0,0 +1,195 @@
+import random
+
+import cv2
+import numpy as np
+
+from yolox.utils import adjust_box_anns
+
+from ..data_augment import box_candidates, random_perspective
+from .datasets_wrapper import Dataset
+
+
+class MosaicDetection(Dataset):
+    """Detection dataset wrapper that performs mixup for normal dataset.
+
+    Parameters
+    ----------
+    dataset : Pytorch Dataset
+        Gluon dataset object.
+    *args : list
+        Additional arguments for mixup random sampler.
+    """
+
+    def __init__(
+        self, dataset, img_size, mosaic=True, preproc=None,
+        degrees=10.0, translate=0.1, scale=(0.5, 1.5), mscale=(0.5, 1.5),
+        shear=2.0, perspective=0.0, enable_mixup=True, *args
+    ):
+        super().__init__(img_size, mosaic=mosaic)
+        self._dataset = dataset
+        self.preproc = preproc
+        self.degrees = degrees
+        self.translate = translate
+        self.scale = scale
+        self.shear = shear
+        self.perspective = perspective
+        self.mixup_scale = mscale
+        self._mosaic = mosaic
+        self.enable_mixup = enable_mixup
+
+    def __len__(self):
+        return len(self._dataset)
+
+    @Dataset.resize_getitem
+    def __getitem__(self, idx):
+        if self._mosaic:
+            labels4 = []
+            s = self._dataset.input_dim[0]
+            # yc, xc = s, s  # mosaic center x, y
+            yc = int(random.uniform(0.5 * s, 1.5 * s))
+            xc = int(random.uniform(0.5 * s, 1.5 * s))
+
+            # 3 additional image indices
+            indices = [idx] + [random.randint(0, len(self._dataset) - 1) for _ in range(3)]
+
+            for i, index in enumerate(indices):
+                img, _labels, _, _ = self._dataset.pull_item(index)
+                h0, w0 = img.shape[:2]  # orig hw
+                r = 1.0 * s / max(h0, w0)  # resize image to img_size
+                interp = cv2.INTER_LINEAR
+                img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp)
+                (h, w) = img.shape[:2]
+
+                if i == 0:  # top left
+                    # base image with 4 tiles
+                    img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8)
+                    # xmin, ymin, xmax, ymax (large image)
+                    x1a, y1a, x2a, y2a = (max(xc - w, 0), max(yc - h, 0), xc, yc,)
+                    # xmin, ymin, xmax, ymax (small image)
+                    x1b, y1b, x2b, y2b = (w - (x2a - x1a), h - (y2a - y1a), w, h,)
+                elif i == 1:  # top right
+                    x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
+                    x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+                elif i == 2:  # bottom left
+                    x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
+                    x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+                elif i == 3:  # bottom right
+                    x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
+                    x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+
+                img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+                padw = x1a - x1b
+                padh = y1a - y1b
+
+                labels = _labels.copy()  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
+                if _labels.size > 0:  # Normalized xywh to pixel xyxy format
+                    labels[:, 0] = r * _labels[:, 0] + padw
+                    labels[:, 1] = r * _labels[:, 1] + padh
+                    labels[:, 2] = r * _labels[:, 2] + padw
+                    labels[:, 3] = r * _labels[:, 3] + padh
+                labels4.append(labels)
+
+            if len(labels4):
+                labels4 = np.concatenate(labels4, 0)
+                np.clip(labels4[:, :4], 0, 2 * s, out=labels4[:, :4])  # use with random_affine
+            img4, labels4 = random_perspective(
+                img4,
+                labels4,
+                degrees=self.degrees,
+                translate=self.translate,
+                scale=self.scale,
+                shear=self.shear,
+                perspective=self.perspective,
+                border=[-s // 2, -s // 2],
+            )  # border to remove
+
+            # -----------------------------------------------------------------
+            # CopyPaste: https://arxiv.org/abs/2012.07177
+            # -----------------------------------------------------------------
+            if self.enable_mixup and not len(labels4) == 0:
+                img4, labels4 = self.mixup(img4, labels4, self.input_dim)
+            mix_img, padded_labels = self.preproc(img4, labels4, self.input_dim)
+            img_info = (mix_img.shape[1], mix_img.shape[0])
+
+            return mix_img, padded_labels, img_info, int(idx)
+
+        else:
+            self._dataset._input_dim = self.input_dim
+            img, label, img_info, idx = self._dataset.pull_item(idx)
+            img, label = self.preproc(img, label, self.input_dim)
+            return img, label, img_info, int(idx)
+
+    def mixup(self, origin_img, origin_labels, input_dim):
+        # jit_factor = random.uniform(0.8, 1.2)
+        jit_factor = random.uniform(*self.mixup_scale)
+        FLIP = random.uniform(0, 1) > 0.5
+        cp_labels = []
+        while len(cp_labels) == 0:
+            cp_index = random.randint(0, self.__len__() - 1)
+            id_ = self._dataset.ids[cp_index]
+            anno_ids = self._dataset.coco.getAnnIds(imgIds=[int(id_)], iscrowd=False)
+            cp_labels = self._dataset.coco.loadAnns(anno_ids)
+        img, cp_labels, _, _ = self._dataset.pull_item(cp_index)
+
+        if len(img.shape) == 3:
+            cp_img = np.ones((input_dim[0], input_dim[1], 3)) * 114.0
+        else:
+            cp_img = np.ones(input_dim) * 114.0
+        cp_scale_ratio = input_dim[0] / max(img.shape[0], img.shape[1])
+        resized_img = cv2.resize(
+            img,
+            (int(img.shape[1] * cp_scale_ratio), int(img.shape[0] * cp_scale_ratio)),
+            interpolation=cv2.INTER_LINEAR,
+        ).astype(np.float32)
+        cp_img[
+            : int(img.shape[0] * cp_scale_ratio), : int(img.shape[1] * cp_scale_ratio)
+        ] = resized_img
+        cp_img = cv2.resize(
+            cp_img,
+            (int(cp_img.shape[1] * jit_factor), int(cp_img.shape[0] * jit_factor)),
+        )
+        cp_scale_ratio *= jit_factor
+        if FLIP:
+            cp_img = cp_img[:, ::-1, :]
+
+        origin_h, origin_w = cp_img.shape[:2]
+        target_h, target_w = origin_img.shape[:2]
+        padded_img = np.zeros(
+            (max(origin_h, target_h), max(origin_w, target_w), 3)
+        ).astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = cp_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
+        padded_cropped_img = padded_img[
+            y_offset: y_offset + target_h, x_offset: x_offset + target_w
+        ]
+
+        cp_bboxes_origin_np = adjust_box_anns(
+            cp_labels[:, :4], cp_scale_ratio, 0, 0, origin_w, origin_h
+        )
+        if FLIP:
+            cp_bboxes_origin_np[:, 0::2] = (
+                origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1]
+            )
+        cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
+        cp_bboxes_transformed_np[:, 0::2] = np.clip(
+            cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w
+        )
+        cp_bboxes_transformed_np[:, 1::2] = np.clip(
+            cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h
+        )
+        keep_list = box_candidates(cp_bboxes_origin_np.T, cp_bboxes_transformed_np.T, 5)
+
+        if keep_list.sum() >= 1.0:
+            cls_labels = cp_labels[keep_list, 4:5]
+            box_labels = cp_bboxes_transformed_np[keep_list]
+            labels = np.hstack((box_labels, cls_labels))
+            origin_labels = np.vstack((origin_labels, labels))
+            origin_img = origin_img.astype(np.float32)
+            origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(np.float32)
+
+        return origin_img.astype(np.uint8), origin_labels
diff --git a/yolox/data/datasets/voc.py b/yolox/data/datasets/voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b8edb92701510c46f04382cf1a8a263179d740b
--- /dev/null
+++ b/yolox/data/datasets/voc.py
@@ -0,0 +1,313 @@
+"""VOC Dataset Classes
+
+Original author: Francisco Massa
+https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
+
+Updated by: Ellis Brown, Max deGroot
+"""
+
+import os
+import os.path
+import pickle
+import xml.etree.ElementTree as ET
+
+import cv2
+import numpy as np
+
+from yolox.evalutors.voc_eval import voc_eval
+
+from .datasets_wrapper import Dataset
+from .voc_classes import VOC_CLASSES
+
+# for making bounding boxes pretty
+COLORS = (
+    (255, 0, 0, 128),
+    (0, 255, 0, 128),
+    (0, 0, 255, 128),
+    (0, 255, 255, 128),
+    (255, 0, 255, 128),
+    (255, 255, 0, 128),
+)
+
+
+class AnnotationTransform(object):
+
+    """Transforms a VOC annotation into a Tensor of bbox coords and label index
+    Initilized with a dictionary lookup of classnames to indexes
+
+    Arguments:
+        class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
+            (default: alphabetic indexing of VOC's 20 classes)
+        keep_difficult (bool, optional): keep difficult instances or not
+            (default: False)
+        height (int): height
+        width (int): width
+    """
+
+    def __init__(self, class_to_ind=None, keep_difficult=True):
+        self.class_to_ind = class_to_ind or dict(zip(VOC_CLASSES, range(len(VOC_CLASSES))))
+        self.keep_difficult = keep_difficult
+
+    def __call__(self, target):
+        """
+        Arguments:
+            target (annotation) : the target annotation to be made usable
+                will be an ET.Element
+        Returns:
+            a list containing lists of bounding boxes  [bbox coords, class name]
+        """
+        res = np.empty((0, 5))
+        for obj in target.iter("object"):
+            difficult = int(obj.find("difficult").text) == 1
+            if not self.keep_difficult and difficult:
+                continue
+            name = obj.find("name").text.lower().strip()
+            bbox = obj.find("bndbox")
+
+            pts = ["xmin", "ymin", "xmax", "ymax"]
+            bndbox = []
+            for i, pt in enumerate(pts):
+                cur_pt = int(bbox.find(pt).text) - 1
+                # scale height or width
+                # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
+                bndbox.append(cur_pt)
+            label_idx = self.class_to_ind[name]
+            bndbox.append(label_idx)
+            res = np.vstack((res, bndbox))  # [xmin, ymin, xmax, ymax, label_ind]
+            # img_id = target.find('filename').text[:-4]
+
+        return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
+
+
+class VOCDetection(Dataset):
+
+    """
+    VOC Detection Dataset Object
+
+    input is image, target is annotation
+
+    Args:
+        root (string): filepath to VOCdevkit folder.
+        image_set (string): imageset to use (eg. 'train', 'val', 'test')
+        transform (callable, optional): transformation to perform on the
+            input image
+        target_transform (callable, optional): transformation to perform on the
+            target `annotation`
+            (eg: take in caption string, return tensor of word indices)
+        dataset_name (string, optional): which dataset to load
+            (default: 'VOC2007')
+    """
+
+    def __init__(
+        self,
+        root,
+        image_sets,
+        preproc=None,
+        target_transform=AnnotationTransform(),
+        input_dim=(416, 416),
+        dataset_name="VOC0712",
+    ):
+        super().__init__(input_dim)
+        self.root = root
+        self.image_set = image_sets
+        self.preproc = preproc
+        self.target_transform = target_transform
+        self.name = dataset_name
+        self._annopath = os.path.join("%s", "Annotations", "%s.xml")
+        self._imgpath = os.path.join("%s", "JPEGImages", "%s.jpg")
+        self._classes = VOC_CLASSES
+        self.ids = list()
+        for (year, name) in image_sets:
+            self._year = year
+            rootpath = os.path.join(self.root, "VOC" + year)
+            for line in open(
+                os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
+            ):
+                self.ids.append((rootpath, line.strip()))
+
+    @Dataset.resize_getitem
+    def __getitem__(self, index):
+        img_id = self.ids[index]
+        target = ET.parse(self._annopath % img_id).getroot()
+        img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
+        # img = Image.open(self._imgpath % img_id).convert('RGB')
+
+        height, width, _ = img.shape
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        if self.preproc is not None:
+            img, target = self.preproc(img, target, self.input_dim)
+            # print(img.size())
+
+        img_info = (width, height)
+
+        return img, target, img_info, img_id
+
+    def __len__(self):
+        return len(self.ids)
+
+    def pull_image(self, index):
+        """Returns the original image object at index in PIL form
+
+        Note: not using self.__getitem__(), as any transformations passed in
+        could mess up this functionality.
+
+        Argument:
+            index (int): index of img to show
+        Return:
+            PIL img
+        """
+        img_id = self.ids[index]
+        return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
+
+    def pull_anno(self, index):
+        """Returns the original annotation of image at index
+
+        Note: not using self.__getitem__(), as any transformations passed in
+        could mess up this functionality.
+
+        Argument:
+            index (int): index of img to get annotation of
+        Return:
+            list:  [img_id, [(label, bbox coords),...]]
+                eg: ('001718', [('dog', (96, 13, 438, 332))])
+        """
+        img_id = self.ids[index]
+        anno = ET.parse(self._annopath % img_id).getroot()
+        gt = self.target_transform(anno, 1, 1)
+        return img_id[1], gt
+
+    def pull_item(self, index):
+        """Returns the original image and target at an index for mixup
+
+        Note: not using self.__getitem__(), as any transformations passed in
+        could mess up this functionality.
+
+        Argument:
+            index (int): index of img to show
+        Return:
+            img, target
+        """
+        img_id = self.ids[index]
+        target = ET.parse(self._annopath % img_id).getroot()
+        img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
+
+        height, width, _ = img.shape
+
+        img_info = (width, height)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target, img_info, img_id
+
+    def evaluate_detections(self, all_boxes, output_dir=None):
+        """
+        all_boxes is a list of length number-of-classes.
+        Each list element is a list of length number-of-images.
+        Each of those list elements is either an empty list []
+        or a numpy array of detection.
+
+        all_boxes[class][image] = [] or np.array of shape #dets x 5
+        """
+        self._write_voc_results_file(all_boxes)
+        IouTh = np.linspace(0.5, 0.95, np.round((0.95 - 0.5) / 0.05) + 1, endpoint=True)
+        mAPs = []
+        for iou in IouTh:
+            mAP = self._do_python_eval(output_dir, iou)
+            mAPs.append(mAP)
+
+        print("--------------------------------------------------------------")
+        print("map_5095:", np.mean(mAPs))
+        print("map_50:", mAPs[0])
+        print("--------------------------------------------------------------")
+        return np.mean(mAPs), mAPs[0]
+
+    def _get_voc_results_file_template(self):
+        filename = "comp4_det_test" + "_{:s}.txt"
+        filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main")
+        if not os.path.exists(filedir):
+            os.makedirs(filedir)
+        path = os.path.join(filedir, filename)
+        return path
+
+    def _write_voc_results_file(self, all_boxes):
+        for cls_ind, cls in enumerate(VOC_CLASSES):
+            cls_ind = cls_ind
+            if cls == "__background__":
+                continue
+            print("Writing {} VOC results file".format(cls))
+            filename = self._get_voc_results_file_template().format(cls)
+            with open(filename, "wt") as f:
+                for im_ind, index in enumerate(self.ids):
+                    index = index[1]
+                    dets = all_boxes[cls_ind][im_ind]
+                    if dets == []:
+                        continue
+                    for k in range(dets.shape[0]):
+                        f.write(
+                            "{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n".format(
+                                index,
+                                dets[k, -1],
+                                dets[k, 0] + 1,
+                                dets[k, 1] + 1,
+                                dets[k, 2] + 1,
+                                dets[k, 3] + 1,
+                            )
+                        )
+
+    def _do_python_eval(self, output_dir="output", iou=0.5):
+        rootpath = os.path.join(self.root, "VOC" + self._year)
+        name = self.image_set[0][1]
+        annopath = os.path.join(rootpath, "Annotations", "{:s}.xml")
+        imagesetfile = os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
+        cachedir = os.path.join(
+            self.root, "annotations_cache", "VOC" + self._year, name
+        )
+        if not os.path.exists(cachedir):
+            os.makedirs(cachedir)
+        aps = []
+        # The PASCAL VOC metric changed in 2010
+        use_07_metric = True if int(self._year) < 2010 else False
+        print("VOC07 metric? " + ("Yes" if use_07_metric else "No"))
+        if output_dir is not None and not os.path.isdir(output_dir):
+            os.mkdir(output_dir)
+        for i, cls in enumerate(VOC_CLASSES):
+
+            if cls == "__background__":
+                continue
+
+            filename = self._get_voc_results_file_template().format(cls)
+            rec, prec, ap = voc_eval(
+                filename,
+                annopath,
+                imagesetfile,
+                cls,
+                cachedir,
+                ovthresh=iou,
+                use_07_metric=use_07_metric,
+            )
+            aps += [ap]
+            if iou == 0.5:
+                print("AP for {} = {:.4f}".format(cls, ap))
+            if output_dir is not None:
+                with open(os.path.join(output_dir, cls + "_pr.pkl"), "wb") as f:
+                    pickle.dump({"rec": rec, "prec": prec, "ap": ap}, f)
+        if iou == 0.5:
+            print("Mean AP = {:.4f}".format(np.mean(aps)))
+            print("~~~~~~~~")
+            print("Results:")
+            for ap in aps:
+                print("{:.3f}".format(ap))
+            print("{:.3f}".format(np.mean(aps)))
+            print("~~~~~~~~")
+            print("")
+            print("--------------------------------------------------------------")
+            print("Results computed with the **unofficial** Python eval code.")
+            print("Results should be very close to the official MATLAB eval code.")
+            print("Recompute with `./tools/reval.py --matlab ...` for your paper.")
+            print("-- Thanks, The Management")
+            print("--------------------------------------------------------------")
+
+        return np.mean(aps)
diff --git a/yolox/data/datasets/voc_classes.py b/yolox/data/datasets/voc_classes.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fe640af60045d4f03397ccb5c645ce656d3d698
--- /dev/null
+++ b/yolox/data/datasets/voc_classes.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+
+# VOC_CLASSES = ( '__background__', # always index 0
+VOC_CLASSES = (
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+)
diff --git a/yolox/data/samplers.py b/yolox/data/samplers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a7d521a09345b0a3645fee6e6d9b0fda73a904a
--- /dev/null
+++ b/yolox/data/samplers.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import itertools
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from torch.utils.data.sampler import BatchSampler as torchBatchSampler
+from torch.utils.data.sampler import Sampler
+
+
+class YoloBatchSampler(torchBatchSampler):
+    """
+    This batch sampler will generate mini-batches of (dim, index) tuples from another sampler.
+    It works just like the :class:`torch.utils.data.sampler.BatchSampler`,
+    but it will prepend a dimension, whilst ensuring it stays the same across one mini-batch.
+    """
+
+    def __init__(self, *args, input_dimension=None, mosaic=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.input_dim = input_dimension
+        self.new_input_dim = None
+        self.mosaic = mosaic
+
+    def __iter__(self):
+        self.__set_input_dim()
+        for batch in super().__iter__():
+            yield [(self.input_dim, idx, self.mosaic) for idx in batch]
+            self.__set_input_dim()
+
+    def __set_input_dim(self):
+        """ This function randomly changes the the input dimension of the dataset. """
+        if self.new_input_dim is not None:
+            self.input_dim = (self.new_input_dim[0], self.new_input_dim[1])
+            self.new_input_dim = None
+
+
+class InfiniteSampler(Sampler):
+    """
+    In training, we only care about the "infinite stream" of training data.
+    So this sampler produces an infinite stream of indices and
+    all workers cooperate to correctly shuffle the indices and sample different indices.
+    The samplers in each worker effectively produces `indices[worker_id::num_workers]`
+    where `indices` is an infinite stream of indices consisting of
+    `shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True)
+    or `range(size) + range(size) + ...` (if shuffle is False)
+    """
+
+    def __init__(
+        self,
+        size: int,
+        shuffle: bool = True,
+        seed: Optional[int] = 0,
+        rank=0,
+        world_size=1,
+    ):
+        """
+        Args:
+            size (int): the total number of data of the underlying dataset to sample from
+            shuffle (bool): whether to shuffle the indices or not
+            seed (int): the initial seed of the shuffle. Must be the same
+                across all workers. If None, will use a random seed shared
+                among workers (require synchronization among all workers).
+        """
+        self._size = size
+        assert size > 0
+        self._shuffle = shuffle
+        self._seed = int(seed)
+
+        if dist.is_available() and dist.is_initialized():
+            self._rank = dist.get_rank()
+            self._world_size = dist.get_world_size()
+        else:
+            self._rank = rank
+            self._world_size = world_size
+
+    def __iter__(self):
+        start = self._rank
+        yield from itertools.islice(
+            self._infinite_indices(), start, None, self._world_size
+        )
+
+    def _infinite_indices(self):
+        g = torch.Generator()
+        g.manual_seed(self._seed)
+        while True:
+            if self._shuffle:
+                yield from torch.randperm(self._size, generator=g)
+            else:
+                yield from torch.arange(self._size)
+
+    def __len__(self):
+        return self._size // self._world_size
diff --git a/yolox/evalutors/__init__.py b/yolox/evalutors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..adb1a3deff2ebbc57fef2725607485ae2a7d6884
--- /dev/null
+++ b/yolox/evalutors/__init__.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+from .coco_evaluator import COCOEvaluator
diff --git a/yolox/evalutors/coco_evaluator.py b/yolox/evalutors/coco_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..2551bc3a1695c96aaa725c51352715b8561d4415
--- /dev/null
+++ b/yolox/evalutors/coco_evaluator.py
@@ -0,0 +1,217 @@
+import contextlib
+import io
+import itertools
+import json
+import tempfile
+import time
+from loguru import logger
+from tqdm import tqdm
+
+import torch
+
+from yolox.utils import (
+    gather,
+    is_main_process,
+    postprocess,
+    synchronize,
+    time_synchronized,
+    xyxy2xywh
+)
+
+
+class COCOEvaluator:
+    """
+    COCO AP Evaluation class.  All the data in the val2017 dataset are processed
+    and evaluated by COCO API.
+    """
+
+    def __init__(
+        self, dataloader, img_size, confthre, nmsthre, num_classes, testdev=False
+    ):
+        """
+        Args:
+            dataloader (Dataloader): evaluate dataloader.
+            img_size (int): image size after preprocess. images are resized
+                to squares whose shape is (img_size, img_size).
+            confthre (float): confidence threshold ranging from 0 to 1, which
+                is defined in the config file.
+            nmsthre (float): IoU threshold of non-max supression ranging from 0 to 1.
+        """
+        self.dataloader = dataloader
+        self.img_size = img_size
+        self.confthre = confthre
+        self.nmsthre = nmsthre
+        self.num_classes = num_classes
+        self.testdev = testdev
+
+    def evaluate(
+        self,
+        model,
+        distributed=False,
+        half=False,
+        trt_file=None,
+        decoder=None,
+        test_size=None,
+    ):
+        """
+        COCO average precision (AP) Evaluation. Iterate inference on the test dataset
+        and the results are evaluated by COCO API.
+
+        NOTE: This function will change training mode to False, please save states if needed.
+
+        Args:
+            model : model to evaluate.
+
+        Returns:
+            ap50_95 (float) : COCO AP of IoU=50:95
+            ap50 (float) : COCO AP of IoU=50
+            summary (sr): summary info of evaluation.
+        """
+        # TODO half to amp_test
+        tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor
+        model = model.eval()
+        if half:
+            model = model.half()
+        ids = []
+        data_list = []
+        progress_bar = tqdm if is_main_process() else iter
+
+        inference_time = 0
+        nms_time = 0
+        n_samples = len(self.dataloader) - 1
+
+        if trt_file is not None:
+            from torch2trt import TRTModule
+
+            model_trt = TRTModule()
+            model_trt.load_state_dict(torch.load(trt_file))
+
+            x = torch.ones(1, 3, test_size[0], test_size[1]).cuda()
+            dump_out = model(x)
+            model = model_trt
+
+        for cur_iter, (imgs, _, info_imgs, ids) in enumerate(
+            progress_bar(self.dataloader)
+        ):
+            with torch.no_grad():
+                imgs = imgs.type(tensor_type)
+
+                # skip the the last iters since batchsize might be not enough for batch inference
+                is_time_record = cur_iter < len(self.dataloader) - 1
+                if is_time_record:
+                    start = time.time()
+
+                outputs = model(imgs)
+                if decoder is not None:
+                    outputs = decoder(outputs, dtype=outputs.type())
+
+                if is_time_record:
+                    infer_end = time_synchronized()
+                    inference_time += infer_end - start
+
+                outputs = postprocess(
+                    outputs, self.num_classes, self.confthre, self.nmsthre
+                )
+                if is_time_record:
+                    nms_end = time_synchronized()
+                    nms_time += nms_end - infer_end
+
+            data_list.extend(self.convert_to_coco_format(outputs, info_imgs, ids))
+
+        statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])
+        if distributed:
+            data_list = gather(data_list, dst=0)
+            data_list = list(itertools.chain(*data_list))
+            torch.distributed.reduce(statistics, dst=0)
+
+        eval_results = self.evaluate_prediction(data_list, statistics)
+        synchronize()
+        return eval_results
+
+    def convert_to_coco_format(self, outputs, info_imgs, ids):
+        data_list = []
+        for (output, img_h, img_w, img_id) in zip(
+            outputs, info_imgs[0], info_imgs[1], ids
+        ):
+            if output is None:
+                continue
+            output = output.cpu()
+
+            bboxes = output[:, 0:4]
+
+            # preprocessing: resize
+            scale = min(
+                self.img_size[0] / float(img_h), self.img_size[1] / float(img_w)
+            )
+            bboxes /= scale
+            bboxes = xyxy2xywh(bboxes)
+
+            cls = output[:, 6]
+            scores = output[:, 4] * output[:, 5]
+            for ind in range(bboxes.shape[0]):
+                label = self.dataloader.dataset.class_ids[int(cls[ind])]
+                pred_data = {
+                    "image_id": int(img_id),
+                    "category_id": label,
+                    "bbox": bboxes[ind].numpy().tolist(),
+                    "score": scores[ind].numpy().item(),
+                    "segmentation": [],
+                }  # COCO json format
+                data_list.append(pred_data)
+        return data_list
+
+    def evaluate_prediction(self, data_dict, statistics):
+        if not is_main_process():
+            return 0, 0, None
+
+        logger.info("Evaluate in main process...")
+
+        annType = ["segm", "bbox", "keypoints"]
+
+        inference_time = statistics[0].item()
+        nms_time = statistics[1].item()
+        n_samples = statistics[2].item()
+
+        a_infer_time = 1000 * inference_time / (n_samples * self.dataloader.batch_size)
+        a_nms_time = 1000 * nms_time / (n_samples * self.dataloader.batch_size)
+
+        time_info = ", ".join(
+            [
+                "Average {} time: {:.2f} ms".format(k, v)
+                for k, v in zip(
+                    ["forward", "NMS", "inference"],
+                    [a_infer_time, a_nms_time, (a_infer_time + a_nms_time)],
+                )
+            ]
+        )
+
+        info = time_info + "\n"
+
+        # Evaluate the Dt (detection) json comparing with the ground truth
+        if len(data_dict) > 0:
+            cocoGt = self.dataloader.dataset.coco
+            # TODO: since pycocotools can't process dict in py36, write data to json file.
+            if self.testdev:
+                json.dump(data_dict, open("./yolox_testdev_2017.json", "w"))
+                cocoDt = cocoGt.loadRes("./yolox_testdev_2017.json")
+            else:
+                _, tmp = tempfile.mkstemp()
+                json.dump(data_dict, open(tmp, "w"))
+                cocoDt = cocoGt.loadRes(tmp)
+            try:
+                from yolox.layers import COCOeval_opt as COCOeval
+            except ImportError:
+                from .cocoeval_mr import COCOeval
+
+                logger.warning("Use standard COCOeval.")
+
+            cocoEval = COCOeval(cocoGt, cocoDt, annType[1])
+            cocoEval.evaluate()
+            cocoEval.accumulate()
+            redirect_string = io.StringIO()
+            with contextlib.redirect_stdout(redirect_string):
+                cocoEval.summarize()
+            info += redirect_string.getvalue()
+            return cocoEval.stats[0], cocoEval.stats[1], info
+        else:
+            return 0, 0, info
diff --git a/yolox/evalutors/voc_eval.py b/yolox/evalutors/voc_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..0744fd8e97d0819302142517f0bd864eb9862328
--- /dev/null
+++ b/yolox/evalutors/voc_eval.py
@@ -0,0 +1,184 @@
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+
+import os
+import pickle
+import xml.etree.ElementTree as ET
+
+import numpy as np
+
+
+def parse_rec(filename):
+    """ Parse a PASCAL VOC xml file """
+    tree = ET.parse(filename)
+    objects = []
+    for obj in tree.findall("object"):
+        obj_struct = {}
+        obj_struct["name"] = obj.find("name").text
+        obj_struct["pose"] = obj.find("pose").text
+        obj_struct["truncated"] = int(obj.find("truncated").text)
+        obj_struct["difficult"] = int(obj.find("difficult").text)
+        bbox = obj.find("bndbox")
+        obj_struct["bbox"] = [
+            int(bbox.find("xmin").text),
+            int(bbox.find("ymin").text),
+            int(bbox.find("xmax").text),
+            int(bbox.find("ymax").text),
+        ]
+        objects.append(obj_struct)
+
+    return objects
+
+
+def voc_ap(rec, prec, use_07_metric=False):
+    """ ap = voc_ap(rec, prec, [use_07_metric])
+    Compute VOC AP given precision and recall.
+    If use_07_metric is true, uses the
+    VOC 07 11 point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.0
+        for t in np.arange(0.0, 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.0
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.0], rec, [1.0]))
+        mpre = np.concatenate(([0.0], prec, [0.0]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def voc_eval(
+    detpath,
+    annopath,
+    imagesetfile,
+    classname,
+    cachedir,
+    ovthresh=0.5,
+    use_07_metric=False,
+):
+    # first load gt
+    if not os.path.isdir(cachedir):
+        os.mkdir(cachedir)
+    cachefile = os.path.join(cachedir, "annots.pkl")
+    # read list of images
+    with open(imagesetfile, "r") as f:
+        lines = f.readlines()
+    imagenames = [x.strip() for x in lines]
+
+    if not os.path.isfile(cachefile):
+        # load annots
+        recs = {}
+        for i, imagename in enumerate(imagenames):
+            recs[imagename] = parse_rec(annopath.format(imagename))
+            if i % 100 == 0:
+                print("Reading annotation for {:d}/{:d}".format(i + 1, len(imagenames)))
+        # save
+        print("Saving cached annotations to {:s}".format(cachefile))
+        with open(cachefile, "wb") as f:
+            pickle.dump(recs, f)
+    else:
+        # load
+        with open(cachefile, "rb") as f:
+            recs = pickle.load(f)
+
+    # extract gt objects for this class
+    class_recs = {}
+    npos = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj["name"] == classname]
+        bbox = np.array([x["bbox"] for x in R])
+        difficult = np.array([x["difficult"] for x in R]).astype(np.bool)
+        det = [False] * len(R)
+        npos = npos + sum(~difficult)
+        class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det}
+
+    # read dets
+    detfile = detpath.format(classname)
+    with open(detfile, "r") as f:
+        lines = f.readlines()
+
+    if len(lines) == 0:
+        return 0, 0, 0
+
+    splitlines = [x.strip().split(" ") for x in lines]
+    image_ids = [x[0] for x in splitlines]
+    confidence = np.array([float(x[1]) for x in splitlines])
+    BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R["bbox"].astype(float)
+
+        if BBGT.size > 0:
+            # compute overlaps
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
+            ih = np.maximum(iymax - iymin + 1.0, 0.0)
+            inters = iw * ih
+
+            # union
+            uni = (
+                (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0)
+                + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0)
+                - inters
+            )
+
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+
+        if ovmax > ovthresh:
+            if not R["difficult"][jmax]:
+                if not R["det"][jmax]:
+                    tp[d] = 1.0
+                    R["det"][jmax] = 1
+                else:
+                    fp[d] = 1.0
+        else:
+            fp[d] = 1.0
+
+        # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+
+    return rec, prec, ap
diff --git a/yolox/evalutors/voc_evaluator.py b/yolox/evalutors/voc_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..8761f2232dc28db04e7f494e93769e3b7cc83f63
--- /dev/null
+++ b/yolox/evalutors/voc_evaluator.py
@@ -0,0 +1,200 @@
+import sys
+import tempfile
+import time
+from tqdm import tqdm
+
+import torch
+
+# TODO check VOC
+from yolox.data.dataset.vocdataset import ValTransform
+from yolox.utils import get_rank, is_main_process, make_pred_vis, make_vis, synchronize
+
+# TODO refactor this file in the future.
+
+
+def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu):
+    all_predictions = dist.scatter_gather(predictions_per_gpu)
+    if not is_main_process():
+        return
+    # merge the list of dicts
+    predictions = {}
+    for p in all_predictions:
+        predictions.update(p)
+    # convert a dict where the key is the index in a list
+    image_ids = list(sorted(predictions.keys()))
+    if len(image_ids) != image_ids[-1] + 1:
+        print("num_imgs: ", len(image_ids))
+        print("last img_id: ", image_ids[-1])
+        print(
+            "Number of images that were gathered from multiple processes is not "
+            "a contiguous set. Some images might be missing from the evaluation"
+        )
+
+    # convert to a list
+    predictions = [predictions[i] for i in image_ids]
+    return predictions
+
+
+class VOCEvaluator:
+    """
+    COCO AP Evaluation class.
+    All the data in the val2017 dataset are processed \
+    and evaluated by COCO API.
+    """
+
+    def __init__(self, data_dir, img_size, confthre, nmsthre, vis=False):
+        """
+        Args:
+            data_dir (str): dataset root directory
+            img_size (int): image size after preprocess. images are resized \
+                to squares whose shape is (img_size, img_size).
+            confthre (float):
+                confidence threshold ranging from 0 to 1, \
+                which is defined in the config file.
+            nmsthre (float):
+                IoU threshold of non-max supression ranging from 0 to 1.
+        """
+        test_sets = [("2007", "test")]
+        self.dataset = VOCDetection(
+            root=data_dir,
+            image_sets=test_sets,
+            input_dim=img_size,
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
+            ),
+        )
+        self.num_images = len(self.dataset)
+        self.dataloader = torch.utils.data.DataLoader(
+            self.dataset, batch_size=1, shuffle=False, num_workers=0
+        )
+        self.img_size = img_size
+        self.confthre = confthre
+        self.nmsthre = nmsthre
+        self.vis = vis
+
+    def evaluate(self, model, distributed=False):
+        """
+        COCO average precision (AP) Evaluation. Iterate inference on the test dataset
+        and the results are evaluated by COCO API.
+        Args:
+            model : model object
+        Returns:
+            ap50_95 (float) : calculated COCO AP for IoU=50:95
+            ap50 (float) : calculated COCO AP for IoU=50
+        """
+        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+            model = model.module
+        model.eval()
+        cuda = torch.cuda.is_available()
+        Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
+
+        ids = []
+        data_dict = []
+        dataiterator = iter(self.dataloader)
+        img_num = 0
+        indices = list(range(self.num_images))
+        dis_indices = indices[get_rank() :: distributed_util.get_world_size()]
+        progress_bar = tqdm if distributed_util.is_main_process() else iter
+        num_classes = 20
+        predictions = {}
+
+        if is_main_process():
+            inference_time = 0
+            nms_time = 0
+            n_samples = len(dis_indices)
+
+        for i in progress_bar(dis_indices):
+            img, _, info_img, id_ = self.dataset[i]  # load a batch
+            info_img = [float(info) for info in info_img]
+            ids.append(id_)
+            with torch.no_grad():
+                img = Variable(img.type(Tensor).unsqueeze(0))
+
+                if is_main_process() and i > 9:
+                    start = time.time()
+
+                if self.vis:
+                    outputs, fuse_weights, fused_f = model(img)
+                else:
+                    outputs = model(img)
+
+                if is_main_process() and i > 9:
+                    infer_end = time.time()
+                    inference_time += infer_end - start
+
+                outputs = postprocess(outputs, 20, self.confthre, self.nmsthre)
+
+                if is_main_process() and i > 9:
+                    nms_end = time.time()
+                    nms_time += nms_end - infer_end
+
+                if outputs[0] is None:
+                    predictions[i] = (None, None, None)
+                    continue
+                outputs = outputs[0].cpu().data
+
+            bboxes = outputs[:, 0:4]
+            bboxes[:, 0::2] *= info_img[0] / self.img_size[0]
+            bboxes[:, 1::2] *= info_img[1] / self.img_size[1]
+            cls = outputs[:, 6]
+            scores = outputs[:, 4] * outputs[:, 5]
+            predictions[i] = (bboxes, cls, scores)
+
+            if self.vis:
+                o_img, _, _, _ = self.dataset.pull_item(i)
+                make_vis("VOC", i, o_img, fuse_weights, fused_f)
+                class_names = self.dataset._classes
+
+                bbox = bboxes.clone()
+                bbox[:, 2] = bbox[:, 2] - bbox[:, 0]
+                bbox[:, 3] = bbox[:, 3] - bbox[:, 1]
+
+                make_pred_vis("VOC", i, o_img, class_names, bbox, cls, scores)
+
+            if is_main_process():
+                o_img, _, _, _ = self.dataset.pull_item(i)
+                class_names = self.dataset._classes
+                bbox = bboxes.clone()
+                bbox[:, 2] = bbox[:, 2] - bbox[:, 0]
+                bbox[:, 3] = bbox[:, 3] - bbox[:, 1]
+                make_pred_vis("VOC", i, o_img, class_names, bbox, cls, scores)
+
+        synchronize()
+        predictions = _accumulate_predictions_from_multiple_gpus(predictions)
+        if not is_main_process():
+            return 0, 0
+
+        print("Main process Evaluating...")
+
+        a_infer_time = 1000 * inference_time / (n_samples - 10)
+        a_nms_time = 1000 * nms_time / (n_samples - 10)
+
+        print(
+            "Average forward time: %.2f ms, Average NMS time: %.2f ms, Average inference time: %.2f ms"
+            % (a_infer_time, a_nms_time, (a_infer_time + a_nms_time))
+        )
+
+        all_boxes = [[[] for _ in range(self.num_images)] for _ in range(num_classes)]
+        for img_num in range(self.num_images):
+            bboxes, cls, scores = predictions[img_num]
+            if bboxes is None:
+                for j in range(num_classes):
+                    all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32)
+                continue
+            for j in range(num_classes):
+                mask_c = cls == j
+                if sum(mask_c) == 0:
+                    all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32)
+                    continue
+
+                c_dets = torch.cat((bboxes, scores.unsqueeze(1)), dim=1)
+                all_boxes[j][img_num] = c_dets[mask_c].numpy()
+
+            sys.stdout.write(
+                "im_eval: {:d}/{:d} \r".format(img_num + 1, self.num_images)
+            )
+            sys.stdout.flush()
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            mAP50, mAP70 = self.dataset.evaluate_detections(all_boxes, tempdir)
+            return mAP50, mAP70
diff --git a/yolox/exp/__init__.py b/yolox/exp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1988be2fd49cde72150eb60778d134ba1e7ef9ae
--- /dev/null
+++ b/yolox/exp/__init__.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+
+from .base_exp import BaseExp
+from .build import get_exp
+from .yolox_base import Exp
diff --git a/yolox/exp/base_exp.py b/yolox/exp/base_exp.py
new file mode 100644
index 0000000000000000000000000000000000000000..501067419e49bdde05febcf329eef3983c920d0e
--- /dev/null
+++ b/yolox/exp/base_exp.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+import ast
+import pprint
+from abc import ABCMeta, abstractmethod
+from typing import Dict
+from tabulate import tabulate
+
+import torch
+from torch.nn import Module
+
+from yolox.utils import LRScheduler
+
+
+class BaseExp(metaclass=ABCMeta):
+    """Basic class for any experiment.
+    """
+
+    def __init__(self):
+        self.seed = None
+        self.output_dir = "/data/YOLOX_outputs"
+        self.print_interval = 100
+        self.eval_interval = 10
+
+    @abstractmethod
+    def get_model(self) -> Module:
+        pass
+
+    @abstractmethod
+    def get_data_loader(
+        self, batch_size: int, is_distributed: bool
+    ) -> Dict[str, torch.utils.data.DataLoader]:
+        pass
+
+    @abstractmethod
+    def get_optimizer(self, batch_size: int) -> torch.optim.Optimizer:
+        pass
+
+    @abstractmethod
+    def get_lr_scheduler(
+        self, lr: float, iters_per_epoch: int, **kwargs
+    ) -> LRScheduler:
+        pass
+
+    @abstractmethod
+    def get_evaluator(self):
+        pass
+
+    @abstractmethod
+    def eval(self, model, evaluator, weights):
+        pass
+
+    def __repr__(self):
+        table_header = ["keys", "values"]
+        exp_table = [
+            (str(k), pprint.pformat(v)) for k, v in vars(self).items() if not k.startswith("_")
+        ]
+        return tabulate(exp_table, headers=table_header, tablefmt="fancy_grid")
+
+    def merge(self, cfg_list):
+        assert len(cfg_list) % 2 == 0
+        for k, v in zip(cfg_list[0::2], cfg_list[1::2]):
+            # only update value with same key
+            if hasattr(self, k):
+                src_value = getattr(self, k)
+                src_type = type(src_value)
+                if src_value is not None and src_type != type(v):
+                    try:
+                        v = src_type(v)
+                    except Exception:
+                        v = ast.literal_eval(v)
+                setattr(self, k, v)
diff --git a/yolox/exp/build.py b/yolox/exp/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..db851fa48b051b7a815e29e047317842ead6b900
--- /dev/null
+++ b/yolox/exp/build.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import importlib
+import os
+import sys
+
+
+def get_exp_by_file(exp_file):
+    try:
+        sys.path.append(os.path.dirname(exp_file))
+        current_exp = importlib.import_module(os.path.basename(exp_file).split(".")[0])
+        exp = current_exp.Exp()
+    except Exception:
+        raise ImportError("{} doesn't contains class named 'Exp'".format(exp_file))
+    return exp
+
+
+def get_exp_by_name(exp_name):
+    import yolox
+    yolox_path = os.path.dirname(os.path.dirname(yolox.__file__))
+    filedict = {
+        "yolox-s": "yolox_s.py",
+        "yolox-m": "yolox_l.py",
+        "yolox-l": "yolox_l.py",
+        "yolox-x": "yolox_x.py",
+        "yolox-nano": "nano.py",
+        "yolov3": "yolov3.py",
+    }
+    filename = filedict[exp_name]
+    exp_path = os.path.join(yolox_path, "exps", "base", filename)
+    return get_exp_by_file(exp_path)
+
+
+def get_exp(exp_file, exp_name):
+    """
+    get Exp object by file or name. If exp_file and exp_name
+    are both provided, get Exp by exp_file.
+
+    Args:
+        exp_file (str): file path of experiment.
+        exp_name (str): name of experiment. "yolo-s",
+    """
+    assert exp_file is not None or exp_name is not None, "plz provide exp file or exp name."
+    if exp_file is not None:
+        return get_exp_by_file(exp_file)
+    else:
+        return get_exp_by_name(exp_name)
diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cc48466528309edbd5ea1f95380c5d80d0d981f
--- /dev/null
+++ b/yolox/exp/yolox_base.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+
+import os
+import random
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from .base_exp import BaseExp
+
+
+class Exp(BaseExp):
+
+    def __init__(self):
+        super().__init__()
+
+        # ---------------- model config ---------------- #
+        self.num_classes = 80
+        self.depth = 1.00
+        self.width = 1.00
+
+        # ---------------- dataloader config ---------------- #
+        self.data_num_workers = 6
+        self.input_size = (640, 640)
+        self.random_size = (14, 26)
+        self.train_ann = "instances_train2017.json"
+        self.val_ann = "instances_val2017.json"
+
+        # --------------- transform config ----------------- #
+        self.degrees = 10.0
+        self.translate = 0.1
+        self.scale = (0.1, 2)
+        self.mscale = (0.8, 1.6)
+        self.shear = 2.0
+        self.perspective = 0.0
+        self.enable_mixup = True
+
+        # --------------  training config --------------------- #
+        self.warmup_epochs = 5
+        self.max_epoch = 300
+        self.warmup_lr = 0
+        self.basic_lr_per_img = 0.01 / 64.0
+        self.scheduler = "yoloxwarmcos"
+        self.no_aug_epochs = 15
+        self.min_lr_ratio = 0.05
+        self.ema = True
+
+        self.weight_decay = 5e-4
+        self.momentum = 0.9
+        self.print_interval = 10
+        self.eval_interval = 10
+        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
+
+        # -----------------  testing config ------------------ #
+        self.test_size = (640, 640)
+        self.test_conf = 0.01
+        self.nmsthre = 0.65
+
+    def get_model(self):
+        from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
+
+        def init_yolo(M):
+            for m in M.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eps = 1e-3
+                    m.momentum = 0.03
+
+        if getattr(self, "model", None) is None:
+            in_channels = [256, 512, 1024]
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels)
+            head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels)
+            self.model = YOLOX(backbone, head)
+
+        self.model.apply(init_yolo)
+        self.model.head.initialize_biases(1e-2)
+        return self.model
+
+    def get_data_loader(self, batch_size, is_distributed, no_aug=False):
+        from yolox.data import (
+            COCODataset,
+            TrainTransform,
+            YoloBatchSampler,
+            DataLoader,
+            InfiniteSampler,
+            MosaicDetection,
+        )
+
+        dataset = COCODataset(
+            data_dir=None,
+            json_file=self.train_ann,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=50,
+            ),
+        )
+
+        dataset = MosaicDetection(
+            dataset,
+            mosaic=not no_aug,
+            img_size=self.input_size,
+            preproc=TrainTransform(
+                rgb_means=(0.485, 0.456, 0.406),
+                std=(0.229, 0.224, 0.225),
+                max_labels=120,
+            ),
+            degrees=self.degrees,
+            translate=self.translate,
+            scale=self.scale,
+            shear=self.shear,
+            perspective=self.perspective,
+            enable_mixup=self.enable_mixup,
+        )
+
+        self.dataset = dataset
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = InfiniteSampler(
+                len(self.dataset), seed=self.seed if self.seed else 0
+            )
+        else:
+            sampler = torch.utils.data.RandomSampler(self.dataset)
+
+        batch_sampler = YoloBatchSampler(
+            sampler=sampler,
+            batch_size=batch_size,
+            drop_last=False,
+            input_dimension=self.input_size,
+            mosaic=not no_aug,
+        )
+
+        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
+        dataloader_kwargs["batch_sampler"] = batch_sampler
+        train_loader = DataLoader(self.dataset, **dataloader_kwargs)
+
+        return train_loader
+
+    def random_resize(self, data_loader, epoch, rank, is_distributed):
+        tensor = torch.LongTensor(1).cuda()
+
+        if rank == 0:
+            if epoch >= self.max_epoch - 1:
+                size = self.input_size[0]
+            else:
+                size = random.randint(*self.random_size)
+                size = int(32 * size)
+            tensor.fill_(size)
+
+        if is_distributed:
+            dist.barrier()
+            dist.broadcast(tensor, 0)
+
+        input_size = data_loader.change_input_dim(
+            multiple=tensor.item(), random_range=None
+        )
+        return input_size
+
+    def get_optimizer(self, batch_size):
+        if "optimizer" not in self.__dict__:
+            if self.warmup_epochs > 0:
+                lr = self.warmup_lr
+            else:
+                lr = self.basic_lr_per_img * batch_size
+
+            pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
+
+            for k, v in self.model.named_modules():
+                if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
+                    pg2.append(v.bias)  # biases
+                if isinstance(v, nn.BatchNorm2d) or "bn" in k:
+                    pg0.append(v.weight)  # no decay
+                elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
+                    pg1.append(v.weight)  # apply decay
+
+            optimizer = torch.optim.SGD(
+                pg0, lr=lr, momentum=self.momentum, nesterov=True
+            )
+            optimizer.add_param_group(
+                {"params": pg1, "weight_decay": self.weight_decay}
+            )  # add pg1 with weight_decay
+            optimizer.add_param_group({"params": pg2})
+            self.optimizer = optimizer
+
+        return self.optimizer
+
+    def get_lr_scheduler(self, lr, iters_per_epoch):
+        from yolox.utils import LRScheduler
+        scheduler = LRScheduler(
+            self.scheduler,
+            lr,
+            iters_per_epoch,
+            self.max_epoch,
+            warmup_epochs=self.warmup_epochs,
+            warmup_lr_start=self.warmup_lr,
+            no_aug_epochs=self.no_aug_epochs,
+            min_lr_ratio=self.min_lr_ratio,
+        )
+        return scheduler
+
+    def get_eval_loader(self, batch_size, is_distributed, testdev=False):
+        from yolox.data import COCODataset, ValTransform
+
+        valdataset = COCODataset(
+            data_dir=None,
+            json_file=self.val_ann if not testdev else "image_info_test-dev2017.json",
+            name="val2017" if not testdev else "test2017",
+            img_size=self.test_size,
+            preproc=ValTransform(
+                rgb_means=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
+            ),
+        )
+
+        if is_distributed:
+            batch_size = batch_size // dist.get_world_size()
+            sampler = torch.utils.data.distributed.DistributedSampler(
+                valdataset, shuffle=False
+            )
+        else:
+            sampler = torch.utils.data.SequentialSampler(valdataset)
+
+        dataloader_kwargs = {
+            "num_workers": self.data_num_workers,
+            "pin_memory": True,
+            "sampler": sampler,
+        }
+        dataloader_kwargs["batch_size"] = batch_size
+        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
+
+        return val_loader
+
+    def get_evaluator(self, batch_size, is_distributed, testdev=False):
+        from yolox.evalutors import COCOEvaluator
+
+        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
+        evaluator = COCOEvaluator(
+            dataloader=val_loader,
+            img_size=self.test_size,
+            confthre=self.test_conf,
+            nmsthre=self.nmsthre,
+            num_classes=self.num_classes,
+            testdev=testdev,
+        )
+        return evaluator
+
+    def eval(self, model, evaluator, is_distributed, half=False):
+        return evaluator.evaluate(model, is_distributed, half)
diff --git a/yolox/layers/__init__.py b/yolox/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2a51963f43d663af81d74a82fa0108970122083
--- /dev/null
+++ b/yolox/layers/__init__.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+from .fast_coco_eval_api import COCOeval_opt
diff --git a/yolox/layers/csrc/cocoeval/cocoeval.cpp b/yolox/layers/csrc/cocoeval/cocoeval.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e63bc9952918060f55999ec100b283d83616b46
--- /dev/null
+++ b/yolox/layers/csrc/cocoeval/cocoeval.cpp
@@ -0,0 +1,502 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "cocoeval.h"
+#include <time.h>
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+
+using namespace pybind11::literals;
+
+namespace COCOeval {
+
+// Sort detections from highest score to lowest, such that
+// detection_instances[detection_sorted_indices[t]] >=
+// detection_instances[detection_sorted_indices[t+1]].  Use stable_sort to match
+// original COCO API
+void SortInstancesByDetectionScore(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    std::vector<uint64_t>* detection_sorted_indices) {
+  detection_sorted_indices->resize(detection_instances.size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_instances](size_t j1, size_t j2) {
+        return detection_instances[j1].score > detection_instances[j2].score;
+      });
+}
+
+// Partition the ground truth objects based on whether or not to ignore them
+// based on area
+void SortInstancesByIgnore(
+    const std::array<double, 2>& area_range,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    std::vector<uint64_t>* ground_truth_sorted_indices,
+    std::vector<bool>* ignores) {
+  ignores->clear();
+  ignores->reserve(ground_truth_instances.size());
+  for (auto o : ground_truth_instances) {
+    ignores->push_back(
+        o.ignore || o.area < area_range[0] || o.area > area_range[1]);
+  }
+
+  ground_truth_sorted_indices->resize(ground_truth_instances.size());
+  std::iota(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      0);
+  std::stable_sort(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      [&ignores](size_t j1, size_t j2) {
+        return (int)(*ignores)[j1] < (int)(*ignores)[j2];
+      });
+}
+
+// For each IOU threshold, greedily match each detected instance to a ground
+// truth instance (if possible) and store the results
+void MatchDetectionsToGroundTruth(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    const std::vector<uint64_t>& ground_truth_sorted_indices,
+    const std::vector<bool>& ignores,
+    const std::vector<std::vector<double>>& ious,
+    const std::vector<double>& iou_thresholds,
+    const std::array<double, 2>& area_range,
+    ImageEvaluation* results) {
+  // Initialize memory to store return data matches and ignore
+  const int num_iou_thresholds = iou_thresholds.size();
+  const int num_ground_truth = ground_truth_sorted_indices.size();
+  const int num_detections = detection_sorted_indices.size();
+  std::vector<uint64_t> ground_truth_matches(
+      num_iou_thresholds * num_ground_truth, 0);
+  std::vector<uint64_t>& detection_matches = results->detection_matches;
+  std::vector<bool>& detection_ignores = results->detection_ignores;
+  std::vector<bool>& ground_truth_ignores = results->ground_truth_ignores;
+  detection_matches.resize(num_iou_thresholds * num_detections, 0);
+  detection_ignores.resize(num_iou_thresholds * num_detections, false);
+  ground_truth_ignores.resize(num_ground_truth);
+  for (auto g = 0; g < num_ground_truth; ++g) {
+    ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
+  }
+
+  for (auto t = 0; t < num_iou_thresholds; ++t) {
+    for (auto d = 0; d < num_detections; ++d) {
+      // information about best match so far (match=-1 -> unmatched)
+      double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
+      int match = -1;
+      for (auto g = 0; g < num_ground_truth; ++g) {
+        // if this ground truth instance is already matched and not a
+        // crowd, it cannot be matched to another detection
+        if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
+            !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
+          continue;
+        }
+
+        // if detected instance matched to a regular ground truth
+        // instance, we can break on the first ground truth instance
+        // tagged as ignore (because they are sorted by the ignore tag)
+        if (match >= 0 && !ground_truth_ignores[match] &&
+            ground_truth_ignores[g]) {
+          break;
+        }
+
+        // if IOU overlap is the best so far, store the match appropriately
+        if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
+          best_iou = ious[d][ground_truth_sorted_indices[g]];
+          match = g;
+        }
+      }
+      // if match was made, store id of match for both detection and
+      // ground truth
+      if (match >= 0) {
+        detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
+        detection_matches[t * num_detections + d] =
+            ground_truth_instances[ground_truth_sorted_indices[match]].id;
+        ground_truth_matches[t * num_ground_truth + match] =
+            detection_instances[detection_sorted_indices[d]].id;
+      }
+
+      // set unmatched detections outside of area range to ignore
+      const InstanceAnnotation& detection =
+          detection_instances[detection_sorted_indices[d]];
+      detection_ignores[t * num_detections + d] =
+          detection_ignores[t * num_detections + d] ||
+          (detection_matches[t * num_detections + d] == 0 &&
+           (detection.area < area_range[0] || detection.area > area_range[1]));
+    }
+  }
+
+  // store detection score results
+  results->detection_scores.resize(detection_sorted_indices.size());
+  for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
+    results->detection_scores[d] =
+        detection_instances[detection_sorted_indices[d]].score;
+  }
+}
+
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges,
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances) {
+  const int num_area_ranges = area_ranges.size();
+  const int num_images = image_category_ground_truth_instances.size();
+  const int num_categories =
+      image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
+  std::vector<uint64_t> detection_sorted_indices;
+  std::vector<uint64_t> ground_truth_sorted_indices;
+  std::vector<bool> ignores;
+  std::vector<ImageEvaluation> results_all(
+      num_images * num_area_ranges * num_categories);
+
+  // Store results for each image, category, and area range combination. Results
+  // for each IOU threshold are packed into the same ImageEvaluation object
+  for (auto i = 0; i < num_images; ++i) {
+    for (auto c = 0; c < num_categories; ++c) {
+      const std::vector<InstanceAnnotation>& ground_truth_instances =
+          image_category_ground_truth_instances[i][c];
+      const std::vector<InstanceAnnotation>& detection_instances =
+          image_category_detection_instances[i][c];
+
+      SortInstancesByDetectionScore(
+          detection_instances, &detection_sorted_indices);
+      if ((int)detection_sorted_indices.size() > max_detections) {
+        detection_sorted_indices.resize(max_detections);
+      }
+
+      for (size_t a = 0; a < area_ranges.size(); ++a) {
+        SortInstancesByIgnore(
+            area_ranges[a],
+            ground_truth_instances,
+            &ground_truth_sorted_indices,
+            &ignores);
+
+        MatchDetectionsToGroundTruth(
+            detection_instances,
+            detection_sorted_indices,
+            ground_truth_instances,
+            ground_truth_sorted_indices,
+            ignores,
+            image_category_ious[i][c],
+            iou_thresholds,
+            area_ranges[a],
+            &results_all
+                [c * num_area_ranges * num_images + a * num_images + i]);
+      }
+    }
+  }
+
+  return results_all;
+}
+
+// Convert a python list to a vector
+template <typename T>
+std::vector<T> list_to_vec(const py::list& l) {
+  std::vector<T> v(py::len(l));
+  for (int i = 0; i < (int)py::len(l); ++i) {
+    v[i] = l[i].cast<T>();
+  }
+  return v;
+}
+
+// Helper function to Accumulate()
+// Considers the evaluation results applicable to a particular category, area
+// range, and max_detections parameter setting, which begin at
+// evaluations[evaluation_index].  Extracts a sorted list of length n of all
+// applicable detection instances concatenated across all images in the dataset,
+// which are represented by the outputs evaluation_indices, detection_scores,
+// image_detection_indices, and detection_sorted_indices--all of which are
+// length n. evaluation_indices[i] stores the applicable index into
+// evaluations[] for instance i, which has detection score detection_score[i],
+// and is the image_detection_indices[i]'th of the list of detections
+// for the image containing i.  detection_sorted_indices[] defines a sorted
+// permutation of the 3 other outputs
+int BuildSortedDetectionList(
+    const std::vector<ImageEvaluation>& evaluations,
+    const int64_t evaluation_index,
+    const int64_t num_images,
+    const int max_detections,
+    std::vector<uint64_t>* evaluation_indices,
+    std::vector<double>* detection_scores,
+    std::vector<uint64_t>* detection_sorted_indices,
+    std::vector<uint64_t>* image_detection_indices) {
+  assert(evaluations.size() >= evaluation_index + num_images);
+
+  // Extract a list of object instances of the applicable category, area
+  // range, and max detections requirements such that they can be sorted
+  image_detection_indices->clear();
+  evaluation_indices->clear();
+  detection_scores->clear();
+  image_detection_indices->reserve(num_images * max_detections);
+  evaluation_indices->reserve(num_images * max_detections);
+  detection_scores->reserve(num_images * max_detections);
+  int num_valid_ground_truth = 0;
+  for (auto i = 0; i < num_images; ++i) {
+    const ImageEvaluation& evaluation = evaluations[evaluation_index + i];
+
+    for (int d = 0;
+         d < (int)evaluation.detection_scores.size() && d < max_detections;
+         ++d) { // detected instances
+      evaluation_indices->push_back(evaluation_index + i);
+      image_detection_indices->push_back(d);
+      detection_scores->push_back(evaluation.detection_scores[d]);
+    }
+    for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
+      if (!ground_truth_ignore) {
+        ++num_valid_ground_truth;
+      }
+    }
+  }
+
+  // Sort detections by decreasing score, using stable sort to match
+  // python implementation
+  detection_sorted_indices->resize(detection_scores->size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_scores](size_t j1, size_t j2) {
+        return (*detection_scores)[j1] > (*detection_scores)[j2];
+      });
+
+  return num_valid_ground_truth;
+}
+
+// Helper function to Accumulate()
+// Compute a precision recall curve given a sorted list of detected instances
+// encoded in evaluations, evaluation_indices, detection_scores,
+// detection_sorted_indices, image_detection_indices (see
+// BuildSortedDetectionList()). Using vectors precisions and recalls
+// and temporary storage, output the results into precisions_out, recalls_out,
+// and scores_out, which are large buffers containing many precion/recall curves
+// for all possible parameter settings, with precisions_out_index and
+// recalls_out_index defining the applicable indices to store results.
+void ComputePrecisionRecallCurve(
+    const int64_t precisions_out_index,
+    const int64_t precisions_out_stride,
+    const int64_t recalls_out_index,
+    const std::vector<double>& recall_thresholds,
+    const int iou_threshold_index,
+    const int num_iou_thresholds,
+    const int num_valid_ground_truth,
+    const std::vector<ImageEvaluation>& evaluations,
+    const std::vector<uint64_t>& evaluation_indices,
+    const std::vector<double>& detection_scores,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<uint64_t>& image_detection_indices,
+    std::vector<double>* precisions,
+    std::vector<double>* recalls,
+    std::vector<double>* precisions_out,
+    std::vector<double>* scores_out,
+    std::vector<double>* recalls_out) {
+  assert(recalls_out->size() > recalls_out_index);
+
+  // Compute precision/recall for each instance in the sorted list of detections
+  int64_t true_positives_sum = 0, false_positives_sum = 0;
+  precisions->clear();
+  recalls->clear();
+  precisions->reserve(detection_sorted_indices.size());
+  recalls->reserve(detection_sorted_indices.size());
+  assert(!evaluations.empty() || detection_sorted_indices.empty());
+  for (auto detection_sorted_index : detection_sorted_indices) {
+    const ImageEvaluation& evaluation =
+        evaluations[evaluation_indices[detection_sorted_index]];
+    const auto num_detections =
+        evaluation.detection_matches.size() / num_iou_thresholds;
+    const auto detection_index = iou_threshold_index * num_detections +
+        image_detection_indices[detection_sorted_index];
+    assert(evaluation.detection_matches.size() > detection_index);
+    assert(evaluation.detection_ignores.size() > detection_index);
+    const int64_t detection_match =
+        evaluation.detection_matches[detection_index];
+    const bool detection_ignores =
+        evaluation.detection_ignores[detection_index];
+    const auto true_positive = detection_match > 0 && !detection_ignores;
+    const auto false_positive = detection_match == 0 && !detection_ignores;
+    if (true_positive) {
+      ++true_positives_sum;
+    }
+    if (false_positive) {
+      ++false_positives_sum;
+    }
+
+    const double recall =
+        static_cast<double>(true_positives_sum) / num_valid_ground_truth;
+    recalls->push_back(recall);
+    const int64_t num_valid_detections =
+        true_positives_sum + false_positives_sum;
+    const double precision = num_valid_detections > 0
+        ? static_cast<double>(true_positives_sum) / num_valid_detections
+        : 0.0;
+    precisions->push_back(precision);
+  }
+
+  (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;
+
+  for (int64_t i = static_cast<int64_t>(precisions->size()) - 1; i > 0; --i) {
+    if ((*precisions)[i] > (*precisions)[i - 1]) {
+      (*precisions)[i - 1] = (*precisions)[i];
+    }
+  }
+
+  // Sample the per instance precision/recall list at each recall threshold
+  for (size_t r = 0; r < recall_thresholds.size(); ++r) {
+    // first index in recalls >= recall_thresholds[r]
+    std::vector<double>::iterator low = std::lower_bound(
+        recalls->begin(), recalls->end(), recall_thresholds[r]);
+    size_t precisions_index = low - recalls->begin();
+
+    const auto results_ind = precisions_out_index + r * precisions_out_stride;
+    assert(results_ind < precisions_out->size());
+    assert(results_ind < scores_out->size());
+    if (precisions_index < precisions->size()) {
+      (*precisions_out)[results_ind] = (*precisions)[precisions_index];
+      (*scores_out)[results_ind] =
+          detection_scores[detection_sorted_indices[precisions_index]];
+    } else {
+      (*precisions_out)[results_ind] = 0;
+      (*scores_out)[results_ind] = 0;
+    }
+  }
+}
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evaluations) {
+  const std::vector<double> recall_thresholds =
+      list_to_vec<double>(params.attr("recThrs"));
+  const std::vector<int> max_detections =
+      list_to_vec<int>(params.attr("maxDets"));
+  const int num_iou_thresholds = py::len(params.attr("iouThrs"));
+  const int num_recall_thresholds = py::len(params.attr("recThrs"));
+  const int num_categories = params.attr("useCats").cast<int>() == 1
+      ? py::len(params.attr("catIds"))
+      : 1;
+  const int num_area_ranges = py::len(params.attr("areaRng"));
+  const int num_max_detections = py::len(params.attr("maxDets"));
+  const int num_images = py::len(params.attr("imgIds"));
+
+  std::vector<double> precisions_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+  std::vector<double> recalls_out(
+      num_iou_thresholds * num_categories * num_area_ranges *
+          num_max_detections,
+      -1);
+  std::vector<double> scores_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+
+  // Consider the list of all detected instances in the entire dataset in one
+  // large list.  evaluation_indices, detection_scores,
+  // image_detection_indices, and detection_sorted_indices all have the same
+  // length as this list, such that each entry corresponds to one detected
+  // instance
+  std::vector<uint64_t> evaluation_indices; // indices into evaluations[]
+  std::vector<double> detection_scores; // detection scores of each instance
+  std::vector<uint64_t> detection_sorted_indices; // sorted indices of all
+                                                  // instances in the dataset
+  std::vector<uint64_t>
+      image_detection_indices; // indices into the list of detected instances in
+                               // the same image as each instance
+  std::vector<double> precisions, recalls;
+
+  for (auto c = 0; c < num_categories; ++c) {
+    for (auto a = 0; a < num_area_ranges; ++a) {
+      for (auto m = 0; m < num_max_detections; ++m) {
+        // The COCO PythonAPI assumes evaluations[] (the return value of
+        // COCOeval::EvaluateImages() is one long list storing results for each
+        // combination of category, area range, and image id, with categories in
+        // the outermost loop and images in the innermost loop.
+        const int64_t evaluations_index =
+            c * num_area_ranges * num_images + a * num_images;
+        int num_valid_ground_truth = BuildSortedDetectionList(
+            evaluations,
+            evaluations_index,
+            num_images,
+            max_detections[m],
+            &evaluation_indices,
+            &detection_scores,
+            &detection_sorted_indices,
+            &image_detection_indices);
+
+        if (num_valid_ground_truth == 0) {
+          continue;
+        }
+
+        for (auto t = 0; t < num_iou_thresholds; ++t) {
+          // recalls_out is a flattened vectors representing a
+          // num_iou_thresholds X num_categories X num_area_ranges X
+          // num_max_detections matrix
+          const int64_t recalls_out_index =
+              t * num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          // precisions_out and scores_out are flattened vectors
+          // representing a num_iou_thresholds X num_recall_thresholds X
+          // num_categories X num_area_ranges X num_max_detections matrix
+          const int64_t precisions_out_stride =
+              num_categories * num_area_ranges * num_max_detections;
+          const int64_t precisions_out_index = t * num_recall_thresholds *
+                  num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          ComputePrecisionRecallCurve(
+              precisions_out_index,
+              precisions_out_stride,
+              recalls_out_index,
+              recall_thresholds,
+              t,
+              num_iou_thresholds,
+              num_valid_ground_truth,
+              evaluations,
+              evaluation_indices,
+              detection_scores,
+              detection_sorted_indices,
+              image_detection_indices,
+              &precisions,
+              &recalls,
+              &precisions_out,
+              &scores_out,
+              &recalls_out);
+        }
+      }
+    }
+  }
+
+  time_t rawtime;
+  struct tm local_time;
+  std::array<char, 200> buffer;
+  time(&rawtime);
+#ifdef _WIN32
+  localtime_s(&local_time, &rawtime);
+#else
+  localtime_r(&rawtime, &local_time);
+#endif
+  strftime(
+      buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
+  return py::dict(
+      "params"_a = params,
+      "counts"_a = std::vector<int64_t>({num_iou_thresholds,
+                                         num_recall_thresholds,
+                                         num_categories,
+                                         num_area_ranges,
+                                         num_max_detections}),
+      "date"_a = buffer,
+      "precision"_a = precisions_out,
+      "recall"_a = recalls_out,
+      "scores"_a = scores_out);
+}
+
+} // namespace COCOeval
diff --git a/yolox/layers/csrc/cocoeval/cocoeval.h b/yolox/layers/csrc/cocoeval/cocoeval.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9def4151102d1c493dc88186384342565798d05
--- /dev/null
+++ b/yolox/layers/csrc/cocoeval/cocoeval.h
@@ -0,0 +1,85 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <vector>
+
+namespace py = pybind11;
+
+namespace COCOeval {
+
+// Annotation data for a single object instance in an image
+struct InstanceAnnotation {
+  InstanceAnnotation(
+      uint64_t id,
+      double score,
+      double area,
+      bool is_crowd,
+      bool ignore)
+      : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
+  uint64_t id;
+  double score = 0.;
+  double area = 0.;
+  bool is_crowd = false;
+  bool ignore = false;
+};
+
+// Stores intermediate results for evaluating detection results for a single
+// image that has D detected instances and G ground truth instances. This stores
+// matches between detected and ground truth instances
+struct ImageEvaluation {
+  // For each of the D detected instances, the id of the matched ground truth
+  // instance, or 0 if unmatched
+  std::vector<uint64_t> detection_matches;
+
+  // The detection score of each of the D detected instances
+  std::vector<double> detection_scores;
+
+  // Marks whether or not each of G instances was ignored from evaluation (e.g.,
+  // because it's outside area_range)
+  std::vector<bool> ground_truth_ignores;
+
+  // Marks whether or not each of D instances was ignored from evaluation (e.g.,
+  // because it's outside aRng)
+  std::vector<bool> detection_ignores;
+};
+
+template <class T>
+using ImageCategoryInstances = std::vector<std::vector<std::vector<T>>>;
+
+// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg().  For each
+// combination of image, category, area range settings, and IOU thresholds to
+// evaluate, it matches detected instances to ground truth instances and stores
+// the results into a vector of ImageEvaluation results, which will be
+// interpreted by the COCOeval::Accumulate() function to produce precion-recall
+// curves.  The parameters of nested vectors have the following semantics:
+//   image_category_ious[i][c][d][g] is the intersection over union of the d'th
+//     detected instance and g'th ground truth instance of
+//     category category_ids[c] in image image_ids[i]
+//   image_category_ground_truth_instances[i][c] is a vector of ground truth
+//     instances in image image_ids[i] of category category_ids[c]
+//   image_category_detection_instances[i][c] is a vector of detected
+//     instances in image image_ids[i] of category category_ids[c]
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges, // vector of 2-tuples
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances);
+
+// C++ implementation of COCOeval.accumulate(), which generates precision
+// recall curves for each set of category, IOU threshold, detection area range,
+// and max number of detections parameters.  It is assumed that the parameter
+// evaluations is the return value of the functon COCOeval::EvaluateImages(),
+// which was called with the same parameter settings params
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evalutations);
+
+} // namespace COCOeval
diff --git a/yolox/layers/csrc/vision.cpp b/yolox/layers/csrc/vision.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7663d0faf5c58542624d2f01730618b9aa9d4a25
--- /dev/null
+++ b/yolox/layers/csrc/vision.cpp
@@ -0,0 +1,13 @@
+#include "cocoeval/cocoeval.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
+    m.def(
+        "COCOevalEvaluateImages",
+        &COCOeval::EvaluateImages,
+        "COCOeval::EvaluateImages");
+    pybind11::class_<COCOeval::InstanceAnnotation>(m, "InstanceAnnotation")
+        .def(pybind11::init<uint64_t, double, double, bool, bool>());
+    pybind11::class_<COCOeval::ImageEvaluation>(m, "ImageEvaluation")
+        .def(pybind11::init<>());
+}
diff --git a/yolox/layers/fast_coco_eval_api.py b/yolox/layers/fast_coco_eval_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..560c6d507107fc9915d7d3effb4fb09319bccbeb
--- /dev/null
+++ b/yolox/layers/fast_coco_eval_api.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# This file comes from
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/evaluation/fast_eval_api.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import copy
+import time
+
+import numpy as np
+from pycocotools.cocoeval import COCOeval
+
+# import torch first to make yolox._C work without ImportError of libc10.so
+# in YOLOX, env is already set in __init__.py.
+from yolox import _C
+
+
+class COCOeval_opt(COCOeval):
+    """
+    This is a slightly modified version of the original COCO API, where the functions evaluateImg()
+    and accumulate() are implemented in C++ to speedup evaluation
+    """
+
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results in self.evalImgs_cpp, a
+        datastructure that isn't readable from Python but is used by a c++ implementation of
+        accumulate().  Unlike the original COCO PythonAPI, we don't populate the datastructure
+        self.evalImgs because this datastructure is a computational bottleneck.
+        :return: None
+        """
+        tic = time.time()
+
+        print("Running per image evaluation...")
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = "segm" if p.useSegm == 1 else "bbox"
+            print(
+                "useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType)
+            )
+        print("Evaluate annotation type *{}*".format(p.iouType))
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType == "segm" or p.iouType == "bbox":
+            computeIoU = self.computeIoU
+        elif p.iouType == "keypoints":
+            computeIoU = self.computeOks
+        self.ious = {
+            (imgId, catId): computeIoU(imgId, catId)
+            for imgId in p.imgIds
+            for catId in catIds
+        }
+
+        maxDet = p.maxDets[-1]
+
+        # <<<< Beginning of code differences with original COCO API
+        def convert_instances_to_cpp(instances, is_det=False):
+            # Convert annotations for a list of instances in an image to a format that's fast
+            # to access in C++
+            instances_cpp = []
+            for instance in instances:
+                instance_cpp = _C.InstanceAnnotation(
+                    int(instance["id"]),
+                    instance["score"] if is_det else instance.get("score", 0.0),
+                    instance["area"],
+                    bool(instance.get("iscrowd", 0)),
+                    bool(instance.get("ignore", 0)),
+                )
+                instances_cpp.append(instance_cpp)
+            return instances_cpp
+
+        # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
+        ground_truth_instances = [
+            [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
+            for imgId in p.imgIds
+        ]
+        detected_instances = [
+            [
+                convert_instances_to_cpp(self._dts[imgId, catId], is_det=True)
+                for catId in p.catIds
+            ]
+            for imgId in p.imgIds
+        ]
+        ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
+
+        if not p.useCats:
+            # For each image, flatten per-category lists into a single list
+            ground_truth_instances = [
+                [[o for c in i for o in c]] for i in ground_truth_instances
+            ]
+            detected_instances = [
+                [[o for c in i for o in c]] for i in detected_instances
+            ]
+
+        # Call C++ implementation of self.evaluateImgs()
+        self._evalImgs_cpp = _C.COCOevalEvaluateImages(
+            p.areaRng,
+            maxDet,
+            p.iouThrs,
+            ious,
+            ground_truth_instances,
+            detected_instances,
+        )
+        self._evalImgs = None
+
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic))
+        # >>>> End of code differences with original COCO API
+
+    def accumulate(self):
+        """
+        Accumulate per image evaluation results and store the result in self.eval.  Does not
+        support changing parameter settings from those used by self.evaluate()
+        """
+        print("Accumulating evaluation results...")
+        tic = time.time()
+        if not hasattr(self, "_evalImgs_cpp"):
+            print("Please run evaluate() first")
+
+        self.eval = _C.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
+
+        # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
+        self.eval["recall"] = np.array(self.eval["recall"]).reshape(
+            self.eval["counts"][:1] + self.eval["counts"][2:]
+        )
+
+        # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
+        # num_area_ranges X num_max_detections
+        self.eval["precision"] = np.array(self.eval["precision"]).reshape(
+            self.eval["counts"]
+        )
+        self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
+        toc = time.time()
+        print(
+            "COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic)
+        )
diff --git a/yolox/models/__init__.py b/yolox/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..34703602a80216db0dd0c5d154ede7704b7ac16b
--- /dev/null
+++ b/yolox/models/__init__.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+from .darknet import CSPDarknet, Darknet
+from .losses import IOUloss
+from .yolo_fpn import YOLOFPN
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+from .yolox import YOLOX
diff --git a/yolox/models/darknet.py b/yolox/models/darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..10d054d6603fd6afd63e0b83a87f63703849088b
--- /dev/null
+++ b/yolox/models/darknet.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from torch import nn
+
+from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
+
+
+class Darknet(nn.Module):
+    # number of blocks from dark2 to dark5.
+    depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
+
+    def __init__(
+        self, depth, in_channels=3, stem_out_channels=32, out_features=("dark3", "dark4", "dark5"),
+    ):
+        """
+        Args:
+            depth (int): depth of darknet used in model, usually use [21, 53] for this param.
+            in_channels (int): number of input channels, for example, use 3 for RGB image.
+            stem_out_channels (int): number of output chanels of darknet stem.
+                It decides channels of darknet layer2 to layer5.
+            out_features (Tuple[str]): desired output layer name.
+        """
+        super().__init__()
+        assert out_features, "please provide output features of Darknet"
+        self.out_features = out_features
+        self.stem = nn.Sequential(
+            BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"),
+            *self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
+        )
+        in_channels = stem_out_channels * 2  # 64
+
+        num_blocks = Darknet.depth2blocks[depth]
+        # create darknet with `stem_out_channels` and `num_blocks` layers.
+        # to make model structure more clear, we don't use `for` statement in python.
+        self.dark2 = nn.Sequential(*self.make_group_layer(in_channels, num_blocks[0], stride=2))
+        in_channels *= 2  # 128
+        self.dark3 = nn.Sequential(*self.make_group_layer(in_channels, num_blocks[1], stride=2))
+        in_channels *= 2  # 256
+        self.dark4 = nn.Sequential(*self.make_group_layer(in_channels, num_blocks[2], stride=2))
+        in_channels *= 2  # 512
+
+        self.dark5 = nn.Sequential(
+            *self.make_group_layer(in_channels, num_blocks[3], stride=2),
+            *self.make_spp_block([in_channels, in_channels * 2], in_channels * 2),
+        )
+
+    def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1):
+        "starts with conv layer then has `num_blocks` `ResLayer`"
+        return [
+            BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"),
+            *[(ResLayer(in_channels * 2)) for _ in range(num_blocks)]
+        ]
+
+    def make_spp_block(self, filters_list, in_filters):
+        m = nn.Sequential(
+            *[
+                BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"),
+                BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
+                SPPBottleneck(
+                    in_channels=filters_list[1],
+                    out_channels=filters_list[0],
+                    activation="lrelu"
+                ),
+                BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
+                BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"),
+            ]
+        )
+        return m
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs["stem"] = x
+        x = self.dark2(x)
+        outputs["dark2"] = x
+        x = self.dark3(x)
+        outputs["dark3"] = x
+        x = self.dark4(x)
+        outputs["dark4"] = x
+        x = self.dark5(x)
+        outputs["dark5"] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}
+
+
+class CSPDarknet(nn.Module):
+
+    def __init__(self, dep_mul, wid_mul, out_features=("dark3", "dark4", "dark5"), depthwise=False):
+        super().__init__()
+        assert out_features, "please provide output features of Darknet"
+        self.out_features = out_features
+        Conv = DWConv if depthwise else BaseConv
+
+        base_channels = int(wid_mul * 64)  # 64
+        base_depth = max(round(dep_mul * 3), 1)  # 3
+
+        # stem
+        self.stem = Focus(3, base_channels, ksize=3)
+
+        # dark2
+        self.dark2 = nn.Sequential(
+            Conv(base_channels, base_channels * 2, 3, 2),
+            CSPLayer(base_channels * 2, base_channels * 2, n=base_depth, depthwise=depthwise),
+        )
+
+        # dark3
+        self.dark3 = nn.Sequential(
+            Conv(base_channels * 2, base_channels * 4, 3, 2),
+            CSPLayer(base_channels * 4, base_channels * 4, n=base_depth * 3, depthwise=depthwise),
+        )
+
+        # dark4
+        self.dark4 = nn.Sequential(
+            Conv(base_channels * 4, base_channels * 8, 3, 2),
+            CSPLayer(base_channels * 8, base_channels * 8, n=base_depth * 3, depthwise=depthwise),
+        )
+
+        # dark5
+        self.dark5 = nn.Sequential(
+            Conv(base_channels * 8, base_channels * 16, 3, 2),
+            SPPBottleneck(base_channels * 16, base_channels * 16),
+            CSPLayer(
+                base_channels * 16, base_channels * 16, n=base_depth,
+                shortcut=False, depthwise=depthwise,
+            ),
+        )
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        outputs["stem"] = x
+        x = self.dark2(x)
+        outputs["dark2"] = x
+        x = self.dark3(x)
+        outputs["dark3"] = x
+        x = self.dark4(x)
+        outputs["dark4"] = x
+        x = self.dark5(x)
+        outputs["dark5"] = x
+        return {k: v for k, v in outputs.items() if k in self.out_features}
diff --git a/yolox/models/losses.py b/yolox/models/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1cfc56deea4e5fe496b93af84dab6224c0fd968
--- /dev/null
+++ b/yolox/models/losses.py
@@ -0,0 +1,48 @@
+import torch
+import torch.nn as nn
+
+
+class IOUloss(nn.Module):
+    def __init__(self, reduction="none", loss_type="iou"):
+        super(IOUloss, self).__init__()
+        self.reduction = reduction
+        self.loss_type = loss_type
+
+    def forward(self, pred, target):
+        assert pred.shape[0] == target.shape[0]
+
+        pred = pred.view(-1, 4)
+        target = target.view(-1, 4)
+        tl = torch.max(
+            (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
+        )
+        br = torch.min(
+            (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
+        )
+
+        area_p = torch.prod(pred[:, 2:], 1)
+        area_g = torch.prod(target[:, 2:], 1)
+
+        en = (tl < br).type(tl.type()).prod(dim=1)
+        area_i = torch.prod(br - tl, 1) * en
+        iou = (area_i) / (area_p + area_g - area_i + 1e-16)
+
+        if self.loss_type == "iou":
+            loss = 1 - iou ** 2
+        elif self.loss_type == "giou":
+            c_tl = torch.min(
+                (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
+            )
+            c_br = torch.max(
+                (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
+            )
+            area_c = torch.prod(c_br - c_tl, 1)
+            giou = iou - (area_c - area_i) / area_c.clamp(1e-16)
+            loss = 1 - giou.clamp(min=-1.0, max=1.0)
+
+        if self.reduction == "mean":
+            loss = loss.mean()
+        elif self.reduction == "sum":
+            loss = loss.sum()
+
+        return loss
diff --git a/yolox/models/network_blocks.py b/yolox/models/network_blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..61c415dafc1594500d900525309c23398d3f6853
--- /dev/null
+++ b/yolox/models/network_blocks.py
@@ -0,0 +1,166 @@
+import torch
+import torch.nn as nn
+
+
+class SiLU(nn.Module):
+    # export-friendly version of nn.SiLU()
+
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+
+def get_activation(name="silu", inplace=True):
+    if name == "silu":
+        module = nn.SiLU(inplace=inplace)
+    elif name == "relu":
+        module = nn.ReLU(inplace=inplace)
+    elif name == "lrelu":
+        module = nn.LeakyReLU(0.1, inplace=inplace)
+    else:
+        raise AttributeError("Unsupported act type: {}".format(name))
+    return module
+
+
+class BaseConv(nn.Module):
+    """
+    A Conv2d -> Batchnorm -> silu/leaky relu block
+    """
+
+    def __init__(self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"):
+        super().__init__()
+        # same padding
+        pad = (ksize - 1) // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = get_activation(act, inplace=True)
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def fuseforward(self, x):
+        return self.act(self.conv(x))
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv + Conv"""
+    def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
+        super().__init__()
+        self.dconv = BaseConv(
+            in_channels, in_channels, ksize=ksize,
+            stride=stride, groups=in_channels, act=act
+        )
+        self.pconv = BaseConv(
+            in_channels, out_channels, ksize=1,
+            stride=1, groups=1, act=act
+        )
+
+    def forward(self, x):
+        x = self.dconv(x)
+        return self.pconv(x)
+
+
+class Bottleneck(nn.Module):
+    # Standard bottleneck
+    def __init__(self, in_channels, out_channels, shortcut=True, expansion=0.5, depthwise=False):
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1)
+        self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1)
+        self.use_add = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.use_add:
+            y = y + x
+        return y
+
+
+class ResLayer(nn.Module):
+    "Residual layer with `in_channels` inputs."
+    def __init__(self, in_channels: int):
+        super().__init__()
+        mid_channels = in_channels // 2
+        self.layer1 = BaseConv(in_channels, mid_channels, ksize=1, stride=1, act="lrelu")
+        self.layer2 = BaseConv(mid_channels, in_channels, ksize=3, stride=1, act="lrelu")
+
+    def forward(self, x):
+        out = self.layer2(self.layer1(x))
+        return x + out
+
+
+class SPPBottleneck(nn.Module):
+    # Spatial pyramid pooling layer used in YOLOv3-SPP
+    def __init__(self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
+        self.m = nn.ModuleList(
+            [nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) for ks in kernel_sizes]
+        )
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+class CSPLayer(nn.Module):
+    """C3 in yolov5, CSP Bottleneck with 3 convolutions"""
+
+    def __init__(
+        self, in_channels, out_channels, n=1,
+        shortcut=True, expansion=0.5, depthwise=False
+    ):
+        """
+        Args:
+            n (int): number of Bottlenecks. Default value: 1.
+        """
+        # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        hidden_channels = int(out_channels * expansion)  # hidden channels
+        self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1)
+        self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1)
+        self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1)  # act=FReLU(c2)
+        module_list = [
+            Bottleneck(hidden_channels, hidden_channels, shortcut, 1.0, depthwise)
+            for _ in range(n)
+        ]
+        self.m = nn.Sequential(*module_list)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_2 = self.conv2(x)
+        x_1 = self.m(x_1)
+        x = torch.cat((x_1, x_2), dim=1)
+        return self.conv3(x)
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space."""
+    def __init__(self, in_channels, out_channels, ksize=1, stride=1):
+        super().__init__()
+        self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (patch_top_left, patch_bot_left, patch_top_right, patch_bot_right,), dim=1,
+        )
+        return self.conv(x)
diff --git a/yolox/models/yolo_fpn.py b/yolox/models/yolo_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0db2539d365f25b65c115939e205ebae36fcd03
--- /dev/null
+++ b/yolox/models/yolo_fpn.py
@@ -0,0 +1,80 @@
+import torch
+import torch.nn as nn
+
+from .darknet import Darknet
+from .network_blocks import BaseConv
+
+
+class YOLOFPN(nn.Module):
+    """
+    YOLOFPN module. Darknet 53 is the default backbone of this model.
+    """
+
+    def __init__(
+        self, depth=53, in_features=["dark3", "dark4", "dark5"],
+    ):
+        super().__init__()
+
+        self.backbone = Darknet(depth)
+        self.in_features = in_features
+
+        # out 1
+        self.out1_cbl = self._make_cbl(512, 256, 1)
+        self.out1 = self._make_embedding([256, 512], 512 + 256)
+
+        # out 2
+        self.out2_cbl = self._make_cbl(256, 128, 1)
+        self.out2 = self._make_embedding([128, 256], 256 + 128)
+
+        # upsample
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+
+    def _make_cbl(self, _in, _out, ks):
+        return BaseConv(_in, _out, ks, stride=1, act="lrelu")
+
+    def _make_embedding(self, filters_list, in_filters):
+        m = nn.Sequential(
+            *[
+                self._make_cbl(in_filters, filters_list[0], 1),
+                self._make_cbl(filters_list[0], filters_list[1], 3),
+
+                self._make_cbl(filters_list[1], filters_list[0], 1),
+
+                self._make_cbl(filters_list[0], filters_list[1], 3),
+                self._make_cbl(filters_list[1], filters_list[0], 1),
+            ]
+        )
+        return m
+
+    def load_pretrained_model(self, filename="./weights/darknet53.mix.pth"):
+        with open(filename, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu")
+        print("loading pretrained weights...")
+        self.backbone.load_state_dict(state_dict)
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (Tensor): input image.
+
+        Returns:
+            Tuple[Tensor]: FPN output features..
+        """
+        #  backbone
+        out_features = self.backbone(inputs)
+        x2, x1, x0 = [out_features[f] for f in self.in_features]
+
+        #  yolo branch 1
+        x1_in = self.out1_cbl(x0)
+        x1_in = self.upsample(x1_in)
+        x1_in = torch.cat([x1_in, x1], 1)
+        out_dark4 = self.out1(x1_in)
+
+        #  yolo branch 2
+        x2_in = self.out2_cbl(out_dark4)
+        x2_in = self.upsample(x2_in)
+        x2_in = torch.cat([x2_in, x2], 1)
+        out_dark3 = self.out2(x2_in)
+
+        outputs = (out_dark3, out_dark4, x0)
+        return outputs
diff --git a/yolox/models/yolo_head.py b/yolox/models/yolo_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f162172a89d7fe6f35dc076e183541660e5e094e
--- /dev/null
+++ b/yolox/models/yolo_head.py
@@ -0,0 +1,525 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import math
+from loguru import logger
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from yolox.utils import bboxes_iou
+
+from .losses import IOUloss
+from .network_blocks import BaseConv, DWConv
+
+
+class YOLOXHead(nn.Module):
+    def __init__(
+        self, num_classes, width=1.0, strides=[8, 16, 32],
+        in_channels=[256, 512, 1024], act="silu", depthwise=False
+    ):
+        """
+        Args:
+            act (str): activation type of conv. Defalut value: "silu".
+            depthwise (bool): wheather apply depthwise conv in conv branch. Defalut value: False.
+        """
+        super().__init__()
+
+        self.n_anchors = 1
+        self.num_classes = num_classes
+        self.decode_in_inference = True  # for deploy, set to False
+
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.obj_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+        Conv = DWConv if depthwise else BaseConv
+
+        for i in range(len(in_channels)):
+            self.stems.append(
+                BaseConv(
+                    in_channels=int(in_channels[i] * width),
+                    out_channels=int(256 * width),
+                    ksize=1,
+                    stride=1,
+                    act=act,
+                )
+            )
+            self.cls_convs.append(
+                nn.Sequential(
+                    *[
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                    ]
+                )
+            )
+            self.reg_convs.append(
+                nn.Sequential(
+                    *[
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                    ]
+                )
+            )
+            self.cls_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * self.num_classes,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+            self.reg_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+            self.obj_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * 1,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+
+        self.use_l1 = False
+        self.l1_loss = nn.L1Loss(reduction="none")
+        self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none")
+        self.iou_loss = IOUloss(reduction="none")
+        self.strides = strides
+        self.grids = [torch.zeros(1)] * len(in_channels)
+        self.expanded_strides = [None] * len(in_channels)
+
+    def initialize_biases(self, prior_prob):
+        for conv in self.cls_preds:
+            b = conv.bias.view(self.n_anchors, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+        for conv in self.obj_preds:
+            b = conv.bias.view(self.n_anchors, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+    def forward(self, xin, labels=None, imgs=None):
+        outputs = []
+        origin_preds = []
+        x_shifts = []
+        y_shifts = []
+        expanded_strides = []
+
+        for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
+            zip(self.cls_convs, self.reg_convs, self.strides, xin)
+        ):
+            x = self.stems[k](x)
+            cls_x = x
+            reg_x = x
+
+            cls_feat = cls_conv(cls_x)
+            cls_output = self.cls_preds[k](cls_feat)
+
+            reg_feat = reg_conv(reg_x)
+            reg_output = self.reg_preds[k](reg_feat)
+            obj_output = self.obj_preds[k](reg_feat)
+
+            if self.training:
+                output = torch.cat([reg_output, obj_output, cls_output], 1)
+                output, grid = self.get_output_and_grid(output, k, stride_this_level, xin[0].type())
+                x_shifts.append(grid[:, :, 0])
+                y_shifts.append(grid[:, :, 1])
+                expanded_strides.append(
+                    torch.zeros(1, grid.shape[1]).fill_(stride_this_level).type_as(xin[0])
+                )
+                if self.use_l1:
+                    origin_preds.append(reg_output.clone())
+
+            else:
+                output = torch.cat([reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1)
+
+            outputs.append(output)
+
+        if self.training:
+            return self.get_losses(
+                imgs, x_shifts, y_shifts, expanded_strides, labels,
+                torch.cat(outputs, 1), origin_preds, dtype=xin[0].dtype
+            )
+        else:
+            self.hw = [x.shape[-2:] for x in outputs]
+            # [batch, n_anchors_all, 85]
+            outputs = torch.cat([x.flatten(start_dim=2) for x in outputs], dim=2).permute(0, 2, 1)
+            if self.decode_in_inference:
+                return self.decode_outputs(outputs, dtype=xin[0].type())
+            else:
+                return outputs
+
+    def get_output_and_grid(self, output, k, stride, dtype):
+        grid = self.grids[k]
+
+        batch_size = output.shape[0]
+        n_ch = 5 + self.num_classes
+        hsize, wsize = output.shape[-2:]
+        if grid.shape[2:3] != output.shape[2:3]:
+            yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).type(dtype)
+            self.grids[k] = grid
+
+        output = output.view(batch_size, self.n_anchors, n_ch, hsize, wsize)
+        output = (
+            output.permute(0, 1, 3, 4, 2)
+            .reshape(batch_size, self.n_anchors * hsize * wsize, -1)
+        )
+        grid = grid.view(1, -1, 2)
+        output[..., :2] = (output[..., :2] + grid) * stride
+        output[..., 2:4] = torch.exp(output[..., 2:4]) * stride
+        return output, grid
+
+    def decode_outputs(self, outputs, dtype):
+        grids = []
+        strides = []
+        for (hsize, wsize), stride in zip(self.hw, self.strides):
+            yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, -1, 2)
+            grids.append(grid)
+            shape = grid.shape[:2]
+            strides.append(torch.full((*shape, 1), stride))
+
+        grids = torch.cat(grids, dim=1).type(dtype)
+        strides = torch.cat(strides, dim=1).type(dtype)
+
+        outputs[..., :2] = (outputs[..., :2] + grids) * strides
+        outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
+        return outputs
+
+    def get_losses(
+        self, imgs, x_shifts, y_shifts, expanded_strides, labels, outputs, origin_preds, dtype,
+    ):
+        bbox_preds = outputs[:, :, :4]  # [batch, n_anchors_all, 4]
+        obj_preds = outputs[:, :, 4].unsqueeze(-1)  # [batch, n_anchors_all, 1]
+        cls_preds = outputs[:, :, 5:]  # [batch, n_anchors_all, n_cls]
+
+        # calculate targets
+        mixup = labels.shape[2] > 5
+        if mixup:
+            label_cut = labels[..., :5]
+        else:
+            label_cut = labels
+        nlabel = (label_cut.sum(dim=2) > 0).sum(dim=1)  # number of objects
+
+        total_num_anchors = outputs.shape[1]
+        x_shifts = torch.cat(x_shifts, 1)  # [1, n_anchors_all]
+        y_shifts = torch.cat(y_shifts, 1)  # [1, n_anchors_all]
+        expanded_strides = torch.cat(expanded_strides, 1)
+        if self.use_l1:
+            origin_preds = torch.cat(origin_preds, 1)
+
+        cls_targets = []
+        reg_targets = []
+        l1_targets = []
+        obj_targets = []
+        fg_masks = []
+
+        num_fg = 0.0
+        num_gts = 0.0
+
+        for batch_idx in range(outputs.shape[0]):
+            num_gt = int(nlabel[batch_idx])
+            num_gts += num_gt
+            if num_gt == 0:
+                cls_target = outputs.new_zeros((0, self.num_classes))
+                reg_target = outputs.new_zeros((0, 4))
+                l1_target = outputs.new_zeros((0, 4))
+                obj_target = outputs.new_zeros((total_num_anchors, 1))
+                fg_mask = outputs.new_zeros(total_num_anchors).bool()
+            else:
+                gt_bboxes_per_image = labels[batch_idx, :num_gt, 1:5]
+                gt_classes = labels[batch_idx, :num_gt, 0]
+                bboxes_preds_per_image = bbox_preds[batch_idx]
+
+                try:
+                    gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg_img = self.get_assignments(  # noqa
+                        batch_idx, num_gt, total_num_anchors, gt_bboxes_per_image, gt_classes,
+                        bboxes_preds_per_image, expanded_strides, x_shifts, y_shifts,
+                        cls_preds, bbox_preds, obj_preds, labels, imgs,
+                    )
+                except RuntimeError:
+                    logger.error(
+                        "OOM RuntimeError is raised due to the huge memory cost during label assignment. \
+                           CPU mode is applied in this batch. If you want to avoid this issue, \
+                           try to reduce the batch size or image size."
+                    )
+                    torch.cuda.empty_cache()
+                    gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg_img = self.get_assignments(  # noqa
+                        batch_idx, num_gt, total_num_anchors, gt_bboxes_per_image, gt_classes,
+                        bboxes_preds_per_image, expanded_strides, x_shifts, y_shifts,
+                        cls_preds, bbox_preds, obj_preds, labels, imgs, "cpu",
+                    )
+
+                torch.cuda.empty_cache()
+                num_fg += num_fg_img
+
+                cls_target = F.one_hot(
+                    gt_matched_classes.to(torch.int64), self.num_classes
+                ) * pred_ious_this_matching.unsqueeze(-1)
+                obj_target = fg_mask.unsqueeze(-1)
+                reg_target = gt_bboxes_per_image[matched_gt_inds]
+                if self.use_l1:
+                    l1_target = self.get_l1_target(
+                        outputs.new_zeros((num_fg_img, 4)),
+                        gt_bboxes_per_image[matched_gt_inds],
+                        expanded_strides[0][fg_mask],
+                        x_shifts=x_shifts[0][fg_mask],
+                        y_shifts=y_shifts[0][fg_mask],
+                    )
+
+            cls_targets.append(cls_target)
+            reg_targets.append(reg_target)
+            obj_targets.append(obj_target.to(dtype))
+            fg_masks.append(fg_mask)
+            if self.use_l1:
+                l1_targets.append(l1_target)
+
+        cls_targets = torch.cat(cls_targets, 0)
+        reg_targets = torch.cat(reg_targets, 0)
+        obj_targets = torch.cat(obj_targets, 0)
+        fg_masks = torch.cat(fg_masks, 0)
+        if self.use_l1:
+            l1_targets = torch.cat(l1_targets, 0)
+
+        num_fg = max(num_fg, 1)
+        loss_iou = (self.iou_loss(bbox_preds.view(-1, 4)[fg_masks], reg_targets)).sum() / num_fg
+        loss_obj = (self.bcewithlog_loss(obj_preds.view(-1, 1), obj_targets)).sum() / num_fg
+        loss_cls = (
+            self.bcewithlog_loss(cls_preds.view(-1, self.num_classes)[fg_masks], cls_targets)
+        ).sum() / num_fg
+        if self.use_l1:
+            loss_l1 = (self.l1_loss(origin_preds.view(-1, 4)[fg_masks], l1_targets)).sum() / num_fg
+        else:
+            loss_l1 = 0.0
+
+        reg_weight = 5.0
+        loss = reg_weight * loss_iou + loss_obj + loss_cls + loss_l1
+
+        return loss, reg_weight * loss_iou, loss_obj, loss_cls, loss_l1, num_fg / max(num_gts, 1)
+
+    def get_l1_target(self, l1_target, gt, stride, x_shifts, y_shifts, eps=1e-8):
+        l1_target[:, 0] = gt[:, 0] / stride - x_shifts
+        l1_target[:, 1] = gt[:, 1] / stride - y_shifts
+        l1_target[:, 2] = torch.log(gt[:, 2] / stride + eps)
+        l1_target[:, 3] = torch.log(gt[:, 3] / stride + eps)
+        return l1_target
+
+    @torch.no_grad()
+    def get_assignments(
+        self, batch_idx, num_gt, total_num_anchors, gt_bboxes_per_image, gt_classes,
+        bboxes_preds_per_image, expanded_strides, x_shifts, y_shifts,
+        cls_preds, bbox_preds, obj_preds, labels, imgs, mode="gpu",
+    ):
+
+        if mode == "cpu":
+            print("------------CPU Mode for This Batch-------------")
+            gt_bboxes_per_image = gt_bboxes_per_image.cpu().float()
+            bboxes_preds_per_image = bboxes_preds_per_image.cpu().float()
+            gt_classes = gt_classes.cpu().float()
+            expanded_strides = expanded_strides.cpu().float()
+            x_shifts = x_shifts.cpu()
+            y_shifts = y_shifts.cpu()
+
+        fg_mask, is_in_boxes_and_center = self.get_in_boxes_info(
+            gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, total_num_anchors, num_gt,
+        )
+
+        bboxes_preds_per_image = bboxes_preds_per_image[fg_mask]
+        cls_preds_ = cls_preds[batch_idx][fg_mask]
+        obj_preds_ = obj_preds[batch_idx][fg_mask]
+        num_in_boxes_anchor = bboxes_preds_per_image.shape[0]
+
+        if mode == "cpu":
+            gt_bboxes_per_image = gt_bboxes_per_image.cpu()
+            bboxes_preds_per_image = bboxes_preds_per_image.cpu()
+
+        pair_wise_ious = bboxes_iou(
+            gt_bboxes_per_image, bboxes_preds_per_image, False
+        )
+
+        gt_cls_per_image = (
+            F.one_hot(gt_classes.to(torch.int64), self.num_classes).float()
+            .unsqueeze(1).repeat(1, num_in_boxes_anchor, 1)
+        )
+        pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8)
+
+        if mode == "cpu":
+            cls_preds_, obj_preds_ = cls_preds_.cpu(), obj_preds_.cpu()
+
+        cls_preds_ = (
+            cls_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()
+            * obj_preds_.unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()
+        )
+        pair_wise_cls_loss = F.binary_cross_entropy(
+            cls_preds_.sqrt_(), gt_cls_per_image, reduction="none"
+        ).sum(-1)
+        del cls_preds_
+
+        cost = (
+            pair_wise_cls_loss
+            + 3.0 * pair_wise_ious_loss
+            + 100000.0 * (~is_in_boxes_and_center)
+        )
+
+        (
+            num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds
+        ) = self.dynamic_k_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask)
+        del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss
+
+        if mode == "cpu":
+            gt_matched_classes = gt_matched_classes.cuda()
+            fg_mask = fg_mask.cuda()
+            pred_ious_this_matching = pred_ious_this_matching.cuda()
+            matched_gt_inds = matched_gt_inds.cuda()
+
+        return gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg
+
+    def get_in_boxes_info(
+        self, gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, total_num_anchors, num_gt,
+    ):
+        expanded_strides_per_image = expanded_strides[0]
+        x_shifts_per_image = x_shifts[0] * expanded_strides_per_image
+        y_shifts_per_image = y_shifts[0] * expanded_strides_per_image
+        x_centers_per_image = (
+            (x_shifts_per_image + 0.5 * expanded_strides_per_image)
+            .unsqueeze(0)
+            .repeat(num_gt, 1)
+        )  # [n_anchor] -> [n_gt, n_anchor]
+        y_centers_per_image = (
+            (y_shifts_per_image + 0.5 * expanded_strides_per_image)
+            .unsqueeze(0)
+            .repeat(num_gt, 1)
+        )
+
+        gt_bboxes_per_image_l = (
+            (gt_bboxes_per_image[:, 0] - 0.5 * gt_bboxes_per_image[:, 2])
+            .unsqueeze(1)
+            .repeat(1, total_num_anchors)
+        )
+        gt_bboxes_per_image_r = (
+            (gt_bboxes_per_image[:, 0] + 0.5 * gt_bboxes_per_image[:, 2])
+            .unsqueeze(1)
+            .repeat(1, total_num_anchors)
+        )
+        gt_bboxes_per_image_t = (
+            (gt_bboxes_per_image[:, 1] - 0.5 * gt_bboxes_per_image[:, 3])
+            .unsqueeze(1)
+            .repeat(1, total_num_anchors)
+        )
+        gt_bboxes_per_image_b = (
+            (gt_bboxes_per_image[:, 1] + 0.5 * gt_bboxes_per_image[:, 3])
+            .unsqueeze(1)
+            .repeat(1, total_num_anchors)
+        )
+
+        b_l = x_centers_per_image - gt_bboxes_per_image_l
+        b_r = gt_bboxes_per_image_r - x_centers_per_image
+        b_t = y_centers_per_image - gt_bboxes_per_image_t
+        b_b = gt_bboxes_per_image_b - y_centers_per_image
+        bbox_deltas = torch.stack([b_l, b_t, b_r, b_b], 2)
+
+        is_in_boxes = bbox_deltas.min(dim=-1).values > 0.0
+        is_in_boxes_all = is_in_boxes.sum(dim=0) > 0
+        # in fixed center
+
+        center_radius = 2.5
+
+        gt_bboxes_per_image_l = (gt_bboxes_per_image[:, 0]).unsqueeze(1).repeat(
+            1, total_num_anchors
+        ) - center_radius * expanded_strides_per_image.unsqueeze(0)
+        gt_bboxes_per_image_r = (gt_bboxes_per_image[:, 0]).unsqueeze(1).repeat(
+            1, total_num_anchors
+        ) + center_radius * expanded_strides_per_image.unsqueeze(0)
+        gt_bboxes_per_image_t = (gt_bboxes_per_image[:, 1]).unsqueeze(1).repeat(
+            1, total_num_anchors
+        ) - center_radius * expanded_strides_per_image.unsqueeze(0)
+        gt_bboxes_per_image_b = (gt_bboxes_per_image[:, 1]).unsqueeze(1).repeat(
+            1, total_num_anchors
+        ) + center_radius * expanded_strides_per_image.unsqueeze(0)
+
+        c_l = x_centers_per_image - gt_bboxes_per_image_l
+        c_r = gt_bboxes_per_image_r - x_centers_per_image
+        c_t = y_centers_per_image - gt_bboxes_per_image_t
+        c_b = gt_bboxes_per_image_b - y_centers_per_image
+        center_deltas = torch.stack([c_l, c_t, c_r, c_b], 2)
+        is_in_centers = center_deltas.min(dim=-1).values > 0.0
+        is_in_centers_all = is_in_centers.sum(dim=0) > 0
+
+        # in boxes and in centers
+        is_in_boxes_anchor = is_in_boxes_all | is_in_centers_all
+
+        is_in_boxes_and_center = (
+            is_in_boxes[:, is_in_boxes_anchor] & is_in_centers[:, is_in_boxes_anchor]
+        )
+        return is_in_boxes_anchor, is_in_boxes_and_center
+
+    def dynamic_k_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
+        # Dynamic K
+        # ---------------------------------------------------------------
+        matching_matrix = torch.zeros_like(cost)
+
+        ious_in_boxes_matrix = pair_wise_ious
+        n_candidate_k = 10
+        topk_ious, _ = torch.topk(ious_in_boxes_matrix, n_candidate_k, dim=1)
+        dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False
+            )
+            matching_matrix[gt_idx][pos_idx] = 1.0
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        anchor_matching_gt = matching_matrix.sum(0)
+        if (anchor_matching_gt > 1).sum() > 0:
+            cost_min, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0)
+            matching_matrix[:, anchor_matching_gt > 1] *= 0.0
+            matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0
+        fg_mask_inboxes = matching_matrix.sum(0) > 0.0
+        num_fg = fg_mask_inboxes.sum().item()
+
+        fg_mask[fg_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
+        gt_matched_classes = gt_classes[matched_gt_inds]
+
+        pred_ious_this_matching = (matching_matrix * pair_wise_ious).sum(0)[fg_mask_inboxes]
+        return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds
diff --git a/yolox/models/yolo_pafpn.py b/yolox/models/yolo_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..99a35e49a607c65f6ced4d81e9ad8032aaa1825a
--- /dev/null
+++ b/yolox/models/yolo_pafpn.py
@@ -0,0 +1,99 @@
+import torch
+import torch.nn as nn
+
+from .darknet import CSPDarknet
+from .network_blocks import BaseConv, CSPLayer, DWConv
+
+
+class YOLOPAFPN(nn.Module):
+    """
+    YOLOv3 model. Darknet 53 is the default backbone of this model.
+    """
+
+    def __init__(
+        self, depth=1.0, width=1.0, in_features=("dark3", "dark4", "dark5"),
+        in_channels=[256, 512, 1024], depthwise=False,
+    ):
+        super().__init__()
+        self.backbone = CSPDarknet(depth, width, depthwise=depthwise)
+        self.in_features = in_features
+        self.in_channels = in_channels
+        Conv = DWConv if depthwise else BaseConv
+
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+        self.lateral_conv0 = BaseConv(
+            int(in_channels[2] * width), int(in_channels[1] * width), 1, 1
+        )
+        self.C3_p4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+        )  # cat
+
+        self.reduce_conv1 = BaseConv(
+            int(in_channels[1] * width), int(in_channels[0] * width), 1, 1
+        )
+        self.C3_p3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[0] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+        )
+
+        # bottom-up conv
+        self.bu_conv2 = Conv(int(in_channels[0] * width), int(in_channels[0] * width), 3, 2)
+        self.C3_n3 = CSPLayer(
+            int(2 * in_channels[0] * width),
+            int(in_channels[1] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+        )
+
+        # bottom-up conv
+        self.bu_conv1 = Conv(int(in_channels[1] * width), int(in_channels[1] * width), 3, 2)
+        self.C3_n4 = CSPLayer(
+            int(2 * in_channels[1] * width),
+            int(in_channels[2] * width),
+            round(3 * depth),
+            False,
+            depthwise=depthwise,
+        )
+
+    def forward(self, input):
+        """
+        Args:
+            inputs: input images.
+
+        Returns:
+            Tuple[Tensor]: FPN feature.
+        """
+
+        #  backbone
+        out_features = self.backbone(input)
+        features = [out_features[f] for f in self.in_features]
+        [x2, x1, x0] = features
+
+        fpn_out0 = self.lateral_conv0(x0)  # 1024->512/32
+        f_out0 = self.upsample(fpn_out0)  # 512/16
+        f_out0 = torch.cat([f_out0, x1], 1)  # 512->1024/16
+        f_out0 = self.C3_p4(f_out0)  # 1024->512/16
+
+        fpn_out1 = self.reduce_conv1(f_out0)  # 512->256/16
+        f_out1 = self.upsample(fpn_out1)  # 256/8
+        f_out1 = torch.cat([f_out1, x2], 1)  # 256->512/8
+        pan_out2 = self.C3_p3(f_out1)  # 512->256/8
+
+        p_out1 = self.bu_conv2(pan_out2)  # 256->256/16
+        p_out1 = torch.cat([p_out1, fpn_out1], 1)  # 256->512/16
+        pan_out1 = self.C3_n3(p_out1)  # 512->512/16
+
+        p_out0 = self.bu_conv1(pan_out1)  # 512->512/32
+        p_out0 = torch.cat([p_out0, fpn_out0], 1)  # 512->1024/32
+        pan_out0 = self.C3_n4(p_out0)  # 1024->1024/32
+
+        outputs = (pan_out2, pan_out1, pan_out0)
+        return outputs
diff --git a/yolox/models/yolox.py b/yolox/models/yolox.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d4c789c640fbcf860bfbc09fb54e058b2af11ac
--- /dev/null
+++ b/yolox/models/yolox.py
@@ -0,0 +1,45 @@
+
+import torch.nn as nn
+
+from .yolo_head import YOLOXHead
+from .yolo_pafpn import YOLOPAFPN
+
+
+class YOLOX(nn.Module):
+    """
+    YOLOX model module. The module list is defined by create_yolov3_modules function.
+    The network returns loss values from three YOLO layers during training
+    and detection results during test.
+    """
+
+    def __init__(self, backbone=None, head=None):
+        super().__init__()
+        if backbone is None:
+            backbone = YOLOPAFPN()
+        if head is None:
+            head = YOLOXHead(80)
+
+        self.backbone = backbone
+        self.head = head
+
+    def forward(self, x, targets=None):
+        # fpn output content features of [dark3, dark4, dark5]
+        fpn_outs = self.backbone(x)
+
+        if self.training:
+            assert targets is not None
+            loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head(
+                fpn_outs, targets, x
+            )
+            outputs = {
+                "total_loss": loss,
+                "iou_loss": iou_loss,
+                "l1_loss": l1_loss,
+                "conf_loss": conf_loss,
+                "cls_loss": cls_loss,
+                "num_fg": num_fg,
+            }
+        else:
+            outputs = self.head(fpn_outs)
+
+        return outputs
diff --git a/yolox/utils/__init__.py b/yolox/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b212f3888cfae4b83abd9fdef0b3121547058151
--- /dev/null
+++ b/yolox/utils/__init__.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+from .allreduce_norm import *
+from .boxes import *
+from .checkpoint import load_ckpt, save_checkpoint
+from .dist import *
+from .ema import ModelEMA
+from .logger import setup_logger
+from .lr_scheduler import LRScheduler
+from .metric import *
+from .model_utils import *
+from .setup_env import *
+from .visualize import *
diff --git a/yolox/utils/allreduce_norm.py b/yolox/utils/allreduce_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2092fe65e185dd3f8462937cc9b21f841ca6823c
--- /dev/null
+++ b/yolox/utils/allreduce_norm.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+
+import pickle
+from collections import OrderedDict
+
+import torch
+from torch import distributed as dist
+from torch import nn
+
+from .dist import _get_global_gloo_group, get_world_size
+
+ASYNC_NORM = (
+    nn.BatchNorm1d,
+    nn.BatchNorm2d,
+    nn.BatchNorm3d,
+    nn.InstanceNorm1d,
+    nn.InstanceNorm2d,
+    nn.InstanceNorm3d,
+)
+
+__all__ = [
+    "get_async_norm_states", "pyobj2tensor", "tensor2pyobj", "all_reduce", "all_reduce_norm"
+]
+
+
+def get_async_norm_states(module):
+    async_norm_states = OrderedDict()
+    for name, child in module.named_modules():
+        if isinstance(child, ASYNC_NORM):
+            for k, v in child.state_dict().items():
+                async_norm_states[".".join([name, k])] = v
+    return async_norm_states
+
+
+def pyobj2tensor(pyobj, device="cuda"):
+    """serialize picklable python object to tensor"""
+    storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj))
+    return torch.ByteTensor(storage).to(device=device)
+
+
+def tensor2pyobj(tensor):
+    """deserialize tensor to picklable python object"""
+    return pickle.loads(tensor.cpu().numpy().tobytes())
+
+
+def _get_reduce_op(op_name):
+    return {
+        "sum": dist.ReduceOp.SUM,
+        "mean": dist.ReduceOp.SUM,
+    }[op_name.lower()]
+
+
+def all_reduce(py_dict, op="sum", group=None):
+    """
+    Apply all reduce function for python dict object.
+    NOTE: make sure that every py_dict has the same keys and values are in the same shape.
+
+    Args:
+        py_dict (dict): dict to apply all reduce op.
+        op (str): operator, could be "sum" or "mean".
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return py_dict
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group) == 1:
+        return py_dict
+
+    # all reduce logic across different devices.
+    py_key = list(py_dict.keys())
+    py_key_tensor = pyobj2tensor(py_key)
+    dist.broadcast(py_key_tensor, src=0)
+    py_key = tensor2pyobj(py_key_tensor)
+
+    tensor_shapes = [py_dict[k].shape for k in py_key]
+    tensor_numels = [py_dict[k].numel() for k in py_key]
+
+    flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key])
+    dist.all_reduce(flatten_tensor, op=_get_reduce_op(op))
+    if op == "mean":
+        flatten_tensor /= world_size
+
+    split_tensors = [
+        x.reshape(shape) for x, shape in zip(
+            torch.split(flatten_tensor, tensor_numels), tensor_shapes
+        )
+    ]
+    return OrderedDict({k: v for k, v in zip(py_key, split_tensors)})
+
+
+def all_reduce_norm(module):
+    """
+    All reduce norm statistics in different devices.
+    """
+    states = get_async_norm_states(module)
+    states = all_reduce(states, op="mean")
+    module.load_state_dict(states, strict=False)
diff --git a/yolox/utils/boxes.py b/yolox/utils/boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..a590567288a469348ca99bb000450d2da3f59c3e
--- /dev/null
+++ b/yolox/utils/boxes.py
@@ -0,0 +1,111 @@
+import numpy as np
+
+import torch
+import torchvision
+
+__all__ = [
+    "filter_box", "postprocess", "bboxes_iou", "matrix_iou",
+    "adjust_box_anns", "xyxy2xywh",
+]
+
+
+def filter_box(output, scale_range):
+    """
+    output: (N, 5+class) shape
+    """
+    min_scale, max_scale = scale_range
+    w = output[:, 2] - output[:, 0]
+    h = output[:, 3] - output[:, 1]
+    keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
+    return output[keep]
+
+
+def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45):
+    box_corner = prediction.new(prediction.shape)
+    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+    prediction[:, :, :4] = box_corner[:, :, :4]
+
+    output = [None for _ in range(len(prediction))]
+    for i, image_pred in enumerate(prediction):
+
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        # Get score and class with highest confidence
+        class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
+
+        conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
+        # _, conf_mask = torch.topk((image_pred[:, 4] * class_conf.squeeze()), 1000)
+        # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+        detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
+        detections = detections[conf_mask]
+        if not detections.size(0):
+            continue
+
+        nms_out_index = torchvision.ops.batched_nms(
+            detections[:, :4],
+            detections[:, 4] * detections[:, 5],
+            detections[:, 6],
+            nms_thre,
+        )
+        detections = detections[nms_out_index]
+        if output[i] is None:
+            output[i] = detections
+        else:
+            output[i] = torch.cat((output[i], detections))
+
+    return output
+
+
+def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    if xyxy:
+        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+    else:
+        tl = torch.max(
+            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
+        )
+        br = torch.min(
+            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
+        )
+
+        area_a = torch.prod(bboxes_a[:, 2:], 1)
+        area_b = torch.prod(bboxes_b[:, 2:], 1)
+    en = (tl < br).type(tl.type()).prod(dim=2)
+    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
+    return area_i / (area_a[:, None] + area_b - area_i)
+
+
+def matrix_iou(a, b):
+    """
+    return iou of a and b, numpy version for data augenmentation
+    """
+    lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+    rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+    area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
+    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+    area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+    return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12)
+
+
+def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max):
+    bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max)
+    bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max)
+    return bbox
+
+
+def xyxy2xywh(bboxes):
+    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+    return bboxes
diff --git a/yolox/utils/checkpoint.py b/yolox/utils/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5760472db287a5066850bfcde2ddd2d484e16dd
--- /dev/null
+++ b/yolox/utils/checkpoint.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+import os
+import shutil
+from loguru import logger
+
+import torch
+
+
+def load_ckpt(model, ckpt):
+    model_state_dict = model.state_dict()
+    load_dict = {}
+    for key_model, v in model_state_dict.items():
+        if key_model not in ckpt:
+            logger.warning(
+                "{} is not in the ckpt. Please double check and see if this is desired.".format(
+                    key_model
+                )
+            )
+            continue
+        v_ckpt = ckpt[key_model]
+        if v.shape != v_ckpt.shape:
+            logger.warning(
+                "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format(
+                    key_model, v_ckpt.shape, key_model, v.shape
+                )
+            )
+            continue
+        load_dict[key_model] = v_ckpt
+
+    model.load_state_dict(load_dict, strict=False)
+    return model
+
+
+def save_checkpoint(state, is_best, save_dir, model_name=""):
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    filename = os.path.join(save_dir, model_name + "_ckpt.pth.tar")
+    torch.save(state, filename)
+    if is_best:
+        best_filename = os.path.join(save_dir, "best_ckpt.pth.tar")
+        shutil.copyfile(filename, best_filename)
diff --git a/yolox/utils/dist.py b/yolox/utils/dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed6419b36b3aa07515efb81925cb6fb6b4a9f030
--- /dev/null
+++ b/yolox/utils/dist.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# This file comes from
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/utils/comm.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+This file contains primitives for multi-gpu communication.
+This is useful when doing distributed training.
+"""
+
+import functools
+import logging
+import pickle
+import time
+
+import numpy as np
+
+import torch
+from torch import distributed as dist
+
+__all__ = [
+    "is_main_process",
+    "synchronize",
+    "get_world_size",
+    "get_rank",
+    "get_local_rank",
+    "get_local_size",
+    "time_synchronized",
+    "gather",
+    "all_gather",
+]
+
+_LOCAL_PROCESS_GROUP = None
+
+
+def synchronize():
+    """
+    Helper function to synchronize (barrier) among all processes when using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+
+
+def get_world_size() -> int:
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank() -> int:
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    assert _LOCAL_PROCESS_GROUP is not None
+    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
+
+
+def get_local_size() -> int:
+    """
+    Returns:
+        The size of the per-machine process group, i.e. the number of processes per machine.
+    """
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
+
+
+def is_main_process() -> bool:
+    return get_rank() == 0
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """
+    Return a process group based on gloo backend, containing all the ranks
+    The result is cached.
+    """
+    if dist.get_backend() == "nccl":
+        return dist.new_group(backend="gloo")
+    else:
+        return dist.group.WORLD
+
+
+def _serialize_to_tensor(data, group):
+    backend = dist.get_backend(group)
+    assert backend in ["gloo", "nccl"]
+    device = torch.device("cpu" if backend == "gloo" else "cuda")
+
+    buffer = pickle.dumps(data)
+    if len(buffer) > 1024 ** 3:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            "Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
+                get_rank(), len(buffer) / (1024 ** 3), device
+            )
+        )
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to(device=device)
+    return tensor
+
+
+def _pad_to_largest_tensor(tensor, group):
+    """
+    Returns:
+        list[int]: size of the tensor, on each rank
+        Tensor: padded tensor that has the max size
+    """
+    world_size = dist.get_world_size(group=group)
+    assert (
+        world_size >= 1
+    ), "comm.gather/all_gather must be called from ranks within the given group!"
+    local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
+    size_list = [
+        torch.zeros([1], dtype=torch.int64, device=tensor.device)
+        for _ in range(world_size)
+    ]
+    dist.all_gather(size_list, local_size, group=group)
+    size_list = [int(size.item()) for size in size_list]
+
+    max_size = max(size_list)
+
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    if local_size != max_size:
+        padding = torch.zeros(
+            (max_size - local_size,), dtype=torch.uint8, device=tensor.device
+        )
+        tensor = torch.cat((tensor, padding), dim=0)
+    return size_list, tensor
+
+
+def all_gather(data, group=None):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group) == 1:
+        return [data]
+
+    tensor = _serialize_to_tensor(data, group)
+
+    size_list, tensor = _pad_to_largest_tensor(tensor, group)
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    tensor_list = [
+        torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
+        for _ in size_list
+    ]
+    dist.all_gather(tensor_list, tensor, group=group)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def gather(data, dst=0, group=None):
+    """
+    Run gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        dst (int): destination rank
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+
+    Returns:
+        list[data]: on dst, a list of data gathered from each rank. Otherwise,
+            an empty list.
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group=group) == 1:
+        return [data]
+    rank = dist.get_rank(group=group)
+
+    tensor = _serialize_to_tensor(data, group)
+    size_list, tensor = _pad_to_largest_tensor(tensor, group)
+
+    # receiving Tensor from all ranks
+    if rank == dst:
+        max_size = max(size_list)
+        tensor_list = [
+            torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
+            for _ in size_list
+        ]
+        dist.gather(tensor, tensor_list, dst=dst, group=group)
+
+        data_list = []
+        for size, tensor in zip(size_list, tensor_list):
+            buffer = tensor.cpu().numpy().tobytes()[:size]
+            data_list.append(pickle.loads(buffer))
+        return data_list
+    else:
+        dist.gather(tensor, [], dst=dst, group=group)
+        return []
+
+
+def shared_random_seed():
+    """
+    Returns:
+        int: a random number that is the same across all workers.
+            If workers need a shared RNG, they can use this shared seed to
+            create one.
+    All workers must call this function, otherwise it will deadlock.
+    """
+    ints = np.random.randint(2 ** 31)
+    all_ints = all_gather(ints)
+    return all_ints[0]
+
+
+def time_synchronized():
+    # pytorch-accurate time
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    return time.time()
diff --git a/yolox/utils/ema.py b/yolox/utils/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..16b78c2c9b2d85119bfd34ae3f69676830bb3f4a
--- /dev/null
+++ b/yolox/utils/ema.py
@@ -0,0 +1,69 @@
+import math
+from copy import deepcopy
+
+import apex
+import torch
+import torch.nn as nn
+
+
+def is_parallel(model):
+    """check if model is in parallel mode."""
+    parallel_type = (
+        nn.parallel.DataParallel,
+        nn.parallel.DistributedDataParallel,
+        apex.parallel.distributed.DistributedDataParallel,
+    )
+    return isinstance(model, parallel_type)
+
+
+def copy_attr(a, b, include=(), exclude=()):
+    # Copy attributes from b to a, options to only include [...] and to exclude [...]
+    for k, v in b.__dict__.items():
+        if (len(include) and k not in include) or k.startswith("_") or k in exclude:
+            continue
+        else:
+            setattr(a, k, v)
+
+
+class ModelEMA:
+    """
+    Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+    This is intended to allow functionality like
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+    A smoothed version of the weights is necessary for some training schemes to perform well.
+    This class is sensitive where it is initialized in the sequence of model init,
+    GPU assignment and distributed training wrappers.
+    """
+    def __init__(self, model, decay=0.9999, updates=0):
+        """
+        Args:
+            model (nn.Module): model to apply EMA.
+            decay (float): ema decay reate.
+            updates (int): counter of EMA updates.
+        """
+        # Create EMA(FP32)
+        self.ema = deepcopy(model.module if is_parallel(model) else model).eval()
+        self.updates = updates
+        # decay exponential ramp (to help early epochs)
+        self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
+        for p in self.ema.parameters():
+            p.requires_grad_(False)
+
+    def update(self, model):
+        # Update EMA parameters
+        with torch.no_grad():
+            self.updates += 1
+            d = self.decay(self.updates)
+
+            msd = (
+                model.module.state_dict() if is_parallel(model) else model.state_dict()
+            )  # model state_dict
+            for k, v in self.ema.state_dict().items():
+                if v.dtype.is_floating_point:
+                    v *= d
+                    v += (1.0 - d) * msd[k].detach()
+
+    def update_attr(self, model, include=(), exclude=("process_group", "reducer")):
+        # Update EMA attributes
+        copy_attr(self.ema, model, include, exclude)
diff --git a/yolox/utils/logger.py b/yolox/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f329a2fd7f3ea4911281df9bd601d24c01f8a1e
--- /dev/null
+++ b/yolox/utils/logger.py
@@ -0,0 +1,90 @@
+import inspect
+import os
+import sys
+from loguru import logger
+
+
+def get_caller_name(depth=0):
+    """
+    Args:
+        depth (int): Depth of caller conext, use 0 for caller depth. Default value: 0.
+
+    Returns:
+        str: module name of the caller
+    """
+    # the following logic is a little bit faster than inspect.stack() logic
+    frame = inspect.currentframe().f_back
+    for _ in range(depth):
+        frame = frame.f_back
+
+    return frame.f_globals["__name__"]
+
+
+class StreamToLoguru:
+    """
+    stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, level="INFO", caller_names=("apex", "pycocotools")):
+        """
+        Args:
+            level(str): log level string of loguru. Default value: "INFO".
+            caller_names(tuple): caller names of redirected module.
+                Default value: (apex, pycocotools).
+        """
+        self.level = level
+        self.linebuf = ""
+        self.caller_names = caller_names
+
+    def write(self, buf):
+        full_name = get_caller_name(depth=1)
+        module_name = full_name.rsplit(".", maxsplit=-1)[0]
+        if module_name in self.caller_names:
+            for line in buf.rstrip().splitlines():
+                # use caller level log
+                logger.opt(depth=2).log(self.level, line.rstrip())
+        else:
+            sys.__stdout__.write(buf)
+
+    def flush(self):
+        pass
+
+
+def redirect_sys_output(log_level="INFO"):
+    redirect_logger = StreamToLoguru(log_level)
+    sys.stderr = redirect_logger
+    sys.stdout = redirect_logger
+
+
+def setup_logger(save_dir, distributed_rank=0, filename="log.txt", mode="a"):
+    """setup logger for training and testing.
+    Args:
+        save_dir(str): location to save log file
+        distributed_rank(int): device rank when multi-gpu environment
+        filename (string): log save name.
+        mode(str): log file write mode, `append` or `override`. default is `a`.
+
+    Return:
+        logger instance.
+    """
+    loguru_format = (
+        "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
+        "<level>{level: <8}</level> | "
+        "<cyan>{name}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"
+    )
+
+    logger.remove()
+    save_file = os.path.join(save_dir, filename)
+    if mode == "o" and os.path.exists(save_file):
+        os.remove(save_file)
+    # only keep logger in rank0 process
+    if distributed_rank == 0:
+        logger.add(
+            sys.stderr,
+            format=loguru_format,
+            level="INFO",
+            enqueue=True,
+        )
+        logger.add(save_file)
+
+    # redirect stdout/stderr to loguru
+    redirect_sys_output("INFO")
diff --git a/yolox/utils/lr_scheduler.py b/yolox/utils/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..09bd56b664ae6bb30ad19336187cb02270e55008
--- /dev/null
+++ b/yolox/utils/lr_scheduler.py
@@ -0,0 +1,197 @@
+import math
+from functools import partial
+
+
+class LRScheduler:
+    def __init__(self, name, lr, iters_per_epoch, total_epochs, **kwargs):
+        """
+        Supported lr schedulers: [cos, warmcos, multistep]
+
+        Extra keyword arguments:
+          - cos: None
+          - warmcos: [warmup_epochs, warmup_lr_start (default 1e-6)]
+          - multistep: [milestones (epochs), gamma (default 0.1)]
+        """
+
+        self.lr = lr
+        self.iters_per_epoch = iters_per_epoch
+        self.total_epochs = total_epochs
+        self.total_iters = iters_per_epoch * total_epochs
+
+        self.__dict__.update(kwargs)
+
+        self.lr_func = self._get_lr_func(name)
+
+    def update_lr(self, iters):
+        return self.lr_func(iters)
+
+    def _get_lr_func(self, name):
+        if name == "cos":  # cosine lr schedule
+            lr_func = partial(cos_lr, self.lr, self.total_iters)
+        elif name == "warmcos":
+            warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
+            warmup_lr_start = getattr(self, "warmup_lr_start", 1e-6)
+            lr_func = partial(
+                warm_cos_lr,
+                self.lr,
+                self.total_iters,
+                warmup_total_iters,
+                warmup_lr_start,
+            )
+        elif name == "yoloxwarmcos":
+            warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
+            no_aug_iters = self.iters_per_epoch * self.no_aug_epochs
+            warmup_lr_start = getattr(self, "warmup_lr_start", 0)
+            min_lr_ratio = getattr(self, "min_lr_ratio", 0.2)
+            lr_func = partial(
+                yolox_warm_cos_lr,
+                self.lr,
+                min_lr_ratio,
+                self.total_iters,
+                warmup_total_iters,
+                warmup_lr_start,
+                no_aug_iters,
+            )
+        elif name == "yoloxsemiwarmcos":
+            warmup_lr_start = getattr(self, "warmup_lr_start", 0)
+            min_lr_ratio = getattr(self, "min_lr_ratio", 0.2)
+            warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
+            no_aug_iters = self.iters_per_epoch * self.no_aug_epochs
+            normal_iters = self.iters_per_epoch * self.semi_epoch
+            semi_iters = self.iters_per_epoch_semi * (
+                self.total_epochs - self.semi_epoch - self.no_aug_epochs
+            )
+            lr_func = partial(
+                yolox_semi_warm_cos_lr,
+                self.lr,
+                min_lr_ratio,
+                warmup_lr_start,
+                self.total_iters,
+                normal_iters,
+                no_aug_iters,
+                warmup_total_iters,
+                semi_iters,
+                self.iters_per_epoch,
+                self.iters_per_epoch_semi,
+            )
+        elif name == "multistep":  # stepwise lr schedule
+            milestones = [
+                int(self.total_iters * milestone / self.total_epochs)
+                for milestone in self.milestones
+            ]
+            gamma = getattr(self, "gamma", 0.1)
+            lr_func = partial(multistep_lr, self.lr, milestones, gamma)
+        else:
+            raise ValueError("Scheduler version {} not supported.".format(name))
+        return lr_func
+
+
+def cos_lr(lr, total_iters, iters):
+    """Cosine learning rate"""
+    lr *= 0.5 * (1.0 + math.cos(math.pi * iters / total_iters))
+    return lr
+
+
+def warm_cos_lr(lr, total_iters, warmup_total_iters, warmup_lr_start, iters):
+    """Cosine learning rate with warm up."""
+    if iters <= warmup_total_iters:
+        lr = (lr - warmup_lr_start) * iters / float(
+            warmup_total_iters
+        ) + warmup_lr_start
+    else:
+        lr *= 0.5 * (
+            1.0
+            + math.cos(
+                math.pi
+                * (iters - warmup_total_iters)
+                / (total_iters - warmup_total_iters)
+            )
+        )
+    return lr
+
+
+def yolox_warm_cos_lr(
+    lr,
+    min_lr_ratio,
+    total_iters,
+    warmup_total_iters,
+    warmup_lr_start,
+    no_aug_iter,
+    iters,
+):
+    """Cosine learning rate with warm up."""
+    min_lr = lr * min_lr_ratio
+    if iters <= warmup_total_iters:
+        # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
+        lr = (lr - warmup_lr_start) * pow(
+            iters / float(warmup_total_iters), 2
+        ) + warmup_lr_start
+    elif iters >= total_iters - no_aug_iter:
+        lr = min_lr
+    else:
+        lr = min_lr + 0.5 * (lr - min_lr) * (
+            1.0
+            + math.cos(
+                math.pi
+                * (iters - warmup_total_iters)
+                / (total_iters - warmup_total_iters - no_aug_iter)
+            )
+        )
+    return lr
+
+
+def yolox_semi_warm_cos_lr(
+    lr,
+    min_lr_ratio,
+    warmup_lr_start,
+    total_iters,
+    normal_iters,
+    no_aug_iters,
+    warmup_total_iters,
+    semi_iters,
+    iters_per_epoch,
+    iters_per_epoch_semi,
+    iters,
+):
+    """Cosine learning rate with warm up."""
+    min_lr = lr * min_lr_ratio
+    if iters <= warmup_total_iters:
+        # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
+        lr = (lr - warmup_lr_start) * pow(
+            iters / float(warmup_total_iters), 2
+        ) + warmup_lr_start
+    elif iters >= normal_iters + semi_iters:
+        lr = min_lr
+    elif iters <= normal_iters:
+        lr = min_lr + 0.5 * (lr - min_lr) * (
+            1.0
+            + math.cos(
+                math.pi
+                * (iters - warmup_total_iters)
+                / (total_iters - warmup_total_iters - no_aug_iters)
+            )
+        )
+    else:
+        lr = min_lr + 0.5 * (lr - min_lr) * (
+            1.0
+            + math.cos(
+                math.pi
+                * (
+                    normal_iters
+                    - warmup_total_iters
+                    + (iters - normal_iters)
+                    * iters_per_epoch
+                    * 1.0
+                    / iters_per_epoch_semi
+                )
+                / (total_iters - warmup_total_iters - no_aug_iters)
+            )
+        )
+    return lr
+
+
+def multistep_lr(lr, milestones, gamma, iters):
+    """MultiStep learning rate"""
+    for milestone in milestones:
+        lr *= gamma if iters >= milestone else 1.0
+    return lr
diff --git a/yolox/utils/metric.py b/yolox/utils/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..e34e9dad12706efecfbced69ed01b0245001b68f
--- /dev/null
+++ b/yolox/utils/metric.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+import functools
+import os
+import time
+from collections import defaultdict, deque
+
+import numpy as np
+
+import torch
+
+__all__ = [
+    "AverageMeter",
+    "MeterBuffer",
+    "get_total_and_free_memory_in_Mb",
+    "occumpy_mem",
+    "gpu_mem_usage",
+]
+
+
+def get_total_and_free_memory_in_Mb(cuda_device):
+    devices_info_str = os.popen(
+        "nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader"
+    )
+    devices_info = devices_info_str.read().strip().split("\n")
+    total, used = devices_info[int(cuda_device)].split(",")
+    return int(total), int(used)
+
+
+def occumpy_mem(cuda_device, mem_ratio=0.9):
+    """
+    pre-allocate gpu memory for training to avoid memory Fragmentation.
+    """
+    total, used = get_total_and_free_memory_in_Mb(cuda_device)
+    max_mem = int(total * mem_ratio)
+    block_mem = max_mem - used
+    x = torch.cuda.FloatTensor(256, 1024, block_mem)
+    del x
+    time.sleep(5)
+
+
+def gpu_mem_usage():
+    """
+    Compute the GPU memory usage for the current device (MB).
+    """
+    mem_usage_bytes = torch.cuda.max_memory_allocated()
+    return mem_usage_bytes / (1024 * 1024)
+
+
+class AverageMeter:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=50):
+        self._deque = deque(maxlen=window_size)
+        self._total = 0.0
+        self._count = 0
+
+    def update(self, value):
+        self._deque.append(value)
+        self._count += 1
+        self._total += value
+
+    @property
+    def median(self):
+        d = np.array(list(self._deque))
+        return np.median(d)
+
+    @property
+    def avg(self):
+        # if deque is empty, nan will be returned.
+        d = np.array(list(self._deque))
+        return d.mean()
+
+    @property
+    def global_avg(self):
+        return self._total / max(self._count, 1e-5)
+
+    @property
+    def latest(self):
+        return self._deque[-1] if len(self._deque) > 0 else None
+
+    @property
+    def total(self):
+        return self._total
+
+    def reset(self):
+        self._deque.clear()
+        self._total = 0.0
+        self._count = 0
+
+    def clear(self):
+        self._deque.clear()
+
+
+class MeterBuffer(defaultdict):
+    """Computes and stores the average and current value"""
+
+    def __init__(self, window_size=20):
+        factory = functools.partial(AverageMeter, window_size=window_size)
+        super().__init__(factory)
+
+    def reset(self):
+        for v in self.values():
+            v.reset()
+
+    def get_filtered_meter(self, filter_key="time"):
+        return {k: v for k, v in self.items() if filter_key in k}
+
+    def update(self, values=None, **kwargs):
+        if values is None:
+            values = {}
+        values.update(kwargs)
+        for k, v in values.items():
+            self[k].update(v)
+
+    def clear_meters(self):
+        for v in self.values():
+            v.clear()
diff --git a/yolox/utils/model_utils.py b/yolox/utils/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..704afbfd4f3c6ea0a088f2bc8d5f9763a31ea549
--- /dev/null
+++ b/yolox/utils/model_utils.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+from thop import profile
+
+__all__ = [
+    "fuse_conv_and_bn", "fuse_model", "get_model_info", "replace_module",
+]
+
+
+def get_model_info(model, tsize):
+
+    stride = 64
+    img = torch.zeros((1, 3, stride, stride), device=next(model.parameters()).device)
+    flops, params = profile(deepcopy(model), inputs=(img,), verbose=False)
+    params /= 1e6
+    flops /= 1e9
+    flops *= tsize[0] * tsize[1] / stride / stride * 2  # Gflops
+    info = "Params: {:.2f}M, Gflops: {:.2f}".format(params, flops)
+    return info
+
+
+def fuse_conv_and_bn(conv, bn):
+    # Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/
+    fusedconv = (
+        nn.Conv2d(
+            conv.in_channels,
+            conv.out_channels,
+            kernel_size=conv.kernel_size,
+            stride=conv.stride,
+            padding=conv.padding,
+            groups=conv.groups,
+            bias=True,
+        )
+        .requires_grad_(False)
+        .to(conv.weight.device)
+    )
+
+    # prepare filters
+    w_conv = conv.weight.clone().view(conv.out_channels, -1)
+    w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
+    fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
+
+    # prepare spatial bias
+    b_conv = (
+        torch.zeros(conv.weight.size(0), device=conv.weight.device)
+        if conv.bias is None
+        else conv.bias
+    )
+    b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(
+        torch.sqrt(bn.running_var + bn.eps)
+    )
+    fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
+
+    return fusedconv
+
+
+def fuse_model(model):
+    from yolox.models.network_blocks import BaseConv
+
+    for m in model.modules():
+        if type(m) is BaseConv and hasattr(m, "bn"):
+            m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
+            delattr(m, "bn")  # remove batchnorm
+            m.forward = m.fuseforward  # update forward
+    return model
+
+
+def replace_module(module, replaced_module_type, new_module_type, replace_func=None):
+    """
+    Replace given type in module to a new type. mostly used in deploy.
+
+    Args:
+        module (nn.Module): model to apply replace operation.
+        replaced_module_type (Type): module type to be replaced.
+        new_module_type (Type)
+        replace_func (function): python function to describe replace logic. Defalut value None.
+
+    Returns:
+        model (nn.Module): module that already been replaced.
+    """
+    def default_replace_func(replaced_module_type, new_module_type):
+        return new_module_type()
+
+    if replace_func is None:
+        replace_func = default_replace_func
+
+    model = module
+    if isinstance(module, replaced_module_type):
+        model = replace_func(replaced_module_type, new_module_type)
+    else:  # recurrsively replace
+        for name, child in module.named_children():
+            new_child = replace_module(child, replaced_module_type, new_module_type)
+            if new_child is not child:  # child is already replaced
+                model.add_module(name, new_child)
+
+    return model
diff --git a/yolox/utils/setup_env.py b/yolox/utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9af2604c2d3b49497d66fa21e53871f771d4962
--- /dev/null
+++ b/yolox/utils/setup_env.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+
+import os
+import resource
+import subprocess
+
+import cv2
+
+__all__ = ["configure_nccl", "configure_module"]
+
+
+def configure_nccl():
+    """Configure multi-machine environment variables of NCCL."""
+    os.environ["NCCL_SOCKET_IFNAME"] = "ib0"
+    os.environ["GLOO_SOCKET_IFNAME"] = "ib0"
+    os.environ["NCCL_IB_DISABLE"] = "1"
+
+    os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL"
+    os.environ["NCCL_IB_HCA"] = subprocess.getoutput(
+        "cd /sys/class/infiniband/ > /dev/null; for i in mlx5_*; "
+        "do cat $i/ports/1/gid_attrs/types/* 2>/dev/null "
+        "| grep v >/dev/null && echo $i ; done; > /dev/null"
+    )
+    os.environ["NCCL_IB_GID_INDEX"] = "3"
+    os.environ["NCCL_IB_TC"] = "106"
+
+
+def configure_module(ulimit_value=8192):
+    """
+    Configure pytorch module environment. setting of ulimit and cv2 will be set.
+
+    Args:
+        ulimit_value(int): default open file number on linux. Default value: 4096.
+    """
+    # system setting
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (ulimit_value, rlimit[1]))
+    # cv2
+    # multiprocess might be harmful on performance of torch dataloader
+    os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled"
+    cv2.setNumThreads(0)
+    cv2.ocl.setUseOpenCL(False)
diff --git a/yolox/utils/visualize.py b/yolox/utils/visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..44ea54509f536fcfc132e0a08b79b9a34ddca20d
--- /dev/null
+++ b/yolox/utils/visualize.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+
+import cv2
+import numpy as np
+
+
+def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
+
+    for i in range(len(boxes)):
+        box = boxes[i]
+        cls_id = int(cls_ids[i])
+        score = scores[i]
+        if score < conf:
+            continue
+        x0 = int(box[0])
+        y0 = int(box[1])
+        x1 = int(box[0] + box[2])
+        y1 = int(box[1] + box[3])
+
+        color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
+        text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100)
+        txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
+        font = cv2.FONT_HERSHEY_COMPLEX
+
+        txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
+        cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
+
+        txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
+        cv2.rectangle(
+            img,
+            (x0, y0 + 1),
+            (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])),
+            txt_bk_color,
+            -1
+        )
+        cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
+
+    return img
+
+
+_COLORS = np.array(
+    [
+        0.000, 0.447, 0.741,
+        0.850, 0.325, 0.098,
+        0.929, 0.694, 0.125,
+        0.494, 0.184, 0.556,
+        0.466, 0.674, 0.188,
+        0.301, 0.745, 0.933,
+        0.635, 0.078, 0.184,
+        0.300, 0.300, 0.300,
+        0.600, 0.600, 0.600,
+        1.000, 0.000, 0.000,
+        1.000, 0.500, 0.000,
+        0.749, 0.749, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.333, 0.333, 0.000,
+        0.333, 0.667, 0.000,
+        0.333, 1.000, 0.000,
+        0.667, 0.333, 0.000,
+        0.667, 0.667, 0.000,
+        0.667, 1.000, 0.000,
+        1.000, 0.333, 0.000,
+        1.000, 0.667, 0.000,
+        1.000, 1.000, 0.000,
+        0.000, 0.333, 0.500,
+        0.000, 0.667, 0.500,
+        0.000, 1.000, 0.500,
+        0.333, 0.000, 0.500,
+        0.333, 0.333, 0.500,
+        0.333, 0.667, 0.500,
+        0.333, 1.000, 0.500,
+        0.667, 0.000, 0.500,
+        0.667, 0.333, 0.500,
+        0.667, 0.667, 0.500,
+        0.667, 1.000, 0.500,
+        1.000, 0.000, 0.500,
+        1.000, 0.333, 0.500,
+        1.000, 0.667, 0.500,
+        1.000, 1.000, 0.500,
+        0.000, 0.333, 1.000,
+        0.000, 0.667, 1.000,
+        0.000, 1.000, 1.000,
+        0.333, 0.000, 1.000,
+        0.333, 0.333, 1.000,
+        0.333, 0.667, 1.000,
+        0.333, 1.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.667, 0.333, 1.000,
+        0.667, 0.667, 1.000,
+        0.667, 1.000, 1.000,
+        1.000, 0.000, 1.000,
+        1.000, 0.333, 1.000,
+        1.000, 0.667, 1.000,
+        0.333, 0.000, 0.000,
+        0.500, 0.000, 0.000,
+        0.667, 0.000, 0.000,
+        0.833, 0.000, 0.000,
+        1.000, 0.000, 0.000,
+        0.000, 0.167, 0.000,
+        0.000, 0.333, 0.000,
+        0.000, 0.500, 0.000,
+        0.000, 0.667, 0.000,
+        0.000, 0.833, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 0.167,
+        0.000, 0.000, 0.333,
+        0.000, 0.000, 0.500,
+        0.000, 0.000, 0.667,
+        0.000, 0.000, 0.833,
+        0.000, 0.000, 1.000,
+        0.000, 0.000, 0.000,
+        0.143, 0.143, 0.143,
+        0.286, 0.286, 0.286,
+        0.429, 0.429, 0.429,
+        0.571, 0.571, 0.571,
+        0.714, 0.714, 0.714,
+        0.857, 0.857, 0.857,
+        0.000, 0.447, 0.741,
+        0.314, 0.717, 0.741,
+        0.50, 0.5, 0
+    ]
+).astype(np.float32).reshape(-1, 3)