Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,628 Bytes
a9a0ec2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
// Copyright (c) Facebook, Inc. and its affiliates.
// @lint-ignore-every CLANGTIDY
// This is an example code that demonstrates how to run inference
// with a torchscript format Mask R-CNN model exported by ./export_model.py
// using export method=tracing, caffe2_tracing & scripting.
#include <opencv2/opencv.hpp>
#include <iostream>
#include <string>
#include <c10/cuda/CUDAStream.h>
#include <torch/csrc/autograd/grad_mode.h>
#include <torch/csrc/jit/runtime/graph_executor.h>
#include <torch/script.h>
// only needed for export_method=tracing
#include <torchvision/vision.h> // @oss-only
// @fb-only: #include <torchvision/csrc/vision.h>
using namespace std;
c10::IValue get_caffe2_tracing_inputs(cv::Mat& img, c10::Device device) {
const int height = img.rows;
const int width = img.cols;
// FPN models require divisibility of 32.
// Tracing mode does padding inside the graph, but caffe2_tracing does not.
assert(height % 32 == 0 && width % 32 == 0);
const int channels = 3;
auto input =
torch::from_blob(img.data, {1, height, width, channels}, torch::kUInt8);
// NHWC to NCHW
input = input.to(device, torch::kFloat).permute({0, 3, 1, 2}).contiguous();
std::array<float, 3> im_info_data{height * 1.0f, width * 1.0f, 1.0f};
auto im_info =
torch::from_blob(im_info_data.data(), {1, 3}).clone().to(device);
return std::make_tuple(input, im_info);
}
c10::IValue get_tracing_inputs(cv::Mat& img, c10::Device device) {
const int height = img.rows;
const int width = img.cols;
const int channels = 3;
auto input =
torch::from_blob(img.data, {height, width, channels}, torch::kUInt8);
// HWC to CHW
input = input.to(device, torch::kFloat).permute({2, 0, 1}).contiguous();
return input;
}
// create a Tuple[Dict[str, Tensor]] which is the input type of scripted model
c10::IValue get_scripting_inputs(cv::Mat& img, c10::Device device) {
const int height = img.rows;
const int width = img.cols;
const int channels = 3;
auto img_tensor =
torch::from_blob(img.data, {height, width, channels}, torch::kUInt8);
// HWC to CHW
img_tensor =
img_tensor.to(device, torch::kFloat).permute({2, 0, 1}).contiguous();
auto dic = c10::Dict<std::string, torch::Tensor>();
dic.insert("image", img_tensor);
return std::make_tuple(dic);
}
c10::IValue
get_inputs(std::string export_method, cv::Mat& img, c10::Device device) {
// Given an image, create inputs in the format required by the model.
if (export_method == "tracing")
return get_tracing_inputs(img, device);
if (export_method == "caffe2_tracing")
return get_caffe2_tracing_inputs(img, device);
if (export_method == "scripting")
return get_scripting_inputs(img, device);
abort();
}
struct MaskRCNNOutputs {
at::Tensor pred_boxes, pred_classes, pred_masks, scores;
int num_instances() const {
return pred_boxes.sizes()[0];
}
};
MaskRCNNOutputs get_outputs(std::string export_method, c10::IValue outputs) {
// Given outputs of the model, extract tensors from it to turn into a
// common MaskRCNNOutputs format.
if (export_method == "tracing") {
auto out_tuple = outputs.toTuple()->elements();
// They are ordered alphabetically by their field name in Instances
return MaskRCNNOutputs{
out_tuple[0].toTensor(),
out_tuple[1].toTensor(),
out_tuple[2].toTensor(),
out_tuple[3].toTensor()};
}
if (export_method == "caffe2_tracing") {
auto out_tuple = outputs.toTuple()->elements();
// A legacy order used by caffe2 models
return MaskRCNNOutputs{
out_tuple[0].toTensor(),
out_tuple[2].toTensor(),
out_tuple[3].toTensor(),
out_tuple[1].toTensor()};
}
if (export_method == "scripting") {
// With the ScriptableAdapter defined in export_model.py, the output is
// List[Dict[str, Any]].
auto out_dict = outputs.toList().get(0).toGenericDict();
return MaskRCNNOutputs{
out_dict.at("pred_boxes").toTensor(),
out_dict.at("pred_classes").toTensor(),
out_dict.at("pred_masks").toTensor(),
out_dict.at("scores").toTensor()};
}
abort();
}
int main(int argc, const char* argv[]) {
if (argc != 4) {
cerr << R"xx(
Usage:
./torchscript_mask_rcnn model.ts input.jpg EXPORT_METHOD
EXPORT_METHOD can be "tracing", "caffe2_tracing" or "scripting".
)xx";
return 1;
}
std::string image_file = argv[2];
std::string export_method = argv[3];
assert(
export_method == "caffe2_tracing" || export_method == "tracing" ||
export_method == "scripting");
torch::jit::FusionStrategy strat = {{torch::jit::FusionBehavior::DYNAMIC, 1}};
torch::jit::setFusionStrategy(strat);
torch::autograd::AutoGradMode guard(false);
auto module = torch::jit::load(argv[1]);
assert(module.buffers().size() > 0);
// Assume that the entire model is on the same device.
// We just put input to this device.
auto device = (*begin(module.buffers())).device();
cv::Mat input_img = cv::imread(image_file, cv::IMREAD_COLOR);
auto inputs = get_inputs(export_method, input_img, device);
// Run the network
auto output = module.forward({inputs});
if (device.is_cuda())
c10::cuda::getCurrentCUDAStream().synchronize();
// run 3 more times to benchmark
int N_benchmark = 3, N_warmup = 1;
auto start_time = chrono::high_resolution_clock::now();
for (int i = 0; i < N_benchmark + N_warmup; ++i) {
if (i == N_warmup)
start_time = chrono::high_resolution_clock::now();
output = module.forward({inputs});
if (device.is_cuda())
c10::cuda::getCurrentCUDAStream().synchronize();
}
auto end_time = chrono::high_resolution_clock::now();
auto ms = chrono::duration_cast<chrono::microseconds>(end_time - start_time)
.count();
cout << "Latency (should vary with different inputs): "
<< ms * 1.0 / 1e6 / N_benchmark << " seconds" << endl;
// Parse Mask R-CNN outputs
auto rcnn_outputs = get_outputs(export_method, output);
cout << "Number of detected objects: " << rcnn_outputs.num_instances()
<< endl;
cout << "pred_boxes: " << rcnn_outputs.pred_boxes.toString() << " "
<< rcnn_outputs.pred_boxes.sizes() << endl;
cout << "scores: " << rcnn_outputs.scores.toString() << " "
<< rcnn_outputs.scores.sizes() << endl;
cout << "pred_classes: " << rcnn_outputs.pred_classes.toString() << " "
<< rcnn_outputs.pred_classes.sizes() << endl;
cout << "pred_masks: " << rcnn_outputs.pred_masks.toString() << " "
<< rcnn_outputs.pred_masks.sizes() << endl;
cout << rcnn_outputs.pred_boxes << endl;
return 0;
}
|