File size: 19,168 Bytes
9375c9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 |
// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
/*
This is an example illustrating the use of the deep learning tools from the
dlib C++ Library. I'm assuming you have already read the dnn_introduction_ex.cpp
example. So in this example program I'm going to go over a number of more
advanced parts of the API, including:
- Using multiple GPUs
- Training on large datasets that don't fit in memory
- Defining large networks
- Accessing and configuring layers in a network
*/
#include <dlib/dnn.h>
#include <iostream>
#include <dlib/data_io.h>
using namespace std;
using namespace dlib;
// ----------------------------------------------------------------------------------------
// Let's start by showing how you can conveniently define large and complex
// networks. The most important tool for doing this are C++'s alias templates.
// These let us define new layer types that are combinations of a bunch of other
// layers. These will form the building blocks for more complex networks.
// So let's begin by defining the building block of a residual network (see
// Figure 2 in Deep Residual Learning for Image Recognition by He, Zhang, Ren,
// and Sun). We are going to decompose the residual block into a few alias
// statements. First, we define the core block.
// Here we have parameterized the "block" layer on a BN layer (nominally some
// kind of batch normalization), the number of filter outputs N, and the stride
// the block operates at.
template <
int N,
template <typename> class BN,
int stride,
typename SUBNET
>
using block = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>;
// Next, we need to define the skip layer mechanism used in the residual network
// paper. They create their blocks by adding the input tensor to the output of
// each block. So we define an alias statement that takes a block and wraps it
// with this skip/add structure.
// Note the tag layer. This layer doesn't do any computation. It exists solely
// so other layers can refer to it. In this case, the add_prev1 layer looks for
// the tag1 layer and will take the tag1 output and add it to the input of the
// add_prev1 layer. This combination allows us to implement skip and residual
// style networks. We have also set the block stride to 1 in this statement.
// The significance of that is explained next.
template <
template <int,template<typename>class,int,typename> class block,
int N,
template<typename>class BN,
typename SUBNET
>
using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>;
// Some residual blocks do downsampling. They do this by using a stride of 2
// instead of 1. However, when downsampling we need to also take care to
// downsample the part of the network that adds the original input to the output
// or the sizes won't make sense (the network will still run, but the results
// aren't as good). So here we define a downsampling version of residual. In
// it, we make use of the skip1 layer. This layer simply outputs whatever is
// output by the tag1 layer. Therefore, the skip1 layer (there are also skip2,
// skip3, etc. in dlib) allows you to create branching network structures.
// residual_down creates a network structure like this:
/*
input from SUBNET
/ \
/ \
block downsample(using avg_pool)
\ /
\ /
add tensors (using add_prev2 which adds the output of tag2 with avg_pool's output)
|
output
*/
template <
template <int,template<typename>class,int,typename> class block,
int N,
template<typename>class BN,
typename SUBNET
>
using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>;
// Now we can define 4 different residual blocks we will use in this example.
// The first two are non-downsampling residual blocks while the last two
// downsample. Also, res and res_down use batch normalization while ares and
// ares_down have had the batch normalization replaced with simple affine
// layers. We will use the affine version of the layers when testing our
// networks.
template <typename SUBNET> using res = relu<residual<block,8,bn_con,SUBNET>>;
template <typename SUBNET> using ares = relu<residual<block,8,affine,SUBNET>>;
template <typename SUBNET> using res_down = relu<residual_down<block,8,bn_con,SUBNET>>;
template <typename SUBNET> using ares_down = relu<residual_down<block,8,affine,SUBNET>>;
// Now that we have these convenient aliases, we can define a residual network
// without a lot of typing. Note the use of a repeat layer. This special layer
// type allows us to type repeat<9,res,SUBNET> instead of
// res<res<res<res<res<res<res<res<res<SUBNET>>>>>>>>>. It will also prevent
// the compiler from complaining about super deep template nesting when creating
// large networks.
const unsigned long number_of_classes = 10;
using net_type = loss_multiclass_log<fc<number_of_classes,
avg_pool_everything<
res<res<res<res_down<
repeat<9,res, // repeat this layer 9 times
res_down<
res<
input<matrix<unsigned char>>
>>>>>>>>>>;
// And finally, let's define a residual network building block that uses
// parametric ReLU units instead of regular ReLU.
template <typename SUBNET>
using pres = prelu<add_prev1<bn_con<con<8,3,3,1,1,prelu<bn_con<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
// ----------------------------------------------------------------------------------------
int main(int argc, char** argv) try
{
if (argc != 2)
{
cout << "This example needs the MNIST dataset to run!" << endl;
cout << "You can get MNIST from http://yann.lecun.com/exdb/mnist/" << endl;
cout << "Download the 4 files that comprise the dataset, decompress them, and" << endl;
cout << "put them in a folder. Then give that folder as input to this program." << endl;
return 1;
}
std::vector<matrix<unsigned char>> training_images;
std::vector<unsigned long> training_labels;
std::vector<matrix<unsigned char>> testing_images;
std::vector<unsigned long> testing_labels;
load_mnist_dataset(argv[1], training_images, training_labels, testing_images, testing_labels);
// dlib uses cuDNN under the covers. One of the features of cuDNN is the
// option to use slower methods that use less RAM or faster methods that use
// a lot of RAM. If you find that you run out of RAM on your graphics card
// then you can call this function and we will request the slower but more
// RAM frugal cuDNN algorithms.
set_dnn_prefer_smallest_algorithms();
// Create a network as defined above. This network will produce 10 outputs
// because that's how we defined net_type. However, fc layers can have the
// number of outputs they produce changed at runtime.
net_type net;
// So if you wanted to use the same network but override the number of
// outputs at runtime you can do so like this:
net_type net2(num_fc_outputs(15));
// Now, let's imagine we wanted to replace some of the relu layers with
// prelu layers. We might do it like this:
using net_type2 = loss_multiclass_log<fc<number_of_classes,
avg_pool_everything<
pres<res<res<res_down< // 2 prelu layers here
tag4<repeat<9,pres, // 9 groups, each containing 2 prelu layers
res_down<
res<
input<matrix<unsigned char>>
>>>>>>>>>>>;
// prelu layers have a floating point parameter. If you want to set it to
// something other than its default value you can do so like this:
net_type2 pnet(prelu_(0.2),
prelu_(0.25),
repeat_group(prelu_(0.3),prelu_(0.4)) // Initialize all the prelu instances in the repeat
// layer. repeat_group() is needed to group the
// things that are part of repeat's block.
);
// As you can see, a network will greedily assign things given to its
// constructor to the layers inside itself. The assignment is done in the
// order the layers are defined, but it will skip layers where the
// assignment doesn't make sense.
// Now let's print the details of the pnet to the screen and inspect it.
cout << "The pnet has " << pnet.num_layers << " layers in it." << endl;
cout << pnet << endl;
// These print statements will output this (I've truncated it since it's
// long, but you get the idea):
/*
The pnet has 131 layers in it.
layer<0> loss_multiclass_log
layer<1> fc (num_outputs=10) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
layer<3> prelu (initial_param_value=0.2)
layer<4> add_prev1
layer<5> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<6> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<7> prelu (initial_param_value=0.25)
layer<8> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<9> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<10> tag1
...
layer<34> relu
layer<35> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<36> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<37> tag1
layer<38> tag4
layer<39> prelu (initial_param_value=0.3)
layer<40> add_prev1
layer<41> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
...
layer<118> relu
layer<119> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<120> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2, padding_y=0, padding_x=0) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<121> tag1
layer<122> relu
layer<123> add_prev1
layer<124> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<125> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<126> relu
layer<127> bn_con eps=1e-05 learning_rate_mult=1 weight_decay_mult=0 bias_learning_rate_mult=1 bias_weight_decay_mult=1
layer<128> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1) learning_rate_mult=1 weight_decay_mult=1 bias_learning_rate_mult=1 bias_weight_decay_mult=0
layer<129> tag1
layer<130> input<matrix>
*/
// Now that we know the index numbers for each layer, we can access them
// individually using layer<index>(pnet). For example, to access the output
// tensor for the first prelu layer we can say:
layer<3>(pnet).get_output();
// Or to print the prelu parameter for layer 7 we can say:
cout << "prelu param: "<< layer<7>(pnet).layer_details().get_initial_param_value() << endl;
// We can also access layers by their type. This next statement finds the
// first tag1 layer in pnet, and is therefore equivalent to calling
// layer<10>(pnet):
layer<tag1>(pnet);
// The tag layers don't do anything at all and exist simply so you can tag
// parts of your network and access them by layer<tag>(). You can also
// index relative to a tag. So for example, to access the layer immediately
// after tag4 you can say:
layer<tag4,1>(pnet); // Equivalent to layer<38+1>(pnet).
// Or to access the layer 2 layers after tag4:
layer<tag4,2>(pnet);
// Tagging is a very useful tool for making complex network structures. For
// example, the add_prev1 layer is implemented internally by using a call to
// layer<tag1>().
// Ok, that's enough talk about defining and inspecting networks. Let's
// talk about training networks!
// The dnn_trainer will use SGD by default, but you can tell it to use
// different solvers like adam with a weight decay of 0.0005 and the given
// momentum parameters.
dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999));
// Also, if you have multiple graphics cards you can tell the trainer to use
// them together to make the training faster. For example, replacing the
// above constructor call with this one would cause it to use GPU cards 0
// and 1.
//dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999), {0,1});
trainer.be_verbose();
// While the trainer is running it keeps an eye on the training error. If
// it looks like the error hasn't decreased for the last 2000 iterations it
// will automatically reduce the learning rate by 0.1. You can change these
// default parameters to some other values by calling these functions. Or
// disable the automatic shrinking entirely by setting the shrink factor to 1.
trainer.set_iterations_without_progress_threshold(2000);
trainer.set_learning_rate_shrink_factor(0.1);
// The learning rate will start at 1e-3.
trainer.set_learning_rate(1e-3);
trainer.set_synchronization_file("mnist_resnet_sync", std::chrono::seconds(100));
// Now, what if your training dataset is so big it doesn't fit in RAM? You
// make mini-batches yourself, any way you like, and you send them to the
// trainer by repeatedly calling trainer.train_one_step().
//
// For example, the loop below stream MNIST data to out trainer.
std::vector<matrix<unsigned char>> mini_batch_samples;
std::vector<unsigned long> mini_batch_labels;
dlib::rand rnd(time(0));
// Loop until the trainer's automatic shrinking has shrunk the learning rate to 1e-6.
// Given our settings, this means it will stop training after it has shrunk the
// learning rate 3 times.
while(trainer.get_learning_rate() >= 1e-6)
{
mini_batch_samples.clear();
mini_batch_labels.clear();
// make a 128 image mini-batch
while(mini_batch_samples.size() < 128)
{
auto idx = rnd.get_random_32bit_number()%training_images.size();
mini_batch_samples.push_back(training_images[idx]);
mini_batch_labels.push_back(training_labels[idx]);
}
// Tell the trainer to update the network given this mini-batch
trainer.train_one_step(mini_batch_samples, mini_batch_labels);
// You can also feed validation data into the trainer by periodically
// calling trainer.test_one_step(samples,labels). Unlike train_one_step(),
// test_one_step() doesn't modify the network, it only computes the testing
// error which it records internally. This testing error will then be print
// in the verbose logging and will also determine when the trainer's
// automatic learning rate shrinking happens. Therefore, test_one_step()
// can be used to perform automatic early stopping based on held out data.
}
// When you call train_one_step(), the trainer will do its processing in a
// separate thread. This allows the main thread to work on loading data
// while the trainer is busy executing the mini-batches in parallel.
// However, this also means we need to wait for any mini-batches that are
// still executing to stop before we mess with the net object. Calling
// get_net() performs the necessary synchronization.
trainer.get_net();
net.clean();
serialize("mnist_res_network.dat") << net;
// Now we have a trained network. However, it has batch normalization
// layers in it. As is customary, we should replace these with simple
// affine layers before we use the network. This can be accomplished by
// making a network type which is identical to net_type but with the batch
// normalization layers replaced with affine. For example:
using test_net_type = loss_multiclass_log<fc<number_of_classes,
avg_pool_everything<
ares<ares<ares<ares_down<
repeat<9,ares,
ares_down<
ares<
input<matrix<unsigned char>>
>>>>>>>>>>;
// Then we can simply assign our trained net to our testing net.
test_net_type tnet = net;
// Or if you only had a file with your trained network you could deserialize
// it directly into your testing network.
deserialize("mnist_res_network.dat") >> tnet;
// And finally, we can run the testing network over our data.
std::vector<unsigned long> predicted_labels = tnet(training_images);
int num_right = 0;
int num_wrong = 0;
for (size_t i = 0; i < training_images.size(); ++i)
{
if (predicted_labels[i] == training_labels[i])
++num_right;
else
++num_wrong;
}
cout << "training num_right: " << num_right << endl;
cout << "training num_wrong: " << num_wrong << endl;
cout << "training accuracy: " << num_right/(double)(num_right+num_wrong) << endl;
predicted_labels = tnet(testing_images);
num_right = 0;
num_wrong = 0;
for (size_t i = 0; i < testing_images.size(); ++i)
{
if (predicted_labels[i] == testing_labels[i])
++num_right;
else
++num_wrong;
}
cout << "testing num_right: " << num_right << endl;
cout << "testing num_wrong: " << num_wrong << endl;
cout << "testing accuracy: " << num_right/(double)(num_right+num_wrong) << endl;
}
catch(std::exception& e)
{
cout << e.what() << endl;
}
|