|
<html><head><title>dlib C++ Library - dnn_introduction_ex.cpp</title></head><body bgcolor='white'><pre> |
|
<font color='#009900'>// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt |
|
</font><font color='#009900'>/* |
|
This is an example illustrating the use of the deep learning tools from the |
|
dlib C++ Library. In it, we will train the venerable LeNet convolutional |
|
neural network to recognize hand written digits. The network will take as |
|
input a small image and classify it as one of the 10 numeric digits between |
|
0 and 9. |
|
|
|
The specific network we will run is from the paper |
|
LeCun, Yann, et al. "Gradient-based learning applied to document recognition." |
|
Proceedings of the IEEE 86.11 (1998): 2278-2324. |
|
except that we replace the sigmoid non-linearities with rectified linear units. |
|
|
|
These tools will use CUDA and cuDNN to drastically accelerate network |
|
training and testing. CMake should automatically find them if they are |
|
installed and configure things appropriately. If not, the program will |
|
still run but will be much slower to execute. |
|
*/</font> |
|
|
|
|
|
<font color='#0000FF'>#include</font> <font color='#5555FF'><</font>dlib<font color='#5555FF'>/</font>dnn.h<font color='#5555FF'>></font> |
|
<font color='#0000FF'>#include</font> <font color='#5555FF'><</font>iostream<font color='#5555FF'>></font> |
|
<font color='#0000FF'>#include</font> <font color='#5555FF'><</font>dlib<font color='#5555FF'>/</font>data_io.h<font color='#5555FF'>></font> |
|
|
|
<font color='#0000FF'>using</font> <font color='#0000FF'>namespace</font> std; |
|
<font color='#0000FF'>using</font> <font color='#0000FF'>namespace</font> dlib; |
|
|
|
<font color='#0000FF'><u>int</u></font> <b><a name='main'></a>main</b><font face='Lucida Console'>(</font><font color='#0000FF'><u>int</u></font> argc, <font color='#0000FF'><u>char</u></font><font color='#5555FF'>*</font><font color='#5555FF'>*</font> argv<font face='Lucida Console'>)</font> <font color='#0000FF'>try</font> |
|
<b>{</b> |
|
<font color='#009900'>// This example is going to run on the MNIST dataset. |
|
</font> <font color='#0000FF'>if</font> <font face='Lucida Console'>(</font>argc <font color='#5555FF'>!</font><font color='#5555FF'>=</font> <font color='#979000'>2</font><font face='Lucida Console'>)</font> |
|
<b>{</b> |
|
cout <font color='#5555FF'><</font><font color='#5555FF'><</font> "<font color='#CC0000'>This example needs the MNIST dataset to run!</font>" <font color='#5555FF'><</font><font color='#5555FF'><</font> endl; |
|
cout <font color='#5555FF'><</font><font color='#5555FF'><</font> "<font color='#CC0000'>You can get MNIST from http://yann.lecun.com/exdb/mnist/</font>" <font color='#5555FF'><</font><font color='#5555FF'><</font> endl; |
|
cout <font color='#5555FF'><</font><font color='#5555FF'><</font> "<font color='#CC0000'>Download the 4 files that comprise the dataset, decompress them, and</font>" <font color='#5555FF'><</font><font color='#5555FF'><</font> endl; |
|
cout <font color='#5555FF'><</font><font color='#5555FF'><</font> "<font color='#CC0000'>put them in a folder. Then give that folder as input to this program.</font>" <font color='#5555FF'><</font><font color='#5555FF'><</font> endl; |
|
<font color='#0000FF'>return</font> <font color='#979000'>1</font>; |
|
<b>}</b> |
|
|
|
|
|
<font color='#009900'>// MNIST is broken into two parts, a training set of 60000 images and a test set of |
|
</font> <font color='#009900'>// 10000 images. Each image is labeled so that we know what hand written digit is |
|
</font> <font color='#009900'>// depicted. These next statements load the dataset into memory. |
|
</font> std::vector<font color='#5555FF'><</font>matrix<font color='#5555FF'><</font><font color='#0000FF'><u>unsigned</u></font> <font color='#0000FF'><u>char</u></font><font color='#5555FF'>></font><font color='#5555FF'>></font> training_images; |
|
std::vector<font color='#5555FF'><</font><font color='#0000FF'><u>unsigned</u></font> <font color='#0000FF'><u>long</u></font><font color='#5555FF'>></font> training_labels; |
|
std::vector<font color='#5555FF'><</font>matrix<font color='#5555FF'><</font><font color='#0000FF'><u>unsigned</u></font> <font color='#0000FF'><u>char</u></font><font color='#5555FF'>></font><font color='#5555FF'>></font> testing_images; |
|
std::vector<font color='#5555FF'><</font><font color='#0000FF'><u>unsigned</u></font> <font color='#0000FF'><u>long</u></font><font color='#5555FF'>></font> testing_labels; |
|
<font color='#BB00BB'>load_mnist_dataset</font><font face='Lucida Console'>(</font>argv[<font color='#979000'>1</font>], training_images, training_labels, testing_images, testing_labels<font face='Lucida Console'>)</font>; |
|
|
|
|
|
<font color='#009900'>// Now let's define the LeNet. Broadly speaking, there are 3 parts to a network |
|
</font> <font color='#009900'>// definition. The loss layer, a bunch of computational layers, and then an input |
|
</font> <font color='#009900'>// layer. You can see these components in the network definition below. |
|
</font> <font color='#009900'>// |
|
</font> <font color='#009900'>// The input layer here says the network expects to be given matrix<unsigned char> |
|
</font> <font color='#009900'>// objects as input. In general, you can use any dlib image or matrix type here, or |
|
</font> <font color='#009900'>// even define your own types by creating custom input layers. |
|
</font> <font color='#009900'>// |
|
</font> <font color='#009900'>// Then the middle layers define the computation the network will do to transform the |
|
</font> <font color='#009900'>// input into whatever we want. Here we run the image through multiple convolutions, |
|
</font> <font color='#009900'>// ReLU units, max pooling operations, and then finally a fully connected layer that |
|
</font> <font color='#009900'>// converts the whole thing into just 10 numbers. |
|
</font> <font color='#009900'>// |
|
</font> <font color='#009900'>// Finally, the loss layer defines the relationship between the network outputs, our 10 |
|
</font> <font color='#009900'>// numbers, and the labels in our dataset. Since we selected loss_multiclass_log it |
|
</font> <font color='#009900'>// means we want to do multiclass classification with our network. Moreover, the |
|
</font> <font color='#009900'>// number of network outputs (i.e. 10) is the number of possible labels. Whichever |
|
</font> <font color='#009900'>// network output is largest is the predicted label. So for example, if the first |
|
</font> <font color='#009900'>// network output is largest then the predicted digit is 0, if the last network output |
|
</font> <font color='#009900'>// is largest then the predicted digit is 9. |
|
</font> <font color='#0000FF'>using</font> net_type <font color='#5555FF'>=</font> loss_multiclass_log<font color='#5555FF'><</font> |
|
fc<font color='#5555FF'><</font><font color='#979000'>10</font>, |
|
relu<font color='#5555FF'><</font>fc<font color='#5555FF'><</font><font color='#979000'>84</font>, |
|
relu<font color='#5555FF'><</font>fc<font color='#5555FF'><</font><font color='#979000'>120</font>, |
|
max_pool<font color='#5555FF'><</font><font color='#979000'>2</font>,<font color='#979000'>2</font>,<font color='#979000'>2</font>,<font color='#979000'>2</font>,relu<font color='#5555FF'><</font>con<font color='#5555FF'><</font><font color='#979000'>16</font>,<font color='#979000'>5</font>,<font color='#979000'>5</font>,<font color='#979000'>1</font>,<font color='#979000'>1</font>, |
|
max_pool<font color='#5555FF'><</font><font color='#979000'>2</font>,<font color='#979000'>2</font>,<font color='#979000'>2</font>,<font color='#979000'>2</font>,relu<font color='#5555FF'><</font>con<font color='#5555FF'><</font><font color='#979000'>6</font>,<font color='#979000'>5</font>,<font color='#979000'>5</font>,<font color='#979000'>1</font>,<font color='#979000'>1</font>, |
|
input<font color='#5555FF'><</font>matrix<font color='#5555FF'><</font><font color='#0000FF'><u>unsigned</u></font> <font color='#0000FF'><u>char</u></font><font color='#5555FF'>></font><font color='#5555FF'>></font> |
|
<font color='#5555FF'>></font><font color='#5555FF'>></font><font color='#5555FF'>></font><font color='#5555FF'>></font><font color='#5555FF'>></font><font color='#5555FF'>></font><font color='#5555FF'>></font><font color='#5555FF'>></font><font color='#5555FF'>></font><font color='#5555FF'>></font><font color='#5555FF'>></font><font color='#5555FF'>></font>; |
|
<font color='#009900'>// This net_type defines the entire network architecture. For example, the block |
|
</font> <font color='#009900'>// relu<fc<84,SUBNET>> means we take the output from the subnetwork, pass it through a |
|
</font> <font color='#009900'>// fully connected layer with 84 outputs, then apply ReLU. Similarly, a block of |
|
</font> <font color='#009900'>// max_pool<2,2,2,2,relu<con<16,5,5,1,1,SUBNET>>> means we apply 16 convolutions with a |
|
</font> <font color='#009900'>// 5x5 filter size and 1x1 stride to the output of a subnetwork, then apply ReLU, then |
|
</font> <font color='#009900'>// perform max pooling with a 2x2 window and 2x2 stride. |
|
</font> |
|
|
|
|
|
<font color='#009900'>// So with that out of the way, we can make a network instance. |
|
</font> net_type net; |
|
<font color='#009900'>// And then train it using the MNIST data. The code below uses mini-batch stochastic |
|
</font> <font color='#009900'>// gradient descent with an initial learning rate of 0.01 to accomplish this. |
|
</font> dnn_trainer<font color='#5555FF'><</font>net_type<font color='#5555FF'>></font> <font color='#BB00BB'>trainer</font><font face='Lucida Console'>(</font>net<font face='Lucida Console'>)</font>; |
|
trainer.<font color='#BB00BB'>set_learning_rate</font><font face='Lucida Console'>(</font><font color='#979000'>0.01</font><font face='Lucida Console'>)</font>; |
|
trainer.<font color='#BB00BB'>set_min_learning_rate</font><font face='Lucida Console'>(</font><font color='#979000'>0.00001</font><font face='Lucida Console'>)</font>; |
|
trainer.<font color='#BB00BB'>set_mini_batch_size</font><font face='Lucida Console'>(</font><font color='#979000'>128</font><font face='Lucida Console'>)</font>; |
|
trainer.<font color='#BB00BB'>be_verbose</font><font face='Lucida Console'>(</font><font face='Lucida Console'>)</font>; |
|
<font color='#009900'>// Since DNN training can take a long time, we can ask the trainer to save its state to |
|
</font> <font color='#009900'>// a file named "mnist_sync" every 20 seconds. This way, if we kill this program and |
|
</font> <font color='#009900'>// start it again it will begin where it left off rather than restarting the training |
|
</font> <font color='#009900'>// from scratch. This is because, when the program restarts, this call to |
|
</font> <font color='#009900'>// set_synchronization_file() will automatically reload the settings from mnist_sync if |
|
</font> <font color='#009900'>// the file exists. |
|
</font> trainer.<font color='#BB00BB'>set_synchronization_file</font><font face='Lucida Console'>(</font>"<font color='#CC0000'>mnist_sync</font>", std::chrono::<font color='#BB00BB'>seconds</font><font face='Lucida Console'>(</font><font color='#979000'>20</font><font face='Lucida Console'>)</font><font face='Lucida Console'>)</font>; |
|
<font color='#009900'>// Finally, this line begins training. By default, it runs SGD with our specified |
|
</font> <font color='#009900'>// learning rate until the loss stops decreasing. Then it reduces the learning rate by |
|
</font> <font color='#009900'>// a factor of 10 and continues running until the loss stops decreasing again. It will |
|
</font> <font color='#009900'>// keep doing this until the learning rate has dropped below the min learning rate |
|
</font> <font color='#009900'>// defined above or the maximum number of epochs as been executed (defaulted to 10000). |
|
</font> trainer.<font color='#BB00BB'>train</font><font face='Lucida Console'>(</font>training_images, training_labels<font face='Lucida Console'>)</font>; |
|
|
|
<font color='#009900'>// At this point our net object should have learned how to classify MNIST images. But |
|
</font> <font color='#009900'>// before we try it out let's save it to disk. Note that, since the trainer has been |
|
</font> <font color='#009900'>// running images through the network, net will have a bunch of state in it related to |
|
</font> <font color='#009900'>// the last batch of images it processed (e.g. outputs from each layer). Since we |
|
</font> <font color='#009900'>// don't care about saving that kind of stuff to disk we can tell the network to forget |
|
</font> <font color='#009900'>// about that kind of transient data so that our file will be smaller. We do this by |
|
</font> <font color='#009900'>// "cleaning" the network before saving it. |
|
</font> net.<font color='#BB00BB'>clean</font><font face='Lucida Console'>(</font><font face='Lucida Console'>)</font>; |
|
<font color='#BB00BB'>serialize</font><font face='Lucida Console'>(</font>"<font color='#CC0000'>mnist_network.dat</font>"<font face='Lucida Console'>)</font> <font color='#5555FF'><</font><font color='#5555FF'><</font> net; |
|
<font color='#009900'>// Now if we later wanted to recall the network from disk we can simply say: |
|
</font> <font color='#009900'>// deserialize("mnist_network.dat") >> net; |
|
</font> |
|
|
|
<font color='#009900'>// Now let's run the training images through the network. This statement runs all the |
|
</font> <font color='#009900'>// images through it and asks the loss layer to convert the network's raw output into |
|
</font> <font color='#009900'>// labels. In our case, these labels are the numbers between 0 and 9. |
|
</font> std::vector<font color='#5555FF'><</font><font color='#0000FF'><u>unsigned</u></font> <font color='#0000FF'><u>long</u></font><font color='#5555FF'>></font> predicted_labels <font color='#5555FF'>=</font> <font color='#BB00BB'>net</font><font face='Lucida Console'>(</font>training_images<font face='Lucida Console'>)</font>; |
|
<font color='#0000FF'><u>int</u></font> num_right <font color='#5555FF'>=</font> <font color='#979000'>0</font>; |
|
<font color='#0000FF'><u>int</u></font> num_wrong <font color='#5555FF'>=</font> <font color='#979000'>0</font>; |
|
<font color='#009900'>// And then let's see if it classified them correctly. |
|
</font> <font color='#0000FF'>for</font> <font face='Lucida Console'>(</font><font color='#0000FF'><u>size_t</u></font> i <font color='#5555FF'>=</font> <font color='#979000'>0</font>; i <font color='#5555FF'><</font> training_images.<font color='#BB00BB'>size</font><font face='Lucida Console'>(</font><font face='Lucida Console'>)</font>; <font color='#5555FF'>+</font><font color='#5555FF'>+</font>i<font face='Lucida Console'>)</font> |
|
<b>{</b> |
|
<font color='#0000FF'>if</font> <font face='Lucida Console'>(</font>predicted_labels[i] <font color='#5555FF'>=</font><font color='#5555FF'>=</font> training_labels[i]<font face='Lucida Console'>)</font> |
|
<font color='#5555FF'>+</font><font color='#5555FF'>+</font>num_right; |
|
<font color='#0000FF'>else</font> |
|
<font color='#5555FF'>+</font><font color='#5555FF'>+</font>num_wrong; |
|
|
|
<b>}</b> |
|
cout <font color='#5555FF'><</font><font color='#5555FF'><</font> "<font color='#CC0000'>training num_right: </font>" <font color='#5555FF'><</font><font color='#5555FF'><</font> num_right <font color='#5555FF'><</font><font color='#5555FF'><</font> endl; |
|
cout <font color='#5555FF'><</font><font color='#5555FF'><</font> "<font color='#CC0000'>training num_wrong: </font>" <font color='#5555FF'><</font><font color='#5555FF'><</font> num_wrong <font color='#5555FF'><</font><font color='#5555FF'><</font> endl; |
|
cout <font color='#5555FF'><</font><font color='#5555FF'><</font> "<font color='#CC0000'>training accuracy: </font>" <font color='#5555FF'><</font><font color='#5555FF'><</font> num_right <font color='#5555FF'>/</font> <font color='#0000FF'>static_cast</font><font color='#5555FF'><</font><font color='#0000FF'><u>double</u></font><font color='#5555FF'>></font><font face='Lucida Console'>(</font>num_right <font color='#5555FF'>+</font> num_wrong<font face='Lucida Console'>)</font> <font color='#5555FF'><</font><font color='#5555FF'><</font> endl; |
|
|
|
<font color='#009900'>// Let's also see if the network can correctly classify the testing images. Since |
|
</font> <font color='#009900'>// MNIST is an easy dataset, we should see at least 99% accuracy. |
|
</font> predicted_labels <font color='#5555FF'>=</font> <font color='#BB00BB'>net</font><font face='Lucida Console'>(</font>testing_images<font face='Lucida Console'>)</font>; |
|
num_right <font color='#5555FF'>=</font> <font color='#979000'>0</font>; |
|
num_wrong <font color='#5555FF'>=</font> <font color='#979000'>0</font>; |
|
<font color='#0000FF'>for</font> <font face='Lucida Console'>(</font><font color='#0000FF'><u>size_t</u></font> i <font color='#5555FF'>=</font> <font color='#979000'>0</font>; i <font color='#5555FF'><</font> testing_images.<font color='#BB00BB'>size</font><font face='Lucida Console'>(</font><font face='Lucida Console'>)</font>; <font color='#5555FF'>+</font><font color='#5555FF'>+</font>i<font face='Lucida Console'>)</font> |
|
<b>{</b> |
|
<font color='#0000FF'>if</font> <font face='Lucida Console'>(</font>predicted_labels[i] <font color='#5555FF'>=</font><font color='#5555FF'>=</font> testing_labels[i]<font face='Lucida Console'>)</font> |
|
<font color='#5555FF'>+</font><font color='#5555FF'>+</font>num_right; |
|
<font color='#0000FF'>else</font> |
|
<font color='#5555FF'>+</font><font color='#5555FF'>+</font>num_wrong; |
|
|
|
<b>}</b> |
|
cout <font color='#5555FF'><</font><font color='#5555FF'><</font> "<font color='#CC0000'>testing num_right: </font>" <font color='#5555FF'><</font><font color='#5555FF'><</font> num_right <font color='#5555FF'><</font><font color='#5555FF'><</font> endl; |
|
cout <font color='#5555FF'><</font><font color='#5555FF'><</font> "<font color='#CC0000'>testing num_wrong: </font>" <font color='#5555FF'><</font><font color='#5555FF'><</font> num_wrong <font color='#5555FF'><</font><font color='#5555FF'><</font> endl; |
|
cout <font color='#5555FF'><</font><font color='#5555FF'><</font> "<font color='#CC0000'>testing accuracy: </font>" <font color='#5555FF'><</font><font color='#5555FF'><</font> num_right <font color='#5555FF'>/</font> <font color='#0000FF'>static_cast</font><font color='#5555FF'><</font><font color='#0000FF'><u>double</u></font><font color='#5555FF'>></font><font face='Lucida Console'>(</font>num_right <font color='#5555FF'>+</font> num_wrong<font face='Lucida Console'>)</font> <font color='#5555FF'><</font><font color='#5555FF'><</font> endl; |
|
|
|
|
|
<font color='#009900'>// Finally, you can also save network parameters to XML files if you want to do |
|
</font> <font color='#009900'>// something with the network in another tool. For example, you could use dlib's |
|
</font> <font color='#009900'>// tools/convert_dlib_nets_to_caffe to convert the network to a caffe model. |
|
</font> <font color='#BB00BB'>net_to_xml</font><font face='Lucida Console'>(</font>net, "<font color='#CC0000'>lenet.xml</font>"<font face='Lucida Console'>)</font>; |
|
<b>}</b> |
|
<font color='#0000FF'>catch</font><font face='Lucida Console'>(</font>std::exception<font color='#5555FF'>&</font> e<font face='Lucida Console'>)</font> |
|
<b>{</b> |
|
cout <font color='#5555FF'><</font><font color='#5555FF'><</font> e.<font color='#BB00BB'>what</font><font face='Lucida Console'>(</font><font face='Lucida Console'>)</font> <font color='#5555FF'><</font><font color='#5555FF'><</font> endl; |
|
<b>}</b> |
|
|
|
</pre></body></html> |