Aging_MouthReplace / dlibs /docs /dnn_introduction_ex.cpp.html
AshanGimhana's picture
Upload folder using huggingface_hub
9375c9a verified
raw
history blame
20.6 kB
<html><!-- Created using the cpp_pretty_printer from the dlib C++ library. See http://dlib.net for updates. --><head><title>dlib C++ Library - dnn_introduction_ex.cpp</title></head><body bgcolor='white'><pre>
<font color='#009900'>// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt
</font><font color='#009900'>/*
This is an example illustrating the use of the deep learning tools from the
dlib C++ Library. In it, we will train the venerable LeNet convolutional
neural network to recognize hand written digits. The network will take as
input a small image and classify it as one of the 10 numeric digits between
0 and 9.
The specific network we will run is from the paper
LeCun, Yann, et al. "Gradient-based learning applied to document recognition."
Proceedings of the IEEE 86.11 (1998): 2278-2324.
except that we replace the sigmoid non-linearities with rectified linear units.
These tools will use CUDA and cuDNN to drastically accelerate network
training and testing. CMake should automatically find them if they are
installed and configure things appropriately. If not, the program will
still run but will be much slower to execute.
*/</font>
<font color='#0000FF'>#include</font> <font color='#5555FF'>&lt;</font>dlib<font color='#5555FF'>/</font>dnn.h<font color='#5555FF'>&gt;</font>
<font color='#0000FF'>#include</font> <font color='#5555FF'>&lt;</font>iostream<font color='#5555FF'>&gt;</font>
<font color='#0000FF'>#include</font> <font color='#5555FF'>&lt;</font>dlib<font color='#5555FF'>/</font>data_io.h<font color='#5555FF'>&gt;</font>
<font color='#0000FF'>using</font> <font color='#0000FF'>namespace</font> std;
<font color='#0000FF'>using</font> <font color='#0000FF'>namespace</font> dlib;
<font color='#0000FF'><u>int</u></font> <b><a name='main'></a>main</b><font face='Lucida Console'>(</font><font color='#0000FF'><u>int</u></font> argc, <font color='#0000FF'><u>char</u></font><font color='#5555FF'>*</font><font color='#5555FF'>*</font> argv<font face='Lucida Console'>)</font> <font color='#0000FF'>try</font>
<b>{</b>
<font color='#009900'>// This example is going to run on the MNIST dataset.
</font> <font color='#0000FF'>if</font> <font face='Lucida Console'>(</font>argc <font color='#5555FF'>!</font><font color='#5555FF'>=</font> <font color='#979000'>2</font><font face='Lucida Console'>)</font>
<b>{</b>
cout <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> "<font color='#CC0000'>This example needs the MNIST dataset to run!</font>" <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> endl;
cout <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> "<font color='#CC0000'>You can get MNIST from http://yann.lecun.com/exdb/mnist/</font>" <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> endl;
cout <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> "<font color='#CC0000'>Download the 4 files that comprise the dataset, decompress them, and</font>" <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> endl;
cout <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> "<font color='#CC0000'>put them in a folder. Then give that folder as input to this program.</font>" <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> endl;
<font color='#0000FF'>return</font> <font color='#979000'>1</font>;
<b>}</b>
<font color='#009900'>// MNIST is broken into two parts, a training set of 60000 images and a test set of
</font> <font color='#009900'>// 10000 images. Each image is labeled so that we know what hand written digit is
</font> <font color='#009900'>// depicted. These next statements load the dataset into memory.
</font> std::vector<font color='#5555FF'>&lt;</font>matrix<font color='#5555FF'>&lt;</font><font color='#0000FF'><u>unsigned</u></font> <font color='#0000FF'><u>char</u></font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font> training_images;
std::vector<font color='#5555FF'>&lt;</font><font color='#0000FF'><u>unsigned</u></font> <font color='#0000FF'><u>long</u></font><font color='#5555FF'>&gt;</font> training_labels;
std::vector<font color='#5555FF'>&lt;</font>matrix<font color='#5555FF'>&lt;</font><font color='#0000FF'><u>unsigned</u></font> <font color='#0000FF'><u>char</u></font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font> testing_images;
std::vector<font color='#5555FF'>&lt;</font><font color='#0000FF'><u>unsigned</u></font> <font color='#0000FF'><u>long</u></font><font color='#5555FF'>&gt;</font> testing_labels;
<font color='#BB00BB'>load_mnist_dataset</font><font face='Lucida Console'>(</font>argv[<font color='#979000'>1</font>], training_images, training_labels, testing_images, testing_labels<font face='Lucida Console'>)</font>;
<font color='#009900'>// Now let's define the LeNet. Broadly speaking, there are 3 parts to a network
</font> <font color='#009900'>// definition. The loss layer, a bunch of computational layers, and then an input
</font> <font color='#009900'>// layer. You can see these components in the network definition below.
</font> <font color='#009900'>//
</font> <font color='#009900'>// The input layer here says the network expects to be given matrix&lt;unsigned char&gt;
</font> <font color='#009900'>// objects as input. In general, you can use any dlib image or matrix type here, or
</font> <font color='#009900'>// even define your own types by creating custom input layers.
</font> <font color='#009900'>//
</font> <font color='#009900'>// Then the middle layers define the computation the network will do to transform the
</font> <font color='#009900'>// input into whatever we want. Here we run the image through multiple convolutions,
</font> <font color='#009900'>// ReLU units, max pooling operations, and then finally a fully connected layer that
</font> <font color='#009900'>// converts the whole thing into just 10 numbers.
</font> <font color='#009900'>//
</font> <font color='#009900'>// Finally, the loss layer defines the relationship between the network outputs, our 10
</font> <font color='#009900'>// numbers, and the labels in our dataset. Since we selected loss_multiclass_log it
</font> <font color='#009900'>// means we want to do multiclass classification with our network. Moreover, the
</font> <font color='#009900'>// number of network outputs (i.e. 10) is the number of possible labels. Whichever
</font> <font color='#009900'>// network output is largest is the predicted label. So for example, if the first
</font> <font color='#009900'>// network output is largest then the predicted digit is 0, if the last network output
</font> <font color='#009900'>// is largest then the predicted digit is 9.
</font> <font color='#0000FF'>using</font> net_type <font color='#5555FF'>=</font> loss_multiclass_log<font color='#5555FF'>&lt;</font>
fc<font color='#5555FF'>&lt;</font><font color='#979000'>10</font>,
relu<font color='#5555FF'>&lt;</font>fc<font color='#5555FF'>&lt;</font><font color='#979000'>84</font>,
relu<font color='#5555FF'>&lt;</font>fc<font color='#5555FF'>&lt;</font><font color='#979000'>120</font>,
max_pool<font color='#5555FF'>&lt;</font><font color='#979000'>2</font>,<font color='#979000'>2</font>,<font color='#979000'>2</font>,<font color='#979000'>2</font>,relu<font color='#5555FF'>&lt;</font>con<font color='#5555FF'>&lt;</font><font color='#979000'>16</font>,<font color='#979000'>5</font>,<font color='#979000'>5</font>,<font color='#979000'>1</font>,<font color='#979000'>1</font>,
max_pool<font color='#5555FF'>&lt;</font><font color='#979000'>2</font>,<font color='#979000'>2</font>,<font color='#979000'>2</font>,<font color='#979000'>2</font>,relu<font color='#5555FF'>&lt;</font>con<font color='#5555FF'>&lt;</font><font color='#979000'>6</font>,<font color='#979000'>5</font>,<font color='#979000'>5</font>,<font color='#979000'>1</font>,<font color='#979000'>1</font>,
input<font color='#5555FF'>&lt;</font>matrix<font color='#5555FF'>&lt;</font><font color='#0000FF'><u>unsigned</u></font> <font color='#0000FF'><u>char</u></font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font>
<font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font><font color='#5555FF'>&gt;</font>;
<font color='#009900'>// This net_type defines the entire network architecture. For example, the block
</font> <font color='#009900'>// relu&lt;fc&lt;84,SUBNET&gt;&gt; means we take the output from the subnetwork, pass it through a
</font> <font color='#009900'>// fully connected layer with 84 outputs, then apply ReLU. Similarly, a block of
</font> <font color='#009900'>// max_pool&lt;2,2,2,2,relu&lt;con&lt;16,5,5,1,1,SUBNET&gt;&gt;&gt; means we apply 16 convolutions with a
</font> <font color='#009900'>// 5x5 filter size and 1x1 stride to the output of a subnetwork, then apply ReLU, then
</font> <font color='#009900'>// perform max pooling with a 2x2 window and 2x2 stride.
</font>
<font color='#009900'>// So with that out of the way, we can make a network instance.
</font> net_type net;
<font color='#009900'>// And then train it using the MNIST data. The code below uses mini-batch stochastic
</font> <font color='#009900'>// gradient descent with an initial learning rate of 0.01 to accomplish this.
</font> dnn_trainer<font color='#5555FF'>&lt;</font>net_type<font color='#5555FF'>&gt;</font> <font color='#BB00BB'>trainer</font><font face='Lucida Console'>(</font>net<font face='Lucida Console'>)</font>;
trainer.<font color='#BB00BB'>set_learning_rate</font><font face='Lucida Console'>(</font><font color='#979000'>0.01</font><font face='Lucida Console'>)</font>;
trainer.<font color='#BB00BB'>set_min_learning_rate</font><font face='Lucida Console'>(</font><font color='#979000'>0.00001</font><font face='Lucida Console'>)</font>;
trainer.<font color='#BB00BB'>set_mini_batch_size</font><font face='Lucida Console'>(</font><font color='#979000'>128</font><font face='Lucida Console'>)</font>;
trainer.<font color='#BB00BB'>be_verbose</font><font face='Lucida Console'>(</font><font face='Lucida Console'>)</font>;
<font color='#009900'>// Since DNN training can take a long time, we can ask the trainer to save its state to
</font> <font color='#009900'>// a file named "mnist_sync" every 20 seconds. This way, if we kill this program and
</font> <font color='#009900'>// start it again it will begin where it left off rather than restarting the training
</font> <font color='#009900'>// from scratch. This is because, when the program restarts, this call to
</font> <font color='#009900'>// set_synchronization_file() will automatically reload the settings from mnist_sync if
</font> <font color='#009900'>// the file exists.
</font> trainer.<font color='#BB00BB'>set_synchronization_file</font><font face='Lucida Console'>(</font>"<font color='#CC0000'>mnist_sync</font>", std::chrono::<font color='#BB00BB'>seconds</font><font face='Lucida Console'>(</font><font color='#979000'>20</font><font face='Lucida Console'>)</font><font face='Lucida Console'>)</font>;
<font color='#009900'>// Finally, this line begins training. By default, it runs SGD with our specified
</font> <font color='#009900'>// learning rate until the loss stops decreasing. Then it reduces the learning rate by
</font> <font color='#009900'>// a factor of 10 and continues running until the loss stops decreasing again. It will
</font> <font color='#009900'>// keep doing this until the learning rate has dropped below the min learning rate
</font> <font color='#009900'>// defined above or the maximum number of epochs as been executed (defaulted to 10000).
</font> trainer.<font color='#BB00BB'>train</font><font face='Lucida Console'>(</font>training_images, training_labels<font face='Lucida Console'>)</font>;
<font color='#009900'>// At this point our net object should have learned how to classify MNIST images. But
</font> <font color='#009900'>// before we try it out let's save it to disk. Note that, since the trainer has been
</font> <font color='#009900'>// running images through the network, net will have a bunch of state in it related to
</font> <font color='#009900'>// the last batch of images it processed (e.g. outputs from each layer). Since we
</font> <font color='#009900'>// don't care about saving that kind of stuff to disk we can tell the network to forget
</font> <font color='#009900'>// about that kind of transient data so that our file will be smaller. We do this by
</font> <font color='#009900'>// "cleaning" the network before saving it.
</font> net.<font color='#BB00BB'>clean</font><font face='Lucida Console'>(</font><font face='Lucida Console'>)</font>;
<font color='#BB00BB'>serialize</font><font face='Lucida Console'>(</font>"<font color='#CC0000'>mnist_network.dat</font>"<font face='Lucida Console'>)</font> <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> net;
<font color='#009900'>// Now if we later wanted to recall the network from disk we can simply say:
</font> <font color='#009900'>// deserialize("mnist_network.dat") &gt;&gt; net;
</font>
<font color='#009900'>// Now let's run the training images through the network. This statement runs all the
</font> <font color='#009900'>// images through it and asks the loss layer to convert the network's raw output into
</font> <font color='#009900'>// labels. In our case, these labels are the numbers between 0 and 9.
</font> std::vector<font color='#5555FF'>&lt;</font><font color='#0000FF'><u>unsigned</u></font> <font color='#0000FF'><u>long</u></font><font color='#5555FF'>&gt;</font> predicted_labels <font color='#5555FF'>=</font> <font color='#BB00BB'>net</font><font face='Lucida Console'>(</font>training_images<font face='Lucida Console'>)</font>;
<font color='#0000FF'><u>int</u></font> num_right <font color='#5555FF'>=</font> <font color='#979000'>0</font>;
<font color='#0000FF'><u>int</u></font> num_wrong <font color='#5555FF'>=</font> <font color='#979000'>0</font>;
<font color='#009900'>// And then let's see if it classified them correctly.
</font> <font color='#0000FF'>for</font> <font face='Lucida Console'>(</font><font color='#0000FF'><u>size_t</u></font> i <font color='#5555FF'>=</font> <font color='#979000'>0</font>; i <font color='#5555FF'>&lt;</font> training_images.<font color='#BB00BB'>size</font><font face='Lucida Console'>(</font><font face='Lucida Console'>)</font>; <font color='#5555FF'>+</font><font color='#5555FF'>+</font>i<font face='Lucida Console'>)</font>
<b>{</b>
<font color='#0000FF'>if</font> <font face='Lucida Console'>(</font>predicted_labels[i] <font color='#5555FF'>=</font><font color='#5555FF'>=</font> training_labels[i]<font face='Lucida Console'>)</font>
<font color='#5555FF'>+</font><font color='#5555FF'>+</font>num_right;
<font color='#0000FF'>else</font>
<font color='#5555FF'>+</font><font color='#5555FF'>+</font>num_wrong;
<b>}</b>
cout <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> "<font color='#CC0000'>training num_right: </font>" <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> num_right <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> endl;
cout <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> "<font color='#CC0000'>training num_wrong: </font>" <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> num_wrong <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> endl;
cout <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> "<font color='#CC0000'>training accuracy: </font>" <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> num_right <font color='#5555FF'>/</font> <font color='#0000FF'>static_cast</font><font color='#5555FF'>&lt;</font><font color='#0000FF'><u>double</u></font><font color='#5555FF'>&gt;</font><font face='Lucida Console'>(</font>num_right <font color='#5555FF'>+</font> num_wrong<font face='Lucida Console'>)</font> <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> endl;
<font color='#009900'>// Let's also see if the network can correctly classify the testing images. Since
</font> <font color='#009900'>// MNIST is an easy dataset, we should see at least 99% accuracy.
</font> predicted_labels <font color='#5555FF'>=</font> <font color='#BB00BB'>net</font><font face='Lucida Console'>(</font>testing_images<font face='Lucida Console'>)</font>;
num_right <font color='#5555FF'>=</font> <font color='#979000'>0</font>;
num_wrong <font color='#5555FF'>=</font> <font color='#979000'>0</font>;
<font color='#0000FF'>for</font> <font face='Lucida Console'>(</font><font color='#0000FF'><u>size_t</u></font> i <font color='#5555FF'>=</font> <font color='#979000'>0</font>; i <font color='#5555FF'>&lt;</font> testing_images.<font color='#BB00BB'>size</font><font face='Lucida Console'>(</font><font face='Lucida Console'>)</font>; <font color='#5555FF'>+</font><font color='#5555FF'>+</font>i<font face='Lucida Console'>)</font>
<b>{</b>
<font color='#0000FF'>if</font> <font face='Lucida Console'>(</font>predicted_labels[i] <font color='#5555FF'>=</font><font color='#5555FF'>=</font> testing_labels[i]<font face='Lucida Console'>)</font>
<font color='#5555FF'>+</font><font color='#5555FF'>+</font>num_right;
<font color='#0000FF'>else</font>
<font color='#5555FF'>+</font><font color='#5555FF'>+</font>num_wrong;
<b>}</b>
cout <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> "<font color='#CC0000'>testing num_right: </font>" <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> num_right <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> endl;
cout <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> "<font color='#CC0000'>testing num_wrong: </font>" <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> num_wrong <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> endl;
cout <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> "<font color='#CC0000'>testing accuracy: </font>" <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> num_right <font color='#5555FF'>/</font> <font color='#0000FF'>static_cast</font><font color='#5555FF'>&lt;</font><font color='#0000FF'><u>double</u></font><font color='#5555FF'>&gt;</font><font face='Lucida Console'>(</font>num_right <font color='#5555FF'>+</font> num_wrong<font face='Lucida Console'>)</font> <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> endl;
<font color='#009900'>// Finally, you can also save network parameters to XML files if you want to do
</font> <font color='#009900'>// something with the network in another tool. For example, you could use dlib's
</font> <font color='#009900'>// tools/convert_dlib_nets_to_caffe to convert the network to a caffe model.
</font> <font color='#BB00BB'>net_to_xml</font><font face='Lucida Console'>(</font>net, "<font color='#CC0000'>lenet.xml</font>"<font face='Lucida Console'>)</font>;
<b>}</b>
<font color='#0000FF'>catch</font><font face='Lucida Console'>(</font>std::exception<font color='#5555FF'>&amp;</font> e<font face='Lucida Console'>)</font>
<b>{</b>
cout <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> e.<font color='#BB00BB'>what</font><font face='Lucida Console'>(</font><font face='Lucida Console'>)</font> <font color='#5555FF'>&lt;</font><font color='#5555FF'>&lt;</font> endl;
<b>}</b>
</pre></body></html>