// The contents of this file are in the public domain. See LICENSE_FOR_EXAMPLE_PROGRAMS.txt | |
/* | |
This is an example illustrating the use of the deep learning tools from the dlib C++ | |
Library. In it, we will show how to do face recognition. This example uses the | |
pretrained dlib_face_recognition_resnet_model_v1 model which is freely available from | |
the dlib web site. This model has a 99.38% accuracy on the standard LFW face | |
recognition benchmark, which is comparable to other state-of-the-art methods for face | |
recognition as of February 2017. | |
In this example, we will use dlib to do face clustering. Included in the examples | |
folder is an image, bald_guys.jpg, which contains a bunch of photos of action movie | |
stars Vin Diesel, The Rock, Jason Statham, and Bruce Willis. We will use dlib to | |
automatically find their faces in the image and then to automatically determine how | |
many people there are (4 in this case) as well as which faces belong to each person. | |
Finally, this example uses a network with the loss_metric loss. Therefore, if you want | |
to learn how to train your own models, or to get a general introduction to this loss | |
layer, you should read the dnn_metric_learning_ex.cpp and | |
dnn_metric_learning_on_images_ex.cpp examples. | |
*/ | |
using namespace dlib; | |
using namespace std; | |
// ---------------------------------------------------------------------------------------- | |
// The next bit of code defines a ResNet network. It's basically copied | |
// and pasted from the dnn_imagenet_ex.cpp example, except we replaced the loss | |
// layer with loss_metric and made the network somewhat smaller. Go read the introductory | |
// dlib DNN examples to learn what all this stuff means. | |
// | |
// Also, the dnn_metric_learning_on_images_ex.cpp example shows how to train this network. | |
// The dlib_face_recognition_resnet_model_v1 model used by this example was trained using | |
// essentially the code shown in dnn_metric_learning_on_images_ex.cpp except the | |
// mini-batches were made larger (35x15 instead of 5x5), the iterations without progress | |
// was set to 10000, and the training dataset consisted of about 3 million images instead of | |
// 55. Also, the input layer was locked to images of size 150. | |
template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET> | |
using residual = add_prev1<block<N,BN,1,tag1<SUBNET>>>; | |
template <template <int,template<typename>class,int,typename> class block, int N, template<typename>class BN, typename SUBNET> | |
using residual_down = add_prev2<avg_pool<2,2,2,2,skip1<tag2<block<N,BN,2,tag1<SUBNET>>>>>>; | |
template <int N, template <typename> class BN, int stride, typename SUBNET> | |
using block = BN<con<N,3,3,1,1,relu<BN<con<N,3,3,stride,stride,SUBNET>>>>>; | |
template <int N, typename SUBNET> using ares = relu<residual<block,N,affine,SUBNET>>; | |
template <int N, typename SUBNET> using ares_down = relu<residual_down<block,N,affine,SUBNET>>; | |
template <typename SUBNET> using alevel0 = ares_down<256,SUBNET>; | |
template <typename SUBNET> using alevel1 = ares<256,ares<256,ares_down<256,SUBNET>>>; | |
template <typename SUBNET> using alevel2 = ares<128,ares<128,ares_down<128,SUBNET>>>; | |
template <typename SUBNET> using alevel3 = ares<64,ares<64,ares<64,ares_down<64,SUBNET>>>>; | |
template <typename SUBNET> using alevel4 = ares<32,ares<32,ares<32,SUBNET>>>; | |
using anet_type = loss_metric<fc_no_bias<128,avg_pool_everything< | |
alevel0< | |
alevel1< | |
alevel2< | |
alevel3< | |
alevel4< | |
max_pool<3,3,2,2,relu<affine<con<32,7,7,2,2, | |
input_rgb_image_sized<150> | |
>>>>>>>>>>>>; | |
// ---------------------------------------------------------------------------------------- | |
std::vector<matrix<rgb_pixel>> jitter_image( | |
const matrix<rgb_pixel>& img | |
); | |
// ---------------------------------------------------------------------------------------- | |
int main(int argc, char** argv) try | |
{ | |
if (argc != 2) | |
{ | |
cout << "Run this example by invoking it like this: " << endl; | |
cout << " ./dnn_face_recognition_ex faces/bald_guys.jpg" << endl; | |
cout << endl; | |
cout << "You will also need to get the face landmarking model file as well as " << endl; | |
cout << "the face recognition model file. Download and then decompress these files from: " << endl; | |
cout << "http://dlib.net/files/shape_predictor_5_face_landmarks.dat.bz2" << endl; | |
cout << "http://dlib.net/files/dlib_face_recognition_resnet_model_v1.dat.bz2" << endl; | |
cout << endl; | |
return 1; | |
} | |
// The first thing we are going to do is load all our models. First, since we need to | |
// find faces in the image we will need a face detector: | |
frontal_face_detector detector = get_frontal_face_detector(); | |
// We will also use a face landmarking model to align faces to a standard pose: (see face_landmark_detection_ex.cpp for an introduction) | |
shape_predictor sp; | |
deserialize("shape_predictor_5_face_landmarks.dat") >> sp; | |
// And finally we load the DNN responsible for face recognition. | |
anet_type net; | |
deserialize("dlib_face_recognition_resnet_model_v1.dat") >> net; | |
matrix<rgb_pixel> img; | |
load_image(img, argv[1]); | |
// Display the raw image on the screen | |
image_window win(img); | |
// Run the face detector on the image of our action heroes, and for each face extract a | |
// copy that has been normalized to 150x150 pixels in size and appropriately rotated | |
// and centered. | |
std::vector<matrix<rgb_pixel>> faces; | |
for (auto face : detector(img)) | |
{ | |
auto shape = sp(img, face); | |
matrix<rgb_pixel> face_chip; | |
extract_image_chip(img, get_face_chip_details(shape,150,0.25), face_chip); | |
faces.push_back(move(face_chip)); | |
// Also put some boxes on the faces so we can see that the detector is finding | |
// them. | |
win.add_overlay(face); | |
} | |
if (faces.size() == 0) | |
{ | |
cout << "No faces found in image!" << endl; | |
return 1; | |
} | |
// This call asks the DNN to convert each face image in faces into a 128D vector. | |
// In this 128D vector space, images from the same person will be close to each other | |
// but vectors from different people will be far apart. So we can use these vectors to | |
// identify if a pair of images are from the same person or from different people. | |
std::vector<matrix<float,0,1>> face_descriptors = net(faces); | |
// In particular, one simple thing we can do is face clustering. This next bit of code | |
// creates a graph of connected faces and then uses the Chinese whispers graph clustering | |
// algorithm to identify how many people there are and which faces belong to whom. | |
std::vector<sample_pair> edges; | |
for (size_t i = 0; i < face_descriptors.size(); ++i) | |
{ | |
for (size_t j = i; j < face_descriptors.size(); ++j) | |
{ | |
// Faces are connected in the graph if they are close enough. Here we check if | |
// the distance between two face descriptors is less than 0.6, which is the | |
// decision threshold the network was trained to use. Although you can | |
// certainly use any other threshold you find useful. | |
if (length(face_descriptors[i]-face_descriptors[j]) < 0.6) | |
edges.push_back(sample_pair(i,j)); | |
} | |
} | |
std::vector<unsigned long> labels; | |
const auto num_clusters = chinese_whispers(edges, labels); | |
// This will correctly indicate that there are 4 people in the image. | |
cout << "number of people found in the image: "<< num_clusters << endl; | |
// Now let's display the face clustering results on the screen. You will see that it | |
// correctly grouped all the faces. | |
std::vector<image_window> win_clusters(num_clusters); | |
for (size_t cluster_id = 0; cluster_id < num_clusters; ++cluster_id) | |
{ | |
std::vector<matrix<rgb_pixel>> temp; | |
for (size_t j = 0; j < labels.size(); ++j) | |
{ | |
if (cluster_id == labels[j]) | |
temp.push_back(faces[j]); | |
} | |
win_clusters[cluster_id].set_title("face cluster " + cast_to_string(cluster_id)); | |
win_clusters[cluster_id].set_image(tile_images(temp)); | |
} | |
// Finally, let's print one of the face descriptors to the screen. | |
cout << "face descriptor for one face: " << trans(face_descriptors[0]) << endl; | |
// It should also be noted that face recognition accuracy can be improved if jittering | |
// is used when creating face descriptors. In particular, to get 99.38% on the LFW | |
// benchmark you need to use the jitter_image() routine to compute the descriptors, | |
// like so: | |
matrix<float,0,1> face_descriptor = mean(mat(net(jitter_image(faces[0])))); | |
cout << "jittered face descriptor for one face: " << trans(face_descriptor) << endl; | |
// If you use the model without jittering, as we did when clustering the bald guys, it | |
// gets an accuracy of 99.13% on the LFW benchmark. So jittering makes the whole | |
// procedure a little more accurate but makes face descriptor calculation slower. | |
cout << "hit enter to terminate" << endl; | |
cin.get(); | |
} | |
catch (std::exception& e) | |
{ | |
cout << e.what() << endl; | |
} | |
// ---------------------------------------------------------------------------------------- | |
std::vector<matrix<rgb_pixel>> jitter_image( | |
const matrix<rgb_pixel>& img | |
) | |
{ | |
// All this function does is make 100 copies of img, all slightly jittered by being | |
// zoomed, rotated, and translated a little bit differently. They are also randomly | |
// mirrored left to right. | |
thread_local dlib::rand rnd; | |
std::vector<matrix<rgb_pixel>> crops; | |
for (int i = 0; i < 100; ++i) | |
crops.push_back(jitter_image(img,rnd)); | |
return crops; | |
} | |
// ---------------------------------------------------------------------------------------- | |