|
|
|
|
|
#ifndef DLIB_MODULARITY_ClUSTERING__H__ |
|
#define DLIB_MODULARITY_ClUSTERING__H__ |
|
|
|
#include "modularity_clustering_abstract.h" |
|
#include "../sparse_vector.h" |
|
#include "../graph_utils/edge_list_graphs.h" |
|
#include "../matrix.h" |
|
#include "../rand.h" |
|
|
|
namespace dlib |
|
{ |
|
|
|
|
|
|
|
namespace impl |
|
{ |
|
inline double newman_cluster_split ( |
|
dlib::rand& rnd, |
|
const std::vector<ordered_sample_pair>& edges, |
|
const matrix<double,0,1>& node_degrees, |
|
const matrix<double,0,1>& Bdiag, |
|
const double& edge_sum, |
|
matrix<double,0,1>& labels, |
|
const double eps, |
|
const unsigned long max_iterations |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{ |
|
|
|
|
|
const double power_iter_eps = eps * std::sqrt(1.0/node_degrees.size()); |
|
|
|
|
|
labels.set_size(node_degrees.size()); |
|
for (long i = 0; i < labels.size(); ++i) |
|
labels(i) = rnd.get_random_gaussian(); |
|
labels /= length(labels); |
|
|
|
matrix<double,0,1> Bv, Bv_unit; |
|
|
|
|
|
double eig = -1; |
|
double offset = 0; |
|
while (eig < 0) |
|
{ |
|
|
|
|
|
double iteration_change = power_iter_eps*2+1; |
|
for (unsigned long i = 0; i < max_iterations && iteration_change > power_iter_eps; ++i) |
|
{ |
|
sparse_matrix_vector_multiply(edges, labels, Bv); |
|
Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees; |
|
|
|
if (offset != 0) |
|
{ |
|
Bv -= offset*labels; |
|
} |
|
|
|
|
|
const double len = length(Bv); |
|
if (len != 0) |
|
{ |
|
Bv_unit = Bv/len; |
|
iteration_change = max(abs(labels-Bv_unit)); |
|
labels.swap(Bv_unit); |
|
} |
|
else |
|
{ |
|
|
|
|
|
for (long i = 0; i < labels.size(); ++i) |
|
labels(i) = rnd.get_random_gaussian(); |
|
} |
|
} |
|
|
|
eig = dot(Bv,labels); |
|
|
|
offset = eig; |
|
} |
|
|
|
|
|
for (long i = 0; i < labels.size(); ++i) |
|
{ |
|
if (labels(i) > 0) |
|
labels(i) = 1; |
|
else |
|
labels(i) = -1; |
|
} |
|
|
|
|
|
|
|
sparse_matrix_vector_multiply(edges, labels, Bv); |
|
Bv -= dot(node_degrees, labels)/(2*edge_sum) * node_degrees; |
|
|
|
|
|
|
|
bool flipped_label = true; |
|
while(flipped_label) |
|
{ |
|
flipped_label = false; |
|
unsigned long idx = 0; |
|
for (long i = 0; i < labels.size(); ++i) |
|
{ |
|
const double val = -2*labels(i); |
|
const double increase = 4*Bdiag(i) + 2*val*Bv(i); |
|
|
|
|
|
if (increase > 0) |
|
{ |
|
labels(i) *= -1; |
|
while (idx < edges.size() && edges[idx].index1() == (unsigned long)i) |
|
{ |
|
const long j = edges[idx].index2(); |
|
Bv(j) += val*edges[idx].distance(); |
|
++idx; |
|
} |
|
|
|
Bv -= (val*node_degrees(i)/(2*edge_sum))*node_degrees; |
|
|
|
flipped_label = true; |
|
} |
|
else |
|
{ |
|
while (idx < edges.size() && edges[idx].index1() == (unsigned long)i) |
|
{ |
|
++idx; |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
const double modularity = dot(Bv, labels)/(4*edge_sum); |
|
|
|
return modularity; |
|
} |
|
|
|
|
|
|
|
inline unsigned long newman_cluster_helper ( |
|
dlib::rand& rnd, |
|
const std::vector<ordered_sample_pair>& edges, |
|
const matrix<double,0,1>& node_degrees, |
|
const matrix<double,0,1>& Bdiag, |
|
const double& edge_sum, |
|
std::vector<unsigned long>& labels, |
|
double modularity_threshold, |
|
const double eps, |
|
const unsigned long max_iterations |
|
) |
|
|
|
|
|
|
|
|
|
{ |
|
matrix<double,0,1> l; |
|
const double modularity = newman_cluster_split(rnd,edges,node_degrees,Bdiag,edge_sum,l,eps,max_iterations); |
|
|
|
|
|
|
|
|
|
|
|
std::vector<unsigned long> left_idx_map(node_degrees.size()); |
|
std::vector<unsigned long> right_idx_map(node_degrees.size()); |
|
|
|
|
|
unsigned long num_left_split = 0; |
|
unsigned long num_right_split = 0; |
|
for (long i = 0; i < l.size(); ++i) |
|
{ |
|
if (l(i) > 0) |
|
{ |
|
left_idx_map[i] = num_left_split; |
|
++num_left_split; |
|
} |
|
else |
|
{ |
|
right_idx_map[i] = num_right_split; |
|
++num_right_split; |
|
} |
|
} |
|
|
|
|
|
if (modularity > modularity_threshold && num_left_split > 0 && num_right_split > 0) |
|
{ |
|
|
|
|
|
matrix<double,0,1> left_node_degrees(num_left_split); |
|
matrix<double,0,1> right_node_degrees(num_right_split); |
|
matrix<double,0,1> left_Bdiag(num_left_split); |
|
matrix<double,0,1> right_Bdiag(num_right_split); |
|
for (long i = 0; i < l.size(); ++i) |
|
{ |
|
if (l(i) > 0) |
|
{ |
|
left_node_degrees(left_idx_map[i]) = node_degrees(i); |
|
left_Bdiag(left_idx_map[i]) = Bdiag(i); |
|
} |
|
else |
|
{ |
|
right_node_degrees(right_idx_map[i]) = node_degrees(i); |
|
right_Bdiag(right_idx_map[i]) = Bdiag(i); |
|
} |
|
} |
|
|
|
|
|
|
|
std::vector<ordered_sample_pair> split_edges; |
|
modularity_threshold = 0; |
|
for (unsigned long k = 0; k < edges.size(); ++k) |
|
{ |
|
const unsigned long i = edges[k].index1(); |
|
const unsigned long j = edges[k].index2(); |
|
const double d = edges[k].distance(); |
|
if (l(i) > 0 && l(j) > 0) |
|
{ |
|
split_edges.push_back(ordered_sample_pair(left_idx_map[i], left_idx_map[j], d)); |
|
modularity_threshold += d; |
|
} |
|
} |
|
modularity_threshold -= sum(left_node_degrees*sum(left_node_degrees))/(2*edge_sum); |
|
modularity_threshold /= 4*edge_sum; |
|
|
|
unsigned long num_left_clusters; |
|
std::vector<unsigned long> left_labels; |
|
num_left_clusters = newman_cluster_helper(rnd,split_edges,left_node_degrees,left_Bdiag, |
|
edge_sum,left_labels,modularity_threshold, |
|
eps, max_iterations); |
|
|
|
|
|
split_edges.clear(); |
|
modularity_threshold = 0; |
|
for (unsigned long k = 0; k < edges.size(); ++k) |
|
{ |
|
const unsigned long i = edges[k].index1(); |
|
const unsigned long j = edges[k].index2(); |
|
const double d = edges[k].distance(); |
|
if (l(i) < 0 && l(j) < 0) |
|
{ |
|
split_edges.push_back(ordered_sample_pair(right_idx_map[i], right_idx_map[j], d)); |
|
modularity_threshold += d; |
|
} |
|
} |
|
modularity_threshold -= sum(right_node_degrees*sum(right_node_degrees))/(2*edge_sum); |
|
modularity_threshold /= 4*edge_sum; |
|
|
|
unsigned long num_right_clusters; |
|
std::vector<unsigned long> right_labels; |
|
num_right_clusters = newman_cluster_helper(rnd,split_edges,right_node_degrees,right_Bdiag, |
|
edge_sum,right_labels,modularity_threshold, |
|
eps, max_iterations); |
|
|
|
|
|
labels.resize(node_degrees.size()); |
|
for (unsigned long i = 0; i < labels.size(); ++i) |
|
{ |
|
|
|
if (l(i) > 0) |
|
{ |
|
labels[i] = left_labels[left_idx_map[i]]; |
|
} |
|
else |
|
{ |
|
labels[i] = right_labels[right_idx_map[i]] + num_left_clusters; |
|
} |
|
} |
|
|
|
|
|
return num_left_clusters + num_right_clusters; |
|
} |
|
else |
|
{ |
|
labels.assign(node_degrees.size(),0); |
|
return 1; |
|
} |
|
|
|
} |
|
} |
|
|
|
|
|
|
|
inline unsigned long newman_cluster ( |
|
const std::vector<ordered_sample_pair>& edges, |
|
std::vector<unsigned long>& labels, |
|
const double eps = 1e-4, |
|
const unsigned long max_iterations = 2000 |
|
) |
|
{ |
|
|
|
DLIB_ASSERT(is_ordered_by_index(edges), |
|
"\t unsigned long newman_cluster()" |
|
<< "\n\t Invalid inputs were given to this function" |
|
); |
|
|
|
labels.clear(); |
|
if (edges.size() == 0) |
|
return 0; |
|
|
|
const unsigned long num_nodes = max_index_plus_one(edges); |
|
|
|
|
|
matrix<double,0,1> node_degrees(num_nodes); |
|
matrix<double,0,1> Bdiag(num_nodes); |
|
Bdiag = 0; |
|
double edge_sum = 0; |
|
node_degrees = 0; |
|
for (unsigned long i = 0; i < edges.size(); ++i) |
|
{ |
|
node_degrees(edges[i].index1()) += edges[i].distance(); |
|
edge_sum += edges[i].distance(); |
|
if (edges[i].index1() == edges[i].index2()) |
|
Bdiag(edges[i].index1()) += edges[i].distance(); |
|
} |
|
edge_sum /= 2; |
|
Bdiag -= squared(node_degrees)/(2*edge_sum); |
|
|
|
|
|
dlib::rand rnd; |
|
return impl::newman_cluster_helper(rnd,edges,node_degrees,Bdiag,edge_sum,labels,0,eps,max_iterations); |
|
} |
|
|
|
|
|
|
|
inline unsigned long newman_cluster ( |
|
const std::vector<sample_pair>& edges, |
|
std::vector<unsigned long>& labels, |
|
const double eps = 1e-4, |
|
const unsigned long max_iterations = 2000 |
|
) |
|
{ |
|
std::vector<ordered_sample_pair> oedges; |
|
convert_unordered_to_ordered(edges, oedges); |
|
std::sort(oedges.begin(), oedges.end(), &order_by_index<ordered_sample_pair>); |
|
|
|
return newman_cluster(oedges, labels, eps, max_iterations); |
|
} |
|
|
|
|
|
|
|
namespace impl |
|
{ |
|
inline std::vector<unsigned long> remap_labels ( |
|
const std::vector<unsigned long>& labels, |
|
unsigned long& num_labels |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{ |
|
std::map<unsigned long, unsigned long> temp; |
|
for (unsigned long i = 0; i < labels.size(); ++i) |
|
{ |
|
if (temp.count(labels[i]) == 0) |
|
{ |
|
const unsigned long next = temp.size(); |
|
temp[labels[i]] = next; |
|
} |
|
} |
|
|
|
num_labels = temp.size(); |
|
|
|
std::vector<unsigned long> result(labels.size()); |
|
for (unsigned long i = 0; i < labels.size(); ++i) |
|
{ |
|
result[i] = temp[labels[i]]; |
|
} |
|
return result; |
|
} |
|
} |
|
|
|
|
|
|
|
inline double modularity ( |
|
const std::vector<sample_pair>& edges, |
|
const std::vector<unsigned long>& labels |
|
) |
|
{ |
|
const unsigned long num_nodes = max_index_plus_one(edges); |
|
|
|
DLIB_ASSERT(labels.size() == num_nodes, |
|
"\t double modularity()" |
|
<< "\n\t Invalid inputs were given to this function" |
|
); |
|
|
|
unsigned long num_labels; |
|
const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels); |
|
|
|
std::vector<double> cluster_sums(num_labels,0); |
|
std::vector<double> k(num_nodes,0); |
|
|
|
double Q = 0; |
|
double m = 0; |
|
for (unsigned long i = 0; i < edges.size(); ++i) |
|
{ |
|
const unsigned long n1 = edges[i].index1(); |
|
const unsigned long n2 = edges[i].index2(); |
|
k[n1] += edges[i].distance(); |
|
if (n1 != n2) |
|
k[n2] += edges[i].distance(); |
|
|
|
if (n1 != n2) |
|
m += edges[i].distance(); |
|
else |
|
m += edges[i].distance()/2; |
|
|
|
if (labels_[n1] == labels_[n2]) |
|
{ |
|
if (n1 != n2) |
|
Q += 2*edges[i].distance(); |
|
else |
|
Q += edges[i].distance(); |
|
} |
|
} |
|
|
|
if (m == 0) |
|
return 0; |
|
|
|
for (unsigned long i = 0; i < labels_.size(); ++i) |
|
{ |
|
cluster_sums[labels_[i]] += k[i]; |
|
} |
|
|
|
for (unsigned long i = 0; i < labels_.size(); ++i) |
|
{ |
|
Q -= k[i]*cluster_sums[labels_[i]]/(2*m); |
|
} |
|
|
|
return 1.0/(2*m)*Q; |
|
} |
|
|
|
|
|
|
|
inline double modularity ( |
|
const std::vector<ordered_sample_pair>& edges, |
|
const std::vector<unsigned long>& labels |
|
) |
|
{ |
|
const unsigned long num_nodes = max_index_plus_one(edges); |
|
|
|
DLIB_ASSERT(labels.size() == num_nodes, |
|
"\t double modularity()" |
|
<< "\n\t Invalid inputs were given to this function" |
|
); |
|
|
|
|
|
unsigned long num_labels; |
|
const std::vector<unsigned long>& labels_ = dlib::impl::remap_labels(labels,num_labels); |
|
|
|
std::vector<double> cluster_sums(num_labels,0); |
|
std::vector<double> k(num_nodes,0); |
|
|
|
double Q = 0; |
|
double m = 0; |
|
for (unsigned long i = 0; i < edges.size(); ++i) |
|
{ |
|
const unsigned long n1 = edges[i].index1(); |
|
const unsigned long n2 = edges[i].index2(); |
|
k[n1] += edges[i].distance(); |
|
m += edges[i].distance(); |
|
if (labels_[n1] == labels_[n2]) |
|
{ |
|
Q += edges[i].distance(); |
|
} |
|
} |
|
|
|
if (m == 0) |
|
return 0; |
|
|
|
for (unsigned long i = 0; i < labels_.size(); ++i) |
|
{ |
|
cluster_sums[labels_[i]] += k[i]; |
|
} |
|
|
|
for (unsigned long i = 0; i < labels_.size(); ++i) |
|
{ |
|
Q -= k[i]*cluster_sums[labels_[i]]/m; |
|
} |
|
|
|
return 1.0/m*Q; |
|
} |
|
|
|
|
|
|
|
} |
|
|
|
#endif |
|
|
|
|