AshanGimhana's picture
Upload folder using huggingface_hub
9375c9a verified
raw
history blame
8.93 kB
// Copyright (C) 2015 Davis E. King ([email protected])
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_BOTTOM_uP_CLUSTER_Hh_
#define DLIB_BOTTOM_uP_CLUSTER_Hh_
#include <queue>
#include <map>
#include "bottom_up_cluster_abstract.h"
#include "../algs.h"
#include "../matrix.h"
#include "../disjoint_subsets.h"
#include "../graph_utils.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
namespace buc_impl
{
inline void merge_sets (
matrix<double>& dists,
unsigned long dest,
unsigned long src
)
{
for (long r = 0; r < dists.nr(); ++r)
dists(dest,r) = dists(r,dest) = std::max(dists(r,dest), dists(r,src));
}
struct compare_dist
{
bool operator() (
const sample_pair& a,
const sample_pair& b
) const
{
return a.distance() > b.distance();
}
};
}
// ----------------------------------------------------------------------------------------
template <
typename EXP
>
unsigned long bottom_up_cluster (
const matrix_exp<EXP>& dists_,
std::vector<unsigned long>& labels,
unsigned long min_num_clusters,
double max_dist = std::numeric_limits<double>::infinity()
)
{
matrix<double> dists = matrix_cast<double>(dists_);
// make sure requires clause is not broken
DLIB_CASSERT(dists.nr() == dists.nc() && min_num_clusters > 0,
"\t unsigned long bottom_up_cluster()"
<< "\n\t Invalid inputs were given to this function."
<< "\n\t dists.nr(): " << dists.nr()
<< "\n\t dists.nc(): " << dists.nc()
<< "\n\t min_num_clusters: " << min_num_clusters
);
using namespace buc_impl;
labels.resize(dists.nr());
disjoint_subsets sets;
sets.set_size(dists.nr());
if (labels.size() == 0)
return 0;
// push all the edges in the graph into a priority queue so the best edges to merge
// come first.
std::priority_queue<sample_pair, std::vector<sample_pair>, compare_dist> que;
for (long r = 0; r < dists.nr(); ++r)
for (long c = r+1; c < dists.nc(); ++c)
que.push(sample_pair(r,c,dists(r,c)));
// Now start merging nodes.
for (unsigned long iter = min_num_clusters; iter < sets.size(); ++iter)
{
// find the next best thing to merge.
double best_dist = que.top().distance();
unsigned long a = sets.find_set(que.top().index1());
unsigned long b = sets.find_set(que.top().index2());
que.pop();
// we have been merging and modifying the distances, so make sure this distance
// is still valid and these guys haven't been merged already.
while(a == b || best_dist < dists(a,b))
{
// Haven't merged it yet, so put it back in with updated distance for
// reconsideration later.
if (a != b)
que.push(sample_pair(a, b, dists(a, b)));
best_dist = que.top().distance();
a = sets.find_set(que.top().index1());
b = sets.find_set(que.top().index2());
que.pop();
}
// now merge these sets if the best distance is small enough
if (best_dist > max_dist)
break;
unsigned long news = sets.merge_sets(a,b);
unsigned long olds = (news==a)?b:a;
merge_sets(dists, news, olds);
}
// figure out which cluster each element is in. Also make sure the labels are
// contiguous.
std::map<unsigned long, unsigned long> relabel;
for (unsigned long r = 0; r < labels.size(); ++r)
{
unsigned long l = sets.find_set(r);
// relabel to make contiguous
if (relabel.count(l) == 0)
{
unsigned long next = relabel.size();
relabel[l] = next;
}
labels[r] = relabel[l];
}
return relabel.size();
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
struct snl_range
{
snl_range() = default;
snl_range(double val) : lower(val), upper(val) {}
snl_range(double l, double u) : lower(l), upper(u) { DLIB_ASSERT(lower <= upper)}
double lower = 0;
double upper = 0;
double width() const { return upper-lower; }
bool operator<(const snl_range& item) const { return lower < item.lower; }
};
inline snl_range merge(const snl_range& a, const snl_range& b)
{
return snl_range(std::min(a.lower, b.lower), std::max(a.upper, b.upper));
}
inline double distance (const snl_range& a, const snl_range& b)
{
return std::max(a.lower,b.lower) - std::min(a.upper,b.upper);
}
inline std::ostream& operator<< (std::ostream& out, const snl_range& item )
{
out << "["<<item.lower<<","<<item.upper<<"]";
return out;
}
// ----------------------------------------------------------------------------------------
inline std::vector<snl_range> segment_number_line (
const std::vector<double>& x,
const double max_range_width
)
{
DLIB_CASSERT(max_range_width >= 0);
// create initial ranges, one for each value in x. So initially, all the ranges have
// width of 0.
std::vector<snl_range> ranges;
for (auto v : x)
ranges.push_back(v);
std::sort(ranges.begin(), ranges.end());
std::vector<snl_range> greedy_final_ranges;
if (ranges.size() == 0)
return greedy_final_ranges;
// We will try two different clustering strategies. One that does a simple greedy left
// to right sweep and another that does a bottom up agglomerative clustering. This
// first loop runs the greedy left to right sweep. Then at the end of this routine we
// will return the results that produced the tightest clustering.
greedy_final_ranges.push_back(ranges[0]);
for (size_t i = 1; i < ranges.size(); ++i)
{
auto m = merge(greedy_final_ranges.back(), ranges[i]);
if (m.width() <= max_range_width)
greedy_final_ranges.back() = m;
else
greedy_final_ranges.push_back(ranges[i]);
}
// Here we do the bottom up clustering. So compute the edges connecting our ranges.
// We will simply say there are edges between ranges if and only if they are
// immediately adjacent on the number line.
std::vector<sample_pair> edges;
for (size_t i = 1; i < ranges.size(); ++i)
edges.push_back(sample_pair(i-1,i, distance(ranges[i-1],ranges[i])));
std::sort(edges.begin(), edges.end(), order_by_distance<sample_pair>);
disjoint_subsets sets;
sets.set_size(ranges.size());
// Now start merging nodes.
for (auto edge : edges)
{
// find the next best thing to merge.
unsigned long a = sets.find_set(edge.index1());
unsigned long b = sets.find_set(edge.index2());
// merge it if it doesn't result in an interval that's too big.
auto m = merge(ranges[a], ranges[b]);
if (m.width() <= max_range_width)
{
unsigned long news = sets.merge_sets(a,b);
ranges[news] = m;
}
}
// Now create a list of the final ranges. We will do this by keeping track of which
// range we already added to final_ranges.
std::vector<snl_range> final_ranges;
std::vector<bool> already_output(ranges.size(), false);
for (unsigned long i = 0; i < sets.size(); ++i)
{
auto s = sets.find_set(i);
if (!already_output[s])
{
final_ranges.push_back(ranges[s]);
already_output[s] = true;
}
}
// only use the greedy clusters if they found a clustering with fewer clusters.
// Otherwise, the bottom up clustering probably produced a more sensible clustering.
if (final_ranges.size() <= greedy_final_ranges.size())
return final_ranges;
else
return greedy_final_ranges;
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_BOTTOM_uP_CLUSTER_Hh_