Spaces:

AshanGimhana
/

Aging_MouthReplace

Paused

App Files Files Community

Aging_MouthReplace / dlibs /dlib /clustering /bottom_up_cluster.h

AshanGimhana

Upload folder using huggingface_hub

9375c9a verified 3 months ago

raw

history blame

8.93 kB

	// Copyright (C) 2015 Davis E. King ([email protected])
	// License: Boost Software License See LICENSE.txt for the full license.
	#ifndef DLIB_BOTTOM_uP_CLUSTER_Hh_
	#define DLIB_BOTTOM_uP_CLUSTER_Hh_

	#include <queue>
	#include <map>

	#include "bottom_up_cluster_abstract.h"
	#include "../algs.h"
	#include "../matrix.h"
	#include "../disjoint_subsets.h"
	#include "../graph_utils.h"


	namespace dlib
	{

	// ----------------------------------------------------------------------------------------

	namespace buc_impl
	{
	inline void merge_sets (
	matrix<double>& dists,
	unsigned long dest,
	unsigned long src
	)
	{
	for (long r = 0; r < dists.nr(); ++r)
	dists(dest,r) = dists(r,dest) = std::max(dists(r,dest), dists(r,src));
	}

	struct compare_dist
	{
	bool operator() (
	const sample_pair& a,
	const sample_pair& b
	) const
	{
	return a.distance() > b.distance();
	}
	};
	}

	// ----------------------------------------------------------------------------------------

	template <
	typename EXP
	>
	unsigned long bottom_up_cluster (
	const matrix_exp<EXP>& dists_,
	std::vector<unsigned long>& labels,
	unsigned long min_num_clusters,
	double max_dist = std::numeric_limits<double>::infinity()
	)
	{
	matrix<double> dists = matrix_cast<double>(dists_);
	// make sure requires clause is not broken
	DLIB_CASSERT(dists.nr() == dists.nc() && min_num_clusters > 0,
	"\t unsigned long bottom_up_cluster()"
	<< "\n\t Invalid inputs were given to this function."
	<< "\n\t dists.nr(): " << dists.nr()
	<< "\n\t dists.nc(): " << dists.nc()
	<< "\n\t min_num_clusters: " << min_num_clusters
	);

	using namespace buc_impl;

	labels.resize(dists.nr());
	disjoint_subsets sets;
	sets.set_size(dists.nr());
	if (labels.size() == 0)
	return 0;

	// push all the edges in the graph into a priority queue so the best edges to merge
	// come first.
	std::priority_queue<sample_pair, std::vector<sample_pair>, compare_dist> que;
	for (long r = 0; r < dists.nr(); ++r)
	for (long c = r+1; c < dists.nc(); ++c)
	que.push(sample_pair(r,c,dists(r,c)));

	// Now start merging nodes.
	for (unsigned long iter = min_num_clusters; iter < sets.size(); ++iter)
	{
	// find the next best thing to merge.
	double best_dist = que.top().distance();
	unsigned long a = sets.find_set(que.top().index1());
	unsigned long b = sets.find_set(que.top().index2());
	que.pop();
	// we have been merging and modifying the distances, so make sure this distance
	// is still valid and these guys haven't been merged already.
	while(a == b \|\| best_dist < dists(a,b))
	{
	// Haven't merged it yet, so put it back in with updated distance for
	// reconsideration later.
	if (a != b)
	que.push(sample_pair(a, b, dists(a, b)));

	best_dist = que.top().distance();
	a = sets.find_set(que.top().index1());
	b = sets.find_set(que.top().index2());
	que.pop();
	}


	// now merge these sets if the best distance is small enough
	if (best_dist > max_dist)
	break;
	unsigned long news = sets.merge_sets(a,b);
	unsigned long olds = (news==a)?b:a;
	merge_sets(dists, news, olds);
	}

	// figure out which cluster each element is in. Also make sure the labels are
	// contiguous.
	std::map<unsigned long, unsigned long> relabel;
	for (unsigned long r = 0; r < labels.size(); ++r)
	{
	unsigned long l = sets.find_set(r);
	// relabel to make contiguous
	if (relabel.count(l) == 0)
	{
	unsigned long next = relabel.size();
	relabel[l] = next;
	}
	labels[r] = relabel[l];
	}


	return relabel.size();
	}

	// ----------------------------------------------------------------------------------------
	// ----------------------------------------------------------------------------------------

	struct snl_range
	{
	snl_range() = default;
	snl_range(double val) : lower(val), upper(val) {}
	snl_range(double l, double u) : lower(l), upper(u) { DLIB_ASSERT(lower <= upper)}

	double lower = 0;
	double upper = 0;

	double width() const { return upper-lower; }
	bool operator<(const snl_range& item) const { return lower < item.lower; }
	};

	inline snl_range merge(const snl_range& a, const snl_range& b)
	{
	return snl_range(std::min(a.lower, b.lower), std::max(a.upper, b.upper));
	}

	inline double distance (const snl_range& a, const snl_range& b)
	{
	return std::max(a.lower,b.lower) - std::min(a.upper,b.upper);
	}

	inline std::ostream& operator<< (std::ostream& out, const snl_range& item )
	{
	out << "["<<item.lower<<","<<item.upper<<"]";
	return out;
	}

	// ----------------------------------------------------------------------------------------

	inline std::vector<snl_range> segment_number_line (
	const std::vector<double>& x,
	const double max_range_width
	)
	{
	DLIB_CASSERT(max_range_width >= 0);

	// create initial ranges, one for each value in x. So initially, all the ranges have
	// width of 0.
	std::vector<snl_range> ranges;
	for (auto v : x)
	ranges.push_back(v);
	std::sort(ranges.begin(), ranges.end());

	std::vector<snl_range> greedy_final_ranges;
	if (ranges.size() == 0)
	return greedy_final_ranges;
	// We will try two different clustering strategies. One that does a simple greedy left
	// to right sweep and another that does a bottom up agglomerative clustering. This
	// first loop runs the greedy left to right sweep. Then at the end of this routine we
	// will return the results that produced the tightest clustering.
	greedy_final_ranges.push_back(ranges[0]);
	for (size_t i = 1; i < ranges.size(); ++i)
	{
	auto m = merge(greedy_final_ranges.back(), ranges[i]);
	if (m.width() <= max_range_width)
	greedy_final_ranges.back() = m;
	else
	greedy_final_ranges.push_back(ranges[i]);
	}


	// Here we do the bottom up clustering. So compute the edges connecting our ranges.
	// We will simply say there are edges between ranges if and only if they are
	// immediately adjacent on the number line.
	std::vector<sample_pair> edges;
	for (size_t i = 1; i < ranges.size(); ++i)
	edges.push_back(sample_pair(i-1,i, distance(ranges[i-1],ranges[i])));
	std::sort(edges.begin(), edges.end(), order_by_distance<sample_pair>);

	disjoint_subsets sets;
	sets.set_size(ranges.size());

	// Now start merging nodes.
	for (auto edge : edges)
	{
	// find the next best thing to merge.
	unsigned long a = sets.find_set(edge.index1());
	unsigned long b = sets.find_set(edge.index2());

	// merge it if it doesn't result in an interval that's too big.
	auto m = merge(ranges[a], ranges[b]);
	if (m.width() <= max_range_width)
	{
	unsigned long news = sets.merge_sets(a,b);
	ranges[news] = m;
	}
	}

	// Now create a list of the final ranges. We will do this by keeping track of which
	// range we already added to final_ranges.
	std::vector<snl_range> final_ranges;
	std::vector<bool> already_output(ranges.size(), false);
	for (unsigned long i = 0; i < sets.size(); ++i)
	{
	auto s = sets.find_set(i);
	if (!already_output[s])
	{
	final_ranges.push_back(ranges[s]);
	already_output[s] = true;
	}
	}

	// only use the greedy clusters if they found a clustering with fewer clusters.
	// Otherwise, the bottom up clustering probably produced a more sensible clustering.
	if (final_ranges.size() <= greedy_final_ranges.size())
	return final_ranges;
	else
	return greedy_final_ranges;
	}

	// ----------------------------------------------------------------------------------------

	}

	#endif // DLIB_BOTTOM_uP_CLUSTER_Hh_