// Copyright (C) 2008 Davis E. King (davis@dlib.net), Steve Taylor // License: Boost Software License See LICENSE.txt for the full license. #ifndef DLIB_STATISTICs_ #define DLIB_STATISTICs_ #include "statistics_abstract.h" #include <limits> #include <cmath> #include "../algs.h" #include "../matrix.h" #include "../sparse_vector.h" namespace dlib { // ---------------------------------------------------------------------------------------- template < typename T > class running_stats { public: running_stats() { clear(); COMPILE_TIME_ASSERT (( is_same_type<float,T>::value || is_same_type<double,T>::value || is_same_type<long double,T>::value )); } void clear() { sum = 0; sum_sqr = 0; sum_cub = 0; sum_four = 0; n = 0; min_value = std::numeric_limits<T>::infinity(); max_value = -std::numeric_limits<T>::infinity(); } void add ( const T& val ) { sum += val; sum_sqr += val*val; sum_cub += cubed(val); sum_four += quaded(val); if (val < min_value) min_value = val; if (val > max_value) max_value = val; ++n; } T current_n ( ) const { return n; } T mean ( ) const { if (n != 0) return sum/n; else return 0; } T max ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 0, "\tT running_stats::max" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); return max_value; } T min ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 0, "\tT running_stats::min" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); return min_value; } T variance ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_stats::variance" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); T temp = 1/(n-1); temp = temp*(sum_sqr - sum*sum/n); // make sure the variance is never negative. This might // happen due to numerical errors. if (temp >= 0) return temp; else return 0; } T stddev ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_stats::stddev" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); return std::sqrt(variance()); } T skewness ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 2, "\tT running_stats::skewness" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); T temp = 1/n; T temp1 = std::sqrt(n*(n-1))/(n-2); temp = temp1*temp*(sum_cub - 3*sum_sqr*sum*temp + 2*cubed(sum)*temp*temp)/ (std::sqrt(std::pow(temp*(sum_sqr-sum*sum*temp),3))); return temp; } T ex_kurtosis ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 3, "\tT running_stats::kurtosis" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); T temp = 1/n; T m4 = temp*(sum_four - 4*sum_cub*sum*temp+6*sum_sqr*sum*sum*temp*temp -3*quaded(sum)*cubed(temp)); T m2 = temp*(sum_sqr-sum*sum*temp); temp = (n-1)*((n+1)*m4/(m2*m2)-3*(n-1))/((n-2)*(n-3)); return temp; } T scale ( const T& val ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_stats::variance" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); return (val-mean())/std::sqrt(variance()); } running_stats operator+ ( const running_stats& rhs ) const { running_stats temp(*this); temp.sum += rhs.sum; temp.sum_sqr += rhs.sum_sqr; temp.sum_cub += rhs.sum_cub; temp.sum_four += rhs.sum_four; temp.n += rhs.n; temp.min_value = std::min(rhs.min_value, min_value); temp.max_value = std::max(rhs.max_value, max_value); return temp; } template <typename U> friend void serialize ( const running_stats<U>& item, std::ostream& out ); template <typename U> friend void deserialize ( running_stats<U>& item, std::istream& in ); private: T sum; T sum_sqr; T sum_cub; T sum_four; T n; T min_value; T max_value; T cubed (const T& val) const {return val*val*val; } T quaded (const T& val) const {return val*val*val*val; } }; template <typename T> void serialize ( const running_stats<T>& item, std::ostream& out ) { int version = 2; serialize(version, out); serialize(item.sum, out); serialize(item.sum_sqr, out); serialize(item.sum_cub, out); serialize(item.sum_four, out); serialize(item.n, out); serialize(item.min_value, out); serialize(item.max_value, out); } template <typename T> void deserialize ( running_stats<T>& item, std::istream& in ) { int version = 0; deserialize(version, in); if (version != 2) throw dlib::serialization_error("Unexpected version number found while deserializing dlib::running_stats object."); deserialize(item.sum, in); deserialize(item.sum_sqr, in); deserialize(item.sum_cub, in); deserialize(item.sum_four, in); deserialize(item.n, in); deserialize(item.min_value, in); deserialize(item.max_value, in); } // ---------------------------------------------------------------------------------------- template < typename T > class running_scalar_covariance { public: running_scalar_covariance() { clear(); COMPILE_TIME_ASSERT (( is_same_type<float,T>::value || is_same_type<double,T>::value || is_same_type<long double,T>::value )); } void clear() { sum_xy = 0; sum_x = 0; sum_y = 0; sum_xx = 0; sum_yy = 0; n = 0; } void add ( const T& x, const T& y ) { sum_xy += x*y; sum_xx += x*x; sum_yy += y*y; sum_x += x; sum_y += y; n += 1; } T current_n ( ) const { return n; } T mean_x ( ) const { if (n != 0) return sum_x/n; else return 0; } T mean_y ( ) const { if (n != 0) return sum_y/n; else return 0; } T covariance ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_scalar_covariance::covariance()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); return 1/(n-1) * (sum_xy - sum_y*sum_x/n); } T correlation ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_scalar_covariance::correlation()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); return covariance() / std::sqrt(variance_x()*variance_y()); } T variance_x ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_scalar_covariance::variance_x()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); T temp = 1/(n-1) * (sum_xx - sum_x*sum_x/n); // make sure the variance is never negative. This might // happen due to numerical errors. if (temp >= 0) return temp; else return 0; } T variance_y ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_scalar_covariance::variance_y()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); T temp = 1/(n-1) * (sum_yy - sum_y*sum_y/n); // make sure the variance is never negative. This might // happen due to numerical errors. if (temp >= 0) return temp; else return 0; } T stddev_x ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_scalar_covariance::stddev_x()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); return std::sqrt(variance_x()); } T stddev_y ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_scalar_covariance::stddev_y()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); return std::sqrt(variance_y()); } running_scalar_covariance operator+ ( const running_scalar_covariance& rhs ) const { running_scalar_covariance temp(rhs); temp.sum_xy += sum_xy; temp.sum_x += sum_x; temp.sum_y += sum_y; temp.sum_xx += sum_xx; temp.sum_yy += sum_yy; temp.n += n; return temp; } private: T sum_xy; T sum_x; T sum_y; T sum_xx; T sum_yy; T n; }; // ---------------------------------------------------------------------------------------- template < typename T > class running_scalar_covariance_decayed { public: explicit running_scalar_covariance_decayed( T decay_halflife = 1000 ) { DLIB_ASSERT(decay_halflife > 0); sum_xy = 0; sum_x = 0; sum_y = 0; sum_xx = 0; sum_yy = 0; forget = std::pow(0.5, 1/decay_halflife); n = 0; COMPILE_TIME_ASSERT (( is_same_type<float,T>::value || is_same_type<double,T>::value || is_same_type<long double,T>::value )); } T forget_factor ( ) const { return forget; } void add ( const T& x, const T& y ) { sum_xy = sum_xy*forget + x*y; sum_xx = sum_xx*forget + x*x; sum_yy = sum_yy*forget + y*y; sum_x = sum_x*forget + x; sum_y = sum_y*forget + y; n = n*forget + 1; } T current_n ( ) const { return n; } T mean_x ( ) const { if (n != 0) return sum_x/n; else return 0; } T mean_y ( ) const { if (n != 0) return sum_y/n; else return 0; } T covariance ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_scalar_covariance_decayed::covariance()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); return 1/(n-1) * (sum_xy - sum_y*sum_x/n); } T correlation ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_scalar_covariance_decayed::correlation()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); T temp = std::sqrt(variance_x()*variance_y()); if (temp != 0) return covariance() / temp; else return 0; // just say it's zero if there isn't any variance in x or y. } T variance_x ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_scalar_covariance_decayed::variance_x()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); T temp = 1/(n-1) * (sum_xx - sum_x*sum_x/n); // make sure the variance is never negative. This might // happen due to numerical errors. if (temp >= 0) return temp; else return 0; } T variance_y ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_scalar_covariance_decayed::variance_y()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); T temp = 1/(n-1) * (sum_yy - sum_y*sum_y/n); // make sure the variance is never negative. This might // happen due to numerical errors. if (temp >= 0) return temp; else return 0; } T stddev_x ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_scalar_covariance_decayed::stddev_x()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); return std::sqrt(variance_x()); } T stddev_y ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_scalar_covariance_decayed::stddev_y()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); return std::sqrt(variance_y()); } private: T sum_xy; T sum_x; T sum_y; T sum_xx; T sum_yy; T n; T forget; }; // ---------------------------------------------------------------------------------------- template < typename T > class running_stats_decayed { public: explicit running_stats_decayed( T decay_halflife = 1000 ) { DLIB_ASSERT(decay_halflife > 0); sum_x = 0; sum_xx = 0; forget = std::pow(0.5, 1/decay_halflife); n = 0; COMPILE_TIME_ASSERT (( is_same_type<float,T>::value || is_same_type<double,T>::value || is_same_type<long double,T>::value )); } T forget_factor ( ) const { return forget; } void add ( const T& x ) { sum_xx = sum_xx*forget + x*x; sum_x = sum_x*forget + x; n = n*forget + 1; } T current_n ( ) const { return n; } T mean ( ) const { if (n != 0) return sum_x/n; else return 0; } T variance ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_stats_decayed::variance()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); T temp = 1/(n-1) * (sum_xx - sum_x*sum_x/n); // make sure the variance is never negative. This might // happen due to numerical errors. if (temp >= 0) return temp; else return 0; } T stddev ( ) const { // make sure requires clause is not broken DLIB_ASSERT(current_n() > 1, "\tT running_stats_decayed::stddev()" << "\n\tyou have to add some numbers to this object first" << "\n\tthis: " << this ); return std::sqrt(variance()); } template <typename U> friend void serialize ( const running_stats_decayed<U>& item, std::ostream& out ); template <typename U> friend void deserialize ( running_stats_decayed<U>& item, std::istream& in ); private: T sum_x; T sum_xx; T n; T forget; }; template <typename T> void serialize ( const running_stats_decayed<T>& item, std::ostream& out ) { int version = 1; serialize(version, out); serialize(item.sum_x, out); serialize(item.sum_xx, out); serialize(item.n, out); serialize(item.forget, out); } template <typename T> void deserialize ( running_stats_decayed<T>& item, std::istream& in ) { int version = 0; deserialize(version, in); if (version != 1) throw dlib::serialization_error("Unexpected version number found while deserializing dlib::running_stats_decayed object."); deserialize(item.sum_x, in); deserialize(item.sum_xx, in); deserialize(item.n, in); deserialize(item.forget, in); } // ---------------------------------------------------------------------------------------- template < typename T, typename alloc > double mean_sign_agreement ( const std::vector<T,alloc>& a, const std::vector<T,alloc>& b ) { // make sure requires clause is not broken DLIB_ASSERT(a.size() == b.size(), "\t double mean_sign_agreement(a,b)" << "\n\t a and b must be the same length." << "\n\t a.size(): " << a.size() << "\n\t b.size(): " << b.size() ); double temp = 0; for (unsigned long i = 0; i < a.size(); ++i) { if ((a[i] >= 0 && b[i] >= 0) || (a[i] < 0 && b[i] < 0)) { temp += 1; } } return temp/a.size(); } // ---------------------------------------------------------------------------------------- template < typename T, typename alloc > double correlation ( const std::vector<T,alloc>& a, const std::vector<T,alloc>& b ) { // make sure requires clause is not broken DLIB_ASSERT(a.size() == b.size() && a.size() > 1, "\t double correlation(a,b)" << "\n\t a and b must be the same length and have more than one element." << "\n\t a.size(): " << a.size() << "\n\t b.size(): " << b.size() ); running_scalar_covariance<double> rs; for (unsigned long i = 0; i < a.size(); ++i) { rs.add(a[i], b[i]); } return rs.correlation(); } // ---------------------------------------------------------------------------------------- template < typename T, typename alloc > double covariance ( const std::vector<T,alloc>& a, const std::vector<T,alloc>& b ) { // make sure requires clause is not broken DLIB_ASSERT(a.size() == b.size() && a.size() > 1, "\t double covariance(a,b)" << "\n\t a and b must be the same length and have more than one element." << "\n\t a.size(): " << a.size() << "\n\t b.size(): " << b.size() ); running_scalar_covariance<double> rs; for (unsigned long i = 0; i < a.size(); ++i) { rs.add(a[i], b[i]); } return rs.covariance(); } // ---------------------------------------------------------------------------------------- template < typename T, typename alloc > double r_squared ( const std::vector<T,alloc>& a, const std::vector<T,alloc>& b ) { // make sure requires clause is not broken DLIB_ASSERT(a.size() == b.size() && a.size() > 1, "\t double r_squared(a,b)" << "\n\t a and b must be the same length and have more than one element." << "\n\t a.size(): " << a.size() << "\n\t b.size(): " << b.size() ); return std::pow(correlation(a,b),2.0); } // ---------------------------------------------------------------------------------------- template < typename T, typename alloc > double mean_squared_error ( const std::vector<T,alloc>& a, const std::vector<T,alloc>& b ) { // make sure requires clause is not broken DLIB_ASSERT(a.size() == b.size(), "\t double mean_squared_error(a,b)" << "\n\t a and b must be the same length." << "\n\t a.size(): " << a.size() << "\n\t b.size(): " << b.size() ); return mean(squared(matrix_cast<double>(mat(a))-matrix_cast<double>(mat(b)))); } // ---------------------------------------------------------------------------------------- template < typename matrix_type > class running_covariance { /*! INITIAL VALUE - vect_size == 0 - total_count == 0 CONVENTION - vect_size == in_vector_size() - total_count == current_n() - if (total_count != 0) - total_sum == the sum of all vectors given to add() - the covariance of all the elements given to add() is given by: - let avg == total_sum/total_count - covariance == total_cov/total_count - avg*trans(avg) !*/ public: typedef typename matrix_type::mem_manager_type mem_manager_type; typedef typename matrix_type::type scalar_type; typedef typename matrix_type::layout_type layout_type; typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix; typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix; running_covariance( ) { clear(); } void clear( ) { total_count = 0; vect_size = 0; total_sum.set_size(0); total_cov.set_size(0,0); } long in_vector_size ( ) const { return vect_size; } long current_n ( ) const { return static_cast<long>(total_count); } void set_dimension ( long size ) { // make sure requires clause is not broken DLIB_ASSERT( size > 0, "\t void running_covariance::set_dimension()" << "\n\t Invalid inputs were given to this function" << "\n\t size: " << size << "\n\t this: " << this ); clear(); vect_size = size; total_sum.set_size(size); total_cov.set_size(size,size); total_sum = 0; total_cov = 0; } template <typename T> typename disable_if<is_matrix<T> >::type add ( const T& val ) { // make sure requires clause is not broken DLIB_ASSERT(((long)max_index_plus_one(val) <= in_vector_size() && in_vector_size() > 0), "\t void running_covariance::add()" << "\n\t Invalid inputs were given to this function" << "\n\t max_index_plus_one(val): " << max_index_plus_one(val) << "\n\t in_vector_size(): " << in_vector_size() << "\n\t this: " << this ); for (typename T::const_iterator i = val.begin(); i != val.end(); ++i) { total_sum(i->first) += i->second; for (typename T::const_iterator j = val.begin(); j != val.end(); ++j) { total_cov(i->first, j->first) += i->second*j->second; } } ++total_count; } template <typename T> typename enable_if<is_matrix<T> >::type add ( const T& val ) { // make sure requires clause is not broken DLIB_ASSERT(is_col_vector(val) && (in_vector_size() == 0 || val.size() == in_vector_size()), "\t void running_covariance::add()" << "\n\t Invalid inputs were given to this function" << "\n\t is_col_vector(val): " << is_col_vector(val) << "\n\t in_vector_size(): " << in_vector_size() << "\n\t val.size(): " << val.size() << "\n\t this: " << this ); vect_size = val.size(); if (total_count == 0) { total_cov = val*trans(val); total_sum = val; } else { total_cov += val*trans(val); total_sum += val; } ++total_count; } const column_matrix mean ( ) const { // make sure requires clause is not broken DLIB_ASSERT( in_vector_size() != 0, "\t running_covariance::mean()" << "\n\t This object can not execute this function in its current state." << "\n\t in_vector_size(): " << in_vector_size() << "\n\t current_n(): " << current_n() << "\n\t this: " << this ); return total_sum/total_count; } const general_matrix covariance ( ) const { // make sure requires clause is not broken DLIB_ASSERT( in_vector_size() != 0 && current_n() > 1, "\t running_covariance::covariance()" << "\n\t This object can not execute this function in its current state." << "\n\t in_vector_size(): " << in_vector_size() << "\n\t current_n(): " << current_n() << "\n\t this: " << this ); return (total_cov - total_sum*trans(total_sum)/total_count)/(total_count-1); } const running_covariance operator+ ( const running_covariance& item ) const { // make sure requires clause is not broken DLIB_ASSERT((in_vector_size() == 0 || item.in_vector_size() == 0 || in_vector_size() == item.in_vector_size()), "\t running_covariance running_covariance::operator+()" << "\n\t The two running_covariance objects being added must have compatible parameters" << "\n\t in_vector_size(): " << in_vector_size() << "\n\t item.in_vector_size(): " << item.in_vector_size() << "\n\t this: " << this ); running_covariance temp(item); // make sure we ignore empty matrices if (total_count != 0 && temp.total_count != 0) { temp.total_cov += total_cov; temp.total_sum += total_sum; temp.total_count += total_count; } else if (total_count != 0) { temp.total_cov = total_cov; temp.total_sum = total_sum; temp.total_count = total_count; } return temp; } private: general_matrix total_cov; column_matrix total_sum; scalar_type total_count; long vect_size; }; // ---------------------------------------------------------------------------------------- template < typename matrix_type > class running_cross_covariance { /*! INITIAL VALUE - x_vect_size == 0 - y_vect_size == 0 - total_count == 0 CONVENTION - x_vect_size == x_vector_size() - y_vect_size == y_vector_size() - total_count == current_n() - if (total_count != 0) - sum_x == the sum of all x vectors given to add() - sum_y == the sum of all y vectors given to add() - total_cov == sum of all x*trans(y) given to add() !*/ public: typedef typename matrix_type::mem_manager_type mem_manager_type; typedef typename matrix_type::type scalar_type; typedef typename matrix_type::layout_type layout_type; typedef matrix<scalar_type,0,0,mem_manager_type,layout_type> general_matrix; typedef matrix<scalar_type,0,1,mem_manager_type,layout_type> column_matrix; running_cross_covariance( ) { clear(); } void clear( ) { total_count = 0; x_vect_size = 0; y_vect_size = 0; sum_x.set_size(0); sum_y.set_size(0); total_cov.set_size(0,0); } long x_vector_size ( ) const { return x_vect_size; } long y_vector_size ( ) const { return y_vect_size; } void set_dimensions ( long x_size, long y_size ) { // make sure requires clause is not broken DLIB_ASSERT( x_size > 0 && y_size > 0, "\t void running_cross_covariance::set_dimensions()" << "\n\t Invalid inputs were given to this function" << "\n\t x_size: " << x_size << "\n\t y_size: " << y_size << "\n\t this: " << this ); clear(); x_vect_size = x_size; y_vect_size = y_size; sum_x.set_size(x_size); sum_y.set_size(y_size); total_cov.set_size(x_size,y_size); sum_x = 0; sum_y = 0; total_cov = 0; } long current_n ( ) const { return static_cast<long>(total_count); } template <typename T, typename U> typename enable_if_c<!is_matrix<T>::value && !is_matrix<U>::value>::type add ( const T& x, const U& y ) { // make sure requires clause is not broken DLIB_ASSERT( ((long)max_index_plus_one(x) <= x_vector_size() && x_vector_size() > 0) && ((long)max_index_plus_one(y) <= y_vector_size() && y_vector_size() > 0) , "\t void running_cross_covariance::add()" << "\n\t Invalid inputs were given to this function" << "\n\t max_index_plus_one(x): " << max_index_plus_one(x) << "\n\t max_index_plus_one(y): " << max_index_plus_one(y) << "\n\t x_vector_size(): " << x_vector_size() << "\n\t y_vector_size(): " << y_vector_size() << "\n\t this: " << this ); for (typename T::const_iterator i = x.begin(); i != x.end(); ++i) { sum_x(i->first) += i->second; for (typename U::const_iterator j = y.begin(); j != y.end(); ++j) { total_cov(i->first, j->first) += i->second*j->second; } } // do sum_y += y for (typename U::const_iterator j = y.begin(); j != y.end(); ++j) { sum_y(j->first) += j->second; } ++total_count; } template <typename T, typename U> typename enable_if_c<is_matrix<T>::value && !is_matrix<U>::value>::type add ( const T& x, const U& y ) { // make sure requires clause is not broken DLIB_ASSERT( (is_col_vector(x) && x.size() == x_vector_size() && x_vector_size() > 0) && ((long)max_index_plus_one(y) <= y_vector_size() && y_vector_size() > 0) , "\t void running_cross_covariance::add()" << "\n\t Invalid inputs were given to this function" << "\n\t is_col_vector(x): " << is_col_vector(x) << "\n\t x.size(): " << x.size() << "\n\t max_index_plus_one(y): " << max_index_plus_one(y) << "\n\t x_vector_size(): " << x_vector_size() << "\n\t y_vector_size(): " << y_vector_size() << "\n\t this: " << this ); sum_x += x; for (long i = 0; i < x.size(); ++i) { for (typename U::const_iterator j = y.begin(); j != y.end(); ++j) { total_cov(i, j->first) += x(i)*j->second; } } // do sum_y += y for (typename U::const_iterator j = y.begin(); j != y.end(); ++j) { sum_y(j->first) += j->second; } ++total_count; } template <typename T, typename U> typename enable_if_c<!is_matrix<T>::value && is_matrix<U>::value>::type add ( const T& x, const U& y ) { // make sure requires clause is not broken DLIB_ASSERT( ((long)max_index_plus_one(x) <= x_vector_size() && x_vector_size() > 0) && (is_col_vector(y) && y.size() == (long)y_vector_size() && y_vector_size() > 0) , "\t void running_cross_covariance::add()" << "\n\t Invalid inputs were given to this function" << "\n\t max_index_plus_one(x): " << max_index_plus_one(x) << "\n\t is_col_vector(y): " << is_col_vector(y) << "\n\t y.size(): " << y.size() << "\n\t x_vector_size(): " << x_vector_size() << "\n\t y_vector_size(): " << y_vector_size() << "\n\t this: " << this ); for (typename T::const_iterator i = x.begin(); i != x.end(); ++i) { sum_x(i->first) += i->second; for (long j = 0; j < y.size(); ++j) { total_cov(i->first, j) += i->second*y(j); } } sum_y += y; ++total_count; } template <typename T, typename U> typename enable_if_c<is_matrix<T>::value && is_matrix<U>::value>::type add ( const T& x, const U& y ) { // make sure requires clause is not broken DLIB_ASSERT(is_col_vector(x) && (x_vector_size() == 0 || x.size() == x_vector_size()) && is_col_vector(y) && (y_vector_size() == 0 || y.size() == y_vector_size()) && x.size() != 0 && y.size() != 0, "\t void running_cross_covariance::add()" << "\n\t Invalid inputs were given to this function" << "\n\t is_col_vector(x): " << is_col_vector(x) << "\n\t x_vector_size(): " << x_vector_size() << "\n\t x.size(): " << x.size() << "\n\t is_col_vector(y): " << is_col_vector(y) << "\n\t y_vector_size(): " << y_vector_size() << "\n\t y.size(): " << y.size() << "\n\t this: " << this ); x_vect_size = x.size(); y_vect_size = y.size(); if (total_count == 0) { total_cov = x*trans(y); sum_x = x; sum_y = y; } else { total_cov += x*trans(y); sum_x += x; sum_y += y; } ++total_count; } const column_matrix mean_x ( ) const { // make sure requires clause is not broken DLIB_ASSERT( current_n() != 0, "\t running_cross_covariance::mean()" << "\n\t This object can not execute this function in its current state." << "\n\t current_n(): " << current_n() << "\n\t this: " << this ); return sum_x/total_count; } const column_matrix mean_y ( ) const { // make sure requires clause is not broken DLIB_ASSERT( current_n() != 0, "\t running_cross_covariance::mean()" << "\n\t This object can not execute this function in its current state." << "\n\t current_n(): " << current_n() << "\n\t this: " << this ); return sum_y/total_count; } const general_matrix covariance_xy ( ) const { // make sure requires clause is not broken DLIB_ASSERT( current_n() > 1, "\t running_cross_covariance::covariance()" << "\n\t This object can not execute this function in its current state." << "\n\t x_vector_size(): " << x_vector_size() << "\n\t y_vector_size(): " << y_vector_size() << "\n\t current_n(): " << current_n() << "\n\t this: " << this ); return (total_cov - sum_x*trans(sum_y)/total_count)/(total_count-1); } const running_cross_covariance operator+ ( const running_cross_covariance& item ) const { // make sure requires clause is not broken DLIB_ASSERT((x_vector_size() == 0 || item.x_vector_size() == 0 || x_vector_size() == item.x_vector_size()) && (y_vector_size() == 0 || item.y_vector_size() == 0 || y_vector_size() == item.y_vector_size()), "\t running_cross_covariance running_cross_covariance::operator+()" << "\n\t The two running_cross_covariance objects being added must have compatible parameters" << "\n\t x_vector_size(): " << x_vector_size() << "\n\t item.x_vector_size(): " << item.x_vector_size() << "\n\t y_vector_size(): " << y_vector_size() << "\n\t item.y_vector_size(): " << item.y_vector_size() << "\n\t this: " << this ); running_cross_covariance temp(item); // make sure we ignore empty matrices if (total_count != 0 && temp.total_count != 0) { temp.total_cov += total_cov; temp.sum_x += sum_x; temp.sum_y += sum_y; temp.total_count += total_count; } else if (total_count != 0) { temp.total_cov = total_cov; temp.sum_x = sum_x; temp.sum_y = sum_y; temp.total_count = total_count; } return temp; } private: general_matrix total_cov; column_matrix sum_x; column_matrix sum_y; scalar_type total_count; long x_vect_size; long y_vect_size; }; // ---------------------------------------------------------------------------------------- template < typename matrix_type > class vector_normalizer { public: typedef typename matrix_type::mem_manager_type mem_manager_type; typedef typename matrix_type::type scalar_type; typedef matrix_type result_type; template <typename vector_type> void train ( const vector_type& samples ) { // make sure requires clause is not broken DLIB_ASSERT(samples.size() > 0, "\tvoid vector_normalizer::train()" << "\n\tyou have to give a nonempty set of samples to this function" << "\n\tthis: " << this ); m = mean(mat(samples)); sd = reciprocal(sqrt(variance(mat(samples)))); DLIB_ASSERT(is_finite(m), "Some of the input vectors to vector_normalizer::train() have infinite or NaN values"); } long in_vector_size ( ) const { return m.nr(); } long out_vector_size ( ) const { return m.nr(); } const matrix_type& means ( ) const { return m; } const matrix_type& std_devs ( ) const { return sd; } const result_type& operator() ( const matrix_type& x ) const { // make sure requires clause is not broken DLIB_ASSERT(x.nr() == in_vector_size() && x.nc() == 1, "\tmatrix vector_normalizer::operator()" << "\n\t you have given invalid arguments to this function" << "\n\t x.nr(): " << x.nr() << "\n\t in_vector_size(): " << in_vector_size() << "\n\t x.nc(): " << x.nc() << "\n\t this: " << this ); temp_out = pointwise_multiply(x-m, sd); return temp_out; } void swap ( vector_normalizer& item ) { m.swap(item.m); sd.swap(item.sd); temp_out.swap(item.temp_out); } template <typename mt> friend void deserialize ( vector_normalizer<mt>& item, std::istream& in ); template <typename mt> friend void serialize ( const vector_normalizer<mt>& item, std::ostream& out ); private: // ------------------- private data members ------------------- matrix_type m, sd; // This is just a temporary variable that doesn't contribute to the // state of this object. mutable matrix_type temp_out; }; // ---------------------------------------------------------------------------------------- template < typename matrix_type > inline void swap ( vector_normalizer<matrix_type>& a, vector_normalizer<matrix_type>& b ) { a.swap(b); } // ---------------------------------------------------------------------------------------- template < typename matrix_type > void deserialize ( vector_normalizer<matrix_type>& item, std::istream& in ) { deserialize(item.m, in); deserialize(item.sd, in); // Keep deserializing the pca matrix for backwards compatibility. matrix<double> pca; deserialize(pca, in); if (pca.size() != 0) throw serialization_error("Error deserializing object of type vector_normalizer\n" "It looks like a serialized vector_normalizer_pca was accidentally deserialized into \n" "a vector_normalizer object."); } // ---------------------------------------------------------------------------------------- template < typename matrix_type > void serialize ( const vector_normalizer<matrix_type>& item, std::ostream& out ) { serialize(item.m, out); serialize(item.sd, out); // Keep serializing the pca matrix for backwards compatibility. matrix<double> pca; serialize(pca, out); } // ---------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------- template < typename matrix_type > class vector_normalizer_pca { public: typedef typename matrix_type::mem_manager_type mem_manager_type; typedef typename matrix_type::type scalar_type; typedef matrix<scalar_type,0,1,mem_manager_type> result_type; template <typename vector_type> void train ( const vector_type& samples, const double eps = 0.99 ) { // You are getting an error here because you are trying to apply PCA // to a vector of fixed length. But PCA is going to try and do // dimensionality reduction so you can't use a vector with a fixed dimension. COMPILE_TIME_ASSERT(matrix_type::NR == 0); // make sure requires clause is not broken DLIB_ASSERT(samples.size() > 0, "\tvoid vector_normalizer_pca::train_pca()" << "\n\tyou have to give a nonempty set of samples to this function" << "\n\tthis: " << this ); DLIB_ASSERT(0 < eps && eps <= 1, "\tvoid vector_normalizer_pca::train_pca()" << "\n\tyou have to give a nonempty set of samples to this function" << "\n\tthis: " << this ); train_pca_impl(mat(samples),eps); DLIB_ASSERT(is_finite(m), "Some of the input vectors to vector_normalizer_pca::train() have infinite or NaN values"); } long in_vector_size ( ) const { return m.nr(); } long out_vector_size ( ) const { return pca.nr(); } const matrix<scalar_type,0,1,mem_manager_type>& means ( ) const { return m; } const matrix<scalar_type,0,1,mem_manager_type>& std_devs ( ) const { return sd; } const matrix<scalar_type,0,0,mem_manager_type>& pca_matrix ( ) const { return pca; } const result_type& operator() ( const matrix_type& x ) const { // make sure requires clause is not broken DLIB_ASSERT(x.nr() == in_vector_size() && x.nc() == 1, "\tmatrix vector_normalizer_pca::operator()" << "\n\t you have given invalid arguments to this function" << "\n\t x.nr(): " << x.nr() << "\n\t in_vector_size(): " << in_vector_size() << "\n\t x.nc(): " << x.nc() << "\n\t this: " << this ); // If we have a pca transform matrix on hand then // also apply that. temp_out = pca*pointwise_multiply(x-m, sd); return temp_out; } void swap ( vector_normalizer_pca& item ) { m.swap(item.m); sd.swap(item.sd); pca.swap(item.pca); temp_out.swap(item.temp_out); } template <typename T> friend void deserialize ( vector_normalizer_pca<T>& item, std::istream& in ); template <typename T> friend void serialize ( const vector_normalizer_pca<T>& item, std::ostream& out ); private: template <typename mat_type> void train_pca_impl ( const mat_type& samples, const double eps ) { m = mean(samples); sd = reciprocal(sqrt(variance(samples))); // fill x with the normalized version of the input samples matrix<typename mat_type::type,0,1,mem_manager_type> x(samples); for (long r = 0; r < x.size(); ++r) x(r) = pointwise_multiply(x(r)-m, sd); matrix<scalar_type,0,0,mem_manager_type> temp, eigen; matrix<scalar_type,0,1,mem_manager_type> eigenvalues; // Compute the svd of the covariance matrix of the normalized inputs svd(covariance(x), temp, eigen, pca); eigenvalues = diag(eigen); rsort_columns(pca, eigenvalues); // figure out how many eigenvectors we want in our pca matrix const double thresh = sum(eigenvalues)*eps; long num_vectors = 0; double total = 0; for (long r = 0; r < eigenvalues.size() && total < thresh; ++r) { ++num_vectors; total += eigenvalues(r); } // So now we know we want to use num_vectors of the first eigenvectors. So // pull those out and discard the rest. pca = trans(colm(pca,range(0,num_vectors-1))); // Apply the pca transform to the data in x. Then we will normalize the // pca matrix below. for (long r = 0; r < x.nr(); ++r) { x(r) = pca*x(r); } // Here we just scale the output features from the pca transform so // that the variance of each feature is 1. So this doesn't really change // what the pca is doing, it just makes sure the output features are // normalized. pca = trans(scale_columns(trans(pca), reciprocal(sqrt(variance(x))))); } // ------------------- private data members ------------------- matrix<scalar_type,0,1,mem_manager_type> m, sd; matrix<scalar_type,0,0,mem_manager_type> pca; // This is just a temporary variable that doesn't contribute to the // state of this object. mutable result_type temp_out; }; template < typename matrix_type > inline void swap ( vector_normalizer_pca<matrix_type>& a, vector_normalizer_pca<matrix_type>& b ) { a.swap(b); } // ---------------------------------------------------------------------------------------- template < typename matrix_type > void deserialize ( vector_normalizer_pca<matrix_type>& item, std::istream& in ) { deserialize(item.m, in); deserialize(item.sd, in); deserialize(item.pca, in); if (item.pca.nc() != item.m.nr()) throw serialization_error("Error deserializing object of type vector_normalizer_pca\n" "It looks like a serialized vector_normalizer was accidentally deserialized into \n" "a vector_normalizer_pca object."); } template < typename matrix_type > void serialize ( const vector_normalizer_pca<matrix_type>& item, std::ostream& out ) { serialize(item.m, out); serialize(item.sd, out); serialize(item.pca, out); } // ---------------------------------------------------------------------------------------- inline double binomial_random_vars_are_different ( uint64_t k1, uint64_t n1, uint64_t k2, uint64_t n2 ) { DLIB_ASSERT(k1 <= n1, "k1: "<< k1 << " n1: "<< n1); DLIB_ASSERT(k2 <= n2, "k2: "<< k2 << " n2: "<< n2); const double p1 = k1/(double)n1; const double p2 = k2/(double)n2; const double p = (k1+k2)/(double)(n1+n2); auto ll = [](double p, uint64_t k, uint64_t n) { if (p == 0 || p == 1) return 0.0; return k*std::log(p) + (n-k)*std::log(1-p); }; auto logll = ll(p1,k1,n1) + ll(p2,k2,n2) - ll(p,k1,n1) - ll(p,k2,n2); // The basic statistic only tells you if the random variables are different. But // it's nice to know which way they are different, i.e., which one is bigger. So // stuff that information into the sign bit of the return value. if (p1>=p2) return logll; else return -logll; } // ---------------------------------------------------------------------------------------- inline double event_correlation ( uint64_t A_count, uint64_t B_count, uint64_t AB_count, uint64_t total_num_observations ) { DLIB_ASSERT(AB_count <= A_count && A_count <= total_num_observations, "AB_count: " << AB_count << ", A_count: "<< A_count << ", total_num_observations: " << total_num_observations); DLIB_ASSERT(AB_count <= B_count && B_count <= total_num_observations, "AB_count: " << AB_count << ", B_count: "<< B_count << ", total_num_observations: " << total_num_observations); if (total_num_observations == 0) return 0; DLIB_ASSERT(A_count + B_count - AB_count <= total_num_observations, "AB_count: " << AB_count << " A_count: " << A_count << ", B_count: "<< B_count << ", total_num_observations: " << total_num_observations); const auto AnotB_count = A_count - AB_count; const auto notB_count = total_num_observations - B_count; // How likely is it that the odds of A happening is different when conditioned on // whether or not B happened? return binomial_random_vars_are_different( AB_count, B_count, // A conditional on the presence of B AnotB_count, notB_count // A conditional on the absence of B ); } // ---------------------------------------------------------------------------------------- } #endif // DLIB_STATISTICs_