Spaces:
No application file
No application file
/* The C clustering library. | |
* Copyright (C) 2002 Michiel Jan Laurens de Hoon. | |
* | |
* This library was written at the Laboratory of DNA Information Analysis, | |
* Human Genome Center, Institute of Medical Science, University of Tokyo, | |
* 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan. | |
* Contact: michiel.dehoon 'AT' riken.jp | |
* | |
* Permission to use, copy, modify, and distribute this software and its | |
* documentation with or without modifications and for any purpose and | |
* without fee is hereby granted, provided that any copyright notices | |
* appear in all copies and that both those copyright notices and this | |
* permission notice appear in supporting documentation, and that the | |
* names of the contributors or copyright holders not be used in | |
* advertising or publicity pertaining to distribution of the software | |
* without specific prior permission. | |
* | |
* THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL | |
* WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED | |
* WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE | |
* CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT | |
* OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS | |
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE | |
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE | |
* OR PERFORMANCE OF THIS SOFTWARE. | |
* | |
*/ | |
/* ************************************************************************ */ | |
/* SORTING FUNCTIONS */ | |
/* | |
* C qsort() is very slow, much slower than C++ std::sort(). | |
* This is because qsort() doesn't utilize data-type information at compile time, | |
* and it has redundant pointer dereference since it requires a compare function. | |
* For projects that use old C, it's impossible to convert to C++/newer C. | |
* | |
* So we implement a simple quicksort that is ~~25% faster than std::sort() | |
* with mostly random data, and much faster with structured/sorted data | |
*/ | |
static const int INF = INT_MAX; // 2^31 - 1 | |
static int TEMP_SWAP_INT; | |
/* For quicksort, we need to choose a random pivot. Any random function should work. Even bad ones. */ | |
static int | |
cheap_random() | |
{ | |
const int base = 2 * 100 * 1000 * 1000 + 33; | |
static int seed = 0; | |
seed = seed * 7 + 13; | |
if (seed > base) seed %= base; | |
return seed; | |
} | |
static inline int | |
median_index_of3_index(const double arr[], int index[], const int a, const int b, const int c) | |
{ | |
if (arr[index[a]] < arr[index[b]]) { | |
if (arr[index[b]] < arr[index[c]]) return b; | |
else if (arr[index[a]] < arr[index[c]]) return c; | |
else return a; | |
} | |
else { | |
if (arr[index[a]] < arr[index[c]]) return a; | |
else if (arr[index[b]] < arr[index[c]]) return c; | |
else return b; | |
} | |
} | |
/* Insertion sort is best when the array is small. */ | |
static void | |
insertion_sort_index(const double a[], int index[], int l, int r) | |
{ | |
int i, j, current_index; | |
double value; | |
if (r <= l) return; | |
i = l; j = r; | |
value = a[index[(l + r) >> 1]]; | |
while (i <= j) { | |
while (a[index[i]] < value) i++; | |
while (a[index[j]] > value) j--; | |
if (i <= j) { | |
swap_int(index[i], index[j]); | |
i++; | |
j--; | |
} | |
} | |
for (i = l + 1; i <= r; i++) { | |
j = i - 1; | |
value = a[index[i]]; | |
current_index = index[i]; | |
while (j >= l && a[index[j]] > value) { | |
index[j + 1] = index[j]; | |
j--; | |
} | |
index[j + 1] = current_index; | |
} | |
} | |
//*************** | |
static void | |
fastsort_partition_index(const double a[], int index[], const int left, const int right, int* first_end_ptr, int* second_start_ptr) { | |
int low, high, i, pivot, mid; | |
double value; | |
int increasing = 1, decreasing = 1; | |
/*******/ | |
/* choose a random way to choose pivot, to prevent all possible worst-cases*/ | |
if ((right - left) & 1) pivot = left + cheap_random() % (right - left); | |
else pivot = median_index_of3_index(a, index, left, (left + right) >> 1, right); | |
value = a[index[pivot]]; | |
/*******/ | |
/* Skip through smaller values on left and larger values on right*/ | |
low = left; high = right; | |
while (a[index[low]] < value) { | |
low++; | |
decreasing = 0; | |
if (a[index[low]] < a[index[low - 1]]) increasing = 0; | |
} | |
while (a[index[high]] > value) { | |
high--; | |
decreasing = 0; | |
if (a[index[high]] > a[index[high + 1]]) increasing = 0; | |
} | |
increasing &= a[index[high]] >= a[index[low]]; | |
decreasing &= a[index[high]] <= a[index[low]]; | |
/*******/ | |
/* Resolve degenerate input cases */ | |
if (increasing) { | |
if ((right - left) & 1) { | |
for (i = low + 1; i <= high; i++) if (a[index[i]] < a[index[i - 1]]) { | |
increasing = 0; | |
break; | |
} | |
} | |
else { | |
for (i = high; i >= low + 1; i--) if (a[index[i]] < a[index[i - 1]]) { | |
increasing = 0; | |
break; | |
} | |
} | |
if (increasing) { /* sorted */ | |
*first_end_ptr = INF; | |
return; | |
} | |
} | |
if (decreasing) { | |
if ((right - left) & 1) { | |
for (i = low + 1; i <= high; i++) if (a[index[i]] > a[index[i - 1]]) { | |
decreasing = 0; | |
break; | |
} | |
} | |
else { | |
for (i = high; i >= low + 1; i--) if (a[index[i]] > a[index[i - 1]]) { | |
decreasing = 0; | |
break; | |
} | |
} | |
if (decreasing) { | |
mid = (right - left + 1) >> 1; | |
for (i = 0; i < mid; i++) swap_int(index[left + i], index[right - i]); | |
*first_end_ptr = INF; | |
return; | |
} | |
} | |
/******/ | |
while (low <= high) { | |
while (a[index[low]] < value) low++; | |
while (a[index[high]] > value) high--; | |
if (low <= high) { | |
swap_int(index[low], index[high]); | |
low++; | |
high--; | |
} | |
} | |
*first_end_ptr = high; | |
*second_start_ptr = low; | |
} | |
//*************** | |
static void | |
fastsort_recursive_index(const double a[], int index[], int l, int r) | |
{ | |
int first_end, second_start; | |
while (l < r) { | |
if (r - l <= 70) { /* determined through experiments and benchmarks, not randomly. 70-150 works fine on random/mixed (hard) data */ | |
insertion_sort_index(a, index, l, r); | |
return; | |
} | |
fastsort_partition_index(a, index, l, r, &first_end, &second_start); | |
if (first_end == INF) return; /* sorted */ | |
/* Recurse into smaller branch to avoid stack overflow */ | |
if (first_end - l < r - second_start) { | |
fastsort_recursive_index(a, index, l, first_end); | |
l = second_start; | |
} | |
else { | |
fastsort_recursive_index(a, index, second_start, r); | |
r = first_end; | |
} | |
} | |
} | |
/* ************************************************************************ */ | |
double | |
mean(int n, double a[]) | |
/* | |
Add 4 elements at once instead of 1. The advantages are: | |
1. less loop overhead | |
2. compile with -O2 -> use SSE/AVX. | |
3. without AVX, still faster because 4 independent additions -> parallel instruction possible | |
4. smaller floating-point error | |
*/ | |
{ | |
double result = 0.; | |
int i; | |
double sum[4] = {0., 0., 0., 0.}; | |
int nstep4 = n - n % 4; | |
for (i = 0; i < nstep4; i += 4) { | |
sum[0] += a[i]; | |
sum[1] += a[i + 1]; | |
sum[2] += a[i + 2]; | |
sum[3] += a[i + 3]; | |
} | |
for (i = nstep4; i < n; i++) result += a[i]; | |
result += (sum[0] + sum[1]) + (sum[2] + sum[3]); | |
return result / n; | |
} | |
/* ************************************************************************ */ | |
double | |
median(int n, double x[]) | |
/* | |
Find the median of X(1), ... , X(N), using as much of the quicksort | |
algorithm as is needed to isolate it. | |
N.B. On exit, the array X is partially ordered. | |
Based on Alan J. Miller's median.f90 routine. | |
*/ | |
{ | |
int i, j; | |
int nr = n / 2; | |
int nl = nr - 1; | |
int even = 0; | |
/* hi & lo are position limits encompassing the median. */ | |
int lo = 0; | |
int hi = n-1; | |
if (n == 2*nr) even = 1; | |
if (n < 3) { | |
if (n < 1) return 0.; | |
if (n == 1) return x[0]; | |
return 0.5*(x[0]+x[1]); | |
} | |
/* Find median of 1st, middle & last values. */ | |
do { | |
int loop; | |
int mid = (lo + hi)/2; | |
double result = x[mid]; | |
double xlo = x[lo]; | |
double xhi = x[hi]; | |
if (xhi<xlo) { | |
double temp = xlo; | |
xlo = xhi; | |
xhi = temp; | |
} | |
if (result>xhi) result = xhi; | |
else if (result<xlo) result = xlo; | |
/* The basic quicksort algorithm to move all values <= the sort key | |
* (XMED) to the left-hand end, and all higher values to the other end. | |
*/ | |
i = lo; | |
j = hi; | |
do { | |
while (x[i]<result) i++; | |
while (x[j]>result) j--; | |
loop = 0; | |
if (i<j) { | |
double temp = x[i]; | |
x[i] = x[j]; | |
x[j] = temp; | |
i++; | |
j--; | |
if (i<=j) loop = 1; | |
} | |
} while (loop); /* Decide which half the median is in. */ | |
if (even) { | |
if (j == nl && i == nr) { | |
/* Special case, n even, j = n/2 & i = j + 1, so the median is | |
* between the two halves of the series. Find max. of the first | |
* half & min. of the second half, then average. | |
*/ | |
int k; | |
double xmax = x[0]; | |
double xmin = x[n-1]; | |
for (k = lo; k <= j; k++) xmax = max(xmax, x[k]); | |
for (k = i; k <= hi; k++) xmin = min(xmin, x[k]); | |
return 0.5*(xmin + xmax); | |
} | |
if (j<nl) lo = i; | |
if (i>nr) hi = j; | |
if (i == j) { | |
if (i == nl) lo = nl; | |
if (j == nr) hi = nr; | |
} | |
} | |
else { | |
if (j<nr) lo = i; | |
if (i>nr) hi = j; | |
/* Test whether median has been isolated. */ | |
if (i == j && i == nr) return result; | |
} | |
} | |
while (lo<hi-1); | |
if (even) return (0.5*(x[nl]+x[nr])); | |
if (x[lo]>x[hi]) { | |
double temp = x[lo]; | |
x[lo] = x[hi]; | |
x[hi] = temp; | |
} | |
return x[nr]; | |
} | |
/* ********************************************************************** */ | |
void | |
sort_index(int n, const double data[], int index[]) | |
/* Sets up an index table given the data, such that data[index[]] is in | |
* increasing order. Sorting is done on the indices; the array data | |
* is unchanged. | |
*/ | |
{ | |
int i; | |
for (i = 0; i < n; i++) index[i] = i; | |
fastsort_recursive_index(data, index, 0, n - 1); | |
} | |
/* ********************************************************************** */ | |
static double* | |
getrank(int n, const double data[], const double weight[]) | |
/* Calculates the ranks of the elements in the array data. Two elements with | |
* the same value get the same rank, equal to the average of the ranks had the | |
* elements different values. The ranks are returned as a newly allocated | |
* array that should be freed by the calling routine. If getrank fails due to | |
* a memory allocation error, it returns NULL. | |
*/ | |
{ | |
int i, j, k, l; | |
double* rank; | |
int* index; | |
double total = 0.0; | |
double subtotal; | |
double current; | |
double value; | |
rank = malloc(n*sizeof(double)); | |
if (!rank) return NULL; | |
index = malloc(n*sizeof(int)); | |
if (!index) { | |
free(rank); | |
return NULL; | |
} | |
/* Call sort to get an index table */ | |
sort_index(n, data, index); | |
/* Build a rank table */ | |
k = 0; | |
j = index[0]; | |
current = data[j]; | |
subtotal = weight[j]; | |
for (i = 1; i < n; i++) { | |
j = index[i]; | |
value = data[j]; | |
if (value != current) { | |
current = value; | |
value = total + (subtotal + 1.0) / 2.0; | |
for (l = k; l < i; l++) rank[index[l]] = value; | |
k = i; | |
total += subtotal; | |
subtotal = 0.0; | |
} | |
subtotal += weight[j]; | |
} | |
value = total + (subtotal + 1.0) / 2.0; | |
for (l = k; l < i; l++) rank[index[l]] = value; | |
free(index); | |
return rank; | |
} | |
/* ---------------------------------------------------------------------- */ | |
static int | |
makedatamask(int nrows, int ncols, double*** pdata, int*** pmask) | |
{ | |
int i; | |
double** data; | |
int** mask; | |
data = malloc(nrows*sizeof(double*)); | |
if (!data) return 0; | |
mask = malloc(nrows*sizeof(int*)); | |
if (!mask) { | |
free(data); | |
return 0; | |
} | |
for (i = 0; i < nrows; i++) { | |
data[i] = malloc(ncols*sizeof(double)); | |
if (!data[i]) break; | |
mask[i] = malloc(ncols*sizeof(int)); | |
if (!mask[i]) { | |
free(data[i]); | |
break; | |
} | |
} | |
if (i == nrows) { /* break not encountered */ | |
*pdata = data; | |
*pmask = mask; | |
return 1; | |
} | |
*pdata = NULL; | |
*pmask = NULL; | |
nrows = i; | |
for (i = 0; i < nrows; i++) { | |
free(data[i]); | |
free(mask[i]); | |
} | |
free(data); | |
free(mask); | |
return 0; | |
} | |
/* ---------------------------------------------------------------------- */ | |
static void | |
freedatamask(int n, double** data, int** mask) | |
{ | |
int i; | |
for (i = 0; i < n; i++) { | |
free(mask[i]); | |
free(data[i]); | |
} | |
free(mask); | |
free(data); | |
} | |
/* ---------------------------------------------------------------------- */ | |
static double | |
find_closest_pair(int n, double** distmatrix, int* ip, int* jp) | |
/* | |
This function searches the distance matrix to find the pair with the shortest | |
distance between them. The indices of the pair are returned in ip and jp; the | |
distance itself is returned by the function. | |
n (input) int | |
The number of elements in the distance matrix. | |
distmatrix (input) double** | |
A ragged array containing the distance matrix. The number of columns in each | |
row is one less than the row index. | |
ip (output) int* | |
A pointer to the integer that is to receive the first index of the pair with | |
the shortest distance. | |
jp (output) int* | |
A pointer to the integer that is to receive the second index of the pair with | |
the shortest distance. | |
*/ | |
{ | |
int i, j; | |
double temp; | |
double distance = distmatrix[1][0]; | |
*ip = 1; | |
*jp = 0; | |
for (i = 1; i < n; i++) { | |
for (j = 0; j < i; j++) { | |
temp = distmatrix[i][j]; | |
if (temp<distance) { | |
distance = temp; | |
*ip = i; | |
*jp = j; | |
} | |
} | |
} | |
return distance; | |
} | |
/* ********************************************************************* */ | |
static int | |
svd(int m, int n, double** u, double w[], double** vt) | |
/* | |
* This subroutine is a translation of the Algol procedure svd, described in | |
* G.H. Golub and C. Reinsch: | |
* "Singular Value Decomposition and Least Squares Solutions", | |
* published in | |
* Numerische Mathematik 14(5): page 403-420 (1970) | |
* and | |
* Handbook for Automatic Computation, Volume II: Linear Algebra, | |
* (John H. Wilkinson and C. Reinsch; edited by Friedrich L. Bauer and | |
* Alston S. Householder): page 134-151 (1971). | |
* t | |
* This subroutine determines the singular value decomposition A=usv of a | |
* real m by n rectangular matrix, where m is greater * than or equal to n. | |
* Householder bidiagonalization and a variant of the QR algorithm are used. | |
* | |
* On input: | |
* | |
* m is the number of rows of A (and u). | |
* | |
* n is the number of columns of A (and u) and the order of v. | |
* | |
* u contains the rectangular input matrix A to be decomposed. | |
* | |
* On output: | |
* | |
* the routine returns an integer ierr equal to | |
* 0: to indicate a normal return, | |
* k: if the k-th singular value has not been determined after 30 iterations, | |
* -1: if memory allocation fails. | |
* | |
* w contains the n (non-negative) singular values of a (the | |
* diagonal elements of s). they are unordered. | |
* If an error exit is made, the singular values should be correct for | |
* indices ierr+1, ierr+2, ..., n. | |
* | |
* u contains the matrix u (orthogonal column vectors) of the decomposition. | |
* If an error exit is made, the columns of u corresponding to indices of | |
* correct singular values should be correct. | |
* t | |
* vt contains the matrix v (orthogonal) of the decomposition. | |
* If an error exit is made, the columns of v corresponding to indices of | |
* correct singular values should be correct. | |
* | |
* | |
* Questions and comments should be directed to B. S. Garbow, | |
* Applied Mathematics division, Argonne National Laboratory | |
* | |
* Modified to eliminate machep | |
* | |
* Translated to C by Michiel de Hoon, Human Genome Center, | |
* University of Tokyo, for inclusion in the C Clustering Library. | |
* This routine is less general than the original svd routine, as | |
* it focuses on the singular value decomposition as needed for | |
* clustering. In particular, | |
* - We calculate both u and v in all cases | |
* - We pass the input array A via u; this array is subsequently overwritten. | |
* - We allocate for the array rv1, used as a working space, internally in | |
* this routine, instead of passing it as an argument. | |
* If the allocation fails, svd returns -1. | |
* 2003.06.05 | |
*/ | |
{ | |
int i, j, k, i1, k1, l1, its; | |
double c, f, h, s, x, y, z; | |
int l = 0; | |
int ierr = 0; | |
double g = 0.0; | |
double scale = 0.0; | |
double anorm = 0.0; | |
double* rv1; | |
rv1 = malloc(n*sizeof(double)); | |
if (!rv1) return -1; | |
if (m >= n) { | |
/* Householder reduction to bidiagonal form */ | |
for (i = 0; i < n; i++) { | |
l = i + 1; | |
rv1[i] = scale * g; | |
g = 0.0; | |
s = 0.0; | |
scale = 0.0; | |
for (k = i; k < m; k++) scale += fabs(u[k][i]); | |
if (scale != 0.0) { | |
for (k = i; k < m; k++) { | |
u[k][i] /= scale; | |
s += u[k][i]*u[k][i]; | |
} | |
f = u[i][i]; | |
g = (f >= 0) ? -sqrt(s) : sqrt(s); | |
h = f * g - s; | |
u[i][i] = f - g; | |
if (i < n-1) { | |
for (j = l; j < n; j++) { | |
s = 0.0; | |
for (k = i; k < m; k++) s += u[k][i] * u[k][j]; | |
f = s / h; | |
for (k = i; k < m; k++) u[k][j] += f * u[k][i]; | |
} | |
} | |
for (k = i; k < m; k++) u[k][i] *= scale; | |
} | |
w[i] = scale * g; | |
g = 0.0; | |
s = 0.0; | |
scale = 0.0; | |
if (i<n-1) { | |
for (k = l; k < n; k++) scale += fabs(u[i][k]); | |
if (scale != 0.0) { | |
for (k = l; k < n; k++) { | |
u[i][k] /= scale; | |
s += u[i][k] * u[i][k]; | |
} | |
f = u[i][l]; | |
g = (f >= 0) ? -sqrt(s) : sqrt(s); | |
h = f * g - s; | |
u[i][l] = f - g; | |
for (k = l; k < n; k++) rv1[k] = u[i][k] / h; | |
for (j = l; j < m; j++) { | |
s = 0.0; | |
for (k = l; k < n; k++) s += u[j][k] * u[i][k]; | |
for (k = l; k < n; k++) u[j][k] += s * rv1[k]; | |
} | |
for (k = l; k < n; k++) u[i][k] *= scale; | |
} | |
} | |
anorm = max(anorm, fabs(w[i])+fabs(rv1[i])); | |
} | |
/* accumulation of right-hand transformations */ | |
for (i = n-1; i >= 0; i--) { | |
if (i < n-1) { | |
if (g != 0.0) { | |
for (j = l; j < n; j++) vt[i][j] = (u[i][j] / u[i][l]) / g; | |
/* double division avoids possible underflow */ | |
for (j = l; j < n; j++) { | |
s = 0.0; | |
for (k = l; k < n; k++) s += u[i][k] * vt[j][k]; | |
for (k = l; k < n; k++) vt[j][k] += s * vt[i][k]; | |
} | |
} | |
} | |
for (j = l; j < n; j++) { | |
vt[j][i] = 0.0; | |
vt[i][j] = 0.0; | |
} | |
vt[i][i] = 1.0; | |
g = rv1[i]; | |
l = i; | |
} | |
/* accumulation of left-hand transformations */ | |
for (i = n-1; i >= 0; i--) { | |
l = i + 1; | |
g = w[i]; | |
if (i != n-1) | |
for (j = l; j < n; j++) u[i][j] = 0.0; | |
if (g != 0.0) { | |
if (i != n-1) { | |
for (j = l; j < n; j++) { | |
s = 0.0; | |
for (k = l; k < m; k++) s += u[k][i] * u[k][j]; | |
/* double division avoids possible underflow */ | |
f = (s / u[i][i]) / g; | |
for (k = i; k < m; k++) u[k][j] += f * u[k][i]; | |
} | |
} | |
for (j = i; j < m; j++) u[j][i] /= g; | |
} | |
else | |
for (j = i; j < m; j++) u[j][i] = 0.0; | |
u[i][i] += 1.0; | |
} | |
/* diagonalization of the bidiagonal form */ | |
for (k = n-1; k >= 0; k--) { | |
k1 = k-1; | |
its = 0; | |
while (1) { | |
/* test for splitting */ | |
for (l = k; l >= 0; l--) { | |
l1 = l-1; | |
if (fabs(rv1[l]) + anorm == anorm) break; | |
/* rv1[0] is always zero, so there is no exit | |
* through the bottom of the loop */ | |
if (fabs(w[l1]) + anorm == anorm) { | |
/* cancellation of rv1[l] if l greater than 0 */ | |
c = 0.0; | |
s = 1.0; | |
for (i = l; i <= k; i++) { | |
f = s * rv1[i]; | |
rv1[i] *= c; | |
if (fabs(f) + anorm == anorm) break; | |
g = w[i]; | |
h = sqrt(f*f+g*g); | |
w[i] = h; | |
c = g / h; | |
s = -f / h; | |
for (j = 0; j < m; j++) { | |
y = u[j][l1]; | |
z = u[j][i]; | |
u[j][l1] = y * c + z * s; | |
u[j][i] = -y * s + z * c; | |
} | |
} | |
break; | |
} | |
} | |
/* test for convergence */ | |
z = w[k]; | |
if (l == k) { /* convergence */ | |
if (z < 0.0) { | |
/* w[k] is made non-negative */ | |
w[k] = -z; | |
for (j = 0; j < n; j++) vt[k][j] = -vt[k][j]; | |
} | |
break; | |
} | |
else if (its == 30) { | |
ierr = k; | |
break; | |
} | |
else { | |
/* shift from bottom 2 by 2 minor */ | |
its++; | |
x = w[l]; | |
y = w[k1]; | |
g = rv1[k1]; | |
h = rv1[k]; | |
f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0*h*y); | |
g = sqrt(f*f+1.0); | |
f = ((x - z) * (x + z) | |
+ h * (y / (f + (f >= 0 ? g : -g)) - h)) / x; | |
/* next qr transformation */ | |
c = 1.0; | |
s = 1.0; | |
for (i1 = l; i1 <= k1; i1++) { | |
i = i1 + 1; | |
g = rv1[i]; | |
y = w[i]; | |
h = s * g; | |
g = c * g; | |
z = sqrt(f*f+h*h); | |
rv1[i1] = z; | |
c = f / z; | |
s = h / z; | |
f = x * c + g * s; | |
g = -x * s + g * c; | |
h = y * s; | |
y = y * c; | |
for (j = 0; j < n; j++) { | |
x = vt[i1][j]; | |
z = vt[i][j]; | |
vt[i1][j] = x * c + z * s; | |
vt[i][j] = -x * s + z * c; | |
} | |
z = sqrt(f*f+h*h); | |
w[i1] = z; | |
/* rotation can be arbitrary if z is zero */ | |
if (z != 0.0) { | |
c = f / z; | |
s = h / z; | |
} | |
f = c * g + s * y; | |
x = -s * g + c * y; | |
for (j = 0; j < m; j++) { | |
y = u[j][i1]; | |
z = u[j][i]; | |
u[j][i1] = y * c + z * s; | |
u[j][i] = -y * s + z * c; | |
} | |
} | |
rv1[l] = 0.0; | |
rv1[k] = f; | |
w[k] = x; | |
} | |
} | |
} | |
} | |
else /* m < n */ { | |
/* Householder reduction to bidiagonal form */ | |
for (i = 0; i < m; i++) { | |
l = i + 1; | |
rv1[i] = scale * g; | |
g = 0.0; | |
s = 0.0; | |
scale = 0.0; | |
for (k = i; k < n; k++) scale += fabs(u[i][k]); | |
if (scale != 0.0) { | |
for (k = i; k < n; k++) { | |
u[i][k] /= scale; | |
s += u[i][k]*u[i][k]; | |
} | |
f = u[i][i]; | |
g = (f >= 0) ? -sqrt(s) : sqrt(s); | |
h = f * g - s; | |
u[i][i] = f - g; | |
if (i < m-1) { | |
for (j = l; j < m; j++) { | |
s = 0.0; | |
for (k = i; k < n; k++) s += u[i][k] * u[j][k]; | |
f = s / h; | |
for (k = i; k < n; k++) u[j][k] += f * u[i][k]; | |
} | |
} | |
for (k = i; k < n; k++) u[i][k] *= scale; | |
} | |
w[i] = scale * g; | |
g = 0.0; | |
s = 0.0; | |
scale = 0.0; | |
if (i<m-1) { | |
for (k = l; k < m; k++) scale += fabs(u[k][i]); | |
if (scale != 0.0) { | |
for (k = l; k < m; k++) { | |
u[k][i] /= scale; | |
s += u[k][i] * u[k][i]; | |
} | |
f = u[l][i]; | |
g = (f >= 0) ? -sqrt(s) : sqrt(s); | |
h = f * g - s; | |
u[l][i] = f - g; | |
for (k = l; k < m; k++) rv1[k] = u[k][i] / h; | |
for (j = l; j < n; j++) { | |
s = 0.0; | |
for (k = l; k < m; k++) s += u[k][j] * u[k][i]; | |
for (k = l; k < m; k++) u[k][j] += s * rv1[k]; | |
} | |
for (k = l; k < m; k++) u[k][i] *= scale; | |
} | |
} | |
anorm = max(anorm, fabs(w[i])+fabs(rv1[i])); | |
} | |
/* accumulation of right-hand transformations */ | |
for (i = m-1; i >= 0; i--) { | |
if (i < m-1) { | |
if (g != 0.0) { | |
for (j = l; j < m; j++) vt[j][i] = (u[j][i] / u[l][i]) / g; | |
/* double division avoids possible underflow */ | |
for (j = l; j < m; j++) { | |
s = 0.0; | |
for (k = l; k < m; k++) s += u[k][i] * vt[k][j]; | |
for (k = l; k < m; k++) vt[k][j] += s * vt[k][i]; | |
} | |
} | |
} | |
for (j = l; j < m; j++) { | |
vt[i][j] = 0.0; | |
vt[j][i] = 0.0; | |
} | |
vt[i][i] = 1.0; | |
g = rv1[i]; | |
l = i; | |
} | |
/* accumulation of left-hand transformations */ | |
for (i = m-1; i >= 0; i--) { | |
l = i + 1; | |
g = w[i]; | |
if (i != m-1) | |
for (j = l; j < m; j++) u[j][i] = 0.0; | |
if (g != 0.0) { | |
if (i != m-1) { | |
for (j = l; j < m; j++) { | |
s = 0.0; | |
for (k = l; k < n; k++) s += u[i][k] * u[j][k]; | |
/* double division avoids possible underflow */ | |
f = (s / u[i][i]) / g; | |
for (k = i; k < n; k++) u[j][k] += f * u[i][k]; | |
} | |
} | |
for (j = i; j < n; j++) u[i][j] /= g; | |
} | |
else | |
for (j = i; j < n; j++) u[i][j] = 0.0; | |
u[i][i] += 1.0; | |
} | |
/* diagonalization of the bidiagonal form */ | |
for (k = m-1; k >= 0; k--) { | |
k1 = k-1; | |
its = 0; | |
while (1) { | |
/* test for splitting */ | |
for (l = k; l >= 0; l--) { | |
l1 = l-1; | |
if (fabs(rv1[l]) + anorm == anorm) break; | |
/* rv1[0] is always zero, so there is no exit | |
* through the bottom of the loop */ | |
if (fabs(w[l1]) + anorm == anorm) { | |
/* cancellation of rv1[l] if l greater than 0 */ | |
c = 0.0; | |
s = 1.0; | |
for (i = l; i <= k; i++) { | |
f = s * rv1[i]; | |
rv1[i] *= c; | |
if (fabs(f) + anorm == anorm) break; | |
g = w[i]; | |
h = sqrt(f*f+g*g); | |
w[i] = h; | |
c = g / h; | |
s = -f / h; | |
for (j = 0; j < n; j++) { | |
y = u[l1][j]; | |
z = u[i][j]; | |
u[l1][j] = y * c + z * s; | |
u[i][j] = -y * s + z * c; | |
} | |
} | |
break; | |
} | |
} | |
/* test for convergence */ | |
z = w[k]; | |
if (l == k) /* convergence */ { | |
if (z < 0.0) { | |
/* w[k] is made non-negative */ | |
w[k] = -z; | |
for (j = 0; j < m; j++) vt[j][k] = -vt[j][k]; | |
} | |
break; | |
} | |
else if (its == 30) { | |
ierr = k; | |
break; | |
} | |
else { | |
/* shift from bottom 2 by 2 minor */ | |
its++; | |
x = w[l]; | |
y = w[k1]; | |
g = rv1[k1]; | |
h = rv1[k]; | |
f = ((y - z) * (y + z) | |
+ (g - h) * (g + h)) / (2.0 * h * y); | |
g = sqrt(f*f+1.0); | |
f = ((x - z) * (x + z) | |
+ h * (y / (f + (f >= 0 ? g : -g)) - h)) / x; | |
/* next qr transformation */ | |
c = 1.0; | |
s = 1.0; | |
for (i1 = l; i1 <= k1; i1++) { | |
i = i1 + 1; | |
g = rv1[i]; | |
y = w[i]; | |
h = s * g; | |
g = c * g; | |
z = sqrt(f*f+h*h); | |
rv1[i1] = z; | |
c = f / z; | |
s = h / z; | |
f = x * c + g * s; | |
g = -x * s + g * c; | |
h = y * s; | |
y = y * c; | |
for (j = 0; j < m; j++) { | |
x = vt[j][i1]; | |
z = vt[j][i]; | |
vt[j][i1] = x * c + z * s; | |
vt[j][i] = -x * s + z * c; | |
} | |
z = sqrt(f*f+h*h); | |
w[i1] = z; | |
/* rotation can be arbitrary if z is zero */ | |
if (z != 0.0) { | |
c = f / z; | |
s = h / z; | |
} | |
f = c * g + s * y; | |
x = -s * g + c * y; | |
for (j = 0; j < n; j++) { | |
y = u[i1][j]; | |
z = u[i][j]; | |
u[i1][j] = y * c + z * s; | |
u[i][j] = -y * s + z * c; | |
} | |
} | |
rv1[l] = 0.0; | |
rv1[k] = f; | |
w[k] = x; | |
} | |
} | |
} | |
} | |
free(rv1); | |
return ierr; | |
} | |
/* ********************************************************************* */ | |
int | |
pca(int nrows, int ncolumns, double** u, double** v, double* w) | |
/* | |
Purpose | |
======= | |
This subroutine uses the singular value decomposition to perform principal | |
components analysis of a real nrows by ncolumns rectangular matrix. | |
Arguments | |
========= | |
nrows (input) int | |
The number of rows in the matrix u. | |
ncolumns (input) int | |
The number of columns in the matrix v. | |
u (input) double[nrows][ncolumns] | |
On input, the array containing the data to which the principal component | |
analysis should be applied. The function assumes that the mean has already been | |
subtracted of each column, and hence that the mean of each column is zero. | |
On output, see below. | |
v (input) double[n][n], where n = min(nrows, ncolumns) | |
Not used on input. | |
w (input) double[n], where n = min(nrows, ncolumns) | |
Not used on input. | |
Return value | |
============ | |
On output: | |
If nrows >= ncolumns, then | |
u contains the coordinates with respect to the principal components; | |
v contains the principal component vectors. | |
The dot product u . v reproduces the data that were passed in u. | |
If nrows < ncolumns, then | |
u contains the principal component vectors; | |
v contains the coordinates with respect to the principal components. | |
The dot product v . u reproduces the data that were passed in u. | |
The eigenvalues of the covariance matrix are returned in w. | |
The arrays u, v, and w are sorted according to eigenvalue, with the largest | |
eigenvalues appearing first. | |
The function returns 0 if successful, -1 if memory allocation fails, and a | |
positive integer if the singular value decomposition fails to converge. | |
*/ | |
{ | |
int i; | |
int j; | |
int error; | |
int* index = malloc(ncolumns*sizeof(int)); | |
double* temp = malloc(ncolumns*sizeof(double)); | |
if (!index || !temp) { | |
if (index) free(index); | |
if (temp) free(temp); | |
return -1; | |
} | |
error = svd(nrows, ncolumns, u, w, v); | |
if (error == 0) { | |
if (nrows >= ncolumns) { | |
for (j = 0; j < ncolumns; j++) { | |
const double s = w[j]; | |
for (i = 0; i < nrows; i++) u[i][j] *= s; | |
} | |
sort_index(ncolumns, w, index); | |
for (i = 0; i < ncolumns/2; i++) { | |
j = index[i]; | |
index[i] = index[ncolumns-1-i]; | |
index[ncolumns-1-i] = j; | |
} | |
for (i = 0; i < nrows; i++) { | |
for (j = 0; j < ncolumns; j++) temp[j] = u[i][index[j]]; | |
for (j = 0; j < ncolumns; j++) u[i][j] = temp[j]; | |
} | |
for (i = 0; i < ncolumns; i++) { | |
for (j = 0; j < ncolumns; j++) temp[j] = v[index[j]][i]; | |
for (j = 0; j < ncolumns; j++) v[j][i] = temp[j]; | |
} | |
for (i = 0; i < ncolumns; i++) temp[i] = w[index[i]]; | |
for (i = 0; i < ncolumns; i++) w[i] = temp[i]; | |
} | |
else /* nrows < ncolumns */ { | |
for (j = 0; j < nrows; j++) { | |
const double s = w[j]; | |
for (i = 0; i < nrows; i++) v[i][j] *= s; | |
} | |
sort_index(nrows, w, index); | |
for (i = 0; i < nrows/2; i++) { | |
j = index[i]; | |
index[i] = index[nrows-1-i]; | |
index[nrows-1-i] = j; | |
} | |
for (j = 0; j < ncolumns; j++) { | |
for (i = 0; i < nrows; i++) temp[i] = u[index[i]][j]; | |
for (i = 0; i < nrows; i++) u[i][j] = temp[i]; | |
} | |
for (j = 0; j < nrows; j++) { | |
for (i = 0; i < nrows; i++) temp[i] = v[j][index[i]]; | |
for (i = 0; i < nrows; i++) v[j][i] = temp[i]; | |
} | |
for (i = 0; i < nrows; i++) temp[i] = w[index[i]]; | |
for (i = 0; i < nrows; i++) w[i] = temp[i]; | |
} | |
} | |
free(index); | |
free(temp); | |
return error; | |
} | |
/* ********************************************************************* */ | |
static double | |
euclid(int n, double** data1, double** data2, int** mask1, int** mask2, | |
const double weight[], int index1, int index2, int transpose) | |
/* | |
Purpose | |
======= | |
The euclid routine calculates the weighted Euclidean distance between two | |
rows or columns in a matrix. | |
Arguments | |
========= | |
n (input) int | |
The number of elements in a row or column. If transpose == 0, then n is the | |
number of columns; otherwise, n is the number of rows. | |
data1 (input) double array | |
The data array containing the first vector. | |
data2 (input) double array | |
The data array containing the second vector. | |
mask1 (input) int array | |
This array which elements in data1 are missing. If mask1[i][j] == 0, then | |
data1[i][j] is missing. | |
mask2 (input) int array | |
This array which elements in data2 are missing. If mask2[i][j] == 0, then | |
data2[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
index1 (input) int | |
Index of the first row or column. | |
index2 (input) int | |
Index of the second row or column. | |
transpose (input) int | |
If transpose == 0, the distance between two rows in the matrix is calculated. | |
Otherwise, the distance between two columns in the matrix is calculated. | |
============================================================================ | |
*/ | |
{ | |
double result = 0.0; | |
double tweight = 0; | |
int i; | |
if (transpose == 0) /* Calculate the distance between two rows */ { | |
for (i = 0; i < n; i++) { | |
if (mask1[index1][i] && mask2[index2][i]) { | |
double term = data1[index1][i] - data2[index2][i]; | |
result += weight[i]*term*term; | |
tweight += weight[i]; | |
} | |
} | |
} | |
else { | |
for (i = 0; i < n; i++) { | |
if (mask1[i][index1] && mask2[i][index2]) { | |
double term = data1[i][index1] - data2[i][index2]; | |
result += weight[i]*term*term; | |
tweight += weight[i]; | |
} | |
} | |
} | |
if (!tweight) return 0; /* usually due to empty clusters */ | |
result /= tweight; | |
return result; | |
} | |
/* ********************************************************************* */ | |
static double | |
cityblock(int n, double** data1, double** data2, int** mask1, int** mask2, | |
const double weight[], int index1, int index2, int transpose) | |
/* | |
Purpose | |
======= | |
The cityblock routine calculates the weighted "City Block" distance between | |
two rows or columns in a matrix. City Block distance is defined as the | |
absolute value of X1-X2 plus the absolute value of Y1-Y2 plus..., which is | |
equivalent to taking an "up and over" path. | |
Arguments | |
========= | |
n (input) int | |
The number of elements in a row or column. If transpose == 0, then n is the | |
number of columns; otherwise, n is the number of rows. | |
data1 (input) double array | |
The data array containing the first vector. | |
data2 (input) double array | |
The data array containing the second vector. | |
mask1 (input) int array | |
This array which elements in data1 are missing. If mask1[i][j] == 0, then | |
data1[i][j] is missing. | |
mask2 (input) int array | |
This array which elements in data2 are missing. If mask2[i][j] == 0, then | |
data2[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
index1 (input) int | |
Index of the first row or column. | |
index2 (input) int | |
Index of the second row or column. | |
transpose (input) int | |
If transpose == 0, the distance between two rows in the matrix is calculated. | |
Otherwise, the distance between two columns in the matrix is calculated. | |
============================================================================ */ | |
{ | |
double result = 0.; | |
double tweight = 0; | |
int i; | |
if (transpose == 0) /* Calculate the distance between two rows */ { | |
for (i = 0; i < n; i++) { | |
if (mask1[index1][i] && mask2[index2][i]) { | |
double term = data1[index1][i] - data2[index2][i]; | |
result = result + weight[i]*fabs(term); | |
tweight += weight[i]; | |
} | |
} | |
} | |
else { | |
for (i = 0; i < n; i++) { | |
if (mask1[i][index1] && mask2[i][index2]) { | |
double term = data1[i][index1] - data2[i][index2]; | |
result = result + weight[i]*fabs(term); | |
tweight += weight[i]; | |
} | |
} | |
} | |
if (!tweight) return 0; /* usually due to empty clusters */ | |
result /= tweight; | |
return result; | |
} | |
/* ********************************************************************* */ | |
static double | |
correlation(int n, double** data1, double** data2, int** mask1, int** mask2, | |
const double weight[], int index1, int index2, int transpose) | |
/* | |
Purpose | |
======= | |
The correlation routine calculates the weighted Pearson distance between two | |
rows or columns in a matrix. We define the Pearson distance as one minus the | |
Pearson correlation. | |
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b. | |
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold | |
(e.g., choose b = a + c). | |
Arguments | |
========= | |
n (input) int | |
The number of elements in a row or column. If transpose == 0, then n is the | |
number of columns; otherwise, n is the number of rows. | |
data1 (input) double array | |
The data array containing the first vector. | |
data2 (input) double array | |
The data array containing the second vector. | |
mask1 (input) int array | |
This array which elements in data1 are missing. If mask1[i][j] == 0, then | |
data1[i][j] is missing. | |
mask2 (input) int array | |
This array which elements in data2 are missing. If mask2[i][j] == 0, then | |
data2[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
index1 (input) int | |
Index of the first row or column. | |
index2 (input) int | |
Index of the second row or column. | |
transpose (input) int | |
If transpose == 0, the distance between two rows in the matrix is calculated. | |
Otherwise, the distance between two columns in the matrix is calculated. | |
============================================================================ | |
*/ | |
{ | |
double result = 0.; | |
double sum1 = 0.; | |
double sum2 = 0.; | |
double denom1 = 0.; | |
double denom2 = 0.; | |
double tweight = 0.; | |
if (transpose == 0) /* Calculate the distance between two rows */ { | |
int i; | |
for (i = 0; i < n; i++) { | |
if (mask1[index1][i] && mask2[index2][i]) { | |
double term1 = data1[index1][i]; | |
double term2 = data2[index2][i]; | |
double w = weight[i]; | |
sum1 += w*term1; | |
sum2 += w*term2; | |
result += w*term1*term2; | |
denom1 += w*term1*term1; | |
denom2 += w*term2*term2; | |
tweight += w; | |
} | |
} | |
} | |
else { | |
int i; | |
for (i = 0; i < n; i++) { | |
if (mask1[i][index1] && mask2[i][index2]) { | |
double term1 = data1[i][index1]; | |
double term2 = data2[i][index2]; | |
double w = weight[i]; | |
sum1 += w*term1; | |
sum2 += w*term2; | |
result += w*term1*term2; | |
denom1 += w*term1*term1; | |
denom2 += w*term2*term2; | |
tweight += w; | |
} | |
} | |
} | |
if (!tweight) return 0; /* usually due to empty clusters */ | |
result -= sum1 * sum2 / tweight; | |
denom1 -= sum1 * sum1 / tweight; | |
denom2 -= sum2 * sum2 / tweight; | |
if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */ | |
if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */ | |
result = result / sqrt(denom1*denom2); | |
result = 1. - result; | |
return result; | |
} | |
/* ********************************************************************* */ | |
static double | |
acorrelation(int n, double** data1, double** data2, int** mask1, int** mask2, | |
const double weight[], int index1, int index2, int transpose) | |
/* | |
Purpose | |
======= | |
The acorrelation routine calculates the weighted Pearson distance between two | |
rows or columns, using the absolute value of the correlation. | |
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b. | |
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold | |
(e.g., choose b = a + c). | |
Arguments | |
========= | |
n (input) int | |
The number of elements in a row or column. If transpose == 0, then n is the | |
number of columns; otherwise, n is the number of rows. | |
data1 (input) double array | |
The data array containing the first vector. | |
data2 (input) double array | |
The data array containing the second vector. | |
mask1 (input) int array | |
This array which elements in data1 are missing. If mask1[i][j] == 0, then | |
data1[i][j] is missing. | |
mask2 (input) int array | |
This array which elements in data2 are missing. If mask2[i][j] == 0, then | |
data2[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
index1 (input) int | |
Index of the first row or column. | |
index2 (input) int | |
Index of the second row or column. | |
transpose (input) int | |
If transpose == 0, the distance between two rows in the matrix is calculated. | |
Otherwise, the distance between two columns in the matrix is calculated. | |
============================================================================ | |
*/ | |
{ | |
double result = 0.; | |
double sum1 = 0.; | |
double sum2 = 0.; | |
double denom1 = 0.; | |
double denom2 = 0.; | |
double tweight = 0.; | |
if (transpose == 0) /* Calculate the distance between two rows */ { | |
int i; | |
for (i = 0; i < n; i++) { | |
if (mask1[index1][i] && mask2[index2][i]) { | |
double term1 = data1[index1][i]; | |
double term2 = data2[index2][i]; | |
double w = weight[i]; | |
sum1 += w*term1; | |
sum2 += w*term2; | |
result += w*term1*term2; | |
denom1 += w*term1*term1; | |
denom2 += w*term2*term2; | |
tweight += w; | |
} | |
} | |
} | |
else { | |
int i; | |
for (i = 0; i < n; i++) { | |
if (mask1[i][index1] && mask2[i][index2]) { | |
double term1 = data1[i][index1]; | |
double term2 = data2[i][index2]; | |
double w = weight[i]; | |
sum1 += w*term1; | |
sum2 += w*term2; | |
result += w*term1*term2; | |
denom1 += w*term1*term1; | |
denom2 += w*term2*term2; | |
tweight += w; | |
} | |
} | |
} | |
if (!tweight) return 0; /* usually due to empty clusters */ | |
result -= sum1 * sum2 / tweight; | |
denom1 -= sum1 * sum1 / tweight; | |
denom2 -= sum2 * sum2 / tweight; | |
if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */ | |
if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */ | |
result = fabs(result) / sqrt(denom1*denom2); | |
result = 1. - result; | |
return result; | |
} | |
/* ********************************************************************* */ | |
static double | |
ucorrelation(int n, double** data1, double** data2, int** mask1, int** mask2, | |
const double weight[], int index1, int index2, int transpose) | |
/* | |
Purpose | |
======= | |
The ucorrelation routine calculates the weighted Pearson distance between two | |
rows or columns, using the uncentered version of the Pearson correlation. In | |
the uncentered Pearson correlation, a zero mean is used for both vectors even | |
if the actual mean is nonzero. | |
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b. | |
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold | |
(e.g., choose b = a + c). | |
Arguments | |
========= | |
n (input) int | |
The number of elements in a row or column. If transpose == 0, then n is the | |
number of columns; otherwise, n is the number of rows. | |
data1 (input) double array | |
The data array containing the first vector. | |
data2 (input) double array | |
The data array containing the second vector. | |
mask1 (input) int array | |
This array which elements in data1 are missing. If mask1[i][j] == 0, then | |
data1[i][j] is missing. | |
mask2 (input) int array | |
This array which elements in data2 are missing. If mask2[i][j] == 0, then | |
data2[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
index1 (input) int | |
Index of the first row or column. | |
index2 (input) int | |
Index of the second row or column. | |
transpose (input) int | |
If transpose == 0, the distance between two rows in the matrix is calculated. | |
Otherwise, the distance between two columns in the matrix is calculated. | |
============================================================================ | |
*/ | |
{ | |
double result = 0.; | |
double denom1 = 0.; | |
double denom2 = 0.; | |
int flag = 0; | |
/* flag will remain zero if no nonzero combinations of mask1 and mask2 are | |
* found. | |
*/ | |
if (transpose == 0) /* Calculate the distance between two rows */ { | |
int i; | |
for (i = 0; i < n; i++) { | |
if (mask1[index1][i] && mask2[index2][i]) { | |
double term1 = data1[index1][i]; | |
double term2 = data2[index2][i]; | |
double w = weight[i]; | |
result += w*term1*term2; | |
denom1 += w*term1*term1; | |
denom2 += w*term2*term2; | |
flag = 1; | |
} | |
} | |
} | |
else { | |
int i; | |
for (i = 0; i < n; i++) { | |
if (mask1[i][index1] && mask2[i][index2]) { | |
double term1 = data1[i][index1]; | |
double term2 = data2[i][index2]; | |
double w = weight[i]; | |
result += w*term1*term2; | |
denom1 += w*term1*term1; | |
denom2 += w*term2*term2; | |
flag = 1; | |
} | |
} | |
} | |
if (!flag) return 0.; | |
if (denom1 == 0.) return 1.; | |
if (denom2 == 0.) return 1.; | |
result = result / sqrt(denom1*denom2); | |
result = 1. - result; | |
return result; | |
} | |
/* ********************************************************************* */ | |
static double | |
uacorrelation(int n, double** data1, double** data2, int** mask1, int** mask2, | |
const double weight[], int index1, int index2, int transpose) | |
/* | |
Purpose | |
======= | |
The uacorrelation routine calculates the weighted Pearson distance between two | |
rows or columns, using the absolute value of the uncentered version of the | |
Pearson correlation. In the uncentered Pearson correlation, a zero mean is used | |
for both vectors even if the actual mean is nonzero. | |
This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b. | |
but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold | |
(e.g., choose b = a + c). | |
Arguments | |
========= | |
n (input) int | |
The number of elements in a row or column. If transpose == 0, then n is the | |
number of columns; otherwise, n is the number of rows. | |
data1 (input) double array | |
The data array containing the first vector. | |
data2 (input) double array | |
The data array containing the second vector. | |
mask1 (input) int array | |
This array which elements in data1 are missing. If mask1[i][j] == 0, then | |
data1[i][j] is missing. | |
mask2 (input) int array | |
This array which elements in data2 are missing. If mask2[i][j] == 0, then | |
data2[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
index1 (input) int | |
Index of the first row or column. | |
index2 (input) int | |
Index of the second row or column. | |
transpose (input) int | |
If transpose == 0, the distance between two rows in the matrix is calculated. | |
Otherwise, the distance between two columns in the matrix is calculated. | |
============================================================================ | |
*/ | |
{ | |
double result = 0.; | |
double denom1 = 0.; | |
double denom2 = 0.; | |
int flag = 0; | |
/* flag will remain zero if no nonzero combinations of mask1 and mask2 are | |
* found. | |
*/ | |
if (transpose == 0) /* Calculate the distance between two rows */ { | |
int i; | |
for (i = 0; i < n; i++) { | |
if (mask1[index1][i] && mask2[index2][i]) { | |
double term1 = data1[index1][i]; | |
double term2 = data2[index2][i]; | |
double w = weight[i]; | |
result += w*term1*term2; | |
denom1 += w*term1*term1; | |
denom2 += w*term2*term2; | |
flag = 1; | |
} | |
} | |
} | |
else { | |
int i; | |
for (i = 0; i < n; i++) { | |
if (mask1[i][index1] && mask2[i][index2]) { | |
double term1 = data1[i][index1]; | |
double term2 = data2[i][index2]; | |
double w = weight[i]; | |
result += w*term1*term2; | |
denom1 += w*term1*term1; | |
denom2 += w*term2*term2; | |
flag = 1; | |
} | |
} | |
} | |
if (!flag) return 0.; | |
if (denom1 == 0.) return 1.; | |
if (denom2 == 0.) return 1.; | |
result = fabs(result) / sqrt(denom1*denom2); | |
result = 1. - result; | |
return result; | |
} | |
/* ********************************************************************* */ | |
static double | |
spearman(int n, double** data1, double** data2, int** mask1, int** mask2, | |
const double weight[], int index1, int index2, int transpose) | |
/* | |
Purpose | |
======= | |
The spearman routine calculates the Spearman distance between two rows or | |
columns. The Spearman distance is defined as one minus the Spearman rank | |
correlation. | |
Arguments | |
========= | |
n (input) int | |
The number of elements in a row or column. If transpose == 0, then n is the | |
number of columns; otherwise, n is the number of rows. | |
data1 (input) double array | |
The data array containing the first vector. | |
data2 (input) double array | |
The data array containing the second vector. | |
mask1 (input) int array | |
This array which elements in data1 are missing. If mask1[i][j] == 0, then | |
data1[i][j] is missing. | |
mask2 (input) int array | |
This array which elements in data2 are missing. If mask2[i][j] == 0, then | |
data2[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
index1 (input) int | |
Index of the first row or column. | |
index2 (input) int | |
Index of the second row or column. | |
transpose (input) int | |
If transpose == 0, the distance between two rows in the matrix is calculated. | |
Otherwise, the distance between two columns in the matrix is calculated. | |
============================================================================ | |
*/ | |
{ | |
int i; | |
int m = 0; | |
double* rank1; | |
double* rank2; | |
double result = 0.; | |
double denom1 = 0.; | |
double denom2 = 0.; | |
double sum1 = 0.; | |
double sum2 = 0.; | |
double totalweight = 0.; | |
double* tdata1; | |
double* tdata2; | |
tdata1 = malloc(n*sizeof(double)); | |
if (!tdata1) return 0.0; /* Memory allocation error */ | |
tdata2 = malloc(n*sizeof(double)); | |
if (!tdata2) /* Memory allocation error */ { | |
free(tdata1); | |
return 0.0; | |
} | |
if (transpose == 0) { | |
for (i = 0; i < n; i++) { | |
if (mask1[index1][i] && mask2[index2][i]) { | |
tdata1[m] = data1[index1][i]; | |
tdata2[m] = data2[index2][i]; | |
m++; | |
} | |
} | |
} | |
else { | |
for (i = 0; i < n; i++) { | |
if (mask1[i][index1] && mask2[i][index2]) { | |
tdata1[m] = data1[i][index1]; | |
tdata2[m] = data2[i][index2]; | |
m++; | |
} | |
} | |
} | |
if (m == 0) { | |
free(tdata1); | |
free(tdata2); | |
return 0; | |
} | |
rank1 = getrank(m, tdata1, weight); | |
free(tdata1); | |
if (!rank1) { | |
free(tdata2); | |
return 0.0; /* Memory allocation error */ | |
} | |
rank2 = getrank(m, tdata2, weight); | |
free(tdata2); | |
if (!rank2) /* Memory allocation error */ { | |
free(rank1); | |
return 0.0; | |
} | |
for (i = 0; i < m; i++) { | |
const double term1 = rank1[i]; | |
const double term2 = rank2[i]; | |
const double w = weight[i]; | |
sum1 += term1 * w; | |
sum2 += term2 * w; | |
result += term1 * term2 * w; | |
denom1 += term1 * term1 * w; | |
denom2 += term2 * term2 * w; | |
totalweight += w; | |
} | |
/* Note: denom1 and denom2 cannot be calculated directly from the number | |
* of elements. If two elements have the same rank, the squared sum of | |
* their ranks will change. | |
*/ | |
free(rank1); | |
free(rank2); | |
if (!totalweight) return 0; /* usually due to empty clusters */ | |
result -= sum1 * sum2 / totalweight; | |
denom1 -= sum1 * sum1 / totalweight; | |
denom2 -= sum2 * sum2 / totalweight; | |
if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */ | |
if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */ | |
result = result / sqrt(denom1*denom2); | |
result = 1. - result; | |
return result; | |
} | |
/* ********************************************************************* */ | |
static double | |
kendall(int n, double** data1, double** data2, int** mask1, int** mask2, | |
const double weight[], int index1, int index2, int transpose) | |
/* | |
Purpose | |
======= | |
The kendall routine calculates the Kendall distance between two | |
rows or columns. The Kendall distance is defined as one minus Kendall's tau. | |
Arguments | |
========= | |
n (input) int | |
The number of elements in a row or column. If transpose == 0, then n is the | |
number of columns; otherwise, n is the number of rows. | |
data1 (input) double array | |
The data array containing the first vector. | |
data2 (input) double array | |
The data array containing the second vector. | |
mask1 (input) int array | |
This array which elements in data1 are missing. If mask1[i][j] == 0, then | |
data1[i][j] is missing. | |
mask2 (input) int array | |
This array which elements in data2 are missing. If mask2[i][j] == 0, then | |
data2[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
index1 (input) int | |
Index of the first row or column. | |
index2 (input) int | |
Index of the second row or column. | |
transpose (input) int | |
If transpose == 0, the distance between two rows in the matrix is calculated. | |
Otherwise, the distance between two columns in the matrix is calculated. | |
============================================================================ | |
*/ | |
{ | |
double con = 0; | |
double dis = 0; | |
double exx = 0; | |
double exy = 0; | |
int flag = 0; | |
/* flag will remain zero if no nonzero combinations of mask1 and mask2 are | |
* found. | |
*/ | |
double denomx; | |
double denomy; | |
double tau; | |
int i, j; | |
if (transpose == 0) { | |
for (i = 0; i < n; i++) { | |
if (mask1[index1][i] && mask2[index2][i]) { | |
for (j = 0; j < i; j++) { | |
if (mask1[index1][j] && mask2[index2][j]) { | |
const double x1 = data1[index1][i]; | |
const double x2 = data1[index1][j]; | |
const double y1 = data2[index2][i]; | |
const double y2 = data2[index2][j]; | |
const double w = weight[i] * weight[j]; | |
if (x1 < x2 && y1 < y2) con += w; | |
else if (x1 > x2 && y1 > y2) con += w; | |
else if (x1 < x2 && y1 > y2) dis += w; | |
else if (x1 > x2 && y1 < y2) dis += w; | |
else if (x1 == x2 && y1 != y2) exx += w; | |
else if (x1 != x2 && y1 == y2) exy += w; | |
flag = 1; | |
} | |
} | |
} | |
} | |
} | |
else { | |
for (i = 0; i < n; i++) { | |
if (mask1[i][index1] && mask2[i][index2]) { | |
for (j = 0; j < i; j++) { | |
if (mask1[j][index1] && mask2[j][index2]) { | |
const double x1 = data1[i][index1]; | |
const double x2 = data1[j][index1]; | |
const double y1 = data2[i][index2]; | |
const double y2 = data2[j][index2]; | |
const double w = weight[i] * weight[j]; | |
if (x1 < x2 && y1 < y2) con += w; | |
else if (x1 > x2 && y1 > y2) con += w; | |
else if (x1 < x2 && y1 > y2) dis += w; | |
else if (x1 > x2 && y1 < y2) dis += w; | |
else if (x1 == x2 && y1 != y2) exx += w; | |
else if (x1 != x2 && y1 == y2) exy += w; | |
flag = 1; | |
} | |
} | |
} | |
} | |
} | |
if (!flag) return 0.; | |
denomx = con + dis + exx; | |
denomy = con + dis + exy; | |
if (denomx == 0) return 1; | |
if (denomy == 0) return 1; | |
tau = (con-dis)/sqrt(denomx*denomy); | |
return 1.-tau; | |
} | |
/* ********************************************************************* */ | |
static double(*setmetric(char dist)) | |
(int, double**, double**, int**, int**, const double[], int, int, int) | |
{ | |
switch(dist) { | |
case 'e': return &euclid; | |
case 'b': return &cityblock; | |
case 'c': return &correlation; | |
case 'a': return &acorrelation; | |
case 'u': return &ucorrelation; | |
case 'x': return &uacorrelation; | |
case 's': return &spearman; | |
case 'k': return &kendall; | |
default: return &euclid; | |
} | |
} | |
/* ********************************************************************* */ | |
static double | |
uniform(void) | |
/* | |
Purpose | |
======= | |
This routine returns a uniform random number between 0.0 and 1.0. Both 0.0 | |
and 1.0 are excluded. This random number generator is described in: | |
Pierre l'Ecuyer | |
Efficient and Portable Combined Random Number Generators | |
Communications of the ACM, Volume 31, Number 6, June 1988, pages 742-749, 774. | |
The first time this routine is called, it initializes the random number | |
generator using the current time. First, the current epoch time in seconds is | |
used as a seed for the random number generator in the C library. The first two | |
random numbers generated by this generator are used to initialize the random | |
number generator implemented in this routine. | |
Arguments | |
========= | |
None. | |
Return value | |
============ | |
A double-precison number between 0.0 and 1.0. | |
============================================================================ | |
*/ | |
{ | |
int z; | |
static const int m1 = 2147483563; | |
static const int m2 = 2147483399; | |
const double scale = 1.0/m1; | |
static int s1 = 0; | |
static int s2 = 0; | |
if (s1 == 0 || s2 == 0) { | |
/* initialize */ | |
unsigned int initseed = (unsigned int) time(0); | |
srand(initseed); | |
s1 = rand(); | |
s2 = rand(); | |
} | |
do { | |
int k = s1/53668; | |
s1 = 40014*(s1-k*53668)-k*12211; | |
if (s1 < 0) s1+=m1; | |
k = s2/52774; | |
s2 = 40692*(s2-k*52774)-k*3791; | |
if (s2 < 0) s2 += m2; | |
z = s1-s2; | |
if (z < 1) z += (m1-1); | |
} while (z == m1); /* To avoid returning 1.0 */ | |
return z*scale; | |
} | |
/* ************************************************************************ */ | |
static int | |
binomial(int n, double p) | |
/* | |
Purpose | |
======= | |
This routine generates a random number between 0 and n inclusive, following | |
the binomial distribution with probability p and n trials. The routine is | |
based on the BTPE algorithm, described in: | |
Voratas Kachitvichyanukul and Bruce W. Schmeiser: | |
Binomial Random Variate Generation | |
Communications of the ACM, Volume 31, Number 2, February 1988, pages 216-222. | |
Arguments | |
========= | |
p (input) double | |
The probability of a single event. This probability should be less than or | |
equal to 0.5. | |
n (input) int | |
The number of trials. | |
Return value | |
============ | |
An integer drawn from a binomial distribution with parameters (p, n). | |
============================================================================ | |
*/ | |
{ | |
const double q = 1 - p; | |
if (n*p < 30.0) /* Algorithm BINV */ { | |
const double s = p/q; | |
const double a = (n+1)*s; | |
double r = exp(n*log(q)); /* pow() causes a crash on AIX */ | |
int x = 0; | |
double u = uniform(); | |
while (1) { | |
if (u < r) return x; | |
u -= r; | |
x++; | |
r *= (a/x)-s; | |
} | |
} | |
else /* Algorithm BTPE */ { | |
/* Step 0 */ | |
const double fm = n*p + p; | |
const int m = (int) fm; | |
const double p1 = floor(2.195*sqrt(n*p*q) -4.6*q) + 0.5; | |
const double xm = m + 0.5; | |
const double xl = xm - p1; | |
const double xr = xm + p1; | |
const double c = 0.134 + 20.5/(15.3+m); | |
const double a = (fm-xl)/(fm-xl*p); | |
const double b = (xr-fm)/(xr*q); | |
const double lambdal = a*(1.0+0.5*a); | |
const double lambdar = b*(1.0+0.5*b); | |
const double p2 = p1*(1+2*c); | |
const double p3 = p2 + c/lambdal; | |
const double p4 = p3 + c/lambdar; | |
while (1) { | |
/* Step 1 */ | |
int y; | |
int k; | |
double u = uniform(); | |
double v = uniform(); | |
u *= p4; | |
if (u <= p1) return (int)(xm-p1*v+u); | |
/* Step 2 */ | |
if (u > p2) { | |
/* Step 3 */ | |
if (u > p3) { | |
/* Step 4 */ | |
y = (int)(xr-log(v)/lambdar); | |
if (y > n) continue; | |
/* Go to step 5 */ | |
v = v*(u-p3)*lambdar; | |
} | |
else { | |
y = (int)(xl+log(v)/lambdal); | |
if (y < 0) continue; | |
/* Go to step 5 */ | |
v = v*(u-p2)*lambdal; | |
} | |
} | |
else { | |
const double x = xl + (u-p1)/c; | |
v = v*c + 1.0 - fabs(m-x+0.5)/p1; | |
if (v > 1) continue; | |
/* Go to step 5 */ | |
y = (int)x; | |
} | |
/* Step 5 */ | |
/* Step 5.0 */ | |
k = abs(y-m); | |
if (k > 20 && k < 0.5*n*p*q-1.0) { | |
/* Step 5.2 */ | |
double rho = (k/(n*p*q))*((k*(k/3.0 + 0.625) | |
+ 0.1666666666666)/(n*p*q)+0.5); | |
double t = -k*k/(2*n*p*q); | |
double A = log(v); | |
if (A < t-rho) return y; | |
else if (A > t+rho) continue; | |
else { | |
/* Step 5.3 */ | |
double x1 = y+1; | |
double f1 = m+1; | |
double z = n+1-m; | |
double w = n-y+1; | |
double x2 = x1*x1; | |
double f2 = f1*f1; | |
double z2 = z*z; | |
double w2 = w*w; | |
if (A > xm * log(f1/x1) + (n-m+0.5)*log(z/w) | |
+ (y-m)*log(w*p/(x1*q)) | |
+ (13860.-(462.-(132.-(99.-140./f2)/f2)/f2)/f2)/f1/166320. | |
+ (13860.-(462.-(132.-(99.-140./z2)/z2)/z2)/z2)/z/166320. | |
+ (13860.-(462.-(132.-(99.-140./x2)/x2)/x2)/x2)/x1/166320. | |
+ (13860.-(462.-(132.-(99.-140./w2)/w2)/w2)/w2)/w/166320.) | |
continue; | |
return y; | |
} | |
} | |
else { | |
/* Step 5.1 */ | |
int i; | |
const double s = p/q; | |
const double aa = s*(n+1); | |
double f = 1.0; | |
for (i = m; i < y; f *= (aa/(++i)-s)); | |
for (i = y; i < m; f /= (aa/(++i)-s)); | |
if (v > f) continue; | |
return y; | |
} | |
} | |
} | |
return -1; | |
} | |
/* ************************************************************************ */ | |
static void | |
randomassign(int nclusters, int nelements, int clusterid[]) | |
/* | |
Purpose | |
======= | |
The randomassign routine performs an initial random clustering, needed for | |
k-means or k-median clustering. Elements (genes or samples) are randomly | |
assigned to clusters. The number of elements in each cluster is chosen | |
randomly, making sure that each cluster will receive at least one element. | |
Arguments | |
========= | |
nclusters (input) int | |
The number of clusters. | |
nelements (input) int | |
The number of elements to be clustered (i.e., the number of genes or samples | |
to be clustered). | |
clusterid (output) int[nelements] | |
The cluster number to which an element was assigned. | |
============================================================================ | |
*/ | |
{ | |
int i, j; | |
int k = 0; | |
double p; | |
int n = nelements-nclusters; | |
/* Draw the number of elements in each cluster from a multinomial | |
* distribution, reserving ncluster elements to set independently | |
* in order to guarantee that none of the clusters are empty. | |
*/ | |
for (i = 0; i < nclusters-1; i++) { | |
p = 1.0/(nclusters-i); | |
j = binomial(n, p); | |
n -= j; | |
j += k+1; /* Assign at least one element to cluster i */ | |
for ( ; k < j; k++) clusterid[k] = i; | |
} | |
/* Assign the remaining elements to the last cluster */ | |
for ( ; k < nelements; k++) clusterid[k] = i; | |
/* Create a random permutation of the cluster assignments */ | |
for (i = 0; i < nelements; i++) { | |
j = (int) (i + (nelements-i)*uniform()); | |
k = clusterid[j]; | |
clusterid[j] = clusterid[i]; | |
clusterid[i] = k; | |
} | |
} | |
/* ********************************************************************* */ | |
static void | |
getclustermeans(int nclusters, int nrows, int ncolumns, | |
double** data, int** mask, int clusterid[], double** cdata, int** cmask, | |
int transpose) | |
/* | |
Purpose | |
======= | |
The getclustermeans routine calculates the cluster centroids, given to which | |
cluster each element belongs. The centroid is defined as the mean over all | |
elements for each dimension. | |
Arguments | |
========= | |
nclusters (input) int | |
The number of clusters. | |
nrows (input) int | |
The number of rows in the gene expression data matrix, equal to the number of | |
genes. | |
ncolumns (input) int | |
The number of columns in the gene expression data matrix, equal to the number | |
of samples. | |
data (input) double[nrows][ncolumns] | |
The array containing the gene expression data. | |
mask (input) int[nrows][ncolumns] | |
This array shows which data values are missing. If mask[i][j] == 0, then | |
data[i][j] is missing. | |
clusterid (output) int[nrows] if transpose == 0 | |
int[ncolumns] otherwise | |
The cluster number to which each element belongs. If transpose == 0, then the | |
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it | |
is equal to ncolumns (the number of samples). | |
cdata (output) double[nclusters][ncolumns] if transpose == 0 | |
double[nrows][nclusters] otherwise | |
On exit of getclustermeans, this array contains the cluster centroids. | |
cmask (output) int[nclusters][ncolumns] if transpose == 0 | |
int[nrows][nclusters] otherwise | |
This array shows which data values of are missing for each centroid. If | |
cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for a | |
centroid if all corresponding data values of the cluster members are missing. | |
transpose (input) int | |
If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters | |
of columns (samples) are specified. | |
======================================================================== | |
*/ | |
{ | |
int i, j, k; | |
if (transpose == 0) { | |
for (i = 0; i < nclusters; i++) { | |
for (j = 0; j < ncolumns; j++) { | |
cmask[i][j] = 0; | |
cdata[i][j] = 0.; | |
} | |
} | |
for (k = 0; k < nrows; k++) { | |
i = clusterid[k]; | |
for (j = 0; j < ncolumns; j++) { | |
if (mask[k][j] != 0) { | |
cdata[i][j] += data[k][j]; | |
cmask[i][j]++; | |
} | |
} | |
} | |
for (i = 0; i < nclusters; i++) { | |
for (j = 0; j < ncolumns; j++) { | |
if (cmask[i][j]>0) { | |
cdata[i][j] /= cmask[i][j]; | |
cmask[i][j] = 1; | |
} | |
} | |
} | |
} | |
else { | |
for (i = 0; i < nrows; i++) { | |
for (j = 0; j < nclusters; j++) { | |
cdata[i][j] = 0.; | |
cmask[i][j] = 0; | |
} | |
} | |
for (k = 0; k < ncolumns; k++) { | |
i = clusterid[k]; | |
for (j = 0; j < nrows; j++) { | |
if (mask[j][k] != 0) { | |
cdata[j][i] += data[j][k]; | |
cmask[j][i]++; | |
} | |
} | |
} | |
for (i = 0; i < nrows; i++) { | |
for (j = 0; j < nclusters; j++) { | |
if (cmask[i][j]>0) { | |
cdata[i][j] /= cmask[i][j]; | |
cmask[i][j] = 1; | |
} | |
} | |
} | |
} | |
} | |
/* ********************************************************************* */ | |
static void | |
getclustermedians(int nclusters, int nrows, int ncolumns, | |
double** data, int** mask, int clusterid[], double** cdata, int** cmask, | |
int transpose, double cache[]) | |
/* | |
Purpose | |
======= | |
The getclustermedians routine calculates the cluster centroids, given to which | |
cluster each element belongs. The centroid is defined as the median over all | |
elements for each dimension. | |
Arguments | |
========= | |
nclusters (input) int | |
The number of clusters. | |
nrows (input) int | |
The number of rows in the gene expression data matrix, equal to the number of | |
genes. | |
ncolumns (input) int | |
The number of columns in the gene expression data matrix, equal to the number | |
of samples. | |
data (input) double[nrows][ncolumns] | |
The array containing the gene expression data. | |
mask (input) int[nrows][ncolumns] | |
This array shows which data values are missing. If mask[i][j] == 0, then | |
data[i][j] is missing. | |
clusterid (output) int[nrows] if transpose == 0 | |
int[ncolumns] otherwise | |
The cluster number to which each element belongs. If transpose == 0, then the | |
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it | |
is equal to ncolumns (the number of samples). | |
cdata (output) double[nclusters][ncolumns] if transpose == 0 | |
double[nrows][nclusters] otherwise | |
On exit of getclustermedians, this array contains the cluster centroids. | |
cmask (output) int[nclusters][ncolumns] if transpose == 0 | |
int[nrows][nclusters] otherwise | |
This array shows which data values of are missing for each centroid. If | |
cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for | |
a centroid if all corresponding data values of the cluster members are missing. | |
transpose (input) int | |
If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters | |
of columns (samples) are specified. | |
cache (input) double[nrows] if transpose == 0 | |
double[ncolumns] otherwise | |
This array should be allocated before calling getclustermedians; its contents | |
on input is not relevant. This array is used as a temporary storage space when | |
calculating the medians. | |
======================================================================== | |
*/ | |
{ | |
int i, j, k; | |
if (transpose == 0) { | |
for (i = 0; i < nclusters; i++) { | |
for (j = 0; j < ncolumns; j++) { | |
int count = 0; | |
for (k = 0; k < nrows; k++) { | |
if (i == clusterid[k] && mask[k][j]) { | |
cache[count] = data[k][j]; | |
count++; | |
} | |
} | |
if (count>0) { | |
cdata[i][j] = median(count, cache); | |
cmask[i][j] = 1; | |
} | |
else { | |
cdata[i][j] = 0.; | |
cmask[i][j] = 0; | |
} | |
} | |
} | |
} | |
else { | |
for (i = 0; i < nclusters; i++) { | |
for (j = 0; j < nrows; j++) { | |
int count = 0; | |
for (k = 0; k < ncolumns; k++) { | |
if (i == clusterid[k] && mask[j][k]) { | |
cache[count] = data[j][k]; | |
count++; | |
} | |
} | |
if (count>0) { | |
cdata[j][i] = median(count, cache); | |
cmask[j][i] = 1; | |
} | |
else { | |
cdata[j][i] = 0.; | |
cmask[j][i] = 0; | |
} | |
} | |
} | |
} | |
} | |
/* ********************************************************************* */ | |
int | |
getclustercentroids(int nclusters, int nrows, int ncolumns, | |
double** data, int** mask, int clusterid[], double** cdata, int** cmask, | |
int transpose, char method) | |
/* | |
Purpose | |
======= | |
The getclustercentroids routine calculates the cluster centroids, given to | |
which cluster each element belongs. Depending on the argument method, the | |
centroid is defined as either the mean or the median for each dimension over | |
all elements belonging to a cluster. | |
Arguments | |
========= | |
nclusters (input) int | |
The number of clusters. | |
nrows (input) int | |
The number of rows in the gene expression data matrix, equal to the number of | |
genes. | |
ncolumns (input) int | |
The number of columns in the gene expression data matrix, equal to the number | |
of samples. | |
data (input) double[nrows][ncolumns] | |
The array containing the gene expression data. | |
mask (input) int[nrows][ncolumns] | |
This array shows which data values are missing. If mask[i][j] == 0, then | |
data[i][j] is missing. | |
clusterid (output) int[nrows] if transpose == 0 | |
int[ncolumns] otherwise | |
The cluster number to which each element belongs. If transpose == 0, then the | |
dimension of clusterid is equal to nrows (the number of genes). Otherwise, it | |
is equal to ncolumns (the number of samples). | |
cdata (output) double[nclusters][ncolumns] if transpose == 0 | |
double[nrows][nclusters] otherwise | |
On exit of getclustercentroids, this array contains the cluster centroids. | |
cmask (output) int[nclusters][ncolumns] if transpose == 0 | |
int[nrows][nclusters] otherwise | |
This array shows which data values of are missing for each centroid. If | |
cmask[i][j] == 0, then cdata[i][j] is missing. A data value is missing for | |
a centroid if all corresponding data values of the cluster members are missing. | |
transpose (input) int | |
If transpose == 0, clusters of rows (genes) are specified. Otherwise, clusters | |
of columns (samples) are specified. | |
method (input) char | |
For method == 'a', the centroid is defined as the mean over all elements | |
belonging to a cluster for each dimension. | |
For method == 'm', the centroid is defined as the median over all elements | |
belonging to a cluster for each dimension. | |
Return value | |
============ | |
The function returns an integer to indicate success or failure. If a | |
memory error occurs, or if method is not 'm' or 'a', getclustercentroids | |
returns 0. If successful, getclustercentroids returns 1. | |
======================================================================== | |
*/ | |
{ | |
switch(method) { | |
case 'm': { | |
const int nelements = (transpose == 0) ? nrows : ncolumns; | |
double* cache = malloc(nelements*sizeof(double)); | |
if (!cache) return 0; | |
getclustermedians(nclusters, nrows, ncolumns, data, mask, | |
clusterid, cdata, cmask, transpose, cache); | |
free(cache); | |
return 1; | |
} | |
case 'a': { | |
getclustermeans(nclusters, nrows, ncolumns, data, mask, | |
clusterid, cdata, cmask, transpose); | |
return 1; | |
} | |
} | |
return 0; | |
} | |
/* ********************************************************************* */ | |
void | |
getclustermedoids(int nclusters, int nelements, double** distance, | |
int clusterid[], int centroids[], double errors[]) | |
/* | |
Purpose | |
======= | |
The getclustermedoids routine calculates the cluster centroids, given to which | |
cluster each element belongs. The centroid is defined as the element with the | |
smallest sum of distances to the other elements. | |
Arguments | |
========= | |
nclusters (input) int | |
The number of clusters. | |
nelements (input) int | |
The total number of elements. | |
distmatrix (input) double array, ragged | |
(number of rows is nelements, number of columns is equal to the row number) | |
The distance matrix. To save space, the distance matrix is given in the | |
form of a ragged array. The distance matrix is symmetric and has zeros | |
on the diagonal. See distancematrix for a description of the content. | |
clusterid (output) int[nelements] | |
The cluster number to which each element belongs. | |
centroid (output) int[nclusters] | |
The index of the element that functions as the centroid for each cluster. | |
errors (output) double[nclusters] | |
The within-cluster sum of distances between the items and the cluster | |
centroid. | |
======================================================================== | |
*/ | |
{ | |
int i, j, k; | |
for (j = 0; j < nclusters; j++) errors[j] = DBL_MAX; | |
for (i = 0; i < nelements; i++) { | |
double d = 0.0; | |
j = clusterid[i]; | |
for (k = 0; k < nelements; k++) { | |
if (i == k || clusterid[k]!=j) continue; | |
d += (i < k ? distance[k][i] : distance[i][k]); | |
if (d > errors[j]) break; | |
} | |
if (d < errors[j]) { | |
errors[j] = d; | |
centroids[j] = i; | |
} | |
} | |
} | |
/* ********************************************************************* */ | |
static int | |
kmeans(int nclusters, int nrows, int ncolumns, double** data, int** mask, | |
double weight[], int transpose, int npass, char dist, | |
double** cdata, int** cmask, int clusterid[], double* error, | |
int tclusterid[], int counts[], int mapping[]) | |
{ | |
int i, j, k; | |
const int nelements = (transpose == 0) ? nrows : ncolumns; | |
const int ndata = (transpose == 0) ? ncolumns : nrows; | |
int ifound = 1; | |
int ipass = 0; | |
/* Set the metric function as indicated by dist */ | |
double (*metric) (int, double**, double**, int**, int**, | |
const double[], int, int, int) = setmetric(dist); | |
/* Save the clustering solution periodically and check if it reappears */ | |
int* saved = malloc(nelements*sizeof(int)); | |
if (saved == NULL) return -1; | |
*error = DBL_MAX; | |
do { | |
double total = DBL_MAX; | |
int counter = 0; | |
int period = 10; | |
/* Perform the EM algorithm. | |
* First, randomly assign elements to clusters. */ | |
if (npass != 0) randomassign(nclusters, nelements, tclusterid); | |
for (i = 0; i < nclusters; i++) counts[i] = 0; | |
for (i = 0; i < nelements; i++) counts[tclusterid[i]]++; | |
/* Start the loop */ | |
while (1) { | |
double previous = total; | |
total = 0.0; | |
if (counter % period == 0) { | |
/* Save the current cluster assignments */ | |
for (i = 0; i < nelements; i++) saved[i] = tclusterid[i]; | |
if (period < INT_MAX / 2) period *= 2; | |
} | |
counter++; | |
/* Find the center */ | |
getclustermeans(nclusters, nrows, ncolumns, data, mask, tclusterid, | |
cdata, cmask, transpose); | |
for (i = 0; i < nelements; i++) { | |
double distance; | |
/* Calculate the distances */ | |
k = tclusterid[i]; | |
if (counts[k] == 1) continue; | |
/* No reassignment if that would lead to an empty cluster */ | |
/* Treat the present cluster as a special case */ | |
distance = metric(ndata, data, cdata, mask, cmask, weight, | |
i, k, transpose); | |
for (j = 0; j < nclusters; j++) { | |
double tdistance; | |
if (j == k) continue; | |
tdistance = metric(ndata, data, cdata, mask, cmask, weight, | |
i, j, transpose); | |
if (tdistance < distance) { | |
distance = tdistance; | |
counts[tclusterid[i]]--; | |
tclusterid[i] = j; | |
counts[j]++; | |
} | |
} | |
total += distance; | |
} | |
if (total >= previous) break; | |
/* total >= previous is FALSE on some machines even if total and | |
* previous are bitwise identical. */ | |
for (i = 0; i < nelements; i++) | |
if (saved[i]!=tclusterid[i]) break; | |
if (i == nelements) | |
break; /* Identical solution found; break out of this loop */ | |
} | |
if (npass <= 1) { | |
*error = total; | |
break; | |
} | |
for (i = 0; i < nclusters; i++) mapping[i] = -1; | |
for (i = 0; i < nelements; i++) { | |
j = tclusterid[i]; | |
k = clusterid[i]; | |
if (mapping[k] == -1) mapping[k] = j; | |
else if (mapping[k] != j) { | |
if (total < *error) { | |
ifound = 1; | |
*error = total; | |
for (j = 0; j < nelements; j++) | |
clusterid[j] = tclusterid[j]; | |
} | |
break; | |
} | |
} | |
if (i == nelements) ifound++; /* break statement not encountered */ | |
} while (++ipass < npass); | |
free(saved); | |
return ifound; | |
} | |
/* ---------------------------------------------------------------------- */ | |
static int | |
kmedians(int nclusters, int nrows, int ncolumns, double** data, int** mask, | |
double weight[], int transpose, int npass, char dist, | |
double** cdata, int** cmask, int clusterid[], double* error, | |
int tclusterid[], int counts[], int mapping[], double cache[]) | |
{ | |
int i, j, k; | |
const int nelements = (transpose == 0) ? nrows : ncolumns; | |
const int ndata = (transpose == 0) ? ncolumns : nrows; | |
int ifound = 1; | |
int ipass = 0; | |
int* saved; | |
/* Set the metric function as indicated by dist */ | |
double (*metric) (int, double**, double**, int**, int**, | |
const double[], int, int, int) = setmetric(dist); | |
/* Save the clustering solution periodically and check if it reappears */ | |
saved = malloc(nelements*sizeof(int)); | |
if (saved == NULL) return -1; | |
*error = DBL_MAX; | |
do { | |
double total = DBL_MAX; | |
int counter = 0; | |
int period = 10; | |
/* Perform the EM algorithm. | |
* First, randomly assign elements to clusters. */ | |
if (npass != 0) randomassign(nclusters, nelements, tclusterid); | |
for (i = 0; i < nclusters; i++) counts[i] = 0; | |
for (i = 0; i < nelements; i++) counts[tclusterid[i]]++; | |
/* Start the loop */ | |
while (1) { | |
double previous = total; | |
total = 0.0; | |
if (counter % period == 0) { | |
/* Save the current cluster assignments */ | |
for (i = 0; i < nelements; i++) saved[i] = tclusterid[i]; | |
if (period < INT_MAX / 2) period *= 2; | |
} | |
counter++; | |
/* Find the center */ | |
getclustermedians(nclusters, nrows, ncolumns, data, mask, | |
tclusterid, cdata, cmask, transpose, cache); | |
for (i = 0; i < nelements; i++) { | |
/* Calculate the distances */ | |
double distance; | |
k = tclusterid[i]; | |
if (counts[k] == 1) continue; | |
/* No reassignment if that would lead to an empty cluster */ | |
/* Treat the present cluster as a special case */ | |
distance = metric(ndata, data, cdata, mask, cmask, weight, | |
i, k, transpose); | |
for (j = 0; j < nclusters; j++) { | |
double tdistance; | |
if (j == k) continue; | |
tdistance = metric(ndata, data, cdata, mask, cmask, weight, | |
i, j, transpose); | |
if (tdistance < distance) { | |
distance = tdistance; | |
counts[tclusterid[i]]--; | |
tclusterid[i] = j; | |
counts[j]++; | |
} | |
} | |
total += distance; | |
} | |
if (total >= previous) break; | |
/* total >= previous is FALSE on some machines even if total and | |
* previous are bitwise identical. */ | |
for (i = 0; i < nelements; i++) | |
if (saved[i]!=tclusterid[i]) break; | |
if (i == nelements) | |
break; /* Identical solution found; break out of this loop */ | |
} | |
if (npass <= 1) { | |
*error = total; | |
break; | |
} | |
for (i = 0; i < nclusters; i++) mapping[i] = -1; | |
for (i = 0; i < nelements; i++) { | |
j = tclusterid[i]; | |
k = clusterid[i]; | |
if (mapping[k] == -1) mapping[k] = j; | |
else if (mapping[k] != j) { | |
if (total < *error) { | |
ifound = 1; | |
*error = total; | |
for (j = 0; j < nelements; j++) | |
clusterid[j] = tclusterid[j]; | |
} | |
break; | |
} | |
} | |
if (i == nelements) ifound++; /* break statement not encountered */ | |
} while (++ipass < npass); | |
free(saved); | |
return ifound; | |
} | |
/* ********************************************************************* */ | |
void | |
kcluster(int nclusters, int nrows, int ncolumns, double** data, int** mask, | |
double weight[], int transpose, int npass, char method, char dist, | |
int clusterid[], double* error, int* ifound) | |
/* | |
Purpose | |
======= | |
The kcluster routine performs k-means or k-median clustering on a given set of | |
elements, using the specified distance measure. The number of clusters is given | |
by the user. Multiple passes are being made to find the optimal clustering | |
solution, each time starting from a different initial clustering. | |
Arguments | |
========= | |
nclusters (input) int | |
The number of clusters to be found. | |
data (input) double[nrows][ncolumns] | |
The array containing the data of the elements to be clustered (i.e., the gene | |
expression data). | |
mask (input) int[nrows][ncolumns] | |
This array shows which data values are missing. If | |
mask[i][j] == 0, then data[i][j] is missing. | |
nrows (input) int | |
The number of rows in the data matrix, equal to the number of genes. | |
ncolumns (input) int | |
The number of columns in the data matrix, equal to the number of samples. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
transpose (input) int | |
If transpose == 0, the rows of the matrix are clustered. Otherwise, columns | |
of the matrix are clustered. | |
npass (input) int | |
The number of times clustering is performed. Clustering is performed npass | |
times, each time starting from a different (random) initial assignment of | |
genes to clusters. The clustering solution with the lowest within-cluster sum | |
of distances is chosen. | |
If npass == 0, then the clustering algorithm will be run once, where the | |
initial assignment of elements to clusters is taken from the clusterid array. | |
method (input) char | |
Defines whether the arithmetic mean (method == 'a') or the median | |
(method == 'm') is used to calculate the cluster center. | |
dist (input) char | |
Defines which distance measure is used, as given by the table: | |
dist == 'e': Euclidean distance | |
dist == 'b': City-block distance | |
dist == 'c': correlation | |
dist == 'a': absolute value of the correlation | |
dist == 'u': uncentered correlation | |
dist == 'x': absolute uncentered correlation | |
dist == 's': Spearman's rank correlation | |
dist == 'k': Kendall's tau | |
For other values of dist, the default (Euclidean distance) is used. | |
clusterid (output; input) int[nrows] if transpose == 0 | |
int[ncolumns] otherwise | |
The cluster number to which a gene or microarray was assigned. If npass == 0, | |
then on input clusterid contains the initial clustering assignment from which | |
the clustering algorithm starts. On output, it contains the clustering solution | |
that was found. | |
error (output) double* | |
The sum of distances to the cluster center of each item in the optimal k-means | |
clustering solution that was found. | |
ifound (output) int* | |
The number of times the optimal clustering solution was | |
found. The value of ifound is at least 1; its maximum value is npass. If the | |
number of clusters is larger than the number of elements being clustered, | |
*ifound is set to 0 as an error code. If a memory allocation error occurs, | |
*ifound is set to -1. | |
======================================================================== | |
*/ | |
{ | |
const int nelements = (transpose == 0) ? nrows : ncolumns; | |
const int ndata = (transpose == 0) ? ncolumns : nrows; | |
int i; | |
int ok; | |
int* tclusterid; | |
int* mapping = NULL; | |
double** cdata; | |
int** cmask; | |
int* counts; | |
if (nelements < nclusters) { | |
*ifound = 0; | |
return; | |
} | |
/* More clusters asked for than elements available */ | |
*ifound = -1; | |
/* This will contain the number of elements in each cluster, which is | |
* needed to check for empty clusters. */ | |
counts = malloc(nclusters*sizeof(int)); | |
if (!counts) return; | |
/* Find out if the user specified an initial clustering */ | |
if (npass <= 1) tclusterid = clusterid; | |
else { | |
tclusterid = malloc(nelements*sizeof(int)); | |
if (!tclusterid) { | |
free(counts); | |
return; | |
} | |
mapping = malloc(nclusters*sizeof(int)); | |
if (!mapping) { | |
free(counts); | |
free(tclusterid); | |
return; | |
} | |
for (i = 0; i < nelements; i++) clusterid[i] = 0; | |
} | |
/* Allocate space to store the centroid data */ | |
if (transpose == 0) ok = makedatamask(nclusters, ndata, &cdata, &cmask); | |
else ok = makedatamask(ndata, nclusters, &cdata, &cmask); | |
if (!ok) { | |
free(counts); | |
if (npass>1) { | |
free(tclusterid); | |
free(mapping); | |
} | |
return; | |
} | |
if (method == 'm') { | |
double* cache = malloc(nelements*sizeof(double)); | |
if (cache) { | |
*ifound = kmedians(nclusters, nrows, ncolumns, data, mask, weight, | |
transpose, npass, dist, cdata, cmask, clusterid, | |
error, tclusterid, counts, mapping, cache); | |
free(cache); | |
} | |
} | |
else | |
*ifound = kmeans(nclusters, nrows, ncolumns, data, mask, weight, | |
transpose, npass, dist, cdata, cmask, clusterid, | |
error, tclusterid, counts, mapping); | |
/* Deallocate temporarily used space */ | |
if (npass > 1) { | |
free(mapping); | |
free(tclusterid); | |
} | |
if (transpose == 0) freedatamask(nclusters, cdata, cmask); | |
else freedatamask(ndata, cdata, cmask); | |
free(counts); | |
} | |
/* *********************************************************************** */ | |
void | |
kmedoids(int nclusters, int nelements, double** distmatrix, int npass, | |
int clusterid[], double* error, int* ifound) | |
/* | |
Purpose | |
======= | |
The kmedoids routine performs k-medoids clustering on a given set of elements, | |
using the distance matrix and the number of clusters passed by the user. | |
Multiple passes are being made to find the optimal clustering solution, each | |
time starting from a different initial clustering. | |
Arguments | |
========= | |
nclusters (input) int | |
The number of clusters to be found. | |
nelements (input) int | |
The number of elements to be clustered. | |
distmatrix (input) double array, ragged | |
(number of rows is nelements, number of columns is equal to the row number) | |
The distance matrix. To save space, the distance matrix is given in the | |
form of a ragged array. The distance matrix is symmetric and has zeros | |
on the diagonal. See distancematrix for a description of the content. | |
npass (input) int | |
The number of times clustering is performed. Clustering is performed npass | |
times, each time starting from a different (random) initial assignment of genes | |
to clusters. The clustering solution with the lowest within-cluster sum of | |
distances is chosen. | |
If npass == 0, then the clustering algorithm will be run once, where the | |
initial assignment of elements to clusters is taken from the clusterid array. | |
clusterid (output; input) int[nelements] | |
On input, if npass == 0, then clusterid contains the initial clustering | |
assignment from which the clustering algorithm starts; all numbers in clusterid | |
should be between zero and nelements-1 inclusive. If npass != 0, clusterid is | |
ignored on input. | |
On output, clusterid contains the clustering solution that was found: clusterid | |
contains the number of the cluster to which each item was assigned. On output, | |
the number of a cluster is defined as the item number of the centroid of the | |
cluster. | |
error (output) double | |
The sum of distances to the cluster center of each item in the optimal | |
k-medoids clustering solution that was found. | |
ifound (output) int | |
If kmedoids is successful: the number of times the optimal clustering solution | |
was found. The value of ifound is at least 1; its maximum value is npass. | |
If the user requested more clusters than elements available, ifound is set | |
to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1. | |
======================================================================== | |
*/ | |
{ | |
int i, j, icluster; | |
int* tclusterid; | |
int* saved; | |
int* centroids; | |
double* errors; | |
int ipass = 0; | |
if (nelements < nclusters) { | |
*ifound = 0; | |
return; | |
} /* More clusters asked for than elements available */ | |
*ifound = -1; | |
/* Save the clustering solution periodically and check if it reappears */ | |
saved = malloc(nelements*sizeof(int)); | |
if (saved == NULL) return; | |
centroids = malloc(nclusters*sizeof(int)); | |
if (!centroids) { | |
free(saved); | |
return; | |
} | |
errors = malloc(nclusters*sizeof(double)); | |
if (!errors) { | |
free(saved); | |
free(centroids); | |
return; | |
} | |
/* Find out if the user specified an initial clustering */ | |
if (npass <= 1) tclusterid = clusterid; | |
else { | |
tclusterid = malloc(nelements*sizeof(int)); | |
if (!tclusterid) { | |
free(saved); | |
free(centroids); | |
free(errors); | |
return; | |
} | |
for (i = 0; i < nelements; i++) clusterid[i] = -1; | |
} | |
*error = DBL_MAX; | |
do /* Start the loop */ { | |
double total = DBL_MAX; | |
int counter = 0; | |
int period = 10; | |
if (npass != 0) randomassign(nclusters, nelements, tclusterid); | |
while (1) { | |
double previous = total; | |
total = 0.0; | |
if (counter % period == 0) { | |
/* Save the current cluster assignments */ | |
for (i = 0; i < nelements; i++) saved[i] = tclusterid[i]; | |
if (period < INT_MAX / 2) period *= 2; | |
} | |
counter++; | |
/* Find the center */ | |
getclustermedoids(nclusters, nelements, distmatrix, tclusterid, | |
centroids, errors); | |
for (i = 0; i < nelements; i++) { | |
/* Find the closest cluster */ | |
double distance = DBL_MAX; | |
for (icluster = 0; icluster < nclusters; icluster++) { | |
double tdistance; | |
j = centroids[icluster]; | |
if (i == j) { | |
distance = 0.0; | |
tclusterid[i] = icluster; | |
break; | |
} | |
tdistance = (i > j) ? distmatrix[i][j] : distmatrix[j][i]; | |
if (tdistance < distance) { | |
distance = tdistance; | |
tclusterid[i] = icluster; | |
} | |
} | |
total += distance; | |
} | |
if (total >= previous) break; | |
/* total >= previous is FALSE on some machines even if total and | |
* previous are bitwise identical. */ | |
for (i = 0; i < nelements; i++) | |
if (saved[i] != tclusterid[i]) break; | |
if (i == nelements) | |
break; /* Identical solution found; break out of this loop */ | |
} | |
if (npass <= 1) { | |
*ifound = 1; | |
*error = total; | |
/* Replace by the centroid in each cluster. */ | |
for (j = 0; j < nelements; j++) { | |
clusterid[j] = centroids[tclusterid[j]]; | |
} | |
break; | |
} | |
for (i = 0; i < nelements; i++) { | |
if (clusterid[i]!=centroids[tclusterid[i]]) { | |
if (total < *error) { | |
*ifound = 1; | |
*error = total; | |
/* Replace by the centroid in each cluster. */ | |
for (j = 0; j < nelements; j++) { | |
clusterid[j] = centroids[tclusterid[j]]; | |
} | |
} | |
break; | |
} | |
} | |
if (i == nelements) (*ifound)++; /* break statement not encountered */ | |
} while (++ipass < npass); | |
/* Deallocate temporarily used space */ | |
if (npass > 1) free(tclusterid); | |
free(saved); | |
free(centroids); | |
free(errors); | |
} | |
/* ******************************************************************** */ | |
void | |
distancematrix(int nrows, int ncolumns, double** data, int** mask, | |
double weights[], char dist, int transpose, double** matrix) | |
/* | |
Purpose | |
======= | |
The distancematrix routine calculates the distance matrix between genes or | |
samples using their measured gene expression data. Several distance measures | |
can be used. As the distance matrix is symmetric, with zeros on the diagonal, | |
only the lower triangular half of the distance matrix is stored. | |
Space for the distance matrix should be allocated before calling this routine. | |
If the parameter transpose is set to a nonzero value, the distances between | |
columns of the data matrix are calculated, otherwise distances between the rows | |
are calculated. | |
Arguments | |
========= | |
nrows (input) int | |
The number of rows in the gene expression data matrix (i.e., the number of | |
genes) | |
ncolumns (input) int | |
The number of columns in the gene expression data matrix (i.e., the number of | |
samples) | |
data (input) double[nrows][ncolumns] | |
The array containing the gene expression data. | |
mask (input) int[nrows][ncolumns] | |
This array shows which data values are missing. If mask[i][j] == 0, then | |
data[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
dist (input) char | |
Defines which distance measure is used, as given by the table: | |
dist == 'e': Euclidean distance | |
dist == 'b': City-block distance | |
dist == 'c': correlation | |
dist == 'a': absolute value of the correlation | |
dist == 'u': uncentered correlation | |
dist == 'x': absolute uncentered correlation | |
dist == 's': Spearman's rank correlation | |
dist == 'k': Kendall's tau | |
For other values of dist, the default (Euclidean distance) is used. | |
transpose (input) int | |
If transpose is equal to zero, the distances between the rows is | |
calculated. Otherwise, the distances between the columns is calculated. | |
The former is needed when genes are being clustered; the latter is used | |
when samples are being clustered. | |
distmatrix (output) double** | |
A ragged array, with the number of columns in each row is equal to the | |
row index (so distmatrix[i] has i columns). Upon return, the values of | |
the distance matrix are stored in this array. | |
======================================================================== | |
*/ | |
{ | |
/* First determine the size of the distance matrix */ | |
const int n = (transpose == 0) ? nrows : ncolumns; | |
const int ndata = (transpose == 0) ? ncolumns : nrows; | |
int i, j; | |
/* Set the metric function as indicated by dist */ | |
double (*metric) (int, double**, double**, int**, int**, | |
const double[], int, int, int) = setmetric(dist); | |
/* Calculate the distances and save them in the ragged array */ | |
for (i = 1; i < n; i++) | |
for (j = 0; j < i; j++) | |
matrix[i][j] = metric(ndata, data, data, mask, mask, weights, | |
i, j, transpose); | |
} | |
/* ******************************************************************** */ | |
double* | |
calculate_weights(int nrows, int ncolumns, double** data, int** mask, | |
double weights[], int transpose, char dist, double cutoff, double exponent) | |
/* | |
Purpose | |
======= | |
This function calculates the weights using the weighting scheme proposed by | |
Michael Eisen: | |
w[i] = 1.0 / sum_{j where d[i][j]<cutoff} (1 - d[i][j]/cutoff)^exponent | |
where the cutoff and the exponent are specified by the user. | |
Arguments | |
========= | |
nrows (input) int | |
The number of rows in the gene expression data matrix, equal to the number of | |
genes. | |
ncolumns (input) int | |
The number of columns in the gene expression data matrix, equal to the number | |
of samples. | |
data (input) double[nrows][ncolumns] | |
The array containing the gene expression data. | |
mask (input) int[nrows][ncolumns] | |
This array shows which data values are missing. If mask[i][j] == 0, then | |
data[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
transpose (input) int | |
If transpose == 0, the weights of the rows of the data matrix are calculated. | |
Otherwise, the weights of the columns of the data matrix are calculated. | |
dist (input) char | |
Defines which distance measure is used, as given by the table: | |
dist == 'e': Euclidean distance | |
dist == 'b': City-block distance | |
dist == 'c': correlation | |
dist == 'a': absolute value of the correlation | |
dist == 'u': uncentered correlation | |
dist == 'x': absolute uncentered correlation | |
dist == 's': Spearman's rank correlation | |
dist == 'k': Kendall's tau | |
For other values of dist, the default (Euclidean distance) is used. | |
cutoff (input) double | |
The cutoff to be used to calculate the weights. | |
exponent (input) double | |
The exponent to be used to calculate the weights. | |
Return value | |
============ | |
The function returns a pointer to a newly allocated array containing the | |
calculated weights for the rows (if transpose == 0) or columns (if | |
transpose != 0). If not enough memory could be allocated to store the | |
weights array, the function returns NULL. | |
======================================================================== | |
*/ | |
{ | |
int i, j; | |
const int ndata = (transpose == 0) ? ncolumns : nrows; | |
const int nelements = (transpose == 0) ? nrows : ncolumns; | |
/* Set the metric function as indicated by dist */ | |
double (*metric) (int, double**, double**, int**, int**, | |
const double[], int, int, int) = setmetric(dist); | |
double* result; | |
result = malloc(nelements*sizeof(double)); | |
if (!result) return NULL; | |
memset(result, 0, nelements*sizeof(double)); | |
for (i = 0; i < nelements; i++) { | |
result[i] += 1.0; | |
for (j = 0; j < i; j++) { | |
const double distance = metric(ndata, data, data, mask, mask, | |
weights, i, j, transpose); | |
if (distance < cutoff) { | |
const double dweight = exp(exponent*log(1-distance/cutoff)); | |
/* pow() causes a crash on AIX */ | |
result[i] += dweight; | |
result[j] += dweight; | |
} | |
} | |
} | |
for (i = 0; i < nelements; i++) result[i] = 1.0/result[i]; | |
return result; | |
} | |
/* ******************************************************************** */ | |
int | |
cuttree(int nelements, const Node* tree, int nclusters, int clusterid[]) | |
/* | |
Purpose | |
======= | |
The cuttree routine takes the output of a hierarchical clustering routine, and | |
divides the elements in the tree structure into clusters based on the | |
hierarchical clustering result. The number of clusters is specified by the | |
user. | |
Arguments | |
========= | |
nelements (input) int | |
The number of elements that were clustered. | |
tree (input) Node[nelements-1] | |
The clustering solution. Each node in the array describes one linking event, | |
with tree[i].left and tree[i].right representing the elements that were joined. | |
The original elements are numbered 0..nelements-1, nodes are numbered | |
-1..-(nelements-1). | |
nclusters (input) int | |
The number of clusters to be formed. | |
clusterid (output) int[nelements] | |
The number of the cluster to which each element was assigned. Clusters are | |
numbered 0..nclusters-1 in the left-to-right order in which they appear in the | |
hierarchical clustering tree. Space for the clusterid array should be allocated | |
before calling the cuttree routine. | |
Return value | |
============ | |
If no errors occur, cuttree returns 1. | |
If a memory error occurs, cuttree returns 0. | |
======================================================================== | |
*/ | |
{ | |
int i = -nelements+1; /* top node */ | |
int j; | |
int k = -1; | |
int previous = nelements; | |
const int n = nelements-nclusters; /* number of nodes to join */ | |
int* parents; | |
if (nclusters == 1) { | |
for (i = 0; i < nelements; i++) clusterid[i] = 0; | |
return 1; | |
} | |
parents = malloc((nelements-1)*sizeof(int)); | |
if (!parents) return 0; | |
while (1) { | |
if (i >= 0) { | |
clusterid[i] = k; | |
j = i; | |
i = previous; | |
previous = j; | |
} | |
else { | |
j = -i-1; | |
if (previous == tree[j].left) { | |
previous = i; | |
i = tree[j].right; | |
if (j >= n && (i >= 0 || -i-1 < n)) k++; | |
} | |
else if (previous == tree[j].right) { | |
previous = i; | |
i = parents[j]; | |
if (i == nelements) break; | |
} | |
else { | |
parents[j] = previous; | |
previous = i; | |
i = tree[j].left; | |
if (j >= n && (i >= 0 || -i-1 < n)) k++; | |
} | |
} | |
} | |
free(parents); | |
return 1; | |
} | |
/* ******************************************************************** */ | |
static Node* | |
pclcluster(int nrows, int ncolumns, double** data, int** mask, double weight[], | |
double** distmatrix, char dist, int transpose) | |
/* | |
Purpose | |
======= | |
The pclcluster routine performs clustering using pairwise centroid-linking on a | |
given set of gene expression data, using the distance metric given by dist. | |
Arguments | |
========= | |
nrows (input) int | |
The number of rows in the gene expression data matrix, equal to the number of | |
genes. | |
ncolumns (input) int | |
The number of columns in the gene expression data matrix, equal to the number | |
of samples. | |
data (input) double[nrows][ncolumns] | |
The array containing the gene expression data. | |
mask (input) int[nrows][ncolumns] | |
This array shows which data values are missing. If | |
mask[i][j] == 0, then data[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0; | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
transpose (input) int | |
If transpose == 0, the rows of the matrix are clustered. Otherwise, columns | |
of the matrix are clustered. | |
dist (input) char | |
Defines which distance measure is used, as given by the table: | |
dist == 'e': Euclidean distance | |
dist == 'b': City-block distance | |
dist == 'c': correlation | |
dist == 'a': absolute value of the correlation | |
dist == 'u': uncentered correlation | |
dist == 'x': absolute uncentered correlation | |
dist == 's': Spearman's rank correlation | |
dist == 'k': Kendall's tau | |
For other values of dist, the default (Euclidean distance) is used. | |
distmatrix (input) double** | |
The distance matrix. This matrix is precalculated by the calling routine | |
treecluster. The pclcluster routine modifies the contents of distmatrix, but | |
does not deallocate it. | |
Return value | |
============ | |
A pointer to a newly allocated array of Node structs, describing the | |
hierarchical clustering solution consisting of nelements-1 nodes. Depending | |
on whether genes (rows) or samples (columns) were clustered, nelements is | |
equal to nrows or ncolumns. See src/cluster.h for a description of the Node | |
structure. | |
If a memory error occurs, pclcluster returns NULL. | |
======================================================================== | |
*/ | |
{ | |
int i, j; | |
const int nelements = (transpose == 0) ? nrows : ncolumns; | |
int inode; | |
const int ndata = transpose ? nrows : ncolumns; | |
const int nnodes = nelements - 1; | |
Node* result; | |
double** newdata; | |
int** newmask; | |
int* distid; | |
/* Set the metric function as indicated by dist */ | |
double (*metric) (int, double**, double**, int**, int**, | |
const double[], int, int, int) = setmetric(dist); | |
distid = malloc(nelements*sizeof(int)); | |
if (!distid) return NULL; | |
result = malloc(nnodes*sizeof(Node)); | |
if (!result) { | |
free(distid); | |
return NULL; | |
} | |
if (!makedatamask(nelements, ndata, &newdata, &newmask)) { | |
free(result); | |
free(distid); | |
return NULL; | |
} | |
for (i = 0; i < nelements; i++) distid[i] = i; | |
/* To remember which row/column in the distance matrix contains what */ | |
/* Storage for node data */ | |
if (transpose) { | |
for (i = 0; i < nelements; i++) { | |
for (j = 0; j < ndata; j++) { | |
newdata[i][j] = data[j][i]; | |
newmask[i][j] = mask[j][i]; | |
} | |
} | |
data = newdata; | |
mask = newmask; | |
} | |
else { | |
for (i = 0; i < nelements; i++) { | |
memcpy(newdata[i], data[i], ndata*sizeof(double)); | |
memcpy(newmask[i], mask[i], ndata*sizeof(int)); | |
} | |
data = newdata; | |
mask = newmask; | |
} | |
for (inode = 0; inode < nnodes; inode++) { | |
/* Find the pair with the shortest distance */ | |
int is = 1; | |
int js = 0; | |
result[inode].distance = find_closest_pair(nelements-inode, distmatrix, | |
&is, &js); | |
result[inode].left = distid[js]; | |
result[inode].right = distid[is]; | |
/* Make node js the new node */ | |
for (i = 0; i < ndata; i++) { | |
data[js][i] = data[js][i]*mask[js][i] + data[is][i]*mask[is][i]; | |
mask[js][i] += mask[is][i]; | |
if (mask[js][i]) data[js][i] /= mask[js][i]; | |
} | |
free(data[is]); | |
free(mask[is]); | |
data[is] = data[nnodes-inode]; | |
mask[is] = mask[nnodes-inode]; | |
/* Fix the distances */ | |
distid[is] = distid[nnodes-inode]; | |
for (i = 0; i < is; i++) | |
distmatrix[is][i] = distmatrix[nnodes-inode][i]; | |
for (i = is + 1; i < nnodes-inode; i++) | |
distmatrix[i][is] = distmatrix[nnodes-inode][i]; | |
distid[js] = -inode-1; | |
for (i = 0; i < js; i++) | |
distmatrix[js][i] = metric(ndata, data, data, mask, mask, weight, | |
js, i, 0); | |
for (i = js + 1; i < nnodes-inode; i++) | |
distmatrix[i][js] = metric(ndata, data, data, mask, mask, weight, | |
js, i, 0); | |
} | |
/* Free temporarily allocated space */ | |
free(data[0]); | |
free(mask[0]); | |
free(data); | |
free(mask); | |
free(distid); | |
return result; | |
} | |
/* ******************************************************************** */ | |
static int | |
nodecompare(const void* a, const void* b) | |
/* Helper function for qsort. */ | |
{ | |
const Node* node1 = (const Node*)a; | |
const Node* node2 = (const Node*)b; | |
const double term1 = node1->distance; | |
const double term2 = node2->distance; | |
if (term1 < term2) return -1; | |
if (term1 > term2) return +1; | |
return 0; | |
} | |
/* ---------------------------------------------------------------------- */ | |
static Node* | |
pslcluster(int nrows, int ncolumns, double** data, int** mask, | |
double weight[], double** distmatrix, char dist, int transpose) | |
/* | |
Purpose | |
======= | |
The pslcluster routine performs single-linkage hierarchical clustering, using | |
either the distance matrix directly, if available, or by calculating the | |
distances from the data array. This implementation is based on the SLINK | |
algorithm, described in: | |
Sibson, R. (1973). SLINK: An optimally efficient algorithm for the single-link | |
cluster method. The Computer Journal, 16(1): 30-34. | |
The output of this algorithm is identical to conventional single-linkage | |
hierarchical clustering, but is much more memory-efficient and faster. Hence, | |
it can be applied to large data sets, for which the conventional single- | |
linkage algorithm fails due to lack of memory. | |
Arguments | |
========= | |
nrows (input) int | |
The number of rows in the gene expression data matrix, equal to the number of | |
genes. | |
ncolumns (input) int | |
The number of columns in the gene expression data matrix, equal to the number | |
of samples. | |
data (input) double[nrows][ncolumns] | |
The array containing the gene expression data. | |
mask (input) int[nrows][ncolumns] | |
This array shows which data values are missing. If | |
mask[i][j] == 0, then data[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
transpose (input) int | |
If transpose == 0, the rows of the matrix are clustered. Otherwise, columns | |
of the matrix are clustered. | |
dist (input) char | |
Defines which distance measure is used, as given by the table: | |
dist == 'e': Euclidean distance | |
dist == 'b': City-block distance | |
dist == 'c': correlation | |
dist == 'a': absolute value of the correlation | |
dist == 'u': uncentered correlation | |
dist == 'x': absolute uncentered correlation | |
dist == 's': Spearman's rank correlation | |
dist == 'k': Kendall's tau | |
For other values of dist, the default (Euclidean distance) is used. | |
distmatrix (input) double** | |
The distance matrix. If the distance matrix is passed by the calling routine | |
treecluster, it is used by pslcluster to speed up the clustering calculation. | |
The pslcluster routine does not modify the contents of distmatrix, and does | |
not deallocate it. If distmatrix is NULL, the pairwise distances are calculated | |
by the pslcluster routine from the gene expression data (the data and mask | |
arrays) and stored in temporary arrays. If distmatrix is passed, the original | |
gene expression data (specified by the data and mask arguments) are not needed | |
and are therefore ignored. | |
Return value | |
============ | |
A pointer to a newly allocated array of Node structs, describing the | |
hierarchical clustering solution consisting of nelements-1 nodes. Depending | |
on whether genes (rows) or samples (columns) were clustered, nelements is | |
equal to nrows or ncolumns. See src/cluster.h for a description of the Node | |
structure. | |
If a memory error occurs, pslcluster returns NULL. | |
======================================================================== | |
*/ | |
{ | |
int i, j, k; | |
const int nelements = transpose ? ncolumns : nrows; | |
const int nnodes = nelements - 1; | |
int* vector; | |
double* temp; | |
int* index; | |
Node* result; | |
temp = malloc(nnodes*sizeof(double)); | |
if (!temp) return NULL; | |
index = malloc(nelements*sizeof(int)); | |
if (!index) { | |
free(temp); | |
return NULL; | |
} | |
vector = malloc(nnodes*sizeof(int)); | |
if (!vector) { | |
free(index); | |
free(temp); | |
return NULL; | |
} | |
result = malloc(nelements*sizeof(Node)); | |
if (!result) { | |
free(vector); | |
free(index); | |
free(temp); | |
return NULL; | |
} | |
for (i = 0; i < nnodes; i++) vector[i] = i; | |
if (distmatrix) { | |
for (i = 0; i < nrows; i++) { | |
result[i].distance = DBL_MAX; | |
for (j = 0; j < i; j++) temp[j] = distmatrix[i][j]; | |
for (j = 0; j < i; j++) { | |
k = vector[j]; | |
if (result[j].distance >= temp[j]) { | |
if (result[j].distance < temp[k]) | |
temp[k] = result[j].distance; | |
result[j].distance = temp[j]; | |
vector[j] = i; | |
} | |
else if (temp[j] < temp[k]) temp[k] = temp[j]; | |
} | |
for (j = 0; j < i; j++) { | |
if (result[j].distance >= result[vector[j]].distance) | |
vector[j] = i; | |
} | |
} | |
} | |
else { | |
const int ndata = transpose ? nrows : ncolumns; | |
/* Set the metric function as indicated by dist */ | |
double (*metric) (int, double**, double**, int**, int**, | |
const double[], int, int, int) = setmetric(dist); | |
for (i = 0; i < nelements; i++) { | |
result[i].distance = DBL_MAX; | |
for (j = 0; j < i; j++) temp[j] = | |
metric(ndata, data, data, mask, mask, weight, i, j, transpose); | |
for (j = 0; j < i; j++) { | |
k = vector[j]; | |
if (result[j].distance >= temp[j]) { | |
if (result[j].distance < temp[k]) | |
temp[k] = result[j].distance; | |
result[j].distance = temp[j]; | |
vector[j] = i; | |
} | |
else if (temp[j] < temp[k]) temp[k] = temp[j]; | |
} | |
for (j = 0; j < i; j++) | |
if (result[j].distance >= result[vector[j]].distance) | |
vector[j] = i; | |
} | |
} | |
free(temp); | |
for (i = 0; i < nnodes; i++) result[i].left = i; | |
qsort(result, nnodes, sizeof(Node), nodecompare); | |
for (i = 0; i < nelements; i++) index[i] = i; | |
for (i = 0; i < nnodes; i++) { | |
j = result[i].left; | |
k = vector[j]; | |
result[i].left = index[j]; | |
result[i].right = index[k]; | |
index[k] = -i-1; | |
} | |
free(vector); | |
free(index); | |
result = realloc(result, nnodes*sizeof(Node)); | |
return result; | |
} | |
/* ******************************************************************** */ | |
static Node* | |
pmlcluster(int nelements, double** distmatrix) | |
/* | |
Purpose | |
======= | |
The pmlcluster routine performs clustering using pairwise maximum- (complete-) | |
linking on the given distance matrix. | |
Arguments | |
========= | |
nelements (input) int | |
The number of elements to be clustered. | |
distmatrix (input) double** | |
The distance matrix, with nelements rows, each row being filled up to the | |
diagonal. The elements on the diagonal are not used, as they are assumed to be | |
zero. The distance matrix will be modified by this routine. | |
Return value | |
============ | |
A pointer to a newly allocated array of Node structs, describing the | |
hierarchical clustering solution consisting of nelements-1 nodes. Depending on | |
whether genes (rows) or samples (columns) were clustered, nelements is equal | |
to nrows or ncolumns. See src/cluster.h for a description of the Node | |
structure. | |
If a memory error occurs, pmlcluster returns NULL. | |
======================================================================== | |
*/ | |
{ | |
int j; | |
int n; | |
int* clusterid; | |
Node* result; | |
clusterid = malloc(nelements*sizeof(int)); | |
if (!clusterid) return NULL; | |
result = malloc((nelements-1)*sizeof(Node)); | |
if (!result) { | |
free(clusterid); | |
return NULL; | |
} | |
/* Setup a list specifying to which cluster a gene belongs */ | |
for (j = 0; j < nelements; j++) clusterid[j] = j; | |
for (n = nelements; n > 1; n--) { | |
int is = 1; | |
int js = 0; | |
result[nelements-n].distance = find_closest_pair(n, distmatrix, | |
&is, &js); | |
/* Fix the distances */ | |
for (j = 0; j < js; j++) | |
distmatrix[js][j] = max(distmatrix[is][j], distmatrix[js][j]); | |
for (j = js+1; j < is; j++) | |
distmatrix[j][js] = max(distmatrix[is][j], distmatrix[j][js]); | |
for (j = is+1; j < n; j++) | |
distmatrix[j][js] = max(distmatrix[j][is], distmatrix[j][js]); | |
for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j]; | |
for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j]; | |
/* Update clusterids */ | |
result[nelements-n].left = clusterid[is]; | |
result[nelements-n].right = clusterid[js]; | |
clusterid[js] = n-nelements-1; | |
clusterid[is] = clusterid[n-1]; | |
} | |
free(clusterid); | |
return result; | |
} | |
/* ******************************************************************* */ | |
static Node* | |
palcluster(int nelements, double** distmatrix) | |
/* | |
Purpose | |
======= | |
The palcluster routine performs clustering using pairwise average | |
linking on the given distance matrix. | |
Arguments | |
========= | |
nelements (input) int | |
The number of elements to be clustered. | |
distmatrix (input) double** | |
The distance matrix, with nelements rows, each row being filled up to the | |
diagonal. The elements on the diagonal are not used, as they are assumed to be | |
zero. The distance matrix will be modified by this routine. | |
Return value | |
============ | |
A pointer to a newly allocated array of Node structs, describing the | |
hierarchical clustering solution consisting of nelements-1 nodes. Depending on | |
whether genes (rows) or samples (columns) were clustered, nelements is equal | |
to nrows or ncolumns. See src/cluster.h for a description of the Node | |
structure. | |
If a memory error occurs, palcluster returns NULL. | |
======================================================================== | |
*/ | |
{ | |
int j; | |
int n; | |
int* clusterid; | |
int* number; | |
Node* result; | |
clusterid = malloc(nelements*sizeof(int)); | |
if (!clusterid) return NULL; | |
number = malloc(nelements*sizeof(int)); | |
if (!number) { | |
free(clusterid); | |
return NULL; | |
} | |
result = malloc((nelements-1)*sizeof(Node)); | |
if (!result) { | |
free(clusterid); | |
free(number); | |
return NULL; | |
} | |
/* Setup a list specifying to which cluster a gene belongs, and keep track | |
* of the number of elements in each cluster (needed to calculate the | |
* average). */ | |
for (j = 0; j < nelements; j++) { | |
number[j] = 1; | |
clusterid[j] = j; | |
} | |
for (n = nelements; n > 1; n--) { | |
int sum; | |
int is = 1; | |
int js = 0; | |
result[nelements-n].distance = find_closest_pair(n, distmatrix, | |
&is, &js); | |
/* Save result */ | |
result[nelements-n].left = clusterid[is]; | |
result[nelements-n].right = clusterid[js]; | |
/* Fix the distances */ | |
sum = number[is] + number[js]; | |
for (j = 0; j < js; j++) { | |
distmatrix[js][j] = distmatrix[is][j]*number[is] | |
+ distmatrix[js][j]*number[js]; | |
distmatrix[js][j] /= sum; | |
} | |
for (j = js+1; j < is; j++) { | |
distmatrix[j][js] = distmatrix[is][j]*number[is] | |
+ distmatrix[j][js]*number[js]; | |
distmatrix[j][js] /= sum; | |
} | |
for (j = is+1; j < n; j++) { | |
distmatrix[j][js] = distmatrix[j][is]*number[is] | |
+ distmatrix[j][js]*number[js]; | |
distmatrix[j][js] /= sum; | |
} | |
for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j]; | |
for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j]; | |
/* Update number of elements in the clusters */ | |
number[js] = sum; | |
number[is] = number[n-1]; | |
/* Update clusterids */ | |
clusterid[js] = n-nelements-1; | |
clusterid[is] = clusterid[n-1]; | |
} | |
free(clusterid); | |
free(number); | |
return result; | |
} | |
/* ******************************************************************* */ | |
Node* | |
treecluster(int nrows, int ncolumns, double** data, int** mask, | |
double weight[], int transpose, char dist, char method, | |
double** distmatrix) | |
/* | |
Purpose | |
======= | |
The treecluster routine performs hierarchical clustering using pairwise | |
single-, maximum-, centroid-, or average-linkage, as defined by method, on a | |
given set of gene expression data, using the distance metric given by dist. | |
If successful, the function returns a pointer to a newly allocated Tree struct | |
containing the hierarchical clustering solution, and NULL if a memory error | |
occurs. The pointer should be freed by the calling routine to prevent memory | |
leaks. | |
Arguments | |
========= | |
nrows (input) int | |
The number of rows in the data matrix, equal to the number of genes. | |
ncolumns (input) int | |
The number of columns in the data matrix, equal to the number of samples. | |
data (input) double[nrows][ncolumns] | |
The array containing the data of the vectors to be clustered. | |
mask (input) int[nrows][ncolumns] | |
This array shows which data values are missing. If mask[i][j] == 0, then | |
data[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0, | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
transpose (input) int | |
If transpose == 0, the rows of the matrix are clustered. Otherwise, columns | |
of the matrix are clustered. | |
dist (input) char | |
Defines which distance measure is used, as given by the table: | |
dist == 'e': Euclidean distance | |
dist == 'b': City-block distance | |
dist == 'c': correlation | |
dist == 'a': absolute value of the correlation | |
dist == 'u': uncentered correlation | |
dist == 'x': absolute uncentered correlation | |
dist == 's': Spearman's rank correlation | |
dist == 'k': Kendall's tau | |
For other values of dist, the default (Euclidean distance) is used. | |
method (input) char | |
Defines which hierarchical clustering method is used: | |
method == 's': pairwise single-linkage clustering | |
method == 'm': pairwise maximum- (or complete-) linkage clustering | |
method == 'a': pairwise average-linkage clustering | |
method == 'c': pairwise centroid-linkage clustering | |
For the first three, either the distance matrix or the gene expression data is | |
sufficient to perform the clustering algorithm. For pairwise centroid-linkage | |
clustering, however, the gene expression data are always needed, even if the | |
distance matrix itself is available. | |
distmatrix (input) double** | |
The distance matrix. If the distance matrix is zero initially, the distance | |
matrix will be allocated and calculated from the data by treecluster, and | |
deallocated before treecluster returns. If the distance matrix is passed by the | |
calling routine, treecluster will modify the contents of the distance matrix as | |
part of the clustering algorithm, but will not deallocate it. The calling | |
routine should deallocate the distance matrix after the return from | |
treecluster. | |
Return value | |
============ | |
A pointer to a newly allocated array of Node structs, describing the | |
hierarchical clustering solution consisting of nelements-1 nodes. Depending on | |
whether genes (rows) or samples (columns) were clustered, nelements is equal | |
to nrows or ncolumns. See src/cluster.h for a description of the Node | |
structure. | |
If a memory error occurs, treecluster returns NULL. | |
======================================================================== | |
*/ | |
{ | |
Node* result = NULL; | |
const int nelements = (transpose == 0) ? nrows : ncolumns; | |
const int ldistmatrix = (distmatrix == NULL && method != 's') ? 1 : 0; | |
if (nelements < 2) return NULL; | |
/* Calculate the distance matrix if the user didn't give it */ | |
if (ldistmatrix) { | |
/* Set up the ragged array */ | |
int i; | |
distmatrix = malloc(nelements*sizeof(double*)); | |
if (distmatrix == NULL) return NULL; /* Not enough memory available */ | |
distmatrix[0] = NULL; | |
for (i = 1; i < nelements; i++) { | |
distmatrix[i] = malloc(i*sizeof(double)); | |
if (distmatrix[i] == NULL) /* Not enough memory available */ { | |
while (--i > 0) free(distmatrix[i]); | |
free(distmatrix); | |
return NULL; | |
} | |
} | |
distancematrix(nrows, ncolumns, data, mask, weight, dist, transpose, | |
distmatrix); | |
} | |
switch(method) { | |
case 's': | |
result = pslcluster(nrows, ncolumns, data, mask, weight, | |
distmatrix, dist, transpose); | |
break; | |
case 'm': | |
result = pmlcluster(nelements, distmatrix); | |
break; | |
case 'a': | |
result = palcluster(nelements, distmatrix); | |
break; | |
case 'c': | |
result = pclcluster(nrows, ncolumns, data, mask, weight, | |
distmatrix, dist, transpose); | |
break; | |
} | |
/* Deallocate space for distance matrix if allocated by treecluster */ | |
if (ldistmatrix) { | |
int i; | |
for (i = 1; i < nelements; i++) free(distmatrix[i]); | |
free(distmatrix); | |
} | |
return result; | |
} | |
/* ******************************************************************* */ | |
int | |
sorttree(const int nnodes, Node* tree, const double order[], int indices[]) | |
/* | |
Purpose | |
======= | |
The sorttree routine sorts the items in a hierarchical clustering solution | |
based on their order values, while remaining consistent with the hierchical | |
clustering solution. | |
Arguments | |
========= | |
nnodes (input) int | |
The number of nodes in the hierarchical clustering tree. | |
tree (input) Node[nnodes] | |
The hierarchical clustering tree describing the clustering solution. | |
order (input) double[nnodes+1] | |
The preferred order of the items. | |
indices (output) int* | |
The indices of each item after sorting, with item i appearing at indices[i] | |
after sorting. | |
Return value | |
============ | |
If no errors occur, sorttree returns 1. | |
If a memory error occurs, sorttree returns 0. | |
======================================================================== | |
*/ | |
{ | |
int i; | |
int index; | |
int i1, i2; | |
double order1, order2; | |
int counts1, counts2; | |
int* nodecounts; | |
nodecounts = malloc(nnodes*sizeof(int)); | |
if (!nodecounts) return 0; | |
if (order) { | |
double* nodeorder = malloc(nnodes*sizeof(double)); | |
if (!nodeorder) { | |
free(nodecounts); | |
return 0; | |
} | |
for (i = 0; i < nnodes; i++) { | |
i1 = tree[i].left; | |
i2 = tree[i].right; | |
/* i1 and i2 are the elements that are to be joined */ | |
if (i1 < 0) { | |
index = -i1-1; | |
order1 = nodeorder[index]; | |
counts1 = nodecounts[index]; | |
} | |
else { | |
order1 = order[i1]; | |
counts1 = 1; | |
} | |
if (i2 < 0) { | |
index = -i2-1; | |
order2 = nodeorder[index]; | |
counts2 = nodecounts[index]; | |
} | |
else { | |
order2 = order[i2]; | |
counts2 = 1; | |
} | |
if (order1 > order2) { | |
tree[i].left = i2; | |
tree[i].right = i1; | |
} | |
nodecounts[i] = counts1 + counts2; | |
nodeorder[i] = (counts1*order1+counts2*order2) / (counts1+counts2); | |
} | |
free(nodeorder); | |
} | |
else { | |
for (i = 0; i < nnodes; i++) { | |
i1 = tree[i].left; | |
i2 = tree[i].right; | |
/* i1 and i2 are the elements that are to be joined */ | |
counts1 = (i1 < 0) ? nodecounts[-i1-1] : 1; | |
counts2 = (i2 < 0) ? nodecounts[-i2-1] : 1; | |
nodecounts[i] = counts1 + counts2; | |
} | |
} | |
i--; | |
nodecounts[i] = 0; | |
for ( ; i >= 0; i--) { | |
i1 = tree[i].left; | |
i2 = tree[i].right; | |
counts1 = (i1<0) ? nodecounts[-i1-1] : 1; | |
index = nodecounts[i]; | |
if (i1 >= 0) indices[index] = i1; | |
else nodecounts[-i1-1] = index; | |
index += counts1; | |
if (i2 >= 0) indices[index] = i2; | |
else nodecounts[-i2-1] = index; | |
} | |
free(nodecounts); | |
return 1; | |
} | |
/* ******************************************************************* */ | |
static void | |
somworker(int nrows, int ncolumns, double** data, int** mask, | |
const double weights[], int transpose, int nxgrid, int nygrid, | |
double inittau, double*** celldata, int niter, char dist) | |
{ | |
const int nelements = (transpose == 0) ? nrows : ncolumns; | |
const int ndata = (transpose == 0) ? ncolumns : nrows; | |
int i, j; | |
int** dummymask; | |
int ix, iy; | |
int* index; | |
int iter; | |
/* Maximum radius in which nodes are adjusted */ | |
double maxradius = sqrt(nxgrid*nxgrid+nygrid*nygrid); | |
double* stddata = calloc(nelements, sizeof(double)); | |
/* Set the metric function as indicated by dist */ | |
double (*metric) (int, double**, double**, int**, int**, | |
const double[], int, int, int) = setmetric(dist); | |
/* Calculate the standard deviation for each row or column */ | |
if (transpose == 0) { | |
for (i = 0; i < nelements; i++) { | |
int n = 0; | |
for (j = 0; j < ndata; j++) { | |
if (mask[i][j]) { | |
double term = data[i][j]; | |
term = term * term; | |
stddata[i] += term; | |
n++; | |
} | |
} | |
if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n); | |
else stddata[i] = 1; | |
} | |
} | |
else { | |
for (i = 0; i < nelements; i++) { | |
int n = 0; | |
for (j = 0; j < ndata; j++) { | |
if (mask[j][i]) { | |
double term = data[j][i]; | |
term = term * term; | |
stddata[i] += term; | |
n++; | |
} | |
} | |
if (stddata[i] > 0) stddata[i] = sqrt(stddata[i]/n); | |
else stddata[i] = 1; | |
} | |
} | |
if (transpose == 0) { | |
dummymask = malloc(nygrid*sizeof(int*)); | |
for (i = 0; i < nygrid; i++) { | |
dummymask[i] = malloc(ndata*sizeof(int)); | |
for (j = 0; j < ndata; j++) dummymask[i][j] = 1; | |
} | |
} | |
else { | |
dummymask = malloc(ndata*sizeof(int*)); | |
for (i = 0; i < ndata; i++) { | |
dummymask[i] = malloc(sizeof(int)); | |
dummymask[i][0] = 1; | |
} | |
} | |
/* Randomly initialize the nodes */ | |
for (ix = 0; ix < nxgrid; ix++) { | |
for (iy = 0; iy < nygrid; iy++) { | |
double sum = 0.; | |
for (i = 0; i < ndata; i++) { | |
double term = -1.0 + 2.0*uniform(); | |
celldata[ix][iy][i] = term; | |
sum += term * term; | |
} | |
sum = sqrt(sum/ndata); | |
for (i = 0; i < ndata; i++) celldata[ix][iy][i] /= sum; | |
} | |
} | |
/* Randomize the order in which genes or arrays will be used */ | |
index = malloc(nelements*sizeof(int)); | |
for (i = 0; i < nelements; i++) index[i] = i; | |
for (i = 0; i < nelements; i++) { | |
j = (int) (i + (nelements-i)*uniform()); | |
ix = index[j]; | |
index[j] = index[i]; | |
index[i] = ix; | |
} | |
/* Start the iteration */ | |
for (iter = 0; iter < niter; iter++) { | |
int ixbest = 0; | |
int iybest = 0; | |
int iobject = iter % nelements; | |
iobject = index[iobject]; | |
if (transpose == 0) { | |
double closest = metric(ndata, data, celldata[ixbest], mask, | |
dummymask, weights, iobject, iybest, | |
transpose); | |
double radius = maxradius * (1. - ((double)iter)/((double)niter)); | |
double tau = inittau * (1. - ((double)iter)/((double)niter)); | |
for (ix = 0; ix < nxgrid; ix++) { | |
for (iy = 0; iy < nygrid; iy++) { | |
double distance = metric(ndata, data, celldata[ix], mask, | |
dummymask, weights, iobject, iy, | |
transpose); | |
if (distance < closest) { | |
ixbest = ix; | |
iybest = iy; | |
closest = distance; | |
} | |
} | |
} | |
for (ix = 0; ix < nxgrid; ix++) { | |
for (iy = 0; iy < nygrid; iy++) { | |
if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest)) < | |
radius) { | |
double sum = 0.; | |
for (i = 0; i < ndata; i++) { | |
if (mask[iobject][i] == 0) continue; | |
celldata[ix][iy][i] += | |
tau * (data[iobject][i]/stddata[iobject] | |
-celldata[ix][iy][i]); | |
} | |
for (i = 0; i < ndata; i++) { | |
double term = celldata[ix][iy][i]; | |
term = term * term; | |
sum += term; | |
} | |
if (sum>0) { | |
sum = sqrt(sum/ndata); | |
for (i = 0; i < ndata; i++) | |
celldata[ix][iy][i] /= sum; | |
} | |
} | |
} | |
} | |
} | |
else { | |
double closest; | |
double** celldatavector = malloc(ndata*sizeof(double*)); | |
double radius = maxradius * (1. - ((double)iter)/((double)niter)); | |
double tau = inittau * (1. - ((double)iter)/((double)niter)); | |
for (i = 0; i < ndata; i++) | |
celldatavector[i] = &(celldata[ixbest][iybest][i]); | |
closest = metric(ndata, data, celldatavector, mask, dummymask, | |
weights, iobject, 0, transpose); | |
for (ix = 0; ix < nxgrid; ix++) { | |
for (iy = 0; iy < nygrid; iy++) { | |
double distance; | |
for (i = 0; i < ndata; i++) | |
celldatavector[i] = &(celldata[ixbest][iybest][i]); | |
distance = metric(ndata, data, celldatavector, mask, | |
dummymask, weights, iobject, 0, | |
transpose); | |
if (distance < closest) { | |
ixbest = ix; | |
iybest = iy; | |
closest = distance; | |
} | |
} | |
} | |
free(celldatavector); | |
for (ix = 0; ix < nxgrid; ix++) { | |
for (iy = 0; iy < nygrid; iy++) { | |
if (sqrt((ix-ixbest)*(ix-ixbest)+(iy-iybest)*(iy-iybest)) < | |
radius) { | |
double sum = 0.; | |
for (i = 0; i < ndata; i++) { | |
if (mask[i][iobject] == 0) continue; | |
celldata[ix][iy][i] += | |
tau * (data[i][iobject]/stddata[iobject] | |
-celldata[ix][iy][i]); | |
} | |
for (i = 0; i < ndata; i++) { | |
double term = celldata[ix][iy][i]; | |
term = term * term; | |
sum += term; | |
} | |
if (sum>0) { | |
sum = sqrt(sum/ndata); | |
for (i = 0; i < ndata; i++) | |
celldata[ix][iy][i] /= sum; | |
} | |
} | |
} | |
} | |
} | |
} | |
if (transpose == 0) | |
for (i = 0; i < nygrid; i++) free(dummymask[i]); | |
else | |
for (i = 0; i < ndata; i++) free(dummymask[i]); | |
free(dummymask); | |
free(stddata); | |
free(index); | |
} | |
/* ******************************************************************* */ | |
static void | |
somassign(int nrows, int ncolumns, double** data, int** mask, | |
const double weights[], int transpose, int nxgrid, int nygrid, | |
double*** celldata, char dist, int clusterid[][2]) | |
/* Collect clusterids */ | |
{ | |
const int ndata = (transpose == 0) ? ncolumns : nrows; | |
int i, j; | |
/* Set the metric function as indicated by dist */ | |
double (*metric) (int, double**, double**, int**, int**, | |
const double[], int, int, int) = setmetric(dist); | |
if (transpose == 0) { | |
int** dummymask = malloc(nygrid*sizeof(int*)); | |
for (i = 0; i < nygrid; i++) { | |
dummymask[i] = malloc(ncolumns*sizeof(int)); | |
for (j = 0; j < ncolumns; j++) dummymask[i][j] = 1; | |
} | |
for (i = 0; i < nrows; i++) { | |
int ixbest = 0; | |
int iybest = 0; | |
double closest = metric(ndata, data, celldata[ixbest], mask, | |
dummymask, weights, i, iybest, transpose); | |
int ix, iy; | |
for (ix = 0; ix < nxgrid; ix++) { | |
for (iy = 0; iy < nygrid; iy++) { | |
double distance = metric(ndata, data, celldata[ix], mask, | |
dummymask, weights, i, iy, | |
transpose); | |
if (distance < closest) { | |
ixbest = ix; | |
iybest = iy; | |
closest = distance; | |
} | |
} | |
} | |
clusterid[i][0] = ixbest; | |
clusterid[i][1] = iybest; | |
} | |
for (i = 0; i < nygrid; i++) free(dummymask[i]); | |
free(dummymask); | |
} | |
else { | |
double** celldatavector = malloc(ndata*sizeof(double*)); | |
int** dummymask = malloc(nrows*sizeof(int*)); | |
int ixbest = 0; | |
int iybest = 0; | |
for (i = 0; i < nrows; i++) { | |
dummymask[i] = malloc(sizeof(int)); | |
dummymask[i][0] = 1; | |
} | |
for (i = 0; i < ncolumns; i++) { | |
double closest; | |
int ix, iy; | |
for (j = 0; j < ndata; j++) | |
celldatavector[j] = &(celldata[ixbest][iybest][j]); | |
closest = metric(ndata, data, celldatavector, mask, dummymask, | |
weights, i, 0, transpose); | |
for (ix = 0; ix < nxgrid; ix++) { | |
for (iy = 0; iy < nygrid; iy++) { | |
double distance; | |
for (j = 0; j < ndata; j++) | |
celldatavector[j] = &(celldata[ix][iy][j]); | |
distance = metric(ndata, data, celldatavector, mask, | |
dummymask, weights, i, 0, transpose); | |
if (distance < closest) { | |
ixbest = ix; | |
iybest = iy; | |
closest = distance; | |
} | |
} | |
} | |
clusterid[i][0] = ixbest; | |
clusterid[i][1] = iybest; | |
} | |
free(celldatavector); | |
for (i = 0; i < nrows; i++) free(dummymask[i]); | |
free(dummymask); | |
} | |
} | |
/* ******************************************************************* */ | |
void | |
somcluster(int nrows, int ncolumns, double** data, int** mask, | |
const double weight[], int transpose, int nxgrid, int nygrid, | |
double inittau, int niter, char dist, double*** celldata, | |
int clusterid[][2]) | |
/* | |
Purpose | |
======= | |
The somcluster routine implements a self-organizing map (Kohonen) on a | |
rectangular grid, using a given set of vectors. The distance measure to be | |
used to find the similarity between genes and nodes is given by dist. | |
Arguments | |
========= | |
nrows (input) int | |
The number of rows in the data matrix, equal to the number of genes. | |
ncolumns (input) int | |
The number of columns in the data matrix, equal to the number of samples. | |
data (input) double[nrows][ncolumns] | |
The array containing the gene expression data. | |
mask (input) int[nrows][ncolumns] | |
This array shows which data values are missing. If | |
mask[i][j] == 0, then data[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0; | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
transpose (input) int | |
If transpose == 0, the rows (genes) of the matrix are clustered. Otherwise, | |
columns (samples) of the matrix are clustered. | |
nxgrid (input) int | |
The number of grid cells horizontally in the rectangular topology of clusters. | |
nygrid (input) int | |
The number of grid cells horizontally in the rectangular topology of clusters. | |
inittau (input) double | |
The initial value of tau, representing the neighborhood function. | |
niter (input) int | |
The number of iterations to be performed. | |
dist (input) char | |
Defines which distance measure is used, as given by the table: | |
dist == 'e': Euclidean distance | |
dist == 'b': City-block distance | |
dist == 'c': correlation | |
dist == 'a': absolute value of the correlation | |
dist == 'u': uncentered correlation | |
dist == 'x': absolute uncentered correlation | |
dist == 's': Spearman's rank correlation | |
dist == 'k': Kendall's tau | |
For other values of dist, the default (Euclidean distance) is used. | |
celldata (output) double[nxgrid][nygrid][ncolumns] if transpose == 0; | |
double[nxgrid][nygrid][nrows] otherwise | |
The gene expression data for each node (cell) in the 2D grid. This can be | |
interpreted as the centroid for the cluster corresponding to that cell. If | |
celldata is NULL, then the centroids are not returned. If celldata is not | |
NULL, enough space should be allocated to store the centroid data before | |
calling somcluster. | |
clusterid (output) int[nrows][2] if transpose == 0; | |
int[ncolumns][2] otherwise | |
For each item (gene or microarray) that is clustered, the coordinates of the | |
cell in the 2D grid to which the item was assigned. If clusterid is NULL, the | |
cluster assignments are not returned. If clusterid is not NULL, enough memory | |
should be allocated to store the clustering information before calling | |
somcluster. | |
======================================================================== | |
*/ | |
{ | |
const int nobjects = (transpose == 0) ? nrows : ncolumns; | |
const int ndata = (transpose == 0) ? ncolumns : nrows; | |
int i, j; | |
const int lcelldata = (celldata == NULL) ? 0 : 1; | |
if (nobjects < 2) return; | |
if (lcelldata == 0) { | |
celldata = malloc(nxgrid*nygrid*ndata*sizeof(double**)); | |
for (i = 0; i < nxgrid; i++) { | |
celldata[i] = malloc(nygrid*ndata*sizeof(double*)); | |
for (j = 0; j < nygrid; j++) | |
celldata[i][j] = malloc(ndata*sizeof(double)); | |
} | |
} | |
somworker(nrows, ncolumns, data, mask, weight, transpose, nxgrid, nygrid, | |
inittau, celldata, niter, dist); | |
if (clusterid) | |
somassign(nrows, ncolumns, data, mask, weight, transpose, | |
nxgrid, nygrid, celldata, dist, clusterid); | |
if (lcelldata == 0) { | |
for (i = 0; i < nxgrid; i++) | |
for (j = 0; j < nygrid; j++) | |
free(celldata[i][j]); | |
for (i = 0; i < nxgrid; i++) | |
free(celldata[i]); | |
free(celldata); | |
} | |
} | |
/* ******************************************************************** */ | |
double | |
clusterdistance(int nrows, int ncolumns, double** data, int** mask, | |
double weight[], int n1, int n2, int index1[], int index2[], | |
char dist, char method, int transpose) | |
/* | |
Purpose | |
======= | |
The clusterdistance routine calculates the distance between two clusters | |
containing genes or samples using the measured gene expression vectors. The | |
distance between clusters, given the genes/samples in each cluster, can be | |
defined in several ways. Several distance measures can be used. | |
The routine returns the distance in double precision. | |
If the parameter transpose is set to a nonzero value, the clusters are | |
interpreted as clusters of samples, otherwise as clusters of gene. | |
Arguments | |
========= | |
nrows (input) int | |
The number of rows (i.e., the number of genes) in the gene expression data | |
matrix. | |
ncolumns (input) int | |
The number of columns (i.e., the number of samples) in the gene expression | |
data matrix. | |
data (input) double[nrows][ncolumns] | |
The array containing the data of the vectors. | |
mask (input) int[nrows][ncolumns] | |
This array shows which data values are missing. If mask[i][j] == 0, then | |
data[i][j] is missing. | |
weight (input) double[ncolumns] if transpose == 0; | |
double[nrows] otherwise | |
The weights that are used to calculate the distance. This is equivalent | |
to including the jth data point weight[j] times in the calculation. The | |
weights can be non-integer. | |
n1 (input) int | |
The number of elements in the first cluster. | |
n2 (input) int | |
The number of elements in the second cluster. | |
index1 (input) int[n1] | |
Identifies which genes/samples belong to the first cluster. | |
index2 (input) int[n2] | |
Identifies which genes/samples belong to the second cluster. | |
dist (input) char | |
Defines which distance measure is used, as given by the table: | |
dist == 'e': Euclidean distance | |
dist == 'b': City-block distance | |
dist == 'c': correlation | |
dist == 'a': absolute value of the correlation | |
dist == 'u': uncentered correlation | |
dist == 'x': absolute uncentered correlation | |
dist == 's': Spearman's rank correlation | |
dist == 'k': Kendall's tau | |
For other values of dist, the default (Euclidean distance) is used. | |
method (input) char | |
Defines how the distance between two clusters is defined, given which genes | |
belong to which cluster: | |
method == 'a': the distance between the arithmetic means of the two clusters | |
method == 'm': the distance between the medians of the two clusters | |
method == 's': the smallest pairwise distance between members of the two | |
clusters | |
method == 'x': the largest pairwise distance between members of the two | |
clusters | |
method == 'v': average of the pairwise distances between members of the two | |
clusters | |
transpose (input) int | |
If transpose is equal to zero, the distances between the rows is | |
calculated. Otherwise, the distances between the columns is calculated. | |
The former is needed when genes are being clustered; the latter is used | |
when samples are being clustered. | |
======================================================================== | |
*/ | |
{ | |
/* Set the metric function as indicated by dist */ | |
double (*metric) (int, double**, double**, int**, int**, | |
const double[], int, int, int) = setmetric(dist); | |
/* if one or both clusters are empty, return */ | |
if (n1 < 1 || n2 < 1) return -1.0; | |
/* Check the indices */ | |
if (transpose == 0) { | |
int i; | |
for (i = 0; i < n1; i++) { | |
int index = index1[i]; | |
if (index < 0 || index >= nrows) return -1.0; | |
} | |
for (i = 0; i < n2; i++) { | |
int index = index2[i]; | |
if (index < 0 || index >= nrows) return -1.0; | |
} | |
} | |
else { | |
int i; | |
for (i = 0; i < n1; i++) { | |
int index = index1[i]; | |
if (index < 0 || index >= ncolumns) return -1.0; | |
} | |
for (i = 0; i < n2; i++) { | |
int index = index2[i]; | |
if (index < 0 || index >= ncolumns) return -1.0; | |
} | |
} | |
switch (method) { | |
case 'a': { | |
/* Find the center */ | |
int i, j, k; | |
if (transpose == 0) { | |
double distance; | |
double* cdata[2]; | |
int* cmask[2]; | |
int* count[2]; | |
count[0] = calloc(ncolumns, sizeof(int)); | |
count[1] = calloc(ncolumns, sizeof(int)); | |
cdata[0] = calloc(ncolumns, sizeof(double)); | |
cdata[1] = calloc(ncolumns, sizeof(double)); | |
cmask[0] = malloc(ncolumns*sizeof(int)); | |
cmask[1] = malloc(ncolumns*sizeof(int)); | |
for (i = 0; i < n1; i++) { | |
k = index1[i]; | |
for (j = 0; j < ncolumns; j++) | |
if (mask[k][j] != 0) { | |
cdata[0][j] = cdata[0][j] + data[k][j]; | |
count[0][j] = count[0][j] + 1; | |
} | |
} | |
for (i = 0; i < n2; i++) { | |
k = index2[i]; | |
for (j = 0; j < ncolumns; j++) | |
if (mask[k][j] != 0) { | |
cdata[1][j] = cdata[1][j] + data[k][j]; | |
count[1][j] = count[1][j] + 1; | |
} | |
} | |
for (i = 0; i < 2; i++) | |
for (j = 0; j < ncolumns; j++) { | |
if (count[i][j]>0) { | |
cdata[i][j] = cdata[i][j] / count[i][j]; | |
cmask[i][j] = 1; | |
} | |
else | |
cmask[i][j] = 0; | |
} | |
distance = metric(ncolumns, cdata, cdata, cmask, cmask, weight, | |
0, 1, 0); | |
for (i = 0; i < 2; i++) { | |
free(cdata[i]); | |
free(cmask[i]); | |
free(count[i]); | |
} | |
return distance; | |
} | |
else { | |
double distance; | |
int** count = malloc(nrows*sizeof(int*)); | |
double** cdata = malloc(nrows*sizeof(double*)); | |
int** cmask = malloc(nrows*sizeof(int*)); | |
for (i = 0; i < nrows; i++) { | |
count[i] = calloc(2, sizeof(int)); | |
cdata[i] = calloc(2, sizeof(double)); | |
cmask[i] = malloc(2*sizeof(int)); | |
} | |
for (i = 0; i < n1; i++) { | |
k = index1[i]; | |
for (j = 0; j < nrows; j++) { | |
if (mask[j][k] != 0) { | |
cdata[j][0] += data[j][k]; | |
count[j][0]++; | |
} | |
} | |
} | |
for (i = 0; i < n2; i++) { | |
k = index2[i]; | |
for (j = 0; j < nrows; j++) { | |
if (mask[j][k] != 0) { | |
cdata[j][1] += data[j][k]; | |
count[j][1]++; | |
} | |
} | |
} | |
for (i = 0; i < nrows; i++) | |
for (j = 0; j < 2; j++) | |
if (count[i][j]>0) { | |
cdata[i][j] /= count[i][j]; | |
cmask[i][j] = 1; | |
} | |
else | |
cmask[i][j] = 0; | |
distance = metric(nrows, cdata, cdata, cmask, cmask, weight, | |
0, 1, 1); | |
for (i = 0; i < nrows; i++) { | |
free(count[i]); | |
free(cdata[i]); | |
free(cmask[i]); | |
} | |
free(count); | |
free(cdata); | |
free(cmask); | |
return distance; | |
} | |
} | |
case 'm': { | |
int i, j, k; | |
if (transpose == 0) { | |
double distance; | |
double* temp = malloc(nrows*sizeof(double)); | |
double* cdata[2]; | |
int* cmask[2]; | |
for (i = 0; i < 2; i++) { | |
cdata[i] = malloc(ncolumns*sizeof(double)); | |
cmask[i] = malloc(ncolumns*sizeof(int)); | |
} | |
for (j = 0; j < ncolumns; j++) { | |
int count = 0; | |
for (k = 0; k < n1; k++) { | |
i = index1[k]; | |
if (mask[i][j]) { | |
temp[count] = data[i][j]; | |
count++; | |
} | |
} | |
if (count>0) { | |
cdata[0][j] = median(count, temp); | |
cmask[0][j] = 1; | |
} | |
else { | |
cdata[0][j] = 0.; | |
cmask[0][j] = 0; | |
} | |
} | |
for (j = 0; j < ncolumns; j++) { | |
int count = 0; | |
for (k = 0; k < n2; k++) { | |
i = index2[k]; | |
if (mask[i][j]) { | |
temp[count] = data[i][j]; | |
count++; | |
} | |
} | |
if (count>0) { | |
cdata[1][j] = median(count, temp); | |
cmask[1][j] = 1; | |
} | |
else { | |
cdata[1][j] = 0.; | |
cmask[1][j] = 0; | |
} | |
} | |
distance = metric(ncolumns, cdata, cdata, cmask, cmask, weight, | |
0, 1, 0); | |
for (i = 0; i < 2; i++) { | |
free(cdata[i]); | |
free(cmask[i]); | |
} | |
free(temp); | |
return distance; | |
} | |
else { | |
double distance; | |
double* temp = malloc(ncolumns*sizeof(double)); | |
double** cdata = malloc(nrows*sizeof(double*)); | |
int** cmask = malloc(nrows*sizeof(int*)); | |
for (i = 0; i < nrows; i++) { | |
cdata[i] = malloc(2*sizeof(double)); | |
cmask[i] = malloc(2*sizeof(int)); | |
} | |
for (j = 0; j < nrows; j++) { | |
int count = 0; | |
for (k = 0; k < n1; k++) { | |
i = index1[k]; | |
if (mask[j][i]) { | |
temp[count] = data[j][i]; | |
count++; | |
} | |
} | |
if (count>0) { | |
cdata[j][0] = median(count, temp); | |
cmask[j][0] = 1; | |
} | |
else { | |
cdata[j][0] = 0.; | |
cmask[j][0] = 0; | |
} | |
} | |
for (j = 0; j < nrows; j++) { | |
int count = 0; | |
for (k = 0; k < n2; k++) { | |
i = index2[k]; | |
if (mask[j][i]) { | |
temp[count] = data[j][i]; | |
count++; | |
} | |
} | |
if (count>0) { | |
cdata[j][1] = median(count, temp); | |
cmask[j][1] = 1; | |
} | |
else { | |
cdata[j][1] = 0.; | |
cmask[j][1] = 0; | |
} | |
} | |
distance = metric(nrows, cdata, cdata, cmask, cmask, weight, | |
0, 1, 1); | |
for (i = 0; i < nrows; i++) { | |
free(cdata[i]); | |
free(cmask[i]); | |
} | |
free(cdata); | |
free(cmask); | |
free(temp); | |
return distance; | |
} | |
} | |
case 's': { | |
int i1, i2, j1, j2; | |
const int n = (transpose == 0) ? ncolumns : nrows; | |
double mindistance = DBL_MAX; | |
for (i1 = 0; i1 < n1; i1++) | |
for (i2 = 0; i2 < n2; i2++) { | |
double distance; | |
j1 = index1[i1]; | |
j2 = index2[i2]; | |
distance = metric(n, data, data, mask, mask, weight, | |
j1, j2, transpose); | |
if (distance < mindistance) mindistance = distance; | |
} | |
return mindistance; | |
} | |
case 'x': { | |
int i1, i2, j1, j2; | |
const int n = (transpose == 0) ? ncolumns : nrows; | |
double maxdistance = 0; | |
for (i1 = 0; i1 < n1; i1++) | |
for (i2 = 0; i2 < n2; i2++) { | |
double distance; | |
j1 = index1[i1]; | |
j2 = index2[i2]; | |
distance = metric(n, data, data, mask, mask, weight, | |
j1, j2, transpose); | |
if (distance > maxdistance) maxdistance = distance; | |
} | |
return maxdistance; | |
} | |
case 'v': { | |
int i1, i2, j1, j2; | |
const int n = (transpose == 0) ? ncolumns : nrows; | |
double distance = 0; | |
for (i1 = 0; i1 < n1; i1++) | |
for (i2 = 0; i2 < n2; i2++) { | |
j1 = index1[i1]; | |
j2 = index2[i2]; | |
distance += metric(n, data, data, mask, mask, weight, | |
j1, j2, transpose); | |
} | |
distance /= (n1*n2); | |
return distance; | |
} | |
} | |
/* Never get here */ | |
return -2.0; | |
} | |