Spaces:
No application file
No application file
// | |
// Original Code | |
// Copyright (C) Jason Vertrees | |
// Modifications | |
// Copyright (C) Joao Rodrigues. | |
// Modifications include removal of RMSD calculation code and associated | |
// dependencies. Output of the module is now the best paths. | |
// | |
// All rights reserved. | |
// Redistribution and use in source and binary forms, with or without | |
// modification, are permitted provided that the following conditions are | |
// met: | |
// | |
// * Redistributions of source code must retain the above copyright | |
// notice, this list of conditions and the following disclaimer. | |
// | |
// * Redistributions in binary form must reproduce the above copyright | |
// notice, this list of conditions and the following disclaimer in | |
// the documentation and/or other materials provided with the | |
// distribution. | |
// | |
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS | |
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | |
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A | |
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER | |
// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
// The following notice is provided since the code was adapted from | |
// open-source Pymol. | |
// Open-Source PyMOL Copyright Notice | |
// ================================== | |
// The Open-Source PyMOL source code is copyrighted, but you can freely | |
// use and copy it as long as you don't change or remove any of the | |
// Copyright notices. The Open-Source PyMOL product is made available | |
// under the following open-source license terms: | |
// ---------------------------------------------------------------------- | |
// Open-Source PyMOL is Copyright (C) Schrodinger, LLC. | |
// All Rights Reserved | |
// Permission to use, copy, modify, distribute, and distribute modified | |
// versions of this software and its built-in documentation for any | |
// purpose and without fee is hereby granted, provided that the above | |
// copyright notice appears in all copies and that both the copyright | |
// notice and this permission notice appear in supporting documentation, | |
// and that the name of Schrodinger, LLC not be used in advertising or | |
// publicity pertaining to distribution of the software without specific, | |
// written prior permission. | |
// SCHRODINGER, LLC DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, | |
// INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN | |
// NO EVENT SHALL SCHRODINGER, LLC BE LIABLE FOR ANY SPECIAL, INDIRECT OR | |
// CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS | |
// OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE | |
// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE | |
// USE OR PERFORMANCE OF THIS SOFTWARE. | |
// ---------------------------------------------------------------------- | |
// PyMOL Trademark Notice | |
// ====================== | |
// PyMOL(TM) is a trademark of Schrodinger, LLC. Derivative | |
// software which contains PyMOL source code must be plainly | |
// distinguished from any and all PyMOL products distributed by Schrodinger, | |
// LLC in all publicity, advertising, and documentation. | |
// The slogans, "Includes PyMOL(TM).", "Based on PyMOL(TM) technology.", | |
// "Contains PyMOL(TM) source code.", and "Built using PyMOL(TM).", may | |
// be used in advertising, publicity, and documentation of derivative | |
// software provided that the notice, "PyMOL is a trademark of Schrodinger, | |
// LLC.", is included in a footnote or at the end of the | |
// document. | |
// All other endorsements employing the PyMOL trademark require specific, | |
// written prior permission. | |
// | |
// Typical XYZ point and array of points | |
typedef struct { | |
double x; | |
double y; | |
double z; | |
} cePoint, *pcePoint; | |
// An AFP (aligned fragment pair), and list/pointer | |
typedef struct { | |
int first; | |
int second; | |
} afp, *path, **pathCache; | |
// Calculate Distance Matrix | |
double ** | |
calcDM(pcePoint coords, int len) { | |
double **dm = (double **)malloc(sizeof(double *) * len); | |
for (int i = 0; i < len; i++) | |
dm[i] = (double *)malloc(sizeof(double) * len); | |
for (int row = 0; row < len; row++) { | |
for (int col = row; col < len; col++) { | |
double xd = coords[row].x - coords[col].x; | |
double yd = coords[row].y - coords[col].y; | |
double zd = coords[row].z - coords[col].z; | |
double distsq = (xd * xd) + (yd * yd) + (zd * zd); | |
dm[row][col] = dm[col][row] = sqrt(distsq); | |
} | |
} | |
return dm; | |
} | |
// Calculate Score Matrix | |
double ** | |
calcS(double **d1, double **d2, int lenA, int lenB, int wSize) { | |
int i; | |
double winSize = (double)wSize; | |
// initialize the 2D similarity matrix | |
double **S = (double **)malloc(sizeof(double *) * lenA); | |
for (i = 0; i < lenA; i++) { | |
S[i] = (double *)malloc(sizeof(double) * lenB); | |
} | |
double sumSize = (winSize - 1.0) * (winSize - 2.0) / 2.0; | |
// | |
// This is where the magic of CE comes out. In the similarity matrix, | |
// for each i and j, the value of ceSIM[i][j] is how well the residues | |
// i - i+winSize in protein A, match to residues j - j+winSize in protein | |
// B. A value of 0 means absolute match; a value >> 1 means bad match. | |
// | |
int lenA_m_wSize = lenA - wSize; | |
int lenB_m_wSize = lenB - wSize; | |
int iA, iB, row, col; | |
for (iA = 0; iA < lenA; iA++) { | |
for (iB = 0; iB < lenB; iB++) { | |
S[iA][iB] = -1.0; | |
if (iA > lenA_m_wSize || iB > lenB_m_wSize) | |
continue; | |
double score = 0.0; | |
// | |
// We always skip the calculation of the distance from THIS | |
// residue, to the next residue. This is a time-saving heur- | |
// istic decision. Almost all alpha carbon bonds of neighboring | |
// residues is 3.8 Angstroms. Due to entropy, S = -k ln pi * pi, | |
// this tell us nothing, so it doesn't help so ignore it. | |
// | |
for (row = 0; row < wSize - 2; row++) { | |
for (col = row + 2; col < wSize; col++) { | |
score += | |
fabs(d1[iA + row][iA + col] - d2[iB + row][iB + col]); | |
} | |
} | |
S[iA][iB] = score / sumSize; | |
} | |
} | |
return S; | |
} | |
pcePoint | |
getCoords(PyObject *L, int length) { | |
// make space for the current coords | |
pcePoint coords = (pcePoint)malloc(sizeof(cePoint) * length); | |
if (!coords) | |
return NULL; | |
// loop through the arguments, pulling out the | |
// XYZ coordinates. | |
for (int i = 0; i < length; i++) { | |
PyObject *curCoord = PyList_GetItem(L, i); | |
Py_INCREF(curCoord); | |
PyObject *curVal = PyList_GetItem(curCoord, 0); | |
Py_INCREF(curVal); | |
coords[i].x = PyFloat_AsDouble(curVal); | |
Py_DECREF(curVal); | |
curVal = PyList_GetItem(curCoord, 1); | |
Py_INCREF(curVal); | |
coords[i].y = PyFloat_AsDouble(curVal); | |
Py_DECREF(curVal); | |
curVal = PyList_GetItem(curCoord, 2); | |
Py_INCREF(curVal); | |
coords[i].z = PyFloat_AsDouble(curVal); | |
Py_DECREF(curVal); | |
Py_DECREF(curCoord); | |
} | |
return coords; | |
} | |
// Find the best N alignment paths | |
PyObject * | |
findPath(double **S, double **dA, double **dB, int lenA, int lenB, | |
int winSize, int gapMax) { | |
const double D0 = 3.0; | |
const double D1 = 4.0; | |
int i, j; | |
// Score of the best Path | |
double bestPathScore = 1e6; | |
int bestPathLength = 0; | |
// Length of longest possible alignment | |
int smaller = (lenA < lenB) ? lenA : lenB; | |
int winSum = (winSize - 1) * (winSize - 2) / 2; | |
path bestPath = (path)malloc(sizeof(afp) * smaller); | |
for (i = 0; i < smaller; i++) { | |
bestPath[i].first = -1; | |
bestPath[i].second = -1; | |
} | |
//====================================================================== | |
// for storing the best N paths | |
int bufferIndex = 0; | |
int bufferSize = 0; | |
int lenBuffer[MAX_PATHS]; | |
double scoreBuffer[MAX_PATHS]; | |
pathCache pathBuffer = (pathCache)malloc(sizeof(path *) * MAX_PATHS); | |
for (i = 0; i < MAX_PATHS; i++) { | |
// initialize the paths | |
scoreBuffer[i] = 1e6; | |
lenBuffer[i] = 0; | |
pathBuffer[i] = 0; | |
} | |
// winCache | |
// this array stores a list of residues seen. We use it to calculate the | |
// total score of a path from 1..M and then add it to M+1..N. | |
int *winCache = (int *)malloc(sizeof(int) * smaller); | |
for (i = 0; i < smaller; i++) | |
winCache[i] = (i + 1) * i * winSize / 2 + (i + 1) * winSum; | |
// allScoreBuffer | |
// this 2D array keeps track of all partial gapped scores | |
double **allScoreBuffer = (double **)malloc(sizeof(double *) * smaller); | |
for (i = 0; i < smaller; i++) { | |
allScoreBuffer[i] = | |
(double *)malloc((gapMax * 2 + 1) * sizeof(double)); | |
// initialize the ASB | |
for (j = 0; j < gapMax * 2 + 1; j++) | |
allScoreBuffer[i][j] = 1e6; | |
} | |
int *tIndex = (int *)malloc(sizeof(int) * smaller); | |
int gapBestIndex = -1; | |
//====================================================================== | |
// Start the search through the CE matrix. | |
// | |
int iA, iB; | |
for (iA = 0; iA < lenA; iA++) { | |
if (iA > lenA - winSize * (bestPathLength - 1)) | |
break; | |
for (iB = 0; iB < lenB; iB++) { | |
if (S[iA][iB] >= D0) | |
continue; | |
if (S[iA][iB] == -1.0) | |
continue; | |
if (iB > lenB - winSize * (bestPathLength - 1)) | |
break; | |
// | |
// Restart curPath here. | |
// | |
path curPath = (path)malloc(sizeof(afp) * smaller); | |
for (i = 0; i < smaller; i++) { | |
curPath[i].first = -1; | |
curPath[i].second = -1; | |
} | |
curPath[0].first = iA; | |
curPath[0].second = iB; | |
int curPathLength = 1; | |
tIndex[curPathLength - 1] = 0; | |
double curTotalScore = 0.0; | |
// | |
// Check all possible paths starting from iA, iB | |
// | |
int done = 0; | |
while (!done) { | |
double gapBestScore = 1e6; | |
gapBestIndex = -1; | |
// | |
// Check all possible gaps [1..gapMax] from here | |
// | |
for (int g = 0; g < (gapMax * 2) + 1; g++) { | |
int jA = curPath[curPathLength - 1].first + winSize; | |
int jB = curPath[curPathLength - 1].second + winSize; | |
if ((g + 1) % 2 == 0) { | |
jA += (g + 1) / 2; | |
} else { // ( g odd ) | |
jB += (g + 1) / 2; | |
} | |
// | |
// Following are three heuristics to ensure high quality | |
// long paths and make sure we don't run over the end of | |
// the S, matrix. | |
// 1st: If jA and jB are at the end of the matrix | |
if (jA > lenA - winSize || jB > lenB - winSize) { | |
// FIXME, was: jA > lenA-winSize-1 || jB > | |
// lenB-winSize-1 | |
continue; | |
} | |
// 2nd: If this gapped octapeptide is bad, ignore it. | |
if (S[jA][jB] > D0) | |
continue; | |
// 3rd: if too close to end, ignore it. | |
if (S[jA][jB] == -1.0) | |
continue; | |
double curScore = 0.0; | |
for (int s = 0; s < curPathLength; s++) { | |
curScore += fabs(dA[curPath[s].first][jA] - | |
dB[curPath[s].second][jB]); | |
curScore += fabs(dA[curPath[s].first + (winSize - 1)] | |
[jA + (winSize - 1)] - | |
dB[curPath[s].second + (winSize - 1)] | |
[jB + (winSize - 1)]); | |
for (int k = 1; k < winSize - 1; k++) | |
curScore += fabs(dA[curPath[s].first + k] | |
[jA + (winSize - 1) - k] - | |
dB[curPath[s].second + k] | |
[jB + (winSize - 1) - k]); | |
} | |
curScore /= (double)winSize * (double)curPathLength; | |
if (curScore >= D1) { | |
continue; | |
} | |
// store GAPPED best | |
if (curScore < gapBestScore) { | |
curPath[curPathLength].first = jA; | |
curPath[curPathLength].second = jB; | |
gapBestScore = curScore; | |
gapBestIndex = g; | |
allScoreBuffer[curPathLength - 1][g] = curScore; | |
} | |
} /// ROF -- END GAP SEARCHING | |
// | |
// DONE GAPPING: | |
// | |
// calculate curTotalScore | |
curTotalScore = 0.0; | |
int jGap, gA, gB; | |
double score1 = 0.0, score2 = 0.0; | |
if (gapBestIndex != -1) { | |
jGap = (gapBestIndex + 1) / 2; | |
if ((gapBestIndex + 1) % 2 == 0) { | |
gA = curPath[curPathLength - 1].first + winSize + jGap; | |
gB = curPath[curPathLength - 1].second + winSize; | |
} else { | |
gA = curPath[curPathLength - 1].first + winSize; | |
gB = | |
curPath[curPathLength - 1].second + winSize + jGap; | |
} | |
// perfect | |
score1 = (allScoreBuffer[curPathLength - 1][gapBestIndex] * | |
winSize * curPathLength + | |
S[gA][gB] * winSum) / | |
(winSize * curPathLength + winSum); | |
// perfect | |
score2 = | |
((curPathLength > 1 | |
? (allScoreBuffer[curPathLength - 2] | |
[tIndex[curPathLength - 1]]) | |
: S[iA][iB]) * | |
winCache[curPathLength - 1] + | |
score1 * (winCache[curPathLength] - | |
winCache[curPathLength - 1])) / | |
winCache[curPathLength]; | |
curTotalScore = score2; | |
// heuristic -- path is getting sloppy, stop looking | |
if (curTotalScore > D1) { | |
done = 1; | |
gapBestIndex = -1; | |
break; | |
} else { | |
allScoreBuffer[curPathLength - 1][gapBestIndex] = | |
curTotalScore; | |
tIndex[curPathLength] = gapBestIndex; | |
curPathLength++; | |
} | |
} else { | |
// if here, then there was no good gapped path | |
// so quit and restart from iA, iB+1 | |
done = 1; | |
curPathLength--; | |
break; | |
} | |
// | |
// test this gapped path against the best seen | |
// starting from iA, iB | |
// | |
// if our currently best gapped path from iA and iB is LONGER | |
// than the current best; or, it's equal length and the score's | |
// better, keep the new path. | |
if (curPathLength > bestPathLength || | |
(curPathLength == bestPathLength && | |
curTotalScore < bestPathScore)) { | |
bestPathLength = curPathLength; | |
bestPathScore = curTotalScore; | |
// deep copy curPath | |
path tempPath = (path)malloc(sizeof(afp) * smaller); | |
for (int i = 0; i < smaller; i++) { | |
tempPath[i].first = curPath[i].first; | |
tempPath[i].second = curPath[i].second; | |
} | |
free(bestPath); | |
bestPath = tempPath; | |
} | |
} /// END WHILE | |
// | |
// At this point, we've found the best path starting at iA, iB. | |
// | |
if (bestPathLength > lenBuffer[bufferIndex] || | |
(bestPathLength == lenBuffer[bufferIndex] && | |
bestPathScore < scoreBuffer[bufferIndex])) { | |
// we're going to add an entry to the ring-buffer. | |
// Adjust maxSize values and curIndex accordingly. | |
bufferIndex = | |
(bufferIndex == MAX_PATHS - 1) ? 0 : bufferIndex + 1; | |
bufferSize = | |
(bufferSize < MAX_PATHS) ? (bufferSize) + 1 : MAX_PATHS; | |
path pathCopy = (path)malloc(sizeof(afp) * smaller); | |
int i; | |
for (i = 0; i < smaller; i++) { | |
pathCopy[i].first = bestPath[i].first; | |
pathCopy[i].second = bestPath[i].second; | |
} | |
if (bufferIndex == 0 && (bufferSize) == MAX_PATHS) { | |
if (pathBuffer[MAX_PATHS - 1]) | |
free(pathBuffer[MAX_PATHS - 1]); | |
pathBuffer[MAX_PATHS - 1] = pathCopy; | |
scoreBuffer[MAX_PATHS - 1] = bestPathScore; | |
lenBuffer[MAX_PATHS - 1] = bestPathLength; | |
} else { | |
if (pathBuffer[bufferIndex - 1]) | |
free(pathBuffer[bufferIndex - 1]); | |
pathBuffer[bufferIndex - 1] = pathCopy; | |
scoreBuffer[bufferIndex - 1] = bestPathScore; | |
lenBuffer[bufferIndex - 1] = bestPathLength; | |
} | |
} | |
free(curPath); | |
curPath = 0; | |
} // ROF -- end for iB | |
} // ROF -- end for iA | |
// To make it simpler to use this code and more portable, we are decoupling | |
// the path finding (the actual CEAlign innovation) from the RMSD | |
// calculation. | |
// | |
// As such, we return the N best paths to Python-land. Since the paths are | |
// encoded as structs, it's simpler to return the each path as a list of | |
// lists with the corresponding atom indices. e.g. [path1, path2, path3, | |
// ..., pathN], where pathN is defined as, | |
// [[Ai, Aj, Ak, ...], [Bi, Bj, Bk, ...], where An and Bn are equivalent | |
// coordinates for structures A and B. | |
PyObject *result = PyList_New(MAX_PATHS); // List to store all paths | |
Py_INCREF(result); | |
for (int o = 0; o < bufferSize; o++) { | |
// Make a new list to store this path | |
PyObject *pathAList = PyList_New(0); | |
PyObject *pathBList = PyList_New(0); | |
Py_INCREF(pathAList); | |
Py_INCREF(pathBList); | |
int j = 0; | |
int it = 0; | |
// Grab the current path | |
while (j < smaller) { | |
if (pathBuffer[o][j].first != -1) { | |
int idxA = pathBuffer[o][j].first; | |
int idxB = pathBuffer[o][j].second; | |
for (int k = 0; k < winSize; k++) { | |
PyObject *v = Py_BuildValue("i", idxA + k); | |
PyList_Append(pathAList, v); | |
Py_DECREF(v); | |
v = Py_BuildValue("i", idxB + k); | |
PyList_Append(pathBList, v); | |
Py_DECREF(v); | |
it++; | |
} | |
j++; | |
} else { | |
break; | |
} | |
} | |
PyObject *pairList = Py_BuildValue("[NN]", pathAList, pathBList); | |
Py_INCREF(pairList); | |
PyList_SET_ITEM(result, o, pairList); | |
} | |
// free memory | |
for (i = 0; i < smaller; i++) | |
free(allScoreBuffer[i]); | |
free(allScoreBuffer); | |
free(tIndex); | |
free(winCache); | |
free(bestPath); | |
free(pathBuffer); | |
return result; | |
} | |
// Main Function | |
PyObject * | |
PyCealign(PyObject *Py_UNUSED(self), PyObject *args) { | |
int i = 0; | |
int windowSize = 8; | |
int gapMax = 30; | |
double **dmA, **dmB, **S; | |
PyObject *listA, *listB, *result; | |
/* Unpack the arguments from Python */ | |
PyArg_ParseTuple(args, "OO|ii", &listA, &listB, &windowSize, &gapMax); | |
/* Get the list lengths */ | |
const int lenA = (int)PyList_Size(listA); | |
const int lenB = (int)PyList_Size(listB); | |
/* get the coodinates from the Python objects */ | |
pcePoint coordsA = (pcePoint)getCoords(listA, lenA); | |
pcePoint coordsB = (pcePoint)getCoords(listB, lenB); | |
/* calculate the distance matrix for each protein */ | |
dmA = (double **)calcDM(coordsA, lenA); | |
dmB = (double **)calcDM(coordsB, lenB); | |
/* calculate the CE Similarity matrix */ | |
S = (double **)calcS(dmA, dmB, lenA, lenB, windowSize); | |
// Calculate Top N Paths | |
result = (PyObject *)findPath(S, dmA, dmB, lenA, lenB, windowSize, gapMax); | |
/* release memory */ | |
free(coordsA); | |
free(coordsB); | |
/* distance matrices */ | |
for (i = 0; i < lenA; i++) | |
free(dmA[i]); | |
free(dmA); | |
for (i = 0; i < lenB; i++) | |
free(dmB[i]); | |
free(dmB); | |
/* similarity matrix */ | |
for (i = 0; i < lenA; i++) | |
free(S[i]); | |
free(S); | |
return result; | |
} | |
// | |
// Python Interface | |
// | |
PyDoc_STRVAR(method_doc, | |
"run_cealign(coordsA, coordsB, windowSize, gapMax) -> list\ | |
\n\n\ | |
Find the optimal alignments between two structures, using CEAlign.\ | |
\n\n\ | |
Arguments:\n\ | |
- listA: List of lists with coordinates for structure A.\n\ | |
- listB: List of lists with coordinates for structure B.\n\ | |
- windowSize: Length of fragments to be used in alignment.\n\ | |
- gapMax: Maximum gap allowed between two aligned fragment pairs."); | |
static PyMethodDef CEAlignMethods[] = { | |
{"run_cealign", PyCealign, METH_VARARGS, method_doc}, | |
{NULL, NULL, 0, NULL}}; | |
PyDoc_STRVAR(module_doc, | |
"Pairwise structure alignment of 3D structures using combinatorial extension.\ | |
\n\n\ | |
This module implements a single function: run_cealign. \ | |
Refer to its docstring for more documentation on usage and implementation."); | |
PyObject *PyInit_ccealign(void) { | |
static struct PyModuleDef moduledef = {PyModuleDef_HEAD_INIT, | |
"ccealign", | |
module_doc, | |
-1, | |
CEAlignMethods, | |
NULL, | |
NULL, | |
NULL, | |
NULL}; | |
return PyModule_Create(&moduledef); | |
} | |