aakash0017's picture
Upload folder using huggingface_hub
b7731cd
//
// Original Code
// Copyright (C) Jason Vertrees
// Modifications
// Copyright (C) Joao Rodrigues.
// Modifications include removal of RMSD calculation code and associated
// dependencies. Output of the module is now the best paths.
//
// All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The following notice is provided since the code was adapted from
// open-source Pymol.
// Open-Source PyMOL Copyright Notice
// ==================================
// The Open-Source PyMOL source code is copyrighted, but you can freely
// use and copy it as long as you don't change or remove any of the
// Copyright notices. The Open-Source PyMOL product is made available
// under the following open-source license terms:
// ----------------------------------------------------------------------
// Open-Source PyMOL is Copyright (C) Schrodinger, LLC.
// All Rights Reserved
// Permission to use, copy, modify, distribute, and distribute modified
// versions of this software and its built-in documentation for any
// purpose and without fee is hereby granted, provided that the above
// copyright notice appears in all copies and that both the copyright
// notice and this permission notice appear in supporting documentation,
// and that the name of Schrodinger, LLC not be used in advertising or
// publicity pertaining to distribution of the software without specific,
// written prior permission.
// SCHRODINGER, LLC DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
// INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN
// NO EVENT SHALL SCHRODINGER, LLC BE LIABLE FOR ANY SPECIAL, INDIRECT OR
// CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
// OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
// OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE
// USE OR PERFORMANCE OF THIS SOFTWARE.
// ----------------------------------------------------------------------
// PyMOL Trademark Notice
// ======================
// PyMOL(TM) is a trademark of Schrodinger, LLC. Derivative
// software which contains PyMOL source code must be plainly
// distinguished from any and all PyMOL products distributed by Schrodinger,
// LLC in all publicity, advertising, and documentation.
// The slogans, "Includes PyMOL(TM).", "Based on PyMOL(TM) technology.",
// "Contains PyMOL(TM) source code.", and "Built using PyMOL(TM).", may
// be used in advertising, publicity, and documentation of derivative
// software provided that the notice, "PyMOL is a trademark of Schrodinger,
// LLC.", is included in a footnote or at the end of the
// document.
// All other endorsements employing the PyMOL trademark require specific,
// written prior permission.
//
#include "Python.h"
#define MAX_PATHS 20
// Typical XYZ point and array of points
typedef struct {
double x;
double y;
double z;
} cePoint, *pcePoint;
// An AFP (aligned fragment pair), and list/pointer
typedef struct {
int first;
int second;
} afp, *path, **pathCache;
// Calculate Distance Matrix
double **
calcDM(pcePoint coords, int len) {
double **dm = (double **)malloc(sizeof(double *) * len);
for (int i = 0; i < len; i++)
dm[i] = (double *)malloc(sizeof(double) * len);
for (int row = 0; row < len; row++) {
for (int col = row; col < len; col++) {
double xd = coords[row].x - coords[col].x;
double yd = coords[row].y - coords[col].y;
double zd = coords[row].z - coords[col].z;
double distsq = (xd * xd) + (yd * yd) + (zd * zd);
dm[row][col] = dm[col][row] = sqrt(distsq);
}
}
return dm;
}
// Calculate Score Matrix
double **
calcS(double **d1, double **d2, int lenA, int lenB, int wSize) {
int i;
double winSize = (double)wSize;
// initialize the 2D similarity matrix
double **S = (double **)malloc(sizeof(double *) * lenA);
for (i = 0; i < lenA; i++) {
S[i] = (double *)malloc(sizeof(double) * lenB);
}
double sumSize = (winSize - 1.0) * (winSize - 2.0) / 2.0;
//
// This is where the magic of CE comes out. In the similarity matrix,
// for each i and j, the value of ceSIM[i][j] is how well the residues
// i - i+winSize in protein A, match to residues j - j+winSize in protein
// B. A value of 0 means absolute match; a value >> 1 means bad match.
//
int lenA_m_wSize = lenA - wSize;
int lenB_m_wSize = lenB - wSize;
int iA, iB, row, col;
for (iA = 0; iA < lenA; iA++) {
for (iB = 0; iB < lenB; iB++) {
S[iA][iB] = -1.0;
if (iA > lenA_m_wSize || iB > lenB_m_wSize)
continue;
double score = 0.0;
//
// We always skip the calculation of the distance from THIS
// residue, to the next residue. This is a time-saving heur-
// istic decision. Almost all alpha carbon bonds of neighboring
// residues is 3.8 Angstroms. Due to entropy, S = -k ln pi * pi,
// this tell us nothing, so it doesn't help so ignore it.
//
for (row = 0; row < wSize - 2; row++) {
for (col = row + 2; col < wSize; col++) {
score +=
fabs(d1[iA + row][iA + col] - d2[iB + row][iB + col]);
}
}
S[iA][iB] = score / sumSize;
}
}
return S;
}
pcePoint
getCoords(PyObject *L, int length) {
// make space for the current coords
pcePoint coords = (pcePoint)malloc(sizeof(cePoint) * length);
if (!coords)
return NULL;
// loop through the arguments, pulling out the
// XYZ coordinates.
for (int i = 0; i < length; i++) {
PyObject *curCoord = PyList_GetItem(L, i);
Py_INCREF(curCoord);
PyObject *curVal = PyList_GetItem(curCoord, 0);
Py_INCREF(curVal);
coords[i].x = PyFloat_AsDouble(curVal);
Py_DECREF(curVal);
curVal = PyList_GetItem(curCoord, 1);
Py_INCREF(curVal);
coords[i].y = PyFloat_AsDouble(curVal);
Py_DECREF(curVal);
curVal = PyList_GetItem(curCoord, 2);
Py_INCREF(curVal);
coords[i].z = PyFloat_AsDouble(curVal);
Py_DECREF(curVal);
Py_DECREF(curCoord);
}
return coords;
}
// Find the best N alignment paths
PyObject *
findPath(double **S, double **dA, double **dB, int lenA, int lenB,
int winSize, int gapMax) {
const double D0 = 3.0;
const double D1 = 4.0;
int i, j;
// Score of the best Path
double bestPathScore = 1e6;
int bestPathLength = 0;
// Length of longest possible alignment
int smaller = (lenA < lenB) ? lenA : lenB;
int winSum = (winSize - 1) * (winSize - 2) / 2;
path bestPath = (path)malloc(sizeof(afp) * smaller);
for (i = 0; i < smaller; i++) {
bestPath[i].first = -1;
bestPath[i].second = -1;
}
//======================================================================
// for storing the best N paths
int bufferIndex = 0;
int bufferSize = 0;
int lenBuffer[MAX_PATHS];
double scoreBuffer[MAX_PATHS];
pathCache pathBuffer = (pathCache)malloc(sizeof(path *) * MAX_PATHS);
for (i = 0; i < MAX_PATHS; i++) {
// initialize the paths
scoreBuffer[i] = 1e6;
lenBuffer[i] = 0;
pathBuffer[i] = 0;
}
// winCache
// this array stores a list of residues seen. We use it to calculate the
// total score of a path from 1..M and then add it to M+1..N.
int *winCache = (int *)malloc(sizeof(int) * smaller);
for (i = 0; i < smaller; i++)
winCache[i] = (i + 1) * i * winSize / 2 + (i + 1) * winSum;
// allScoreBuffer
// this 2D array keeps track of all partial gapped scores
double **allScoreBuffer = (double **)malloc(sizeof(double *) * smaller);
for (i = 0; i < smaller; i++) {
allScoreBuffer[i] =
(double *)malloc((gapMax * 2 + 1) * sizeof(double));
// initialize the ASB
for (j = 0; j < gapMax * 2 + 1; j++)
allScoreBuffer[i][j] = 1e6;
}
int *tIndex = (int *)malloc(sizeof(int) * smaller);
int gapBestIndex = -1;
//======================================================================
// Start the search through the CE matrix.
//
int iA, iB;
for (iA = 0; iA < lenA; iA++) {
if (iA > lenA - winSize * (bestPathLength - 1))
break;
for (iB = 0; iB < lenB; iB++) {
if (S[iA][iB] >= D0)
continue;
if (S[iA][iB] == -1.0)
continue;
if (iB > lenB - winSize * (bestPathLength - 1))
break;
//
// Restart curPath here.
//
path curPath = (path)malloc(sizeof(afp) * smaller);
for (i = 0; i < smaller; i++) {
curPath[i].first = -1;
curPath[i].second = -1;
}
curPath[0].first = iA;
curPath[0].second = iB;
int curPathLength = 1;
tIndex[curPathLength - 1] = 0;
double curTotalScore = 0.0;
//
// Check all possible paths starting from iA, iB
//
int done = 0;
while (!done) {
double gapBestScore = 1e6;
gapBestIndex = -1;
//
// Check all possible gaps [1..gapMax] from here
//
for (int g = 0; g < (gapMax * 2) + 1; g++) {
int jA = curPath[curPathLength - 1].first + winSize;
int jB = curPath[curPathLength - 1].second + winSize;
if ((g + 1) % 2 == 0) {
jA += (g + 1) / 2;
} else { // ( g odd )
jB += (g + 1) / 2;
}
//
// Following are three heuristics to ensure high quality
// long paths and make sure we don't run over the end of
// the S, matrix.
// 1st: If jA and jB are at the end of the matrix
if (jA > lenA - winSize || jB > lenB - winSize) {
// FIXME, was: jA > lenA-winSize-1 || jB >
// lenB-winSize-1
continue;
}
// 2nd: If this gapped octapeptide is bad, ignore it.
if (S[jA][jB] > D0)
continue;
// 3rd: if too close to end, ignore it.
if (S[jA][jB] == -1.0)
continue;
double curScore = 0.0;
for (int s = 0; s < curPathLength; s++) {
curScore += fabs(dA[curPath[s].first][jA] -
dB[curPath[s].second][jB]);
curScore += fabs(dA[curPath[s].first + (winSize - 1)]
[jA + (winSize - 1)] -
dB[curPath[s].second + (winSize - 1)]
[jB + (winSize - 1)]);
for (int k = 1; k < winSize - 1; k++)
curScore += fabs(dA[curPath[s].first + k]
[jA + (winSize - 1) - k] -
dB[curPath[s].second + k]
[jB + (winSize - 1) - k]);
}
curScore /= (double)winSize * (double)curPathLength;
if (curScore >= D1) {
continue;
}
// store GAPPED best
if (curScore < gapBestScore) {
curPath[curPathLength].first = jA;
curPath[curPathLength].second = jB;
gapBestScore = curScore;
gapBestIndex = g;
allScoreBuffer[curPathLength - 1][g] = curScore;
}
} /// ROF -- END GAP SEARCHING
//
// DONE GAPPING:
//
// calculate curTotalScore
curTotalScore = 0.0;
int jGap, gA, gB;
double score1 = 0.0, score2 = 0.0;
if (gapBestIndex != -1) {
jGap = (gapBestIndex + 1) / 2;
if ((gapBestIndex + 1) % 2 == 0) {
gA = curPath[curPathLength - 1].first + winSize + jGap;
gB = curPath[curPathLength - 1].second + winSize;
} else {
gA = curPath[curPathLength - 1].first + winSize;
gB =
curPath[curPathLength - 1].second + winSize + jGap;
}
// perfect
score1 = (allScoreBuffer[curPathLength - 1][gapBestIndex] *
winSize * curPathLength +
S[gA][gB] * winSum) /
(winSize * curPathLength + winSum);
// perfect
score2 =
((curPathLength > 1
? (allScoreBuffer[curPathLength - 2]
[tIndex[curPathLength - 1]])
: S[iA][iB]) *
winCache[curPathLength - 1] +
score1 * (winCache[curPathLength] -
winCache[curPathLength - 1])) /
winCache[curPathLength];
curTotalScore = score2;
// heuristic -- path is getting sloppy, stop looking
if (curTotalScore > D1) {
done = 1;
gapBestIndex = -1;
break;
} else {
allScoreBuffer[curPathLength - 1][gapBestIndex] =
curTotalScore;
tIndex[curPathLength] = gapBestIndex;
curPathLength++;
}
} else {
// if here, then there was no good gapped path
// so quit and restart from iA, iB+1
done = 1;
curPathLength--;
break;
}
//
// test this gapped path against the best seen
// starting from iA, iB
//
// if our currently best gapped path from iA and iB is LONGER
// than the current best; or, it's equal length and the score's
// better, keep the new path.
if (curPathLength > bestPathLength ||
(curPathLength == bestPathLength &&
curTotalScore < bestPathScore)) {
bestPathLength = curPathLength;
bestPathScore = curTotalScore;
// deep copy curPath
path tempPath = (path)malloc(sizeof(afp) * smaller);
for (int i = 0; i < smaller; i++) {
tempPath[i].first = curPath[i].first;
tempPath[i].second = curPath[i].second;
}
free(bestPath);
bestPath = tempPath;
}
} /// END WHILE
//
// At this point, we've found the best path starting at iA, iB.
//
if (bestPathLength > lenBuffer[bufferIndex] ||
(bestPathLength == lenBuffer[bufferIndex] &&
bestPathScore < scoreBuffer[bufferIndex])) {
// we're going to add an entry to the ring-buffer.
// Adjust maxSize values and curIndex accordingly.
bufferIndex =
(bufferIndex == MAX_PATHS - 1) ? 0 : bufferIndex + 1;
bufferSize =
(bufferSize < MAX_PATHS) ? (bufferSize) + 1 : MAX_PATHS;
path pathCopy = (path)malloc(sizeof(afp) * smaller);
int i;
for (i = 0; i < smaller; i++) {
pathCopy[i].first = bestPath[i].first;
pathCopy[i].second = bestPath[i].second;
}
if (bufferIndex == 0 && (bufferSize) == MAX_PATHS) {
if (pathBuffer[MAX_PATHS - 1])
free(pathBuffer[MAX_PATHS - 1]);
pathBuffer[MAX_PATHS - 1] = pathCopy;
scoreBuffer[MAX_PATHS - 1] = bestPathScore;
lenBuffer[MAX_PATHS - 1] = bestPathLength;
} else {
if (pathBuffer[bufferIndex - 1])
free(pathBuffer[bufferIndex - 1]);
pathBuffer[bufferIndex - 1] = pathCopy;
scoreBuffer[bufferIndex - 1] = bestPathScore;
lenBuffer[bufferIndex - 1] = bestPathLength;
}
}
free(curPath);
curPath = 0;
} // ROF -- end for iB
} // ROF -- end for iA
// To make it simpler to use this code and more portable, we are decoupling
// the path finding (the actual CEAlign innovation) from the RMSD
// calculation.
//
// As such, we return the N best paths to Python-land. Since the paths are
// encoded as structs, it's simpler to return the each path as a list of
// lists with the corresponding atom indices. e.g. [path1, path2, path3,
// ..., pathN], where pathN is defined as,
// [[Ai, Aj, Ak, ...], [Bi, Bj, Bk, ...], where An and Bn are equivalent
// coordinates for structures A and B.
PyObject *result = PyList_New(MAX_PATHS); // List to store all paths
Py_INCREF(result);
for (int o = 0; o < bufferSize; o++) {
// Make a new list to store this path
PyObject *pathAList = PyList_New(0);
PyObject *pathBList = PyList_New(0);
Py_INCREF(pathAList);
Py_INCREF(pathBList);
int j = 0;
int it = 0;
// Grab the current path
while (j < smaller) {
if (pathBuffer[o][j].first != -1) {
int idxA = pathBuffer[o][j].first;
int idxB = pathBuffer[o][j].second;
for (int k = 0; k < winSize; k++) {
PyObject *v = Py_BuildValue("i", idxA + k);
PyList_Append(pathAList, v);
Py_DECREF(v);
v = Py_BuildValue("i", idxB + k);
PyList_Append(pathBList, v);
Py_DECREF(v);
it++;
}
j++;
} else {
break;
}
}
PyObject *pairList = Py_BuildValue("[NN]", pathAList, pathBList);
Py_INCREF(pairList);
PyList_SET_ITEM(result, o, pairList);
}
// free memory
for (i = 0; i < smaller; i++)
free(allScoreBuffer[i]);
free(allScoreBuffer);
free(tIndex);
free(winCache);
free(bestPath);
free(pathBuffer);
return result;
}
// Main Function
PyObject *
PyCealign(PyObject *Py_UNUSED(self), PyObject *args) {
int i = 0;
int windowSize = 8;
int gapMax = 30;
double **dmA, **dmB, **S;
PyObject *listA, *listB, *result;
/* Unpack the arguments from Python */
PyArg_ParseTuple(args, "OO|ii", &listA, &listB, &windowSize, &gapMax);
/* Get the list lengths */
const int lenA = (int)PyList_Size(listA);
const int lenB = (int)PyList_Size(listB);
/* get the coodinates from the Python objects */
pcePoint coordsA = (pcePoint)getCoords(listA, lenA);
pcePoint coordsB = (pcePoint)getCoords(listB, lenB);
/* calculate the distance matrix for each protein */
dmA = (double **)calcDM(coordsA, lenA);
dmB = (double **)calcDM(coordsB, lenB);
/* calculate the CE Similarity matrix */
S = (double **)calcS(dmA, dmB, lenA, lenB, windowSize);
// Calculate Top N Paths
result = (PyObject *)findPath(S, dmA, dmB, lenA, lenB, windowSize, gapMax);
/* release memory */
free(coordsA);
free(coordsB);
/* distance matrices */
for (i = 0; i < lenA; i++)
free(dmA[i]);
free(dmA);
for (i = 0; i < lenB; i++)
free(dmB[i]);
free(dmB);
/* similarity matrix */
for (i = 0; i < lenA; i++)
free(S[i]);
free(S);
return result;
}
//
// Python Interface
//
PyDoc_STRVAR(method_doc,
"run_cealign(coordsA, coordsB, windowSize, gapMax) -> list\
\n\n\
Find the optimal alignments between two structures, using CEAlign.\
\n\n\
Arguments:\n\
- listA: List of lists with coordinates for structure A.\n\
- listB: List of lists with coordinates for structure B.\n\
- windowSize: Length of fragments to be used in alignment.\n\
- gapMax: Maximum gap allowed between two aligned fragment pairs.");
static PyMethodDef CEAlignMethods[] = {
{"run_cealign", PyCealign, METH_VARARGS, method_doc},
{NULL, NULL, 0, NULL}};
PyDoc_STRVAR(module_doc,
"Pairwise structure alignment of 3D structures using combinatorial extension.\
\n\n\
This module implements a single function: run_cealign. \
Refer to its docstring for more documentation on usage and implementation.");
PyObject *PyInit_ccealign(void) {
static struct PyModuleDef moduledef = {PyModuleDef_HEAD_INIT,
"ccealign",
module_doc,
-1,
CEAlignMethods,
NULL,
NULL,
NULL,
NULL};
return PyModule_Create(&moduledef);
}