Spaces:
Runtime error
Runtime error
File size: 7,870 Bytes
b4346be d940adf b4346be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 |
import os
import multiprocessing
from multiprocessing import Pool
from turtle import width
import numpy as np
from moleculekit.molecule import Molecule
from scipy.spatial import KDTree
from sklearn.cluster import AgglomerativeClustering
def create_grid_fromBB(boundingBox, voxelSize=1):
"""Create a grid from a bounding box.
Parameters
----------
boundingBox : list
List of the form [xmin, xmax, ymin, ymax, zmin, zmax]
voxelSize : float
Size of the voxels in Angstrom
Returns
-------
grid : numpy.ndarray
Grid of shape (nx, ny, nz)
box_N : numpy.ndarray
Number of voxels in each dimension
"""
# increase grid by 0.5 to sample everything
xrange = np.arange(boundingBox[0][0], boundingBox[1][0] + 0.5, step=voxelSize)
yrange = np.arange(boundingBox[0][1], boundingBox[1][1] + 0.5, step=voxelSize)
zrange = np.arange(boundingBox[0][2], boundingBox[1][2] + 0.5, step=voxelSize)
gridpoints = np.zeros((xrange.shape[0] * yrange.shape[0] * zrange.shape[0], 3))
i = 0
for x in xrange:
for y in yrange:
for z in zrange:
gridpoints[i][0] = x
gridpoints[i][1] = y
gridpoints[i][2] = z
i += 1
return gridpoints, (xrange.shape[0], yrange.shape[0], zrange.shape[0])
def get_bb(points):
"""Return bounding box from a set of points (N,3)
Parameters
----------
points : numpy.ndarray
Set of points (N,3)
Returns
-------
boundingBox : list
List of the form [xmin, xmax, ymin, ymax, zmin, zmax]
"""
minx = np.min(points[:, 0])
maxx = np.max(points[:, 0])
miny = np.min(points[:, 1])
maxy = np.max(points[:, 1])
minz = np.min(points[:, 2])
maxz = np.max(points[:, 2])
bb = [[minx, miny, minz], [maxx, maxy, maxz]]
return bb
def get_all_protein_resids(pdb_file):
"""Return all protein residues from a pdb file
Parameters
----------
pdb_file : str
Path to pdb file
Returns
-------
resids : numpy.ndarray
Array of protein resids old -> new
"""
try:
prot = Molecule(pdb_file)
except:
exit("could not read file")
prot.filter("protein")
return prot.get("index", sel="name CA")
def get_all_metalbinding_resids(pdb_file):
"""Return all metal binding residues from a pdb file
Parameters
----------
pdb_file : str
Path to pdb file
Returns
-------
resids : numpy.ndarray
id of resids that are metal binding
"""
try:
prot = Molecule('files/2CBA.pdb')
except:
exit("could not read file")
prot.filter("protein")
return prot.get(
"index",
sel="name CA and resname HIS HID HIE HIP CYS CYX GLU GLH GLN ASP ASH ASN GLN MET",
)
def compute_average_p_fast(point, cutoff=1):
"""Using KDTree find the closest gridpoints
Parameters
----------
point : numpy.ndarray
Point of shape (3,)
cutoff : float
Cutoff distance in Angstrom
Returns
-------
average_p : numpy.ndarray
Average probability of shape (1,)"""
p = 0
nearest_neighbors, indices = tree.query(
point, k=15, distance_upper_bound=cutoff, workers=1
)
if np.min(nearest_neighbors) != np.inf:
p = np.mean(output_v[indices[nearest_neighbors != np.inf]])
return p
def get_probability_mean(grid, prot_centers, pvalues):
"""Compute the mean probability of all gridpoints from the globalgrid based on the individual boxes
Parameters
----------
grid : numpy.ndarray
Grid of shape (nx, ny, nz)
prot_centers : numpy.ndarray
Protein centers of shape (N,3)
pvalues : numpy.ndarray
Probability values of shape (N,1)
Returns
-------
mean_p : numpy.ndarray
Mean probability over grid of shape (nx, ny, nz)
"""
global output_v
output_v = pvalues
global prot_v
prot_v = prot_centers
cpuCount = multiprocessing.cpu_count()
global tree
tree = KDTree(prot_v)
p = Pool(cpuCount)
results = p.map(compute_average_p_fast, grid)
return np.array(results)
def write_cubefile(bb, pvalues, box_N, outname="Metal3D_pmap.cube", gridres=1):
"""Write a cube file from a probability map
The cube specification from gaussian is used, distance are converted to bohr
Parameters
----------
bb : list
List of the form [xmin, xmax, ymin, ymax, zmin, zmax]
pvalues : numpy.ndarray
Probability values of shape (nx, ny, nz)
box_N : tuple
Number of voxels in each dimension
outname : str
Name of the output file
gridres:float
Resolution of the grid used for writing the voxels
"""
with open(outname, "w") as cube:
cube.write(" Metal3D Cube File\n")
cube.write(" Outer Loop: X, Middle Loop y, inner Loop z\n")
angstromToBohr = 1.89
cube.write(
f" 1 {bb[0][0]*angstromToBohr: .6f} {bb[0][1]*angstromToBohr: .6f} {bb[0][2]*angstromToBohr: .6f}\n"
)
cube.write(
f"{str(box_N[0]).rjust(5)} {1.890000*gridres:.9f} 0.000000 0.000000\n"
)
cube.write(
f"{str(box_N[1]).rjust(5)} 0.000000 {1.890000*gridres:.9f} 0.000000\n"
)
cube.write(
f"{str(box_N[2]).rjust(5)} 0.000000 0.000000 {1.890000*gridres:.9f}\n"
)
cube.write(" 1 1.000000 0.000000 0.000000 0.000000\n")
o = pvalues.reshape(box_N)
for x in range(box_N[0]):
for y in range(box_N[1]):
for z in range(box_N[2]):
cube.write(f" {o[x][y][z]: .5E}")
if z % 6 == 5:
cube.write("\n")
cube.write("\n")
def find_unique_sites(
pvalues, grid, writeprobes=False, probefile="probes.pdb", threshold=5, p=0.75
):
"""The probability voxels are points and the voxel clouds may contain multiple metals
This function finds the unique sites and returns the coordinates of the unique sites with the highest p for each cluster.
It uses the AgglomerativeClustering algorithm to find the unique sites.
The threshold is the maximum distance between two points in the same cluster it can be changed to get more metal points.
Parameters
----------
pvalues : numpy.ndarray
Probability values of shape (N, 1)
grid : numpy.ndarray
Grid of shape (N, 3)
writeprobes : bool
If True, write the probes to a pdb file
probefile : str
Name of the output file
threshold : float
Maximum distance between two points in the same cluster
p : float
Minimum probability of a point to be considered a unique site
"""
points = grid[pvalues > p]
point_p = pvalues[pvalues > p]
if len(points) == 0:
return "no metals found"
clustering = AgglomerativeClustering(
n_clusters=None, linkage="complete", distance_threshold=threshold
).fit(points)
message = f"min metal p={p}, n(metals) found: {clustering.n_clusters_}"
sites = []
for i in range(clustering.n_clusters_):
c_points = points[clustering.labels_ == i]
c_points_p = point_p[clustering.labels_ == i]
position = c_points[np.argmax(c_points_p)]
sites.append((position, np.max(c_points_p)))
if writeprobes:
print(f"writing probes to {probefile}")
with open(probefile, "w") as f:
for i, site in enumerate(sites):
f.write(
f"HETATM {i+1:3} ZN ZN A {i+1:3} {site[0][0]: 8.3f}{site[0][1]: 8.3f}{site[0][2]: 8.3f} {site[1]:.2f} 0.0 ZN2+\n"
)
return message
|