metal3d / utils /helpers.py
Simon Duerr
fix voxelization issue with structures containing H
46124fc
import os
import multiprocessing
from multiprocessing import Pool
from turtle import width
import numpy as np
from moleculekit.molecule import Molecule
from scipy.spatial import KDTree
from sklearn.cluster import AgglomerativeClustering
def create_grid_fromBB(boundingBox, voxelSize=1):
"""Create a grid from a bounding box.
Parameters
----------
boundingBox : list
List of the form [xmin, xmax, ymin, ymax, zmin, zmax]
voxelSize : float
Size of the voxels in Angstrom
Returns
-------
grid : numpy.ndarray
Grid of shape (nx, ny, nz)
box_N : numpy.ndarray
Number of voxels in each dimension
"""
# increase grid by 0.5 to sample everything
xrange = np.arange(boundingBox[0][0], boundingBox[1][0] + 0.5, step=voxelSize)
yrange = np.arange(boundingBox[0][1], boundingBox[1][1] + 0.5, step=voxelSize)
zrange = np.arange(boundingBox[0][2], boundingBox[1][2] + 0.5, step=voxelSize)
gridpoints = np.zeros((xrange.shape[0] * yrange.shape[0] * zrange.shape[0], 3))
i = 0
for x in xrange:
for y in yrange:
for z in zrange:
gridpoints[i][0] = x
gridpoints[i][1] = y
gridpoints[i][2] = z
i += 1
return gridpoints, (xrange.shape[0], yrange.shape[0], zrange.shape[0])
def get_bb(points):
"""Return bounding box from a set of points (N,3)
Parameters
----------
points : numpy.ndarray
Set of points (N,3)
Returns
-------
boundingBox : list
List of the form [xmin, xmax, ymin, ymax, zmin, zmax]
"""
minx = np.min(points[:, 0])
maxx = np.max(points[:, 0])
miny = np.min(points[:, 1])
maxy = np.max(points[:, 1])
minz = np.min(points[:, 2])
maxz = np.max(points[:, 2])
bb = [[minx, miny, minz], [maxx, maxy, maxz]]
return bb
def get_all_protein_resids(pdb_file):
"""Return all protein residues from a pdb file
Parameters
----------
pdb_file : str
Path to pdb file
Returns
-------
resids : numpy.ndarray
indexes of ca atoms
"""
try:
prot = Molecule(pdb_file)
except:
exit("could not read file")
prot.filter("protein and not hydrogen")
return prot.get("index", sel="name CA")
def get_all_metalbinding_resids(pdb_file):
"""Return all metal binding residues from a pdb file
Parameters
----------
pdb_file : str
Path to pdb file
Returns
-------
resids : numpy.ndarray
indexes of name CA that are metal binding
"""
try:
prot = Molecule(pdb_file)
except:
exit("could not read file")
prot.filter("protein and not hydrogen")
return prot.get(
"index",
sel="name CA and resname HIS HID HIE HIP CYS CYX GLU GLH GLN ASP ASH ASN GLN MET",
)
def get_all_resids_from_list(pdb_file, resids):
"""Return all metal binding residues from a pdb file
Parameters
----------
pdb_file : str
Path to pdb file
resids : list
id of resids that are metal binding
Returns
-------
resids : numpy.ndarray
indexes of name CA resids
"""
try:
prot = Molecule(pdb_file)
except:
exit("could not read file")
prot.filter("protein and not hydrogen")
return prot.get(
"index",
sel=f"name CA and resid {resids}",
)
def compute_average_p_fast(point, cutoff=1):
"""Using KDTree find the closest gridpoints
Parameters
----------
point : numpy.ndarray
Point of shape (3,)
cutoff : float
Cutoff distance in Angstrom
Returns
-------
average_p : numpy.ndarray
Average probability of shape (1,)"""
p = 0
nearest_neighbors, indices = tree.query(
point, k=15, distance_upper_bound=cutoff, workers=1
)
if np.min(nearest_neighbors) != np.inf:
p = np.mean(output_v[indices[nearest_neighbors != np.inf]])
return p
def get_probability_mean(grid, prot_centers, pvalues):
"""Compute the mean probability of all gridpoints from the globalgrid based on the individual boxes
Parameters
----------
grid : numpy.ndarray
Grid of shape (nx, ny, nz)
prot_centers : numpy.ndarray
Protein centers of shape (N,3)
pvalues : numpy.ndarray
Probability values of shape (N,1)
Returns
-------
mean_p : numpy.ndarray
Mean probability over grid of shape (nx, ny, nz)
"""
global output_v
output_v = pvalues
global prot_v
prot_v = prot_centers
cpuCount = multiprocessing.cpu_count()
global tree
tree = KDTree(prot_v)
p = Pool(cpuCount)
results = p.map(compute_average_p_fast, grid)
return np.array(results)
def write_cubefile(bb, pvalues, box_N, outname="Metal3D_pmap.cube", gridres=1):
"""Write a cube file from a probability map
The cube specification from gaussian is used, distance are converted to bohr
Parameters
----------
bb : list
List of the form [xmin, xmax, ymin, ymax, zmin, zmax]
pvalues : numpy.ndarray
Probability values of shape (nx, ny, nz)
box_N : tuple
Number of voxels in each dimension
outname : str
Name of the output file
gridres:float
Resolution of the grid used for writing the voxels
"""
with open(outname, "w") as cube:
cube.write(" Metal3D Cube File\n")
cube.write(" Outer Loop: X, Middle Loop y, inner Loop z\n")
angstromToBohr = 1.89
cube.write(
f" 1 {bb[0][0]*angstromToBohr: .6f} {bb[0][1]*angstromToBohr: .6f} {bb[0][2]*angstromToBohr: .6f}\n"
)
cube.write(
f"{str(box_N[0]).rjust(5)} {1.890000*gridres:.9f} 0.000000 0.000000\n"
)
cube.write(
f"{str(box_N[1]).rjust(5)} 0.000000 {1.890000*gridres:.9f} 0.000000\n"
)
cube.write(
f"{str(box_N[2]).rjust(5)} 0.000000 0.000000 {1.890000*gridres:.9f}\n"
)
cube.write(" 1 1.000000 0.000000 0.000000 0.000000\n")
o = pvalues.reshape(box_N)
for x in range(box_N[0]):
for y in range(box_N[1]):
for z in range(box_N[2]):
cube.write(f" {o[x][y][z]: .5E}")
if z % 6 == 5:
cube.write("\n")
cube.write("\n")
def find_unique_sites(
pvalues, grid, writeprobes=False, probefile="probes.pdb", threshold=5, p=0.75
):
"""The probability voxels are points and the voxel clouds may contain multiple metals
This function finds the unique sites and returns the coordinates of the unique sites with the highest p for each cluster.
It uses the AgglomerativeClustering algorithm to find the unique sites.
The threshold is the maximum distance between two points in the same cluster it can be changed to get more metal points.
Parameters
----------
pvalues : numpy.ndarray
Probability values of shape (N, 1)
grid : numpy.ndarray
Grid of shape (N, 3)
writeprobes : bool
If True, write the probes to a pdb file
probefile : str
Name of the output file
threshold : float
Maximum distance between two points in the same cluster
p : float
Minimum probability of a point to be considered a unique site
"""
points = grid[pvalues > p]
point_p = pvalues[pvalues > p]
if len(points) == 0:
return "no metals found"
clustering = AgglomerativeClustering(
n_clusters=None, linkage="complete", distance_threshold=threshold
).fit(points)
message = f"min metal p={p}, n(metals) found: {clustering.n_clusters_}"
sites = []
for i in range(clustering.n_clusters_):
c_points = points[clustering.labels_ == i]
c_points_p = point_p[clustering.labels_ == i]
position = c_points[np.argmax(c_points_p)]
sites.append((position, np.max(c_points_p)))
if writeprobes:
print(f"writing probes to {probefile}")
with open(probefile, "w") as f:
for i, site in enumerate(sites):
f.write(
f"HETATM {i+1:3} ZN ZN A {i+1:3} {site[0][0]: 8.3f}{site[0][1]: 8.3f}{site[0][2]: 8.3f} {site[1]:.2f} 0.0 ZN2+\n"
)
return message