Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 4,342 Bytes
c0ec7e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
"""
MinHashed Atom-pair Fingerprint, MAP
orignal paper: Capecchi, Alice, Daniel Probst, and Jean-Louis Reymond. "One molecular fingerprint to rule them all: drugs, biomolecules, and the metabolome." Journal of Cheminformatics 12.1 (2020): 1-15. orignal code: https://github.com/reymond-group/map4, thanks their orignal work
A small bug is fixed: https://github.com/reymond-group/map4/issues/6
"""
_type = 'topological-based'
import itertools
from collections import defaultdict
import tmap as tm
from mhfp.encoder import MHFPEncoder
from rdkit import Chem
from rdkit.Chem import rdmolops
from rdkit.Chem.rdmolops import GetDistanceMatrix
def to_smiles(mol):
return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=False)
class MAP4Calculator:
def __init__(self, dimensions=2048, radius=2, is_counted=False, is_folded=False, fold_dimensions=2048):
"""
MAP4 calculator class
"""
self.dimensions = dimensions
self.radius = radius
self.is_counted = is_counted
self.is_folded = is_folded
self.fold_dimensions = fold_dimensions
if self.is_folded:
self.encoder = MHFPEncoder(dimensions)
else:
self.encoder = tm.Minhash(dimensions)
def calculate(self, mol):
"""Calculates the atom pair minhashed fingerprint
Arguments:
mol -- rdkit mol object
Returns:
tmap VectorUint -- minhashed fingerprint
"""
atom_env_pairs = self._calculate(mol)
if self.is_folded:
return self._fold(atom_env_pairs)
return self.encoder.from_string_array(atom_env_pairs)
def calculate_many(self, mols):
""" Calculates the atom pair minhashed fingerprint
Arguments:
mols -- list of mols
Returns:
list of tmap VectorUint -- minhashed fingerprints list
"""
atom_env_pairs_list = [self._calculate(mol) for mol in mols]
if self.is_folded:
return [self._fold(pairs) for pairs in atom_env_pairs_list]
return self.encoder.batch_from_string_array(atom_env_pairs_list)
def _calculate(self, mol):
return self._all_pairs(mol, self._get_atom_envs(mol))
def _fold(self, pairs):
fp_hash = self.encoder.hash(set(pairs))
return self.encoder.fold(fp_hash, self.fold_dimensions)
def _get_atom_envs(self, mol):
atoms_env = {}
for atom in mol.GetAtoms():
idx = atom.GetIdx()
for radius in range(1, self.radius + 1):
if idx not in atoms_env:
atoms_env[idx] = []
atoms_env[idx].append(MAP4Calculator._find_env(mol, idx, radius))
return atoms_env
@classmethod
def _find_env(cls, mol, idx, radius):
env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx)
atom_map = {}
submol = Chem.PathToSubmol(mol, env, atomMap=atom_map)
if idx in atom_map:
smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False)
return smiles
return ''
def _all_pairs(self, mol, atoms_env):
atom_pairs = []
distance_matrix = GetDistanceMatrix(mol)
num_atoms = mol.GetNumAtoms()
shingle_dict = defaultdict(int)
for idx1, idx2 in itertools.combinations(range(num_atoms), 2):
dist = str(int(distance_matrix[idx1][idx2]))
for i in range(self.radius):
env_a = atoms_env[idx1][i]
env_b = atoms_env[idx2][i]
ordered = sorted([env_a, env_b])
shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1])
if self.is_counted:
shingle_dict[shingle] += 1
shingle += '|' + str(shingle_dict[shingle])
atom_pairs.append(shingle.encode('utf-8'))
return list(set(atom_pairs))
def GetMAP4(mol, nBits=2048, radius=2, fold_dimensions=None):
"""
MAP4: radius=2
"""
if fold_dimensions == None:
fold_dimensions = nBits
calc = MAP4Calculator(dimensions=nBits, radius=radius, is_counted=False, is_folded=True,
fold_dimensions=fold_dimensions)
arr = calc.calculate(mol)
return arr.astype(bool)
|