libokj's picture
Upload 110 files
c0ec7e6
raw
history blame
4.34 kB
"""
MinHashed Atom-pair Fingerprint, MAP
orignal paper: Capecchi, Alice, Daniel Probst, and Jean-Louis Reymond. "One molecular fingerprint to rule them all: drugs, biomolecules, and the metabolome." Journal of Cheminformatics 12.1 (2020): 1-15. orignal code: https://github.com/reymond-group/map4, thanks their orignal work
A small bug is fixed: https://github.com/reymond-group/map4/issues/6
"""
_type = 'topological-based'
import itertools
from collections import defaultdict
import tmap as tm
from mhfp.encoder import MHFPEncoder
from rdkit import Chem
from rdkit.Chem import rdmolops
from rdkit.Chem.rdmolops import GetDistanceMatrix
def to_smiles(mol):
return Chem.MolToSmiles(mol, canonical=True, isomericSmiles=False)
class MAP4Calculator:
def __init__(self, dimensions=2048, radius=2, is_counted=False, is_folded=False, fold_dimensions=2048):
"""
MAP4 calculator class
"""
self.dimensions = dimensions
self.radius = radius
self.is_counted = is_counted
self.is_folded = is_folded
self.fold_dimensions = fold_dimensions
if self.is_folded:
self.encoder = MHFPEncoder(dimensions)
else:
self.encoder = tm.Minhash(dimensions)
def calculate(self, mol):
"""Calculates the atom pair minhashed fingerprint
Arguments:
mol -- rdkit mol object
Returns:
tmap VectorUint -- minhashed fingerprint
"""
atom_env_pairs = self._calculate(mol)
if self.is_folded:
return self._fold(atom_env_pairs)
return self.encoder.from_string_array(atom_env_pairs)
def calculate_many(self, mols):
""" Calculates the atom pair minhashed fingerprint
Arguments:
mols -- list of mols
Returns:
list of tmap VectorUint -- minhashed fingerprints list
"""
atom_env_pairs_list = [self._calculate(mol) for mol in mols]
if self.is_folded:
return [self._fold(pairs) for pairs in atom_env_pairs_list]
return self.encoder.batch_from_string_array(atom_env_pairs_list)
def _calculate(self, mol):
return self._all_pairs(mol, self._get_atom_envs(mol))
def _fold(self, pairs):
fp_hash = self.encoder.hash(set(pairs))
return self.encoder.fold(fp_hash, self.fold_dimensions)
def _get_atom_envs(self, mol):
atoms_env = {}
for atom in mol.GetAtoms():
idx = atom.GetIdx()
for radius in range(1, self.radius + 1):
if idx not in atoms_env:
atoms_env[idx] = []
atoms_env[idx].append(MAP4Calculator._find_env(mol, idx, radius))
return atoms_env
@classmethod
def _find_env(cls, mol, idx, radius):
env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx)
atom_map = {}
submol = Chem.PathToSubmol(mol, env, atomMap=atom_map)
if idx in atom_map:
smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False)
return smiles
return ''
def _all_pairs(self, mol, atoms_env):
atom_pairs = []
distance_matrix = GetDistanceMatrix(mol)
num_atoms = mol.GetNumAtoms()
shingle_dict = defaultdict(int)
for idx1, idx2 in itertools.combinations(range(num_atoms), 2):
dist = str(int(distance_matrix[idx1][idx2]))
for i in range(self.radius):
env_a = atoms_env[idx1][i]
env_b = atoms_env[idx2][i]
ordered = sorted([env_a, env_b])
shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1])
if self.is_counted:
shingle_dict[shingle] += 1
shingle += '|' + str(shingle_dict[shingle])
atom_pairs.append(shingle.encode('utf-8'))
return list(set(atom_pairs))
def GetMAP4(mol, nBits=2048, radius=2, fold_dimensions=None):
"""
MAP4: radius=2
"""
if fold_dimensions == None:
fold_dimensions = nBits
calc = MAP4Calculator(dimensions=nBits, radius=radius, is_counted=False, is_folded=True,
fold_dimensions=fold_dimensions)
arr = calc.calculate(mol)
return arr.astype(bool)