# Copyright Generate Biomedicines, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import print_function import numpy as np import torch import torch.linalg import torch.nn as nn from chroma.layers import graph from chroma.layers.linalg import eig_leading from chroma.layers.structure import geometry, protein_graph class CrossRMSD(nn.Module): """Compute optimal RMSDs between two sets of structures. This module uses the quaternion-based approach for calculating RMSDs as described in `Using Quaternions to Calculate RMSD`, 2004, by Coutsias, Seok, and Dill. The minimal RMSD and associated rotation are computed in terms of the most positive eigenvalue and associated eigvector of a special 4x4 matrix. Args: method (str, optional): Method for calculating the most postive eigenvalue. Can be `power` or `symeig`. If `symeig`, this will use `torch.symeig`, which is the most accurate method but tends to be very slow on GPU for large batches of RMSDs. If `power`, then use power iteration to estimate leading eigenvalues. Default is `power`. method_iter (int, optional): When the method is `power`, this argument sets the number of power iterations used for approximation. The default is 50, which has tended to produce estimates of optimal RMSD with sub-angstrom accuracy on test problems. Note: Convergence rates of power iteration can be highly variable dependening on the system. If accuracy is important, it is recommended to compare outputs with `symeig`-based RMSDs. 当使用 "power" 方法时,此参数设置幂迭代的次数 Inputs: X_mobile (Tensor): Mobile coordinates, i.e. the "mobile" coordinates, with shape `(num_source, num_atoms, 3)`. X_target (Tensor): Target coordinates with shape `(num_target, num_atoms, 3)`. Outputs: RMSD (Tensors): RMSDs after optimal superposition for all pairs of source and target structures with shape `(num_source, num_target)`. While `forward` returns the Cartesian product of all possible alignments, i.e. (`num_source * num_target` alignments), the `pairedRMSD` will do the same calculation for zipped batches, i.e. `num_source` total alignments. """ """ method:计算最大特征值的方法,可以是 "power" 或 "symeig"。 method_iter:当使用 "power" 方法时,此参数设置幂迭代的次数。 _eps:一个小的正数,用于避免除以零的错误。 dither:一个布尔值,用于决定是否在计算中加入随机扰动。 """ def __init__(self, method="power", method_iter=50, dither=True): super(CrossRMSD, self).__init__() self.method = method self.method_iter = method_iter self._eps = 1e-5 self.dither = dither # R_to_F converts xyz cross-covariance matrices (3x3) to the (4x4) F # matrix of Coutsias et al. This F matrix encodes the optimal RMSD in # its spectra; namely, the eigenvector associated with the most # positive eigenvalue of F is the quaternion encoding the optimal # 3D rotation for superposition. # fmt: off R_to_F = np.zeros((9, 16)).astype("f") F_nonzero = [ [(0,0,1.),(1,1,1.),(2,2,1.)], [(1,2,1.),(2,1,-1.)], [(2,0,1.),(0,2,-1.)], [(0,1,1.),(1,0,-1.)], [(1,2,1.),(2,1,-1.)], [(0,0,1.),(1,1,-1.),(2,2,-1.)], [(0,1,1.),(1,0,1.)], [(0,2,1.),(2,0,1.)], [(2,0,1.),(0,2,-1.)], [(0,1,1.),(1,0,1.)], [(0,0,-1.),(1,1,1.),(2,2,-1.)], [(1,2,1.),(2,1,1.)], [(0,1,1.),(1,0,-1.)], [(0,2,1.),(2,0,1.)], [(1,2,1.),(2,1,1.)], [(0,0,-1.),(1,1,-1.),(2,2,1.)] ] # fmt: on for F_ij, nonzero in enumerate(F_nonzero): for R_i, R_j, sign in nonzero: R_to_F[R_i * 3 + R_j, F_ij] = sign self.register_buffer("R_to_F", torch.tensor(R_to_F)) """ 在这个方法中,首先对坐标进行中心化处理,然后计算交叉协方差矩阵, R 展平并与 R_to_F 矩阵相乘得到 F 矩阵。 之后,根据 method 参数选择的方法计算 F 矩阵的最大特征值,并使用这个特征值来计算 RMSD. """ def forward(self, X_mobile, X_target): num_source = X_mobile.size(0) num_target = X_target.size(0) num_atoms = X_mobile.size(1) # Center coordinates X_mobile = X_mobile - X_mobile.mean(dim=1, keepdim=True) X_target = X_target - X_target.mean(dim=1, keepdim=True) # CrossCov matrices contract over atoms R = torch.einsum("sai,taj->stij", [X_mobile, X_target]) # F Matrix has leading eigenvector as optimal quaternion R_flat = R.reshape(num_source, num_target, 9) F = torch.matmul(R_flat, self.R_to_F).reshape(num_source, num_target, 4, 4) # Compute optimal quaternion by extracting leading eigenvector if self.method == "symeig": top_eig = torch.linalg.eigvalsh(F)[:, :, 3] elif self.method == "power": top_eig, vec = eig_leading(F, num_iterations=self.method_iter) else: raise NotImplementedError # Compute RMSD in terms of RMSD using the scheme of Coutsias et al norms = (X_mobile ** 2).sum(dim=[-1, -2]).unsqueeze(1) + (X_target ** 2).sum( dim=[-1, -2] ).unsqueeze(0) sqRMSD = torch.relu((norms - 2 * top_eig) / (num_atoms + self._eps)) RMSD = torch.sqrt(sqRMSD) return RMSD def pairedRMSD( self, X_mobile, X_target, mask=None, compute_alignment=False, align_unmasked=False, ): """Compute optimal RMSDs between each corresponding batch members. Args: X_mobile (Tensor): Mobile coordinates with shape `(..., num_atoms, 3)`. X_target (Tensor): Target coordinates with shape `(..., num_atoms, 3)`. mask (Tensor, optional): Binary mask tensor for missing atoms with shape `(..., num_atoms)`. compute_alignment (boolean, optional): If True, also return the superposed coordinates. Returns: RMSD (Tensors): Optimal RMSDs after superposition for all pairs of input structures with shape `(...)`. X_mobile_transform (Tensor, optional): Superposed coordinates with shape `(..., num_atoms, 3)`. Requires `compute_alignment` = True`. """ # Collapse all leading batch dimensions num_atoms = X_mobile.size(-2) batch_dims = list(X_mobile.shape)[:-2] X_mobile = X_mobile.reshape([-1, num_atoms, 3]) X_target = X_target.reshape([-1, num_atoms, 3]) num_batch = X_mobile.size(0) if mask is not None: mask = mask.reshape([-1, num_atoms]) # Center coordinates if mask is None: X_mobile_mean = X_mobile.mean(dim=1, keepdim=True) X_target_mean = X_target.mean(dim=1, keepdim=True) else: mask_expand = mask.unsqueeze(-1) X_mobile_mean = torch.sum(mask_expand * X_mobile, 1, keepdim=True) / ( torch.sum(mask_expand, 1, keepdim=True) + self._eps ) X_target_mean = torch.sum(mask_expand * X_target, 1, keepdim=True) / ( torch.sum(mask_expand, 1, keepdim=True) + self._eps ) X_mobile_center = X_mobile - X_mobile_mean X_target_center = X_target - X_target_mean if mask is not None: X_mobile_center = mask_expand * X_mobile_center X_target_center = mask_expand * X_target_center # Cross-covariance matrices contract over atoms R = torch.einsum("sai,saj->sij", [X_mobile_center, X_target_center]) # F Matrix has leading eigenvector as optimal quaternion R_flat = R.reshape(num_batch, 9) R_to_F = self.R_to_F.type(R_flat.dtype) F = torch.matmul(R_flat, R_to_F).reshape(num_batch, 4, 4) if self.dither: F = F + 1e-5 * torch.randn_like(F) # Compute optimal quaternion by extracting leading eigenvector if self.method == "symeig": L, V = torch.linalg.eigh(F) top_eig = L[:, 3] vec = V[:, :, 3] elif self.method == "power": top_eig, vec = eig_leading(F, num_iterations=self.method_iter) else: raise NotImplementedError # Compute RMSD using top eigenvalue norms = (X_mobile_center ** 2).sum(dim=[-1, -2]) + (X_target_center ** 2).sum( dim=[-1, -2] ) sqRMSD = torch.relu((norms - 2 * top_eig) / (num_atoms + self._eps)) rmsd = torch.sqrt(sqRMSD) if not compute_alignment: # Unpack leading batch dimensions rmsd = rmsd.reshape(batch_dims) return rmsd else: R = geometry.rotations_from_quaternions(vec, normalize=False) X_mobile_transform = torch.einsum("bxr,bir->bix", R, X_mobile_center) X_mobile_transform = X_mobile_transform + X_target_mean if mask is not None: X_mobile_transform = mask_expand * X_mobile_transform # Return the RMSD of the transformed coordinates rmsd_direct = rmsd_unaligned(X_mobile_transform, X_target, mask) # Unpack leading batch dimensions rmsd_direct = rmsd_direct.reshape(batch_dims) X_mobile_transform = X_mobile_transform.reshape(batch_dims + [num_atoms, 3]) if align_unmasked: X_mobile_transform = X_mobile - X_mobile_mean X_mobile_transform = torch.einsum( "bxr, bir -> bix", R, X_mobile_transform.view(X_mobile.size(0), -1, 3), ) X_mobile_transform = X_mobile_transform + X_target_mean return rmsd_direct, X_mobile_transform class BackboneRMSD(nn.Module): """Compute optimal RMSDs between two sets of backbones. This wraps `CrossRMSD` for use with XCS-formatted protein data. Args: method (str, optional): Method for calculating the most postive eigenvalue. Can be `power` or `symeig`. Default is `power`. method_iter (int, optional): Number of power iterations for eigenvalue approximation. Requires `method=power`. Default is 50. Inputs: X_mobile (Tensor): Mobile coordinates with shape `(num_source, num_atoms, 4, 3)`. X_target (Tensor): Target coordinates with shape `(num_target, num_atoms, 4, 3)`. C (Tensor): Chain map with shape `(num_batch, num_residues)`. Outputs: X_aligned (Tensor, optional): Superposed `X_mobile` with shape `(num_batch, num_atoms, 3)`. rmsd (Tensors): Optimal RMSDs after superposition with shape `(num_batch)`. """ def __init__(self, method="symeig"): super(BackboneRMSD, self).__init__() self.rmsd = CrossRMSD(method=method) """ 在 align 方法中,首先根据链映射 C 创建一个掩码 mask。这个掩码用于确定蛋白质中哪些部分将被用于对齐计算。 接着,将输入的蛋白质坐标 X_mobile 和 X_target 重塑为适合 RMSD 计算的格式。 然后,使用 CrossRMSD 实例的 pairedRMSD 方法计算 RMSD 并获取对齐后的坐标。 最后,将对齐后的坐标重新塑形为原始蛋白质坐标的格式并返回. """ def align(self, X_mobile, X_target, C, align_unmasked=False): mask = (C > 0).type(torch.float32) mask_flat = mask.unsqueeze(-1).expand(-1, -1, 4).reshape(mask.shape[0], -1) X_mobile_flat = X_mobile.reshape(X_mobile.size(0), -1, 3) X_target_flat = X_target.reshape(X_target.size(0), -1, 3) rmsd, X_aligned = self.rmsd.pairedRMSD( X_mobile_flat, X_target_flat, mask=mask_flat, compute_alignment=True, align_unmasked=align_unmasked, ) X_aligned = X_aligned.reshape(X_mobile.size()).contiguous() return X_aligned, rmsd class LossFragmentRMSD(nn.Module): """Compute optimal fragment-pair RMSDs between two sets of backbones. Args: fragment_k (int, option): Fram method (str, optional): Method for calculating the most postive eigenvalue. Can be `power` or `symeig`. Default is `power`. method_iter (int, optional): Number of power iterations for eigenvalue approximation. Requires `method=power`. Default is 50. Inputs: X_mobile (Tensor): Mobile coordinates with shape `(num_source, num_atoms, 4, 3)`. X_target (Tensor): Target coordinates with shape `(num_target, num_atoms, 4, 3)`. edge_idx C (Tensor): Chain map with shape `(num_batch, num_residues)`. Outputs: rmsd (Tensor, optional): Per-site fragment RMSDs with shape `(num_batch)`. """ def __init__(self, k=7, method="symeig", method_iter=50): super(LossFragmentRMSD, self).__init__() self.k = k self.rmsd = CrossRMSD(method=method, method_iter=method_iter) """ X_mobile 和 X_target:分别表示待对齐的蛋白质和目标蛋白质的坐标。 C:表示链映射,用于确定蛋白质中哪些残基(residues)应该被考虑在对齐过程中。 return_coords:一个布尔值,指示是否返回对齐后的坐标。 在 forward 方法中,首先将输入的蛋白质坐标 X_mobile 和 X_target 限制在背骨原子上。 然后,使用 _collect_X_fragments 函数(这个函数没有在代码中定义,可能是在其他地方定义的)从每个蛋白质中收集片段,并根据链映射 C 创建掩码。 之后,使用 CrossRMSD 实例的 pairedRMSD 方法计算每个片段对的 RMSD,并根据 return_coords 参数决定是否返回对齐后的坐标. """ def forward(self, X_mobile, X_target, C, return_coords=False): # Discard potential sidechain coordinates X_mobile = X_mobile[:, :, :4, :] X_target = X_target[:, :, :4, :] # Build graph and pair fragments X_fragment_mobile, C_fragment_mobile = _collect_X_fragments(X_mobile, C, self.k) X_fragment_target, C_fragment_target = _collect_X_fragments(X_target, C, self.k) shape = list(C.shape) + [-1, 3] X_fragment_mobile = X_fragment_mobile.reshape(shape) X_fragment_target = X_fragment_target.reshape(shape) mask = (C_fragment_mobile > 0).float() rmsd, X_fragment_mobile_align = self.rmsd.pairedRMSD( X_fragment_mobile, X_fragment_target, mask, compute_alignment=True ) if return_coords: return rmsd, X_fragment_target, X_fragment_mobile, X_fragment_mobile_align else: return rmsd class LossFragmentPairRMSD(nn.Module): """Compute optimal fragment-pair RMSDs between two sets of backbones. Args: fragment_k (int, option): Fram method (str, optional): Method for calculating the most postive eigenvalue. Can be `power` or `symeig`. Default is `power`. method_iter (int, optional): Number of power iterations for eigenvalue approximation. Requires `method=power`. Default is 50. Inputs: X_mobile (Tensor): Mobile coordinates with shape `(num_source, num_atoms, 4, 3)`. X_target (Tensor): Target coordinates with shape `(num_target, num_atoms, 4, 3)`. edge_idx C (Tensor): Chain map with shape `(num_batch, num_residues)`. Outputs: rmsd (Tensor, optional): Per-site fragment RMSDs with shape `(num_batch)`. """ def __init__(self, k=7, method="symeig", method_iter=50, graph_num_neighbors=30): super(LossFragmentPairRMSD, self).__init__() self.k = k self.rmsd = CrossRMSD(method=method, method_iter=method_iter) self.graph_builder = protein_graph.ProteinGraph( num_neighbors=graph_num_neighbors ) def _stack_neighbor(self, node_h, edge_idx): neighbor_h = graph.collect_neighbors(node_h, edge_idx) node_h = node_h[:, :, None, :].expand(neighbor_h.shape) edge_h = torch.cat([neighbor_h, node_h], dim=-1) return edge_h def _collect_X_fragment_pairs(self, X, C, edge_idx): X_kmer, C_kmer = _collect_X_fragments(X, C, self.k) X_pair = self._stack_neighbor(X_kmer, edge_idx) C_pair = self._stack_neighbor(C_kmer, edge_idx) X_pair = X_pair.reshape(list(X_pair.shape)[:-1] + [-1, 3]) return X_pair, C_pair def forward(self, X_mobile, X_target, C, return_coords=False): # Discard potential sidechain coordinates X_mobile = X_mobile[:, :, :4, :] X_target = X_target[:, :, :4, :] # Build graph and pair fragments edge_idx, mask_ij = self.graph_builder(X_target, C) X_pair_mobile, C_pair_mobile = self._collect_X_fragment_pairs( X_mobile, C, edge_idx ) X_pair_target, C_pair_target = self._collect_X_fragment_pairs( X_target, C, edge_idx ) mask = (C_pair_mobile > 0).float() rmsd, X_pair_mobile_align = self.rmsd.pairedRMSD( X_pair_mobile, X_pair_target, mask, compute_alignment=True ) if return_coords: return rmsd, mask_ij, X_pair_target, X_pair_mobile, X_pair_mobile_align else: return rmsd, mask_ij class LossNeighborhoodRMSD(nn.Module): """Compute optimal fragment-pair RMSDs between two sets of backbones. Args: fragment_k (int, option): Fram method (str, optional): Method for calculating the most postive eigenvalue. Can be `power` or `symeig`. Default is `power`. method_iter (int, optional): Number of power iterations for eigenvalue approximation. Requires `method=power`. Default is 50. Inputs: X_mobile (Tensor): Mobile coordinates with shape `(num_source, num_atoms, 4, 3)`. X_target (Tensor): Target coordinates with shape `(num_target, num_atoms, 4, 3)`. edge_idx C (Tensor): Chain map with shape `(num_batch, num_residues)`. Outputs: rmsd (Tensor, optional): Per-site fragment RMSDs with shape `(num_batch)`. """ def __init__(self, method="symeig", method_iter=50, graph_num_neighbors=30): super(LossNeighborhoodRMSD, self).__init__() self.rmsd = CrossRMSD(method=method, method_iter=method_iter) self.graph_builder = protein_graph.ProteinGraph( num_neighbors=graph_num_neighbors ) def _collect_X_neighborhood(self, X, C, edge_idx): num_batch, num_nodes, num_atoms, _ = X.shape shape_flat = [num_batch, num_nodes, -1] X_flat = X.reshape(shape_flat) C_flat = C[..., None].expand([-1, -1, num_atoms]) X_neighborhood = graph.collect_neighbors(X_flat, edge_idx).reshape( [num_batch, num_nodes, -1, 3] ) C_neighborhood = graph.collect_neighbors(C_flat, edge_idx).reshape( [num_batch, num_nodes, -1] ) return X_neighborhood, C_neighborhood def forward(self, X_mobile, X_target, C, return_coords=False): # Discard potential sidechain coordinates X_mobile = X_mobile[:, :, :4, :] X_target = X_target[:, :, :4, :] # Build graph and pair fragments edge_idx, mask_ij = self.graph_builder(X_target, C) X_neighborhood_mobile, C_neighborhood_mobile = self._collect_X_neighborhood( X_mobile, C, edge_idx ) X_neighborhood_target, C_neighborhood_target = self._collect_X_neighborhood( X_target, C, edge_idx ) mask = (C_neighborhood_mobile > 0).float() rmsd, X_neighborhood_mobile_align = self.rmsd.pairedRMSD( X_neighborhood_mobile, X_neighborhood_target, mask, compute_alignment=True ) mask = (mask.sum(-1) > 0).float() if return_coords: return ( rmsd, mask, X_neighborhood_target, X_neighborhood_mobile, X_neighborhood_mobile_align, ) else: return rmsd, mask def rmsd_unaligned(X_a, X_b, mask=None, eps=1e-5, _min_rmsd=1e-8): """Compute RMSD between two coordinate sets without alignment. Args: X_a (Tensor): Coordinate set 1 with shape `(..., num_points, 3)`. X_b (Tensor): Coordinate set 2 with shape `(..., num_points, 3)`. mask (Tensor, optional): Mask with shape `(..., num_points)`. eps (float, optional): Small number to prevent division by zero. default is 1E-5. Returns: rmsd (Tensor): Root mean squared deviations (raw) with shape `(...)`. """ squared_dev = ((X_a - X_b) ** 2).sum(-1) if mask is None: rmsd = torch.sqrt(squared_dev.mean(-1).clamp(min=_min_rmsd)) else: rmsd = torch.sqrt( (mask * squared_dev).sum(-1).clamp(min=_min_rmsd) / (mask.sum(-1) + eps) ) return rmsd """ 这两个函数是处理蛋白质结构数据的关键部分,特别是在需要从蛋白质结构中提取和分析特定长度片段的情况下。 _collect_X_fragments 函数处理蛋白质的坐标和链映射信息,以收集和处理特定长度的片段, 而 _collect_kmers 函数则是一个更通用的工具,用于从任何给定的节点特征矩阵中收集 k-mers. _collect_X_fragments: 函数首先将 X 和 C 转换为扁平形状。 然后,使用 _collect_kmers 函数从 X_flat 和 C_flat 中收集 k-mers,这些 k-mers 本质上是局部的、长度为 k 的片段。 最后,函数使用 torch.where 来处理非连续原子,将它们视为缺失,并返回处理后的 X_kmer 和 C_kmer。 _collect_kmers: 函数的主要步骤包括: 构建索引以定位 k-mers。首先,创建一个长度为 k 的索引数组 k_idx。 然后,使用这个索引和节点的索引 node_idx 生成 k-mers 的索引 kmer_idx。 使用 kmer_idx 从 node_h 中收集相邻节点的特征,形成新的 k-mer 特征矩阵 kmer_h。 这个函数的关键在于它能够从原始的节点特征矩阵中构建出包含局部邻居信息的新矩阵,这对于处理基于图的结构(如蛋白质结构)特别有用。 """ def _collect_X_fragments(X, C, k): num_batch, num_nodes, num_atoms, _ = X.shape shape_flat = [num_batch, num_nodes, -1] X_flat = X.reshape(shape_flat) C_flat = C[..., None].expand([-1, -1, num_atoms]) # Grab local kmers X_kmer = _collect_kmers(X_flat, k).reshape(shape_flat) C_kmer = _collect_kmers(C_flat, k).reshape(shape_flat) # Treat noncontiguous atoms as missing C_kmer = torch.where(C[..., None].eq(C_kmer), C_kmer, -C_kmer.abs()) return X_kmer, C_kmer def _collect_kmers(node_h, k): """Gather `(B,I,H) => (B,I,K,H)`""" device = node_h.device num_batch, num_nodes, _ = node_h.shape # Build indices k_idx = torch.arange(k, device=device) - (k - 1) // 2 node_idx = torch.arange(node_h.shape[1], device=device) kmer_idx = node_idx[None, :, None] - k_idx[None, None, :] kmer_idx = kmer_idx.clamp(min=0, max=num_nodes - 1).long() kmer_idx = kmer_idx.expand([num_batch, -1, k]) # Collect neighbors kmer_h = graph.collect_neighbors(node_h, kmer_idx) return kmer_h