Spaces:
Sleeping
Sleeping
# Copyright 2021 DeepMind Technologies Limited | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Modules and code used in the core part of AlphaFold. | |
The structure generation code is in 'folding.py'. | |
""" | |
import functools | |
from alphafold.common import residue_constants | |
from alphafold.model import all_atom | |
from alphafold.model import common_modules | |
from alphafold.model import folding | |
from alphafold.model import layer_stack | |
from alphafold.model import lddt | |
from alphafold.model import mapping | |
from alphafold.model import prng | |
from alphafold.model import quat_affine | |
from alphafold.model import utils | |
import haiku as hk | |
import jax | |
import jax.numpy as jnp | |
from alphafold.model.r3 import Rigids, Rots, Vecs | |
def softmax_cross_entropy(logits, labels): | |
"""Computes softmax cross entropy given logits and one-hot class labels.""" | |
loss = -jnp.sum(labels * jax.nn.log_softmax(logits), axis=-1) | |
return jnp.asarray(loss) | |
def sigmoid_cross_entropy(logits, labels): | |
"""Computes sigmoid cross entropy given logits and multiple class labels.""" | |
log_p = jax.nn.log_sigmoid(logits) | |
# log(1 - sigmoid(x)) = log_sigmoid(-x), the latter is more numerically stable | |
log_not_p = jax.nn.log_sigmoid(-logits) | |
loss = -labels * log_p - (1. - labels) * log_not_p | |
return jnp.asarray(loss) | |
def apply_dropout(*, tensor, safe_key, rate, is_training, broadcast_dim=None): | |
"""Applies dropout to a tensor.""" | |
if is_training: # and rate != 0.0: | |
shape = list(tensor.shape) | |
if broadcast_dim is not None: | |
shape[broadcast_dim] = 1 | |
keep_rate = 1.0 - rate | |
keep = jax.random.bernoulli(safe_key.get(), keep_rate, shape=shape) | |
return keep * tensor / keep_rate | |
else: | |
return tensor | |
def dropout_wrapper(module, | |
input_act, | |
mask, | |
safe_key, | |
global_config, | |
output_act=None, | |
is_training=True, | |
scale_rate=1.0, | |
**kwargs): | |
"""Applies module + dropout + residual update.""" | |
if output_act is None: | |
output_act = input_act | |
gc = global_config | |
residual = module(input_act, mask, is_training=is_training, **kwargs) | |
dropout_rate = 0.0 if gc.deterministic else module.config.dropout_rate | |
if module.config.shared_dropout: | |
if module.config.orientation == 'per_row': | |
broadcast_dim = 0 | |
else: | |
broadcast_dim = 1 | |
else: | |
broadcast_dim = None | |
residual = apply_dropout(tensor=residual, | |
safe_key=safe_key, | |
rate=dropout_rate * scale_rate, | |
is_training=is_training, | |
broadcast_dim=broadcast_dim) | |
new_act = output_act + residual | |
return new_act | |
def create_extra_msa_feature(batch): | |
"""Expand extra_msa into 1hot and concat with other extra msa features. | |
We do this as late as possible as the one_hot extra msa can be very large. | |
Arguments: | |
batch: a dictionary with the following keys: | |
* 'extra_msa': [N_extra_seq, N_res] MSA that wasn't selected as a cluster | |
centre. Note, that this is not one-hot encoded. | |
* 'extra_has_deletion': [N_extra_seq, N_res] Whether there is a deletion to | |
the left of each position in the extra MSA. | |
* 'extra_deletion_value': [N_extra_seq, N_res] The number of deletions to | |
the left of each position in the extra MSA. | |
Returns: | |
Concatenated tensor of extra MSA features. | |
""" | |
# 23 = 20 amino acids + 'X' for unknown + gap + bert mask | |
msa_1hot = jax.nn.one_hot(batch['extra_msa'], 23) | |
msa_feat = [msa_1hot, | |
jnp.expand_dims(batch['extra_has_deletion'], axis=-1), | |
jnp.expand_dims(batch['extra_deletion_value'], axis=-1)] | |
return jnp.concatenate(msa_feat, axis=-1) | |
class AlphaFoldIteration(hk.Module): | |
"""A single recycling iteration of AlphaFold architecture. | |
Computes ensembled (averaged) representations from the provided features. | |
These representations are then passed to the various heads | |
that have been requested by the configuration file. Each head also returns a | |
loss which is combined as a weighted sum to produce the total loss. | |
Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 3-22 | |
""" | |
def __init__(self, config, global_config, name='alphafold_iteration'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, | |
ensembled_batch, | |
non_ensembled_batch, | |
is_training, | |
compute_loss=False, | |
ensemble_representations=False, | |
return_representations=False): | |
num_ensemble = jnp.asarray(ensembled_batch['seq_length'].shape[0]) | |
if not ensemble_representations: | |
assert ensembled_batch['seq_length'].shape[0] == 1 | |
def slice_batch(i): | |
b = {k: v[i] for k, v in ensembled_batch.items()} | |
b.update(non_ensembled_batch) | |
return b | |
# Compute representations for each batch element and average. | |
evoformer_module = EmbeddingsAndEvoformer( | |
self.config.embeddings_and_evoformer, self.global_config) | |
batch0 = slice_batch(0) | |
representations = evoformer_module(batch0, is_training) | |
# MSA representations are not ensembled so | |
# we don't pass tensor into the loop. | |
msa_representation = representations['msa'] | |
del representations['msa'] | |
# Average the representations (except MSA) over the batch dimension. | |
if ensemble_representations: | |
def body(x): | |
"""Add one element to the representations ensemble.""" | |
i, current_representations = x | |
feats = slice_batch(i) | |
representations_update = evoformer_module( | |
feats, is_training) | |
new_representations = {} | |
for k in current_representations: | |
new_representations[k] = ( | |
current_representations[k] + representations_update[k]) | |
return i+1, new_representations | |
if hk.running_init(): | |
# When initializing the Haiku module, run one iteration of the | |
# while_loop to initialize the Haiku modules used in `body`. | |
_, representations = body((1, representations)) | |
else: | |
_, representations = hk.while_loop( | |
lambda x: x[0] < num_ensemble, | |
body, | |
(1, representations)) | |
for k in representations: | |
if k != 'msa': | |
representations[k] /= num_ensemble.astype(representations[k].dtype) | |
representations['msa'] = msa_representation | |
batch = batch0 # We are not ensembled from here on. | |
if jnp.issubdtype(ensembled_batch['aatype'].dtype, jnp.integer): | |
_, num_residues = ensembled_batch['aatype'].shape | |
else: | |
_, num_residues, _ = ensembled_batch['aatype'].shape | |
if self.config.use_struct: | |
struct_module = folding.StructureModule | |
else: | |
struct_module = folding.dummy | |
heads = {} | |
for head_name, head_config in sorted(self.config.heads.items()): | |
if not head_config.weight: | |
continue # Do not instantiate zero-weight heads. | |
head_factory = { | |
'masked_msa': MaskedMsaHead, | |
'distogram': DistogramHead, | |
'structure_module': functools.partial(struct_module, compute_loss=compute_loss), | |
'predicted_lddt': PredictedLDDTHead, | |
'predicted_aligned_error': PredictedAlignedErrorHead, | |
'experimentally_resolved': ExperimentallyResolvedHead, | |
}[head_name] | |
heads[head_name] = (head_config, | |
head_factory(head_config, self.global_config)) | |
total_loss = 0. | |
ret = {} | |
ret['representations'] = representations | |
def loss(module, head_config, ret, name, filter_ret=True): | |
if filter_ret: | |
value = ret[name] | |
else: | |
value = ret | |
loss_output = module.loss(value, batch) | |
ret[name].update(loss_output) | |
loss = head_config.weight * ret[name]['loss'] | |
return loss | |
for name, (head_config, module) in heads.items(): | |
# Skip PredictedLDDTHead and PredictedAlignedErrorHead until | |
# StructureModule is executed. | |
if name in ('predicted_lddt', 'predicted_aligned_error'): | |
continue | |
else: | |
ret[name] = module(representations, batch, is_training) | |
if 'representations' in ret[name]: | |
# Extra representations from the head. Used by the structure module | |
# to provide activations for the PredictedLDDTHead. | |
representations.update(ret[name].pop('representations')) | |
if compute_loss: | |
total_loss += loss(module, head_config, ret, name) | |
if self.config.use_struct: | |
if self.config.heads.get('predicted_lddt.weight', 0.0): | |
# Add PredictedLDDTHead after StructureModule executes. | |
name = 'predicted_lddt' | |
# Feed all previous results to give access to structure_module result. | |
head_config, module = heads[name] | |
ret[name] = module(representations, batch, is_training) | |
if compute_loss: | |
total_loss += loss(module, head_config, ret, name, filter_ret=False) | |
if ('predicted_aligned_error' in self.config.heads | |
and self.config.heads.get('predicted_aligned_error.weight', 0.0)): | |
# Add PredictedAlignedErrorHead after StructureModule executes. | |
name = 'predicted_aligned_error' | |
# Feed all previous results to give access to structure_module result. | |
head_config, module = heads[name] | |
ret[name] = module(representations, batch, is_training) | |
if compute_loss: | |
total_loss += loss(module, head_config, ret, name, filter_ret=False) | |
if compute_loss: | |
return ret, total_loss | |
else: | |
return ret | |
class AlphaFold(hk.Module): | |
"""AlphaFold model with recycling. | |
Jumper et al. (2021) Suppl. Alg. 2 "Inference" | |
""" | |
def __init__(self, config, name='alphafold'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = config.global_config | |
def __call__( | |
self, | |
batch, | |
is_training, | |
compute_loss=False, | |
ensemble_representations=False, | |
return_representations=False): | |
"""Run the AlphaFold model. | |
Arguments: | |
batch: Dictionary with inputs to the AlphaFold model. | |
is_training: Whether the system is in training or inference mode. | |
compute_loss: Whether to compute losses (requires extra features | |
to be present in the batch and knowing the true structure). | |
ensemble_representations: Whether to use ensembling of representations. | |
return_representations: Whether to also return the intermediate | |
representations. | |
Returns: | |
When compute_loss is True: | |
a tuple of loss and output of AlphaFoldIteration. | |
When compute_loss is False: | |
just output of AlphaFoldIteration. | |
The output of AlphaFoldIteration is a nested dictionary containing | |
predictions from the various heads. | |
""" | |
if "scale_rate" not in batch: | |
batch["scale_rate"] = jnp.ones((1,)) | |
impl = AlphaFoldIteration(self.config, self.global_config) | |
if jnp.issubdtype(batch['aatype'].dtype, jnp.integer): | |
batch_size, num_residues = batch['aatype'].shape | |
else: | |
batch_size, num_residues, _ = batch['aatype'].shape | |
def get_prev(ret): | |
new_prev = { | |
'prev_msa_first_row': ret['representations']['msa_first_row'], | |
'prev_pair': ret['representations']['pair'], | |
'prev_dgram': ret["distogram"]["logits"], | |
} | |
if self.config.use_struct: | |
new_prev.update({'prev_pos': ret['structure_module']['final_atom_positions'], | |
'prev_plddt': ret["predicted_lddt"]["logits"]}) | |
if "predicted_aligned_error" in ret: | |
new_prev["prev_pae"] = ret["predicted_aligned_error"]["logits"] | |
if not self.config.backprop_recycle: | |
for k in ["prev_pos","prev_msa_first_row","prev_pair"]: | |
if k in new_prev: | |
new_prev[k] = jax.lax.stop_gradient(new_prev[k]) | |
return new_prev | |
def do_call(prev, | |
recycle_idx, | |
compute_loss=compute_loss): | |
if self.config.resample_msa_in_recycling: | |
num_ensemble = batch_size // (self.config.num_recycle + 1) | |
def slice_recycle_idx(x): | |
start = recycle_idx * num_ensemble | |
size = num_ensemble | |
return jax.lax.dynamic_slice_in_dim(x, start, size, axis=0) | |
ensembled_batch = jax.tree_map(slice_recycle_idx, batch) | |
else: | |
num_ensemble = batch_size | |
ensembled_batch = batch | |
non_ensembled_batch = jax.tree_map(lambda x: x, prev) | |
return impl(ensembled_batch=ensembled_batch, | |
non_ensembled_batch=non_ensembled_batch, | |
is_training=is_training, | |
compute_loss=compute_loss, | |
ensemble_representations=ensemble_representations) | |
emb_config = self.config.embeddings_and_evoformer | |
prev = { | |
'prev_msa_first_row': jnp.zeros([num_residues, emb_config.msa_channel]), | |
'prev_pair': jnp.zeros([num_residues, num_residues, emb_config.pair_channel]), | |
'prev_dgram': jnp.zeros([num_residues, num_residues, 64]), | |
} | |
if self.config.use_struct: | |
prev.update({'prev_pos': jnp.zeros([num_residues, residue_constants.atom_type_num, 3]), | |
'prev_plddt': jnp.zeros([num_residues, 50]), | |
'prev_pae': jnp.zeros([num_residues, num_residues, 64])}) | |
for k in ["pos","msa_first_row","pair","dgram"]: | |
if f"init_{k}" in batch: prev[f"prev_{k}"] = batch[f"init_{k}"][0] | |
if self.config.num_recycle: | |
if 'num_iter_recycling' in batch: | |
# Training time: num_iter_recycling is in batch. | |
# The value for each ensemble batch is the same, so arbitrarily taking | |
# 0-th. | |
num_iter = batch['num_iter_recycling'][0] | |
# Add insurance that we will not run more | |
# recyclings than the model is configured to run. | |
num_iter = jnp.minimum(num_iter, self.config.num_recycle) | |
else: | |
# Eval mode or tests: use the maximum number of iterations. | |
num_iter = self.config.num_recycle | |
def add_prev(p,p_): | |
p_["prev_dgram"] += p["prev_dgram"] | |
if self.config.use_struct: | |
p_["prev_plddt"] += p["prev_plddt"] | |
p_["prev_pae"] += p["prev_pae"] | |
return p_ | |
############################################################## | |
def body(p, i): | |
p_ = get_prev(do_call(p, recycle_idx=i, compute_loss=False)) | |
if self.config.add_prev: | |
p_ = add_prev(p, p_) | |
return p_, None | |
if hk.running_init(): | |
prev,_ = body(prev, 0) | |
else: | |
prev,_ = hk.scan(body, prev, jnp.arange(num_iter)) | |
############################################################## | |
else: | |
num_iter = 0 | |
ret = do_call(prev=prev, recycle_idx=num_iter) | |
if self.config.add_prev: | |
prev_ = get_prev(ret) | |
if compute_loss: | |
ret = ret[0], [ret[1]] | |
if not return_representations: | |
del (ret[0] if compute_loss else ret)['representations'] # pytype: disable=unsupported-operands | |
if self.config.add_prev and num_iter > 0: | |
prev_ = add_prev(prev, prev_) | |
ret["distogram"]["logits"] = prev_["prev_dgram"]/(num_iter+1) | |
if self.config.use_struct: | |
ret["predicted_lddt"]["logits"] = prev_["prev_plddt"]/(num_iter+1) | |
if "predicted_aligned_error" in ret: | |
ret["predicted_aligned_error"]["logits"] = prev_["prev_pae"]/(num_iter+1) | |
return ret | |
class TemplatePairStack(hk.Module): | |
"""Pair stack for the templates. | |
Jumper et al. (2021) Suppl. Alg. 16 "TemplatePairStack" | |
""" | |
def __init__(self, config, global_config, name='template_pair_stack'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, pair_act, pair_mask, is_training, safe_key=None, scale_rate=1.0): | |
"""Builds TemplatePairStack module. | |
Arguments: | |
pair_act: Pair activations for single template, shape [N_res, N_res, c_t]. | |
pair_mask: Pair mask, shape [N_res, N_res]. | |
is_training: Whether the module is in training mode. | |
safe_key: Safe key object encapsulating the random number generation key. | |
Returns: | |
Updated pair_act, shape [N_res, N_res, c_t]. | |
""" | |
if safe_key is None: | |
safe_key = prng.SafeKey(hk.next_rng_key()) | |
gc = self.global_config | |
c = self.config | |
if not c.num_block: | |
return pair_act | |
def block(x): | |
"""One block of the template pair stack.""" | |
pair_act, safe_key = x | |
dropout_wrapper_fn = functools.partial( | |
dropout_wrapper, is_training=is_training, global_config=gc, scale_rate=scale_rate) | |
safe_key, *sub_keys = safe_key.split(6) | |
sub_keys = iter(sub_keys) | |
pair_act = dropout_wrapper_fn( | |
TriangleAttention(c.triangle_attention_starting_node, gc, | |
name='triangle_attention_starting_node'), | |
pair_act, | |
pair_mask, | |
next(sub_keys)) | |
pair_act = dropout_wrapper_fn( | |
TriangleAttention(c.triangle_attention_ending_node, gc, | |
name='triangle_attention_ending_node'), | |
pair_act, | |
pair_mask, | |
next(sub_keys)) | |
pair_act = dropout_wrapper_fn( | |
TriangleMultiplication(c.triangle_multiplication_outgoing, gc, | |
name='triangle_multiplication_outgoing'), | |
pair_act, | |
pair_mask, | |
next(sub_keys)) | |
pair_act = dropout_wrapper_fn( | |
TriangleMultiplication(c.triangle_multiplication_incoming, gc, | |
name='triangle_multiplication_incoming'), | |
pair_act, | |
pair_mask, | |
next(sub_keys)) | |
pair_act = dropout_wrapper_fn( | |
Transition(c.pair_transition, gc, name='pair_transition'), | |
pair_act, | |
pair_mask, | |
next(sub_keys)) | |
return pair_act, safe_key | |
if gc.use_remat: | |
block = hk.remat(block) | |
res_stack = layer_stack.layer_stack(c.num_block)(block) | |
pair_act, safe_key = res_stack((pair_act, safe_key)) | |
return pair_act | |
class Transition(hk.Module): | |
"""Transition layer. | |
Jumper et al. (2021) Suppl. Alg. 9 "MSATransition" | |
Jumper et al. (2021) Suppl. Alg. 15 "PairTransition" | |
""" | |
def __init__(self, config, global_config, name='transition_block'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, act, mask, is_training=True): | |
"""Builds Transition module. | |
Arguments: | |
act: A tensor of queries of size [batch_size, N_res, N_channel]. | |
mask: A tensor denoting the mask of size [batch_size, N_res]. | |
is_training: Whether the module is in training mode. | |
Returns: | |
A float32 tensor of size [batch_size, N_res, N_channel]. | |
""" | |
_, _, nc = act.shape | |
num_intermediate = int(nc * self.config.num_intermediate_factor) | |
mask = jnp.expand_dims(mask, axis=-1) | |
act = hk.LayerNorm( | |
axis=[-1], | |
create_scale=True, | |
create_offset=True, | |
name='input_layer_norm')( | |
act) | |
transition_module = hk.Sequential([ | |
common_modules.Linear( | |
num_intermediate, | |
initializer='relu', | |
name='transition1'), jax.nn.relu, | |
common_modules.Linear( | |
nc, | |
initializer=utils.final_init(self.global_config), | |
name='transition2') | |
]) | |
act = mapping.inference_subbatch( | |
transition_module, | |
self.global_config.subbatch_size, | |
batched_args=[act], | |
nonbatched_args=[], | |
low_memory=not is_training) | |
return act | |
def glorot_uniform(): | |
return hk.initializers.VarianceScaling(scale=1.0, | |
mode='fan_avg', | |
distribution='uniform') | |
class Attention(hk.Module): | |
"""Multihead attention.""" | |
def __init__(self, config, global_config, output_dim, name='attention'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
self.output_dim = output_dim | |
def __call__(self, q_data, m_data, bias, nonbatched_bias=None): | |
"""Builds Attention module. | |
Arguments: | |
q_data: A tensor of queries, shape [batch_size, N_queries, q_channels]. | |
m_data: A tensor of memories from which the keys and values are | |
projected, shape [batch_size, N_keys, m_channels]. | |
bias: A bias for the attention, shape [batch_size, N_queries, N_keys]. | |
nonbatched_bias: Shared bias, shape [N_queries, N_keys]. | |
Returns: | |
A float32 tensor of shape [batch_size, N_queries, output_dim]. | |
""" | |
# Sensible default for when the config keys are missing | |
key_dim = self.config.get('key_dim', int(q_data.shape[-1])) | |
value_dim = self.config.get('value_dim', int(m_data.shape[-1])) | |
num_head = self.config.num_head | |
assert key_dim % num_head == 0 | |
assert value_dim % num_head == 0 | |
key_dim = key_dim // num_head | |
value_dim = value_dim // num_head | |
q_weights = hk.get_parameter( | |
'query_w', shape=(q_data.shape[-1], num_head, key_dim), | |
init=glorot_uniform()) | |
k_weights = hk.get_parameter( | |
'key_w', shape=(m_data.shape[-1], num_head, key_dim), | |
init=glorot_uniform()) | |
v_weights = hk.get_parameter( | |
'value_w', shape=(m_data.shape[-1], num_head, value_dim), | |
init=glorot_uniform()) | |
q = jnp.einsum('bqa,ahc->bqhc', q_data, q_weights) * key_dim**(-0.5) | |
k = jnp.einsum('bka,ahc->bkhc', m_data, k_weights) | |
v = jnp.einsum('bka,ahc->bkhc', m_data, v_weights) | |
logits = jnp.einsum('bqhc,bkhc->bhqk', q, k) + bias | |
if nonbatched_bias is not None: | |
logits += jnp.expand_dims(nonbatched_bias, axis=0) | |
weights = jax.nn.softmax(logits) | |
weighted_avg = jnp.einsum('bhqk,bkhc->bqhc', weights, v) | |
if self.global_config.zero_init: | |
init = hk.initializers.Constant(0.0) | |
else: | |
init = glorot_uniform() | |
if self.config.gating: | |
gating_weights = hk.get_parameter( | |
'gating_w', | |
shape=(q_data.shape[-1], num_head, value_dim), | |
init=hk.initializers.Constant(0.0)) | |
gating_bias = hk.get_parameter( | |
'gating_b', | |
shape=(num_head, value_dim), | |
init=hk.initializers.Constant(1.0)) | |
gate_values = jnp.einsum('bqc, chv->bqhv', q_data, | |
gating_weights) + gating_bias | |
gate_values = jax.nn.sigmoid(gate_values) | |
weighted_avg *= gate_values | |
o_weights = hk.get_parameter( | |
'output_w', shape=(num_head, value_dim, self.output_dim), | |
init=init) | |
o_bias = hk.get_parameter('output_b', shape=(self.output_dim,), | |
init=hk.initializers.Constant(0.0)) | |
output = jnp.einsum('bqhc,hco->bqo', weighted_avg, o_weights) + o_bias | |
return output | |
class GlobalAttention(hk.Module): | |
"""Global attention. | |
Jumper et al. (2021) Suppl. Alg. 19 "MSAColumnGlobalAttention" lines 2-7 | |
""" | |
def __init__(self, config, global_config, output_dim, name='attention'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
self.output_dim = output_dim | |
def __call__(self, q_data, m_data, q_mask, bias): | |
"""Builds GlobalAttention module. | |
Arguments: | |
q_data: A tensor of queries with size [batch_size, N_queries, | |
q_channels] | |
m_data: A tensor of memories from which the keys and values | |
projected. Size [batch_size, N_keys, m_channels] | |
q_mask: A binary mask for q_data with zeros in the padded sequence | |
elements and ones otherwise. Size [batch_size, N_queries, q_channels] | |
(or broadcastable to this shape). | |
bias: A bias for the attention. | |
Returns: | |
A float32 tensor of size [batch_size, N_queries, output_dim]. | |
""" | |
# Sensible default for when the config keys are missing | |
key_dim = self.config.get('key_dim', int(q_data.shape[-1])) | |
value_dim = self.config.get('value_dim', int(m_data.shape[-1])) | |
num_head = self.config.num_head | |
assert key_dim % num_head == 0 | |
assert value_dim % num_head == 0 | |
key_dim = key_dim // num_head | |
value_dim = value_dim // num_head | |
q_weights = hk.get_parameter( | |
'query_w', shape=(q_data.shape[-1], num_head, key_dim), | |
init=glorot_uniform()) | |
k_weights = hk.get_parameter( | |
'key_w', shape=(m_data.shape[-1], key_dim), | |
init=glorot_uniform()) | |
v_weights = hk.get_parameter( | |
'value_w', shape=(m_data.shape[-1], value_dim), | |
init=glorot_uniform()) | |
v = jnp.einsum('bka,ac->bkc', m_data, v_weights) | |
q_avg = utils.mask_mean(q_mask, q_data, axis=1) | |
q = jnp.einsum('ba,ahc->bhc', q_avg, q_weights) * key_dim**(-0.5) | |
k = jnp.einsum('bka,ac->bkc', m_data, k_weights) | |
bias = (1e9 * (q_mask[:, None, :, 0] - 1.)) | |
logits = jnp.einsum('bhc,bkc->bhk', q, k) + bias | |
weights = jax.nn.softmax(logits) | |
weighted_avg = jnp.einsum('bhk,bkc->bhc', weights, v) | |
if self.global_config.zero_init: | |
init = hk.initializers.Constant(0.0) | |
else: | |
init = glorot_uniform() | |
o_weights = hk.get_parameter( | |
'output_w', shape=(num_head, value_dim, self.output_dim), | |
init=init) | |
o_bias = hk.get_parameter('output_b', shape=(self.output_dim,), | |
init=hk.initializers.Constant(0.0)) | |
if self.config.gating: | |
gating_weights = hk.get_parameter( | |
'gating_w', | |
shape=(q_data.shape[-1], num_head, value_dim), | |
init=hk.initializers.Constant(0.0)) | |
gating_bias = hk.get_parameter( | |
'gating_b', | |
shape=(num_head, value_dim), | |
init=hk.initializers.Constant(1.0)) | |
gate_values = jnp.einsum('bqc, chv->bqhv', q_data, gating_weights) | |
gate_values = jax.nn.sigmoid(gate_values + gating_bias) | |
weighted_avg = weighted_avg[:, None] * gate_values | |
output = jnp.einsum('bqhc,hco->bqo', weighted_avg, o_weights) + o_bias | |
else: | |
output = jnp.einsum('bhc,hco->bo', weighted_avg, o_weights) + o_bias | |
output = output[:, None] | |
return output | |
class MSARowAttentionWithPairBias(hk.Module): | |
"""MSA per-row attention biased by the pair representation. | |
Jumper et al. (2021) Suppl. Alg. 7 "MSARowAttentionWithPairBias" | |
""" | |
def __init__(self, config, global_config, | |
name='msa_row_attention_with_pair_bias'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, | |
msa_act, | |
msa_mask, | |
pair_act, | |
is_training=False): | |
"""Builds MSARowAttentionWithPairBias module. | |
Arguments: | |
msa_act: [N_seq, N_res, c_m] MSA representation. | |
msa_mask: [N_seq, N_res] mask of non-padded regions. | |
pair_act: [N_res, N_res, c_z] pair representation. | |
is_training: Whether the module is in training mode. | |
Returns: | |
Update to msa_act, shape [N_seq, N_res, c_m]. | |
""" | |
c = self.config | |
assert len(msa_act.shape) == 3 | |
assert len(msa_mask.shape) == 2 | |
assert c.orientation == 'per_row' | |
bias = (1e9 * (msa_mask - 1.))[:, None, None, :] | |
assert len(bias.shape) == 4 | |
msa_act = hk.LayerNorm( | |
axis=[-1], create_scale=True, create_offset=True, name='query_norm')( | |
msa_act) | |
pair_act = hk.LayerNorm( | |
axis=[-1], | |
create_scale=True, | |
create_offset=True, | |
name='feat_2d_norm')( | |
pair_act) | |
init_factor = 1. / jnp.sqrt(int(pair_act.shape[-1])) | |
weights = hk.get_parameter( | |
'feat_2d_weights', | |
shape=(pair_act.shape[-1], c.num_head), | |
init=hk.initializers.RandomNormal(stddev=init_factor)) | |
nonbatched_bias = jnp.einsum('qkc,ch->hqk', pair_act, weights) | |
attn_mod = Attention( | |
c, self.global_config, msa_act.shape[-1]) | |
msa_act = mapping.inference_subbatch( | |
attn_mod, | |
self.global_config.subbatch_size, | |
batched_args=[msa_act, msa_act, bias], | |
nonbatched_args=[nonbatched_bias], | |
low_memory=not is_training) | |
return msa_act | |
class MSAColumnAttention(hk.Module): | |
"""MSA per-column attention. | |
Jumper et al. (2021) Suppl. Alg. 8 "MSAColumnAttention" | |
""" | |
def __init__(self, config, global_config, name='msa_column_attention'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, | |
msa_act, | |
msa_mask, | |
is_training=False): | |
"""Builds MSAColumnAttention module. | |
Arguments: | |
msa_act: [N_seq, N_res, c_m] MSA representation. | |
msa_mask: [N_seq, N_res] mask of non-padded regions. | |
is_training: Whether the module is in training mode. | |
Returns: | |
Update to msa_act, shape [N_seq, N_res, c_m] | |
""" | |
c = self.config | |
assert len(msa_act.shape) == 3 | |
assert len(msa_mask.shape) == 2 | |
assert c.orientation == 'per_column' | |
msa_act = jnp.swapaxes(msa_act, -2, -3) | |
msa_mask = jnp.swapaxes(msa_mask, -1, -2) | |
bias = (1e9 * (msa_mask - 1.))[:, None, None, :] | |
assert len(bias.shape) == 4 | |
msa_act = hk.LayerNorm( | |
axis=[-1], create_scale=True, create_offset=True, name='query_norm')( | |
msa_act) | |
attn_mod = Attention( | |
c, self.global_config, msa_act.shape[-1]) | |
msa_act = mapping.inference_subbatch( | |
attn_mod, | |
self.global_config.subbatch_size, | |
batched_args=[msa_act, msa_act, bias], | |
nonbatched_args=[], | |
low_memory=not is_training) | |
msa_act = jnp.swapaxes(msa_act, -2, -3) | |
return msa_act | |
class MSAColumnGlobalAttention(hk.Module): | |
"""MSA per-column global attention. | |
Jumper et al. (2021) Suppl. Alg. 19 "MSAColumnGlobalAttention" | |
""" | |
def __init__(self, config, global_config, name='msa_column_global_attention'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, | |
msa_act, | |
msa_mask, | |
is_training=False): | |
"""Builds MSAColumnGlobalAttention module. | |
Arguments: | |
msa_act: [N_seq, N_res, c_m] MSA representation. | |
msa_mask: [N_seq, N_res] mask of non-padded regions. | |
is_training: Whether the module is in training mode. | |
Returns: | |
Update to msa_act, shape [N_seq, N_res, c_m]. | |
""" | |
c = self.config | |
assert len(msa_act.shape) == 3 | |
assert len(msa_mask.shape) == 2 | |
assert c.orientation == 'per_column' | |
msa_act = jnp.swapaxes(msa_act, -2, -3) | |
msa_mask = jnp.swapaxes(msa_mask, -1, -2) | |
bias = (1e9 * (msa_mask - 1.))[:, None, None, :] | |
assert len(bias.shape) == 4 | |
msa_act = hk.LayerNorm( | |
axis=[-1], create_scale=True, create_offset=True, name='query_norm')( | |
msa_act) | |
attn_mod = GlobalAttention( | |
c, self.global_config, msa_act.shape[-1], | |
name='attention') | |
# [N_seq, N_res, 1] | |
msa_mask = jnp.expand_dims(msa_mask, axis=-1) | |
msa_act = mapping.inference_subbatch( | |
attn_mod, | |
self.global_config.subbatch_size, | |
batched_args=[msa_act, msa_act, msa_mask, bias], | |
nonbatched_args=[], | |
low_memory=not is_training) | |
msa_act = jnp.swapaxes(msa_act, -2, -3) | |
return msa_act | |
class TriangleAttention(hk.Module): | |
"""Triangle Attention. | |
Jumper et al. (2021) Suppl. Alg. 13 "TriangleAttentionStartingNode" | |
Jumper et al. (2021) Suppl. Alg. 14 "TriangleAttentionEndingNode" | |
""" | |
def __init__(self, config, global_config, name='triangle_attention'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, pair_act, pair_mask, is_training=False): | |
"""Builds TriangleAttention module. | |
Arguments: | |
pair_act: [N_res, N_res, c_z] pair activations tensor | |
pair_mask: [N_res, N_res] mask of non-padded regions in the tensor. | |
is_training: Whether the module is in training mode. | |
Returns: | |
Update to pair_act, shape [N_res, N_res, c_z]. | |
""" | |
c = self.config | |
assert len(pair_act.shape) == 3 | |
assert len(pair_mask.shape) == 2 | |
assert c.orientation in ['per_row', 'per_column'] | |
if c.orientation == 'per_column': | |
pair_act = jnp.swapaxes(pair_act, -2, -3) | |
pair_mask = jnp.swapaxes(pair_mask, -1, -2) | |
bias = (1e9 * (pair_mask - 1.))[:, None, None, :] | |
assert len(bias.shape) == 4 | |
pair_act = hk.LayerNorm( | |
axis=[-1], create_scale=True, create_offset=True, name='query_norm')( | |
pair_act) | |
init_factor = 1. / jnp.sqrt(int(pair_act.shape[-1])) | |
weights = hk.get_parameter( | |
'feat_2d_weights', | |
shape=(pair_act.shape[-1], c.num_head), | |
init=hk.initializers.RandomNormal(stddev=init_factor)) | |
nonbatched_bias = jnp.einsum('qkc,ch->hqk', pair_act, weights) | |
attn_mod = Attention( | |
c, self.global_config, pair_act.shape[-1]) | |
pair_act = mapping.inference_subbatch( | |
attn_mod, | |
self.global_config.subbatch_size, | |
batched_args=[pair_act, pair_act, bias], | |
nonbatched_args=[nonbatched_bias], | |
low_memory=not is_training) | |
if c.orientation == 'per_column': | |
pair_act = jnp.swapaxes(pair_act, -2, -3) | |
return pair_act | |
class MaskedMsaHead(hk.Module): | |
"""Head to predict MSA at the masked locations. | |
The MaskedMsaHead employs a BERT-style objective to reconstruct a masked | |
version of the full MSA, based on a linear projection of | |
the MSA representation. | |
Jumper et al. (2021) Suppl. Sec. 1.9.9 "Masked MSA prediction" | |
""" | |
def __init__(self, config, global_config, name='masked_msa_head'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, representations, batch, is_training): | |
"""Builds MaskedMsaHead module. | |
Arguments: | |
representations: Dictionary of representations, must contain: | |
* 'msa': MSA representation, shape [N_seq, N_res, c_m]. | |
batch: Batch, unused. | |
is_training: Whether the module is in training mode. | |
Returns: | |
Dictionary containing: | |
* 'logits': logits of shape [N_seq, N_res, N_aatype] with | |
(unnormalized) log probabilies of predicted aatype at position. | |
""" | |
del batch | |
logits = common_modules.Linear( | |
self.config.num_output, | |
initializer=utils.final_init(self.global_config), | |
name='logits')( | |
representations['msa']) | |
return dict(logits=logits) | |
def loss(self, value, batch): | |
errors = softmax_cross_entropy( | |
labels=jax.nn.one_hot(batch['true_msa'], num_classes=23), | |
logits=value['logits']) | |
loss = (jnp.sum(errors * batch['bert_mask'], axis=(-2, -1)) / | |
(1e-8 + jnp.sum(batch['bert_mask'], axis=(-2, -1)))) | |
return {'loss': loss} | |
class PredictedLDDTHead(hk.Module): | |
"""Head to predict the per-residue LDDT to be used as a confidence measure. | |
Jumper et al. (2021) Suppl. Sec. 1.9.6 "Model confidence prediction (pLDDT)" | |
Jumper et al. (2021) Suppl. Alg. 29 "predictPerResidueLDDT_Ca" | |
""" | |
def __init__(self, config, global_config, name='predicted_lddt_head'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, representations, batch, is_training): | |
"""Builds ExperimentallyResolvedHead module. | |
Arguments: | |
representations: Dictionary of representations, must contain: | |
* 'structure_module': Single representation from the structure module, | |
shape [N_res, c_s]. | |
batch: Batch, unused. | |
is_training: Whether the module is in training mode. | |
Returns: | |
Dictionary containing : | |
* 'logits': logits of shape [N_res, N_bins] with | |
(unnormalized) log probabilies of binned predicted lDDT. | |
""" | |
act = representations['structure_module'] | |
act = hk.LayerNorm( | |
axis=[-1], | |
create_scale=True, | |
create_offset=True, | |
name='input_layer_norm')( | |
act) | |
act = common_modules.Linear( | |
self.config.num_channels, | |
initializer='relu', | |
name='act_0')( | |
act) | |
act = jax.nn.relu(act) | |
act = common_modules.Linear( | |
self.config.num_channels, | |
initializer='relu', | |
name='act_1')( | |
act) | |
act = jax.nn.relu(act) | |
logits = common_modules.Linear( | |
self.config.num_bins, | |
initializer=utils.final_init(self.global_config), | |
name='logits')( | |
act) | |
# Shape (batch_size, num_res, num_bins) | |
return dict(logits=logits) | |
def loss(self, value, batch): | |
# Shape (num_res, 37, 3) | |
pred_all_atom_pos = value['structure_module']['final_atom_positions'] | |
# Shape (num_res, 37, 3) | |
true_all_atom_pos = batch['all_atom_positions'] | |
# Shape (num_res, 37) | |
all_atom_mask = batch['all_atom_mask'] | |
# Shape (num_res,) | |
lddt_ca = lddt.lddt( | |
# Shape (batch_size, num_res, 3) | |
predicted_points=pred_all_atom_pos[None, :, 1, :], | |
# Shape (batch_size, num_res, 3) | |
true_points=true_all_atom_pos[None, :, 1, :], | |
# Shape (batch_size, num_res, 1) | |
true_points_mask=all_atom_mask[None, :, 1:2].astype(jnp.float32), | |
cutoff=15., | |
per_residue=True)[0] | |
lddt_ca = jax.lax.stop_gradient(lddt_ca) | |
num_bins = self.config.num_bins | |
bin_index = jnp.floor(lddt_ca * num_bins).astype(jnp.int32) | |
# protect against out of range for lddt_ca == 1 | |
bin_index = jnp.minimum(bin_index, num_bins - 1) | |
lddt_ca_one_hot = jax.nn.one_hot(bin_index, num_classes=num_bins) | |
# Shape (num_res, num_channel) | |
logits = value['predicted_lddt']['logits'] | |
errors = softmax_cross_entropy(labels=lddt_ca_one_hot, logits=logits) | |
# Shape (num_res,) | |
mask_ca = all_atom_mask[:, residue_constants.atom_order['CA']] | |
mask_ca = mask_ca.astype(jnp.float32) | |
loss = jnp.sum(errors * mask_ca) / (jnp.sum(mask_ca) + 1e-8) | |
if self.config.filter_by_resolution: | |
# NMR & distillation have resolution = 0 | |
loss *= ((batch['resolution'] >= self.config.min_resolution) | |
& (batch['resolution'] <= self.config.max_resolution)).astype( | |
jnp.float32) | |
output = {'loss': loss} | |
return output | |
class PredictedAlignedErrorHead(hk.Module): | |
"""Head to predict the distance errors in the backbone alignment frames. | |
Can be used to compute predicted TM-Score. | |
Jumper et al. (2021) Suppl. Sec. 1.9.7 "TM-score prediction" | |
""" | |
def __init__(self, config, global_config, | |
name='predicted_aligned_error_head'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, representations, batch, is_training): | |
"""Builds PredictedAlignedErrorHead module. | |
Arguments: | |
representations: Dictionary of representations, must contain: | |
* 'pair': pair representation, shape [N_res, N_res, c_z]. | |
batch: Batch, unused. | |
is_training: Whether the module is in training mode. | |
Returns: | |
Dictionary containing: | |
* logits: logits for aligned error, shape [N_res, N_res, N_bins]. | |
* bin_breaks: array containing bin breaks, shape [N_bins - 1]. | |
""" | |
act = representations['pair'] | |
# Shape (num_res, num_res, num_bins) | |
logits = common_modules.Linear( | |
self.config.num_bins, | |
initializer=utils.final_init(self.global_config), | |
name='logits')(act) | |
# Shape (num_bins,) | |
breaks = jnp.linspace( | |
0., self.config.max_error_bin, self.config.num_bins - 1) | |
return dict(logits=logits, breaks=breaks) | |
def loss(self, value, batch): | |
# Shape (num_res, 7) | |
predicted_affine = quat_affine.QuatAffine.from_tensor( | |
value['structure_module']['final_affines']) | |
# Shape (num_res, 7) | |
true_affine = quat_affine.QuatAffine.from_tensor( | |
batch['backbone_affine_tensor']) | |
# Shape (num_res) | |
mask = batch['backbone_affine_mask'] | |
# Shape (num_res, num_res) | |
square_mask = mask[:, None] * mask[None, :] | |
num_bins = self.config.num_bins | |
# (1, num_bins - 1) | |
breaks = value['predicted_aligned_error']['breaks'] | |
# (1, num_bins) | |
logits = value['predicted_aligned_error']['logits'] | |
# Compute the squared error for each alignment. | |
def _local_frame_points(affine): | |
points = [jnp.expand_dims(x, axis=-2) for x in affine.translation] | |
return affine.invert_point(points, extra_dims=1) | |
error_dist2_xyz = [ | |
jnp.square(a - b) | |
for a, b in zip(_local_frame_points(predicted_affine), | |
_local_frame_points(true_affine))] | |
error_dist2 = sum(error_dist2_xyz) | |
# Shape (num_res, num_res) | |
# First num_res are alignment frames, second num_res are the residues. | |
error_dist2 = jax.lax.stop_gradient(error_dist2) | |
sq_breaks = jnp.square(breaks) | |
true_bins = jnp.sum(( | |
error_dist2[..., None] > sq_breaks).astype(jnp.int32), axis=-1) | |
errors = softmax_cross_entropy( | |
labels=jax.nn.one_hot(true_bins, num_bins, axis=-1), logits=logits) | |
loss = (jnp.sum(errors * square_mask, axis=(-2, -1)) / | |
(1e-8 + jnp.sum(square_mask, axis=(-2, -1)))) | |
if self.config.filter_by_resolution: | |
# NMR & distillation have resolution = 0 | |
loss *= ((batch['resolution'] >= self.config.min_resolution) | |
& (batch['resolution'] <= self.config.max_resolution)).astype( | |
jnp.float32) | |
output = {'loss': loss} | |
return output | |
class ExperimentallyResolvedHead(hk.Module): | |
"""Predicts if an atom is experimentally resolved in a high-res structure. | |
Only trained on high-resolution X-ray crystals & cryo-EM. | |
Jumper et al. (2021) Suppl. Sec. 1.9.10 '"Experimentally resolved" prediction' | |
""" | |
def __init__(self, config, global_config, | |
name='experimentally_resolved_head'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, representations, batch, is_training): | |
"""Builds ExperimentallyResolvedHead module. | |
Arguments: | |
representations: Dictionary of representations, must contain: | |
* 'single': Single representation, shape [N_res, c_s]. | |
batch: Batch, unused. | |
is_training: Whether the module is in training mode. | |
Returns: | |
Dictionary containing: | |
* 'logits': logits of shape [N_res, 37], | |
log probability that an atom is resolved in atom37 representation, | |
can be converted to probability by applying sigmoid. | |
""" | |
logits = common_modules.Linear( | |
37, # atom_exists.shape[-1] | |
initializer=utils.final_init(self.global_config), | |
name='logits')(representations['single']) | |
return dict(logits=logits) | |
def loss(self, value, batch): | |
logits = value['logits'] | |
assert len(logits.shape) == 2 | |
# Does the atom appear in the amino acid? | |
atom_exists = batch['atom37_atom_exists'] | |
# Is the atom resolved in the experiment? Subset of atom_exists, | |
# *except for OXT* | |
all_atom_mask = batch['all_atom_mask'].astype(jnp.float32) | |
xent = sigmoid_cross_entropy(labels=all_atom_mask, logits=logits) | |
loss = jnp.sum(xent * atom_exists) / (1e-8 + jnp.sum(atom_exists)) | |
if self.config.filter_by_resolution: | |
# NMR & distillation examples have resolution = 0. | |
loss *= ((batch['resolution'] >= self.config.min_resolution) | |
& (batch['resolution'] <= self.config.max_resolution)).astype( | |
jnp.float32) | |
output = {'loss': loss} | |
return output | |
class TriangleMultiplication(hk.Module): | |
"""Triangle multiplication layer ("outgoing" or "incoming"). | |
Jumper et al. (2021) Suppl. Alg. 11 "TriangleMultiplicationOutgoing" | |
Jumper et al. (2021) Suppl. Alg. 12 "TriangleMultiplicationIncoming" | |
""" | |
def __init__(self, config, global_config, name='triangle_multiplication'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, act, mask, is_training=True): | |
"""Builds TriangleMultiplication module. | |
Arguments: | |
act: Pair activations, shape [N_res, N_res, c_z] | |
mask: Pair mask, shape [N_res, N_res]. | |
is_training: Whether the module is in training mode. | |
Returns: | |
Outputs, same shape/type as act. | |
""" | |
del is_training | |
c = self.config | |
gc = self.global_config | |
mask = mask[..., None] | |
act = hk.LayerNorm(axis=[-1], create_scale=True, create_offset=True, | |
name='layer_norm_input')(act) | |
input_act = act | |
left_projection = common_modules.Linear( | |
c.num_intermediate_channel, | |
name='left_projection') | |
left_proj_act = mask * left_projection(act) | |
right_projection = common_modules.Linear( | |
c.num_intermediate_channel, | |
name='right_projection') | |
right_proj_act = mask * right_projection(act) | |
left_gate_values = jax.nn.sigmoid(common_modules.Linear( | |
c.num_intermediate_channel, | |
bias_init=1., | |
initializer=utils.final_init(gc), | |
name='left_gate')(act)) | |
right_gate_values = jax.nn.sigmoid(common_modules.Linear( | |
c.num_intermediate_channel, | |
bias_init=1., | |
initializer=utils.final_init(gc), | |
name='right_gate')(act)) | |
left_proj_act *= left_gate_values | |
right_proj_act *= right_gate_values | |
# "Outgoing" edges equation: 'ikc,jkc->ijc' | |
# "Incoming" edges equation: 'kjc,kic->ijc' | |
# Note on the Suppl. Alg. 11 & 12 notation: | |
# For the "outgoing" edges, a = left_proj_act and b = right_proj_act | |
# For the "incoming" edges, it's swapped: | |
# b = left_proj_act and a = right_proj_act | |
act = jnp.einsum(c.equation, left_proj_act, right_proj_act) | |
act = hk.LayerNorm( | |
axis=[-1], | |
create_scale=True, | |
create_offset=True, | |
name='center_layer_norm')( | |
act) | |
output_channel = int(input_act.shape[-1]) | |
act = common_modules.Linear( | |
output_channel, | |
initializer=utils.final_init(gc), | |
name='output_projection')(act) | |
gate_values = jax.nn.sigmoid(common_modules.Linear( | |
output_channel, | |
bias_init=1., | |
initializer=utils.final_init(gc), | |
name='gating_linear')(input_act)) | |
act *= gate_values | |
return act | |
class DistogramHead(hk.Module): | |
"""Head to predict a distogram. | |
Jumper et al. (2021) Suppl. Sec. 1.9.8 "Distogram prediction" | |
""" | |
def __init__(self, config, global_config, name='distogram_head'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, representations, batch, is_training): | |
"""Builds DistogramHead module. | |
Arguments: | |
representations: Dictionary of representations, must contain: | |
* 'pair': pair representation, shape [N_res, N_res, c_z]. | |
batch: Batch, unused. | |
is_training: Whether the module is in training mode. | |
Returns: | |
Dictionary containing: | |
* logits: logits for distogram, shape [N_res, N_res, N_bins]. | |
* bin_breaks: array containing bin breaks, shape [N_bins - 1,]. | |
""" | |
half_logits = common_modules.Linear( | |
self.config.num_bins, | |
initializer=utils.final_init(self.global_config), | |
name='half_logits')( | |
representations['pair']) | |
logits = half_logits + jnp.swapaxes(half_logits, -2, -3) | |
breaks = jnp.linspace(self.config.first_break, self.config.last_break, | |
self.config.num_bins - 1) | |
return dict(logits=logits, bin_edges=breaks) | |
def loss(self, value, batch): | |
return _distogram_log_loss(value['logits'], value['bin_edges'], | |
batch, self.config.num_bins) | |
def _distogram_log_loss(logits, bin_edges, batch, num_bins): | |
"""Log loss of a distogram.""" | |
assert len(logits.shape) == 3 | |
positions = batch['pseudo_beta'] | |
mask = batch['pseudo_beta_mask'] | |
assert positions.shape[-1] == 3 | |
sq_breaks = jnp.square(bin_edges) | |
dist2 = jnp.sum( | |
jnp.square( | |
jnp.expand_dims(positions, axis=-2) - | |
jnp.expand_dims(positions, axis=-3)), | |
axis=-1, | |
keepdims=True) | |
true_bins = jnp.sum(dist2 > sq_breaks, axis=-1) | |
errors = softmax_cross_entropy( | |
labels=jax.nn.one_hot(true_bins, num_bins), logits=logits) | |
square_mask = jnp.expand_dims(mask, axis=-2) * jnp.expand_dims(mask, axis=-1) | |
avg_error = ( | |
jnp.sum(errors * square_mask, axis=(-2, -1)) / | |
(1e-6 + jnp.sum(square_mask, axis=(-2, -1)))) | |
dist2 = dist2[..., 0] | |
return dict(loss=avg_error, true_dist=jnp.sqrt(1e-6 + dist2)) | |
class OuterProductMean(hk.Module): | |
"""Computes mean outer product. | |
Jumper et al. (2021) Suppl. Alg. 10 "OuterProductMean" | |
""" | |
def __init__(self, | |
config, | |
global_config, | |
num_output_channel, | |
name='outer_product_mean'): | |
super().__init__(name=name) | |
self.global_config = global_config | |
self.config = config | |
self.num_output_channel = num_output_channel | |
def __call__(self, act, mask, is_training=True): | |
"""Builds OuterProductMean module. | |
Arguments: | |
act: MSA representation, shape [N_seq, N_res, c_m]. | |
mask: MSA mask, shape [N_seq, N_res]. | |
is_training: Whether the module is in training mode. | |
Returns: | |
Update to pair representation, shape [N_res, N_res, c_z]. | |
""" | |
gc = self.global_config | |
c = self.config | |
mask = mask[..., None] | |
act = hk.LayerNorm([-1], True, True, name='layer_norm_input')(act) | |
left_act = mask * common_modules.Linear( | |
c.num_outer_channel, | |
initializer='linear', | |
name='left_projection')( | |
act) | |
right_act = mask * common_modules.Linear( | |
c.num_outer_channel, | |
initializer='linear', | |
name='right_projection')( | |
act) | |
if gc.zero_init: | |
init_w = hk.initializers.Constant(0.0) | |
else: | |
init_w = hk.initializers.VarianceScaling(scale=2., mode='fan_in') | |
output_w = hk.get_parameter( | |
'output_w', | |
shape=(c.num_outer_channel, c.num_outer_channel, | |
self.num_output_channel), | |
init=init_w) | |
output_b = hk.get_parameter( | |
'output_b', shape=(self.num_output_channel,), | |
init=hk.initializers.Constant(0.0)) | |
def compute_chunk(left_act): | |
# This is equivalent to | |
# | |
# act = jnp.einsum('abc,ade->dceb', left_act, right_act) | |
# act = jnp.einsum('dceb,cef->bdf', act, output_w) + output_b | |
# | |
# but faster. | |
left_act = jnp.transpose(left_act, [0, 2, 1]) | |
act = jnp.einsum('acb,ade->dceb', left_act, right_act) | |
act = jnp.einsum('dceb,cef->dbf', act, output_w) + output_b | |
return jnp.transpose(act, [1, 0, 2]) | |
act = mapping.inference_subbatch( | |
compute_chunk, | |
c.chunk_size, | |
batched_args=[left_act], | |
nonbatched_args=[], | |
low_memory=True, | |
input_subbatch_dim=1, | |
output_subbatch_dim=0) | |
epsilon = 1e-3 | |
norm = jnp.einsum('abc,adc->bdc', mask, mask) | |
act /= epsilon + norm | |
return act | |
def dgram_from_positions(positions, num_bins, min_bin, max_bin): | |
"""Compute distogram from amino acid positions. | |
Arguments: | |
positions: [N_res, 3] Position coordinates. | |
num_bins: The number of bins in the distogram. | |
min_bin: The left edge of the first bin. | |
max_bin: The left edge of the final bin. The final bin catches | |
everything larger than `max_bin`. | |
Returns: | |
Distogram with the specified number of bins. | |
""" | |
def squared_difference(x, y): | |
return jnp.square(x - y) | |
lower_breaks = jnp.linspace(min_bin, max_bin, num_bins) | |
lower_breaks = jnp.square(lower_breaks) | |
upper_breaks = jnp.concatenate([lower_breaks[1:],jnp.array([1e8], dtype=jnp.float32)], axis=-1) | |
dist2 = jnp.sum( | |
squared_difference( | |
jnp.expand_dims(positions, axis=-2), | |
jnp.expand_dims(positions, axis=-3)), | |
axis=-1, keepdims=True) | |
return ((dist2 > lower_breaks).astype(jnp.float32) * (dist2 < upper_breaks).astype(jnp.float32)) | |
def dgram_from_positions_soft(positions, num_bins, min_bin, max_bin, temp=2.0): | |
'''soft positions to dgram converter''' | |
lower_breaks = jnp.append(-1e8,jnp.linspace(min_bin, max_bin, num_bins)) | |
upper_breaks = jnp.append(lower_breaks[1:],1e8) | |
dist = jnp.sqrt(jnp.square(positions[...,:,None,:] - positions[...,None,:,:]).sum(-1,keepdims=True) + 1e-8) | |
o = jax.nn.sigmoid((dist - lower_breaks)/temp) * jax.nn.sigmoid((upper_breaks - dist)/temp) | |
o = o/(o.sum(-1,keepdims=True) + 1e-8) | |
return o[...,1:] | |
def pseudo_beta_fn(aatype, all_atom_positions, all_atom_masks): | |
"""Create pseudo beta features.""" | |
ca_idx = residue_constants.atom_order['CA'] | |
cb_idx = residue_constants.atom_order['CB'] | |
if jnp.issubdtype(aatype.dtype, jnp.integer): | |
is_gly = jnp.equal(aatype, residue_constants.restype_order['G']) | |
is_gly_tile = jnp.tile(is_gly[..., None], [1] * len(is_gly.shape) + [3]) | |
pseudo_beta = jnp.where(is_gly_tile, all_atom_positions[..., ca_idx, :], all_atom_positions[..., cb_idx, :]) | |
if all_atom_masks is not None: | |
pseudo_beta_mask = jnp.where(is_gly, all_atom_masks[..., ca_idx], all_atom_masks[..., cb_idx]) | |
pseudo_beta_mask = pseudo_beta_mask.astype(jnp.float32) | |
return pseudo_beta, pseudo_beta_mask | |
else: | |
return pseudo_beta | |
else: | |
is_gly = aatype[...,residue_constants.restype_order['G']] | |
ca_pos = all_atom_positions[...,ca_idx,:] | |
cb_pos = all_atom_positions[...,cb_idx,:] | |
pseudo_beta = is_gly[...,None] * ca_pos + (1-is_gly[...,None]) * cb_pos | |
if all_atom_masks is not None: | |
ca_mask = all_atom_masks[...,ca_idx] | |
cb_mask = all_atom_masks[...,cb_idx] | |
pseudo_beta_mask = is_gly * ca_mask + (1-is_gly) * cb_mask | |
return pseudo_beta, pseudo_beta_mask | |
else: | |
return pseudo_beta | |
class EvoformerIteration(hk.Module): | |
"""Single iteration (block) of Evoformer stack. | |
Jumper et al. (2021) Suppl. Alg. 6 "EvoformerStack" lines 2-10 | |
""" | |
def __init__(self, config, global_config, is_extra_msa, | |
name='evoformer_iteration'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
self.is_extra_msa = is_extra_msa | |
def __call__(self, activations, masks, is_training=True, safe_key=None, scale_rate=1.0): | |
"""Builds EvoformerIteration module. | |
Arguments: | |
activations: Dictionary containing activations: | |
* 'msa': MSA activations, shape [N_seq, N_res, c_m]. | |
* 'pair': pair activations, shape [N_res, N_res, c_z]. | |
masks: Dictionary of masks: | |
* 'msa': MSA mask, shape [N_seq, N_res]. | |
* 'pair': pair mask, shape [N_res, N_res]. | |
is_training: Whether the module is in training mode. | |
safe_key: prng.SafeKey encapsulating rng key. | |
Returns: | |
Outputs, same shape/type as act. | |
""" | |
c = self.config | |
gc = self.global_config | |
msa_act, pair_act = activations['msa'], activations['pair'] | |
if safe_key is None: | |
safe_key = prng.SafeKey(hk.next_rng_key()) | |
msa_mask, pair_mask = masks['msa'], masks['pair'] | |
dropout_wrapper_fn = functools.partial( | |
dropout_wrapper, | |
is_training=is_training, | |
global_config=gc, | |
scale_rate=scale_rate) | |
safe_key, *sub_keys = safe_key.split(10) | |
sub_keys = iter(sub_keys) | |
msa_act = dropout_wrapper_fn( | |
MSARowAttentionWithPairBias( | |
c.msa_row_attention_with_pair_bias, gc, | |
name='msa_row_attention_with_pair_bias'), | |
msa_act, | |
msa_mask, | |
safe_key=next(sub_keys), | |
pair_act=pair_act) | |
if not self.is_extra_msa: | |
attn_mod = MSAColumnAttention( | |
c.msa_column_attention, gc, name='msa_column_attention') | |
else: | |
attn_mod = MSAColumnGlobalAttention( | |
c.msa_column_attention, gc, name='msa_column_global_attention') | |
msa_act = dropout_wrapper_fn( | |
attn_mod, | |
msa_act, | |
msa_mask, | |
safe_key=next(sub_keys)) | |
msa_act = dropout_wrapper_fn( | |
Transition(c.msa_transition, gc, name='msa_transition'), | |
msa_act, | |
msa_mask, | |
safe_key=next(sub_keys)) | |
pair_act = dropout_wrapper_fn( | |
OuterProductMean( | |
config=c.outer_product_mean, | |
global_config=self.global_config, | |
num_output_channel=int(pair_act.shape[-1]), | |
name='outer_product_mean'), | |
msa_act, | |
msa_mask, | |
safe_key=next(sub_keys), | |
output_act=pair_act) | |
pair_act = dropout_wrapper_fn( | |
TriangleMultiplication(c.triangle_multiplication_outgoing, gc, | |
name='triangle_multiplication_outgoing'), | |
pair_act, | |
pair_mask, | |
safe_key=next(sub_keys)) | |
pair_act = dropout_wrapper_fn( | |
TriangleMultiplication(c.triangle_multiplication_incoming, gc, | |
name='triangle_multiplication_incoming'), | |
pair_act, | |
pair_mask, | |
safe_key=next(sub_keys)) | |
pair_act = dropout_wrapper_fn( | |
TriangleAttention(c.triangle_attention_starting_node, gc, | |
name='triangle_attention_starting_node'), | |
pair_act, | |
pair_mask, | |
safe_key=next(sub_keys)) | |
pair_act = dropout_wrapper_fn( | |
TriangleAttention(c.triangle_attention_ending_node, gc, | |
name='triangle_attention_ending_node'), | |
pair_act, | |
pair_mask, | |
safe_key=next(sub_keys)) | |
pair_act = dropout_wrapper_fn( | |
Transition(c.pair_transition, gc, name='pair_transition'), | |
pair_act, | |
pair_mask, | |
safe_key=next(sub_keys)) | |
return {'msa': msa_act, 'pair': pair_act} | |
class EmbeddingsAndEvoformer(hk.Module): | |
"""Embeds the input data and runs Evoformer. | |
Produces the MSA, single and pair representations. | |
Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 5-18 | |
""" | |
def __init__(self, config, global_config, name='evoformer'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, batch, is_training, safe_key=None): | |
c = self.config | |
gc = self.global_config | |
if safe_key is None: | |
safe_key = prng.SafeKey(hk.next_rng_key()) | |
# Embed clustered MSA. | |
# Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 5 | |
# Jumper et al. (2021) Suppl. Alg. 3 "InputEmbedder" | |
preprocess_1d = common_modules.Linear( | |
c.msa_channel, name='preprocess_1d')( | |
batch['target_feat']) | |
preprocess_msa = common_modules.Linear( | |
c.msa_channel, name='preprocess_msa')( | |
batch['msa_feat']) | |
msa_activations = jnp.expand_dims(preprocess_1d, axis=0) + preprocess_msa | |
left_single = common_modules.Linear( | |
c.pair_channel, name='left_single')( | |
batch['target_feat']) | |
right_single = common_modules.Linear( | |
c.pair_channel, name='right_single')( | |
batch['target_feat']) | |
pair_activations = left_single[:, None] + right_single[None] | |
mask_2d = batch['seq_mask'][:, None] * batch['seq_mask'][None, :] | |
# Inject previous outputs for recycling. | |
# Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 6 | |
# Jumper et al. (2021) Suppl. Alg. 32 "RecyclingEmbedder" | |
if "prev_pos" in batch: | |
# use predicted position input | |
prev_pseudo_beta = pseudo_beta_fn(batch['aatype'], batch['prev_pos'], None) | |
if c.backprop_dgram: | |
dgram = dgram_from_positions_soft(prev_pseudo_beta, temp=c.backprop_dgram_temp, **c.prev_pos) | |
else: | |
dgram = dgram_from_positions(prev_pseudo_beta, **c.prev_pos) | |
elif 'prev_dgram' in batch: | |
# use predicted distogram input (from Sergey) | |
dgram = jax.nn.softmax(batch["prev_dgram"]) | |
dgram_map = jax.nn.one_hot(jnp.repeat(jnp.append(0,jnp.arange(15)),4),15).at[:,0].set(0) | |
dgram = dgram @ dgram_map | |
pair_activations += common_modules.Linear(c.pair_channel, name='prev_pos_linear')(dgram) | |
if c.recycle_features: | |
if 'prev_msa_first_row' in batch: | |
prev_msa_first_row = hk.LayerNorm([-1], | |
True, | |
True, | |
name='prev_msa_first_row_norm')( | |
batch['prev_msa_first_row']) | |
msa_activations = msa_activations.at[0].add(prev_msa_first_row) | |
if 'prev_pair' in batch: | |
pair_activations += hk.LayerNorm([-1], | |
True, | |
True, | |
name='prev_pair_norm')( | |
batch['prev_pair']) | |
# Relative position encoding. | |
# Jumper et al. (2021) Suppl. Alg. 4 "relpos" | |
# Jumper et al. (2021) Suppl. Alg. 5 "one_hot" | |
if c.max_relative_feature: | |
# Add one-hot-encoded clipped residue distances to the pair activations. | |
if "rel_pos" in batch: | |
rel_pos = batch['rel_pos'] | |
else: | |
if "offset" in batch: | |
offset = batch['offset'] | |
else: | |
pos = batch['residue_index'] | |
offset = pos[:, None] - pos[None, :] | |
rel_pos = jax.nn.one_hot( | |
jnp.clip( | |
offset + c.max_relative_feature, | |
a_min=0, | |
a_max=2 * c.max_relative_feature), | |
2 * c.max_relative_feature + 1) | |
pair_activations += common_modules.Linear(c.pair_channel, name='pair_activiations')(rel_pos) | |
# Embed templates into the pair activations. | |
# Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9-13 | |
if c.template.enabled: | |
template_batch = {k: batch[k] for k in batch if k.startswith('template_')} | |
template_pair_representation = TemplateEmbedding(c.template, gc)( | |
pair_activations, | |
template_batch, | |
mask_2d, | |
is_training=is_training, | |
scale_rate=batch["scale_rate"]) | |
pair_activations += template_pair_representation | |
# Embed extra MSA features. | |
# Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 14-16 | |
extra_msa_feat = create_extra_msa_feature(batch) | |
extra_msa_activations = common_modules.Linear( | |
c.extra_msa_channel, | |
name='extra_msa_activations')( | |
extra_msa_feat) | |
# Extra MSA Stack. | |
# Jumper et al. (2021) Suppl. Alg. 18 "ExtraMsaStack" | |
extra_msa_stack_input = { | |
'msa': extra_msa_activations, | |
'pair': pair_activations, | |
} | |
extra_msa_stack_iteration = EvoformerIteration( | |
c.evoformer, gc, is_extra_msa=True, name='extra_msa_stack') | |
def extra_msa_stack_fn(x): | |
act, safe_key = x | |
safe_key, safe_subkey = safe_key.split() | |
extra_evoformer_output = extra_msa_stack_iteration( | |
activations=act, | |
masks={ | |
'msa': batch['extra_msa_mask'], | |
'pair': mask_2d | |
}, | |
is_training=is_training, | |
safe_key=safe_subkey, scale_rate=batch["scale_rate"]) | |
return (extra_evoformer_output, safe_key) | |
if gc.use_remat: | |
extra_msa_stack_fn = hk.remat(extra_msa_stack_fn) | |
extra_msa_stack = layer_stack.layer_stack( | |
c.extra_msa_stack_num_block)( | |
extra_msa_stack_fn) | |
extra_msa_output, safe_key = extra_msa_stack( | |
(extra_msa_stack_input, safe_key)) | |
pair_activations = extra_msa_output['pair'] | |
evoformer_input = { | |
'msa': msa_activations, | |
'pair': pair_activations, | |
} | |
evoformer_masks = {'msa': batch['msa_mask'], 'pair': mask_2d} | |
#################################################################### | |
#################################################################### | |
# Append num_templ rows to msa_activations with template embeddings. | |
# Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 7-8 | |
if c.template.enabled and c.template.embed_torsion_angles: | |
if jnp.issubdtype(batch['template_aatype'].dtype, jnp.integer): | |
num_templ, num_res = batch['template_aatype'].shape | |
# Embed the templates aatypes. | |
aatype = batch['template_aatype'] | |
aatype_one_hot = jax.nn.one_hot(batch['template_aatype'], 22, axis=-1) | |
else: | |
num_templ, num_res, _ = batch['template_aatype'].shape | |
aatype = batch['template_aatype'].argmax(-1) | |
aatype_one_hot = batch['template_aatype'] | |
# Embed the templates aatype, torsion angles and masks. | |
# Shape (templates, residues, msa_channels) | |
ret = all_atom.atom37_to_torsion_angles( | |
aatype=aatype, | |
all_atom_pos=batch['template_all_atom_positions'], | |
all_atom_mask=batch['template_all_atom_masks'], | |
# Ensure consistent behaviour during testing: | |
placeholder_for_undefined=not gc.zero_init) | |
template_features = jnp.concatenate([ | |
aatype_one_hot, | |
jnp.reshape(ret['torsion_angles_sin_cos'], [num_templ, num_res, 14]), | |
jnp.reshape(ret['alt_torsion_angles_sin_cos'], [num_templ, num_res, 14]), | |
ret['torsion_angles_mask']], axis=-1) | |
template_activations = common_modules.Linear( | |
c.msa_channel, | |
initializer='relu', | |
name='template_single_embedding')(template_features) | |
template_activations = jax.nn.relu(template_activations) | |
template_activations = common_modules.Linear( | |
c.msa_channel, | |
initializer='relu', | |
name='template_projection')(template_activations) | |
# Concatenate the templates to the msa. | |
evoformer_input['msa'] = jnp.concatenate([evoformer_input['msa'], template_activations], axis=0) | |
# Concatenate templates masks to the msa masks. | |
# Use mask from the psi angle, as it only depends on the backbone atoms | |
# from a single residue. | |
torsion_angle_mask = ret['torsion_angles_mask'][:, :, 2] | |
torsion_angle_mask = torsion_angle_mask.astype(evoformer_masks['msa'].dtype) | |
evoformer_masks['msa'] = jnp.concatenate([evoformer_masks['msa'], torsion_angle_mask], axis=0) | |
#################################################################### | |
#################################################################### | |
# Main trunk of the network | |
# Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 17-18 | |
evoformer_iteration = EvoformerIteration( | |
c.evoformer, gc, is_extra_msa=False, name='evoformer_iteration') | |
def evoformer_fn(x): | |
act, safe_key = x | |
safe_key, safe_subkey = safe_key.split() | |
evoformer_output = evoformer_iteration( | |
activations=act, | |
masks=evoformer_masks, | |
is_training=is_training, | |
safe_key=safe_subkey, scale_rate=batch["scale_rate"]) | |
return (evoformer_output, safe_key) | |
if gc.use_remat: | |
evoformer_fn = hk.remat(evoformer_fn) | |
evoformer_stack = layer_stack.layer_stack(c.evoformer_num_block)(evoformer_fn) | |
evoformer_output, safe_key = evoformer_stack((evoformer_input, safe_key)) | |
msa_activations = evoformer_output['msa'] | |
pair_activations = evoformer_output['pair'] | |
single_activations = common_modules.Linear( | |
c.seq_channel, name='single_activations')(msa_activations[0]) | |
num_sequences = batch['msa_feat'].shape[0] | |
output = { | |
'single': single_activations, | |
'pair': pair_activations, | |
# Crop away template rows such that they are not used in MaskedMsaHead. | |
'msa': msa_activations[:num_sequences, :, :], | |
'msa_first_row': msa_activations[0], | |
} | |
return output | |
#################################################################### | |
#################################################################### | |
class SingleTemplateEmbedding(hk.Module): | |
"""Embeds a single template. | |
Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9+11 | |
""" | |
def __init__(self, config, global_config, name='single_template_embedding'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, query_embedding, batch, mask_2d, is_training, scale_rate=1.0): | |
"""Build the single template embedding. | |
Arguments: | |
query_embedding: Query pair representation, shape [N_res, N_res, c_z]. | |
batch: A batch of template features (note the template dimension has been | |
stripped out as this module only runs over a single template). | |
mask_2d: Padding mask (Note: this doesn't care if a template exists, | |
unlike the template_pseudo_beta_mask). | |
is_training: Whether the module is in training mode. | |
Returns: | |
A template embedding [N_res, N_res, c_z]. | |
""" | |
assert mask_2d.dtype == query_embedding.dtype | |
dtype = query_embedding.dtype | |
num_res = batch['template_aatype'].shape[0] | |
num_channels = (self.config.template_pair_stack | |
.triangle_attention_ending_node.value_dim) | |
template_mask = batch['template_pseudo_beta_mask'] | |
template_mask_2d = template_mask[:, None] * template_mask[None, :] | |
template_mask_2d = template_mask_2d.astype(dtype) | |
if "template_dgram" in batch: | |
template_dgram = batch["template_dgram"] | |
else: | |
if self.config.backprop_dgram: | |
template_dgram = dgram_from_positions_soft(batch['template_pseudo_beta'], | |
temp=self.config.backprop_dgram_temp, | |
**self.config.dgram_features) | |
else: | |
template_dgram = dgram_from_positions(batch['template_pseudo_beta'], | |
**self.config.dgram_features) | |
template_dgram = template_dgram.astype(dtype) | |
to_concat = [template_dgram, template_mask_2d[:, :, None]] | |
if jnp.issubdtype(batch['template_aatype'].dtype, jnp.integer): | |
aatype = jax.nn.one_hot(batch['template_aatype'], 22, axis=-1, dtype=dtype) | |
else: | |
aatype = batch['template_aatype'] | |
to_concat.append(jnp.tile(aatype[None, :, :], [num_res, 1, 1])) | |
to_concat.append(jnp.tile(aatype[:, None, :], [1, num_res, 1])) | |
# Backbone affine mask: whether the residue has C, CA, N | |
# (the template mask defined above only considers pseudo CB). | |
n, ca, c = [residue_constants.atom_order[a] for a in ('N', 'CA', 'C')] | |
template_mask = ( | |
batch['template_all_atom_masks'][..., n] * | |
batch['template_all_atom_masks'][..., ca] * | |
batch['template_all_atom_masks'][..., c]) | |
template_mask_2d = template_mask[:, None] * template_mask[None, :] | |
# compute unit_vector (not used by default) | |
if self.config.use_template_unit_vector: | |
rot, trans = quat_affine.make_transform_from_reference( | |
n_xyz=batch['template_all_atom_positions'][:, n], | |
ca_xyz=batch['template_all_atom_positions'][:, ca], | |
c_xyz=batch['template_all_atom_positions'][:, c]) | |
affines = quat_affine.QuatAffine( | |
quaternion=quat_affine.rot_to_quat(rot, unstack_inputs=True), | |
translation=trans, | |
rotation=rot, | |
unstack_inputs=True) | |
points = [jnp.expand_dims(x, axis=-2) for x in affines.translation] | |
affine_vec = affines.invert_point(points, extra_dims=1) | |
inv_distance_scalar = jax.lax.rsqrt(1e-6 + sum([jnp.square(x) for x in affine_vec])) | |
inv_distance_scalar *= template_mask_2d.astype(inv_distance_scalar.dtype) | |
unit_vector = [(x * inv_distance_scalar)[..., None] for x in affine_vec] | |
else: | |
unit_vector = [jnp.zeros((num_res,num_res,1))] * 3 | |
unit_vector = [x.astype(dtype) for x in unit_vector] | |
to_concat.extend(unit_vector) | |
template_mask_2d = template_mask_2d.astype(dtype) | |
to_concat.append(template_mask_2d[..., None]) | |
act = jnp.concatenate(to_concat, axis=-1) | |
# Mask out non-template regions so we don't get arbitrary values in the | |
# distogram for these regions. | |
act *= template_mask_2d[..., None] | |
# Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 9 | |
act = common_modules.Linear( | |
num_channels, | |
initializer='relu', | |
name='embedding2d')(act) | |
# Jumper et al. (2021) Suppl. Alg. 2 "Inference" line 11 | |
act = TemplatePairStack( | |
self.config.template_pair_stack, self.global_config)(act, mask_2d, is_training, scale_rate=scale_rate) | |
act = hk.LayerNorm([-1], True, True, name='output_layer_norm')(act) | |
return act | |
class TemplateEmbedding(hk.Module): | |
"""Embeds a set of templates. | |
Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9-12 | |
Jumper et al. (2021) Suppl. Alg. 17 "TemplatePointwiseAttention" | |
""" | |
def __init__(self, config, global_config, name='template_embedding'): | |
super().__init__(name=name) | |
self.config = config | |
self.global_config = global_config | |
def __call__(self, query_embedding, template_batch, mask_2d, is_training, scale_rate=1.0): | |
"""Build TemplateEmbedding module. | |
Arguments: | |
query_embedding: Query pair representation, shape [N_res, N_res, c_z]. | |
template_batch: A batch of template features. | |
mask_2d: Padding mask (Note: this doesn't care if a template exists, | |
unlike the template_pseudo_beta_mask). | |
is_training: Whether the module is in training mode. | |
Returns: | |
A template embedding [N_res, N_res, c_z]. | |
""" | |
num_templates = template_batch['template_mask'].shape[0] | |
num_channels = (self.config.template_pair_stack | |
.triangle_attention_ending_node.value_dim) | |
num_res = query_embedding.shape[0] | |
dtype = query_embedding.dtype | |
template_mask = template_batch['template_mask'] | |
template_mask = template_mask.astype(dtype) | |
query_num_channels = query_embedding.shape[-1] | |
# Make sure the weights are shared across templates by constructing the | |
# embedder here. | |
# Jumper et al. (2021) Suppl. Alg. 2 "Inference" lines 9-12 | |
template_embedder = SingleTemplateEmbedding(self.config, self.global_config) | |
def map_fn(batch): | |
return template_embedder(query_embedding, batch, mask_2d, is_training, scale_rate=scale_rate) | |
template_pair_representation = mapping.sharded_map(map_fn, in_axes=0)(template_batch) | |
# Cross attend from the query to the templates along the residue | |
# dimension by flattening everything else into the batch dimension. | |
# Jumper et al. (2021) Suppl. Alg. 17 "TemplatePointwiseAttention" | |
flat_query = jnp.reshape(query_embedding,[num_res * num_res, 1, query_num_channels]) | |
flat_templates = jnp.reshape( | |
jnp.transpose(template_pair_representation, [1, 2, 0, 3]), | |
[num_res * num_res, num_templates, num_channels]) | |
bias = (1e9 * (template_mask[None, None, None, :] - 1.)) | |
template_pointwise_attention_module = Attention( | |
self.config.attention, self.global_config, query_num_channels) | |
nonbatched_args = [bias] | |
batched_args = [flat_query, flat_templates] | |
embedding = mapping.inference_subbatch( | |
template_pointwise_attention_module, | |
self.config.subbatch_size, | |
batched_args=batched_args, | |
nonbatched_args=nonbatched_args, | |
low_memory=not is_training) | |
embedding = jnp.reshape(embedding,[num_res, num_res, query_num_channels]) | |
# No gradients if no templates. | |
embedding *= (jnp.sum(template_mask) > 0.).astype(embedding.dtype) | |
return embedding | |
#################################################################### | |