Spaces:

AI4PD
/

hexviz

Sleeping

App Files Files Community

aksell commited on Apr 27, 2023

Commit

9f086ee

1 Parent(s): f7e76de

WIP visualize EC number and attention to it

Browse files

Visualizes EC tags as spheres on the protein structure.

Files changed (3) hide show

hexviz/attention.py +64 -4
hexviz/ec_number.py +11 -0
hexviz/🧬Attention_Visualization.py +65 -7

hexviz/attention.py CHANGED Viewed

@@ -6,6 +6,7 @@ import streamlit as st
 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
 from hexviz.models import (
     ModelType,
     get_prot_bert,
@@ -98,11 +99,39 @@ def clean_and_validate_sequence(sequence: str) -> tuple[str, str | None]:
         return cleaned_sequence, None
 @st.cache
 def get_attention(
     sequence: str,
     model_type: ModelType = ModelType.TAPE_BERT,
     remove_special_tokens: bool = True,
 ):
     """
     Returns a tensor of shape [n_layers, n_heads, n_res, n_res] with attention weights
@@ -122,6 +151,10 @@ def get_attention(
     elif model_type == ModelType.ZymCTRL:
         tokenizer, model = get_zymctrl()
         inputs = tokenizer(sequence, return_tensors="pt").input_ids.to(device)
         attention_mask = tokenizer(sequence, return_tensors="pt").attention_mask.to(
             device
@@ -133,6 +166,12 @@ def get_attention(
             )
             attentions = outputs.attentions
         # torch.Size([1, n_heads, n_res, n_res]) -> torch.Size([n_heads, n_res, n_res])
         attention_squeezed = [torch.squeeze(attention) for attention in attentions]
         # ([n_heads, n_res, n_res]*n_layers) -> [n_layers, n_heads, n_res, n_res]
@@ -202,6 +241,7 @@ def get_attention_pairs(
     threshold: int = 0.2,
     model_type: ModelType = ModelType.TAPE_BERT,
     top_n: int = 2,
 ):
     structure = PDBParser().get_structure("pdb", StringIO(pdb_str))
     if chain_ids:
@@ -213,7 +253,9 @@ def get_attention_pairs(
     top_residues = []
     for chain in chains:
         sequence = get_sequence(chain)
-        attention = get_attention(sequence=sequence, model_type=model_type)
         attention_unidirectional = unidirectional_avg_filtered(
             attention, layer, head, threshold
         )
@@ -222,8 +264,19 @@ def get_attention_pairs(
         residue_attention = {}
         for attn_value, res_1, res_2 in attention_unidirectional:
             try:
-                coord_1 = chain[res_1]["CA"].coord.tolist()
-                coord_2 = chain[res_2]["CA"].coord.tolist()
             except KeyError:
                 continue
@@ -236,7 +289,14 @@ def get_attention_pairs(
         )[:top_n]
         for res, attn_sum in top_n_residues:
-            coord = chain[res]["CA"].coord.tolist()
             top_residues.append((attn_sum, coord, chain.id, res))
     return attention_pairs, top_residues

 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
+from hexviz.ec_number import ECNumber
 from hexviz.models import (
     ModelType,
     get_prot_bert,
         return cleaned_sequence, None
+def remove_special_tokens_and_periods(attentions_tuple, sequence, tokenizer):
+    tokens = tokenizer.tokenize(sequence)
+    indices_to_remove = [
+        i
+        for i, token in enumerate(tokens)
+        if token in {".", "<sep>", "<start>", "<end>", "<pad>"}
+    ]
+    new_attentions = []
+    for attentions in attentions_tuple:
+        # Remove rows and columns corresponding to special tokens and periods
+        for idx in sorted(indices_to_remove, reverse=True):
+            attentions = torch.cat(
+                (attentions[:, :, :idx], attentions[:, :, idx + 1 :]), dim=2
+            )
+            attentions = torch.cat(
+                (attentions[:, :, :, :idx], attentions[:, :, :, idx + 1 :]), dim=3
+            )
+        # Append the modified attentions tensor to the new_attentions list
+        new_attentions.append(attentions)
+    return new_attentions
 @st.cache
 def get_attention(
     sequence: str,
     model_type: ModelType = ModelType.TAPE_BERT,
     remove_special_tokens: bool = True,
+    ec_number: list[ECNumber] = None,
 ):
     """
     Returns a tensor of shape [n_layers, n_heads, n_res, n_res] with attention weights
     elif model_type == ModelType.ZymCTRL:
         tokenizer, model = get_zymctrl()
+        if ec_number:
+            sequence = f"{'.'.join([ec.number for ec in ec_number])}<sep><start>{sequence}<end><pad>"
         inputs = tokenizer(sequence, return_tensors="pt").input_ids.to(device)
         attention_mask = tokenizer(sequence, return_tensors="pt").attention_mask.to(
             device
             )
             attentions = outputs.attentions
+        if ec_number:
+            # Remove attention to special tokens and periods separating EC number components
+            attentions = remove_special_tokens_and_periods(
+                attentions, sequence, tokenizer
+            )
         # torch.Size([1, n_heads, n_res, n_res]) -> torch.Size([n_heads, n_res, n_res])
         attention_squeezed = [torch.squeeze(attention) for attention in attentions]
         # ([n_heads, n_res, n_res]*n_layers) -> [n_layers, n_heads, n_res, n_res]
     threshold: int = 0.2,
     model_type: ModelType = ModelType.TAPE_BERT,
     top_n: int = 2,
+    ec_number: list[ECNumber] | None = None,
 ):
     structure = PDBParser().get_structure("pdb", StringIO(pdb_str))
     if chain_ids:
     top_residues = []
     for chain in chains:
         sequence = get_sequence(chain)
+        attention = get_attention(
+            sequence=sequence, model_type=model_type, ec_number=ec_number
+        )
         attention_unidirectional = unidirectional_avg_filtered(
             attention, layer, head, threshold
         )
         residue_attention = {}
         for attn_value, res_1, res_2 in attention_unidirectional:
             try:
+                if not ec_number:
+                    coord_1 = chain[res_1]["CA"].coord.tolist()
+                    coord_2 = chain[res_2]["CA"].coord.tolist()
+                else:
+                    if res_1 < 4:
+                        coord_1 = ec_number[res_1].coordinate
+                    else:
+                        coord_1 = chain[res_1 - 4]["CA"].coord.tolist()
+                    if res_2 < 4:
+                        coord_2 = ec_number[res_2].coordinate
+                    else:
+                        coord_2 = chain[res_2 - 4]["CA"].coord.tolist()
             except KeyError:
                 continue
         )[:top_n]
         for res, attn_sum in top_n_residues:
+            if not ec_number:
+                coord = chain[res]["CA"].coord.tolist()
+            else:
+                if res < 4:
+                    # Ignore EC tag chars as these can't be labeled
+                    continue
+                else:
+                    coord = chain[res - 4]["CA"].coord.tolist()
             top_residues.append((attn_sum, coord, chain.id, res))
     return attention_pairs, top_residues

hexviz/ec_number.py ADDED Viewed

	@@ -0,0 +1,11 @@

+class ECNumber:
+    def __init__(self, number, coordinate, color, radius):
+        self.number = number
+        self.coordinate = coordinate
+        self.color = color
+        self.radius = radius
+    def __str__(self):
+        return (
+            f"(EC: {self.number}, Coordinate: {self.coordinate}, Color: {self.color})"
+        )

hexviz/🧬Attention_Visualization.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import pandas as pd
 import py3Dmol
 import stmol
@@ -9,10 +12,10 @@ from hexviz.attention import (
     get_attention_pairs,
     get_chains,
 )
 from hexviz.models import Model, ModelType
 from hexviz.view import menu_items, select_model, select_pdb, select_protein
-from hexviz.config import URL
 st.set_page_config(layout="centered", menu_items=menu_items)
 st.title("Attention Visualization on proteins")
@@ -110,15 +113,60 @@ with right:
     )
     head = head_one - 1
-ec_class = ""
 if selected_model.name == ModelType.ZymCTRL:
     try:
-        ec_class = structure.header["compound"]["1"]["ec"]
     except KeyError:
         pass
-    ec_class = st.sidebar.text_input(
-        "Enzyme classification number fetched from PDB", ec_class
-    )
 attention_pairs, top_residues = get_attention_pairs(
@@ -129,6 +177,7 @@ attention_pairs, top_residues = get_attention_pairs(
     threshold=min_attn,
     model_type=selected_model.name,
     top_n=n_highest_resis,
 )
 sorted_by_attention = sorted(attention_pairs, key=lambda x: x[0], reverse=True)
@@ -169,6 +218,15 @@ def get_3dview(pdb):
             dashed=False,
         )
     if label_resi:
         for hl_resi in hl_resi_list:
             xyzview.addResLabels(

+import re
+import numpy as np
 import pandas as pd
 import py3Dmol
 import stmol
     get_attention_pairs,
     get_chains,
 )
+from hexviz.config import URL
+from hexviz.ec_number import ECNumber
 from hexviz.models import Model, ModelType
 from hexviz.view import menu_items, select_model, select_pdb, select_protein
 st.set_page_config(layout="centered", menu_items=menu_items)
 st.title("Attention Visualization on proteins")
     )
     head = head_one - 1
+ec_number = ""
 if selected_model.name == ModelType.ZymCTRL:
+    st.sidebar.markdown(
+        """
+    ZymCTRL EC number
+    ---
+    """
+    )
     try:
+        ec_number = structure.header["compound"]["1"]["ec"]
     except KeyError:
         pass
+    ec_number = st.sidebar.text_input("Enzyme Comission number (EC)", ec_number)
+    # Validate EC number
+    if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ec_number):
+        st.sidebar.error(
+            "Please enter a valid Enzyme Commission number in the format of 4 integers separated by periods (e.g., 1.2.3.21)"
+        )
+    if ec_number:
+        if selected_chains:
+            all_chains = [
+                ch for ch in structure.get_chains() if ch.id in selected_chains
+            ]
+        else:
+            all_chains = list(structure.get_chains())
+        the_chain = all_chains[0]
+        res_1 = the_chain[1]["CA"].coord.tolist()
+        res_2 = the_chain[2]["CA"].coord.tolist()
+        # Calculate the vector from res_1 to res_2
+        vector = [res_2[i] - res_1[i] for i in range(3)]
+        # Reverse the vector
+        reverse_vector = [-v for v in vector]
+        # Normalize the reverse vector
+        reverse_vector_normalized = np.array(reverse_vector) / np.linalg.norm(
+            reverse_vector
+        )
+        radius = 1
+        coordinates = [
+            [res_1[j] + i * 2 * radius * reverse_vector_normalized[j] for j in range(3)]
+            for i in range(4)
+        ]
+        colors = ["blue", "green", "orange", "red"]
+        EC_numbers = ec_number.split(".")
+        EC_tag = [
+            ECNumber(number=num, coordinate=coord, color=color, radius=radius)
+            for num, coord, color in zip(EC_numbers, coordinates, colors)
+        ]
+        EC_colored = [f":{color}[{EC.number}]" for EC, color in zip(EC_tag, colors)]
+        st.sidebar.write("Visualized as colored spheres: " + ".".join(EC_colored))
 attention_pairs, top_residues = get_attention_pairs(
     threshold=min_attn,
     model_type=selected_model.name,
     top_n=n_highest_resis,
+    ec_number=EC_tag if ec_number else None,
 )
 sorted_by_attention = sorted(attention_pairs, key=lambda x: x[0], reverse=True)
             dashed=False,
         )
+    if selected_model.name == ModelType.ZymCTRL and ec_number:
+        for EC_num in EC_tag:
+            stmol.add_sphere(
+                xyzview,
+                spcenter=EC_num.coordinate,
+                radius=EC_num.radius,
+                spColor=EC_num.color,
+            )
     if label_resi:
         for hl_resi in hl_resi_list:
             xyzview.addResLabels(