File size: 5,110 Bytes
9a1bde7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import pandas as pd
from biopandas.pdb import PandasPdb
from prody import parsePDBHeader




def read_pdb_to_dataframe(
    pdb_path,
    model_index: int = 1,
    parse_header: bool = True,
    ) -> pd.DataFrame:
    """
    Read a PDB file, and return a Pandas DataFrame containing the atomic coordinates and metadata.

    Args:
        pdb_path (str, optional): Path to a local PDB file to read. Defaults to None.
        model_index (int, optional): Index of the model to extract from the PDB file, in case
            it contains multiple models. Defaults to 1.
        parse_header (bool, optional): Whether to parse the PDB header and extract metadata.
            Defaults to True.

    Returns:
        pd.DataFrame: A DataFrame containing the atomic coordinates and metadata, with one row
            per atom
    """
    atomic_df = PandasPdb().read_pdb(pdb_path)
    if parse_header:
        header = parsePDBHeader(pdb_path)
    else:
        header = None
    atomic_df = atomic_df.get_model(model_index)
    if len(atomic_df.df["ATOM"]) == 0:
        raise ValueError(f"No model found for index: {model_index}")

    return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]]), header

from graphein.protein.graphs import label_node_id

def process_dataframe(df: pd.DataFrame, granularity='CA') -> pd.DataFrame:
    """
    Process a DataFrame of protein structure data to reduce ambiguity and simplify analysis.

    This function performs the following steps:
    1. Handles alternate locations for an atom, defaulting to keep the first one if multiple exist.
    2. Assigns a unique node_id to each residue in the DataFrame, using a helper function label_node_id.
    3. Filters the DataFrame based on specified granularity (defaults to 'CA' for alpha carbon).

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame containing protein structure data to process. It is expected to contain columns 'alt_loc' and 'atom_name'.
        
    granularity : str, optional
        The level of detail or perspective at which the DataFrame should be analyzed. Defaults to 'CA' (alpha carbon).
    """
    # handle the case of alternative locations,
    # if so default to the 1st one = A
    if 'alt_loc' in df.columns:
      df['alt_loc'] = df['alt_loc'].replace('', 'A')
      df = df.loc[(df['alt_loc']=='A')]
    df = label_node_id(df, granularity)
    df = df.loc[(df['atom_name']==granularity)]
    return df


from graphein.protein.graphs import initialise_graph_with_metadata
from graphein.protein.graphs import add_nodes_to_graph
from graphein.protein.visualisation import plotly_protein_structure_graph
from PIL import Image
import networkx as nx

def take_care(pdb_path):
    

    df, header = read_pdb_to_dataframe(pdb_path)
    process_df = process_dataframe(df)

    g = initialise_graph_with_metadata(protein_df=process_df, # from above cell
                                        raw_pdb_df=df, # Store this for traceability
                                        pdb_code = '3nir', #and again
                                        granularity = 'CA' # Store this so we know what kind of graph we have
                                        )
    g = add_nodes_to_graph(g)

    
    def add_backbone_edges(G: nx.Graph) -> nx.Graph:
        # Iterate over every chain
        for chain_id in G.graph["chain_ids"]:
            # Find chain residues
            chain_residues = [
                (n, v) for n, v in G.nodes(data=True) if v["chain_id"] == chain_id
            ]
            # Iterate over every residue in chain
            for i, residue in enumerate(chain_residues):
                try:
                    # Checks not at chain terminus
                    if i == len(chain_residues) - 1:
                        continue
                    # Asserts residues are on the same chain
                    cond_1 = ( residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"])
                    # Asserts residue numbers are adjacent
                    cond_2 = (abs(residue[1]["residue_number"] - chain_residues[i + 1][1]["residue_number"])== 1)

                    # If this checks out, we add a peptide bond
                    if (cond_1) and (cond_2):
                        # Adds "peptide bond" between current residue and the next
                        if G.has_edge(i, i + 1):
                            G.edges[i, i + 1]["kind"].add('backbone_bond')
                        else:
                            G.add_edge(residue[0],chain_residues[i + 1][0],kind={'backbone_bond'},)
                except IndexError as e:
                    print(e)
        return G

    g = add_backbone_edges(g)

    

    p = plotly_protein_structure_graph(
        g,
        colour_edges_by="kind",
        colour_nodes_by="seq_position",
        label_node_ids=False,
        plot_title="Backbone Protein Graph",
        node_size_multiplier=1,
    )
    image_file = "protein_graph.png"
    p.write_image(image_file, format='png')


    # Load the PNG image into a PIL image
    image = Image.open(image_file)

    return image