Spaces:
Runtime error
Runtime error
File size: 5,110 Bytes
9a1bde7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import pandas as pd
from biopandas.pdb import PandasPdb
from prody import parsePDBHeader
def read_pdb_to_dataframe(
pdb_path,
model_index: int = 1,
parse_header: bool = True,
) -> pd.DataFrame:
"""
Read a PDB file, and return a Pandas DataFrame containing the atomic coordinates and metadata.
Args:
pdb_path (str, optional): Path to a local PDB file to read. Defaults to None.
model_index (int, optional): Index of the model to extract from the PDB file, in case
it contains multiple models. Defaults to 1.
parse_header (bool, optional): Whether to parse the PDB header and extract metadata.
Defaults to True.
Returns:
pd.DataFrame: A DataFrame containing the atomic coordinates and metadata, with one row
per atom
"""
atomic_df = PandasPdb().read_pdb(pdb_path)
if parse_header:
header = parsePDBHeader(pdb_path)
else:
header = None
atomic_df = atomic_df.get_model(model_index)
if len(atomic_df.df["ATOM"]) == 0:
raise ValueError(f"No model found for index: {model_index}")
return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]]), header
from graphein.protein.graphs import label_node_id
def process_dataframe(df: pd.DataFrame, granularity='CA') -> pd.DataFrame:
"""
Process a DataFrame of protein structure data to reduce ambiguity and simplify analysis.
This function performs the following steps:
1. Handles alternate locations for an atom, defaulting to keep the first one if multiple exist.
2. Assigns a unique node_id to each residue in the DataFrame, using a helper function label_node_id.
3. Filters the DataFrame based on specified granularity (defaults to 'CA' for alpha carbon).
Parameters
----------
df : pd.DataFrame
The DataFrame containing protein structure data to process. It is expected to contain columns 'alt_loc' and 'atom_name'.
granularity : str, optional
The level of detail or perspective at which the DataFrame should be analyzed. Defaults to 'CA' (alpha carbon).
"""
# handle the case of alternative locations,
# if so default to the 1st one = A
if 'alt_loc' in df.columns:
df['alt_loc'] = df['alt_loc'].replace('', 'A')
df = df.loc[(df['alt_loc']=='A')]
df = label_node_id(df, granularity)
df = df.loc[(df['atom_name']==granularity)]
return df
from graphein.protein.graphs import initialise_graph_with_metadata
from graphein.protein.graphs import add_nodes_to_graph
from graphein.protein.visualisation import plotly_protein_structure_graph
from PIL import Image
import networkx as nx
def take_care(pdb_path):
df, header = read_pdb_to_dataframe(pdb_path)
process_df = process_dataframe(df)
g = initialise_graph_with_metadata(protein_df=process_df, # from above cell
raw_pdb_df=df, # Store this for traceability
pdb_code = '3nir', #and again
granularity = 'CA' # Store this so we know what kind of graph we have
)
g = add_nodes_to_graph(g)
def add_backbone_edges(G: nx.Graph) -> nx.Graph:
# Iterate over every chain
for chain_id in G.graph["chain_ids"]:
# Find chain residues
chain_residues = [
(n, v) for n, v in G.nodes(data=True) if v["chain_id"] == chain_id
]
# Iterate over every residue in chain
for i, residue in enumerate(chain_residues):
try:
# Checks not at chain terminus
if i == len(chain_residues) - 1:
continue
# Asserts residues are on the same chain
cond_1 = ( residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"])
# Asserts residue numbers are adjacent
cond_2 = (abs(residue[1]["residue_number"] - chain_residues[i + 1][1]["residue_number"])== 1)
# If this checks out, we add a peptide bond
if (cond_1) and (cond_2):
# Adds "peptide bond" between current residue and the next
if G.has_edge(i, i + 1):
G.edges[i, i + 1]["kind"].add('backbone_bond')
else:
G.add_edge(residue[0],chain_residues[i + 1][0],kind={'backbone_bond'},)
except IndexError as e:
print(e)
return G
g = add_backbone_edges(g)
p = plotly_protein_structure_graph(
g,
colour_edges_by="kind",
colour_nodes_by="seq_position",
label_node_ids=False,
plot_title="Backbone Protein Graph",
node_size_multiplier=1,
)
image_file = "protein_graph.png"
p.write_image(image_file, format='png')
# Load the PNG image into a PIL image
image = Image.open(image_file)
return image |