Spaces:

as-cle-bert
/

proteinviz

Runtime error

File size: 5,110 Bytes

9a1bde7

import pandas as pd
from biopandas.pdb import PandasPdb
from prody import parsePDBHeader




def read_pdb_to_dataframe(
    pdb_path,
    model_index: int = 1,
    parse_header: bool = True,
    ) -> pd.DataFrame:
    """
    Read a PDB file, and return a Pandas DataFrame containing the atomic coordinates and metadata.

    Args:
        pdb_path (str, optional): Path to a local PDB file to read. Defaults to None.
        model_index (int, optional): Index of the model to extract from the PDB file, in case
            it contains multiple models. Defaults to 1.
        parse_header (bool, optional): Whether to parse the PDB header and extract metadata.
            Defaults to True.

    Returns:
        pd.DataFrame: A DataFrame containing the atomic coordinates and metadata, with one row
            per atom
    """
    atomic_df = PandasPdb().read_pdb(pdb_path)
    if parse_header:
        header = parsePDBHeader(pdb_path)
    else:
        header = None
    atomic_df = atomic_df.get_model(model_index)
    if len(atomic_df.df["ATOM"]) == 0:
        raise ValueError(f"No model found for index: {model_index}")

    return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]]), header

from graphein.protein.graphs import label_node_id

def process_dataframe(df: pd.DataFrame, granularity='CA') -> pd.DataFrame:
    """
    Process a DataFrame of protein structure data to reduce ambiguity and simplify analysis.

    This function performs the following steps:
    1. Handles alternate locations for an atom, defaulting to keep the first one if multiple exist.
    2. Assigns a unique node_id to each residue in the DataFrame, using a helper function label_node_id.
    3. Filters the DataFrame based on specified granularity (defaults to 'CA' for alpha carbon).

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame containing protein structure data to process. It is expected to contain columns 'alt_loc' and 'atom_name'.
        
    granularity : str, optional
        The level of detail or perspective at which the DataFrame should be analyzed. Defaults to 'CA' (alpha carbon).
    """
    # handle the case of alternative locations,
    # if so default to the 1st one = A
    if 'alt_loc' in df.columns:
      df['alt_loc'] = df['alt_loc'].replace('', 'A')
      df = df.loc[(df['alt_loc']=='A')]
    df = label_node_id(df, granularity)
    df = df.loc[(df['atom_name']==granularity)]
    return df


from graphein.protein.graphs import initialise_graph_with_metadata
from graphein.protein.graphs import add_nodes_to_graph
from graphein.protein.visualisation import plotly_protein_structure_graph
from PIL import Image
import networkx as nx

def take_care(pdb_path):
    

    df, header = read_pdb_to_dataframe(pdb_path)
    process_df = process_dataframe(df)

    g = initialise_graph_with_metadata(protein_df=process_df, # from above cell
                                        raw_pdb_df=df, # Store this for traceability
                                        pdb_code = '3nir', #and again
                                        granularity = 'CA' # Store this so we know what kind of graph we have
                                        )
    g = add_nodes_to_graph(g)

    
    def add_backbone_edges(G: nx.Graph) -> nx.Graph:
        # Iterate over every chain
        for chain_id in G.graph["chain_ids"]:
            # Find chain residues
            chain_residues = [
                (n, v) for n, v in G.nodes(data=True) if v["chain_id"] == chain_id
            ]
            # Iterate over every residue in chain
            for i, residue in enumerate(chain_residues):
                try:
                    # Checks not at chain terminus
                    if i == len(chain_residues) - 1:
                        continue
                    # Asserts residues are on the same chain
                    cond_1 = ( residue[1]["chain_id"] == chain_residues[i + 1][1]["chain_id"])
                    # Asserts residue numbers are adjacent
                    cond_2 = (abs(residue[1]["residue_number"] - chain_residues[i + 1][1]["residue_number"])== 1)

                    # If this checks out, we add a peptide bond
                    if (cond_1) and (cond_2):
                        # Adds "peptide bond" between current residue and the next
                        if G.has_edge(i, i + 1):
                            G.edges[i, i + 1]["kind"].add('backbone_bond')
                        else:
                            G.add_edge(residue[0],chain_residues[i + 1][0],kind={'backbone_bond'},)
                except IndexError as e:
                    print(e)
        return G

    g = add_backbone_edges(g)

    

    p = plotly_protein_structure_graph(
        g,
        colour_edges_by="kind",
        colour_nodes_by="seq_position",
        label_node_ids=False,
        plot_title="Backbone Protein Graph",
        node_size_multiplier=1,
    )
    image_file = "protein_graph.png"
    p.write_image(image_file, format='png')


    # Load the PNG image into a PIL image
    image = Image.open(image_file)

    return image