Spaces:

anonymousforpaper
/

M3Site

Sleeping

File size: 8,058 Bytes

224a33f

"""Utilities for interacting with InterPro."""
import itertools
import re
from dataclasses import dataclass
from enum import IntEnum, auto
from functools import cached_property
from pathlib import Path

import networkx as nx
import numpy as np
import pandas as pd

from esm.utils.constants import esm3 as C


def parse_go_terms(text: str) -> list[str]:
    """Parses GO terms from a string.

    Args:
        text: String containing GO terms. Example: "GO:0008309, GO:1902267" Note that GO
          terms have exactly 7 digits.
    Returns:
        All GO terms found in the string. Example: ['GO:0008309', 'GO:1902267']
    """
    return re.findall(r"GO:(?:\d{7,})", text)


def _parse_interpro2go(path: str) -> dict[str, list[str]]:
    """Parses InterPro2GO file into map.

    NOTE: this file has a very strange, non-standard format.

    Args:
        path: path to InterPro2GO file from: https://www.ebi.ac.uk/GOA/InterPro2GO
    Returns:
        Mapping from InterPro to list of associated GO terms.
    """
    with Path(path).open("r") as f:
        text = f.read()
    df = pd.Series(text.split("\n"), name="line").to_frame()
    df = df[~df.line.str.startswith("!")]
    df["interpro_id"] = df.line.apply(lambda line: re.findall(r"IPR\d+", line))
    df["go_ids"] = df.line.apply(parse_go_terms)
    df = df[df.go_ids.apply(len).gt(0) & df.interpro_id.apply(len).eq(1)]
    df["interpro_id"] = df["interpro_id"].apply(lambda xs: xs[0])  # type: ignore

    # Group all mappints together into a single map.
    df = (
        df.groupby("interpro_id")["go_ids"]  # type: ignore
        .apply(lambda group: list(itertools.chain.from_iterable(group)))
        .reset_index()
    )
    return dict(zip(df.interpro_id, df.go_ids))  # type: ignore


class InterProEntryType(IntEnum):
    """InterPro types and representation counts:

    Family                    21,942
    Domain                    14,053
    Homologous_superfamily     3,446
    Conserved_site               728
    Repeat                       374
    Active_site                  133
    Binding_site                  75
    PTM                           17
    """

    ACTIVE_SITE = 0
    BINDING_SITE = auto()
    CONSERVED_SITE = auto()
    DOMAIN = auto()
    FAMILY = auto()
    HOMOLOGOUS_SUPERFAMILY = auto()
    PTM = auto()
    REPEAT = auto()
    UNKNOWN = auto()


@dataclass
class InterProEntry:
    """Represents an InterPro entry."""

    id: str  # Example: IPR000006
    type: InterProEntryType
    name: str  # Example: "Metallothionein, vertebrate"
    description: str | None = None


@dataclass(frozen=True)
class InterProRangeAnnotation:
    """Represents a InterPro annotation along a range of residues in a protein."""

    interpro_accession: str
    start_idx: int
    end_idx: int


class InterPro:
    """Convenience class interacting with InterPro ontology/data."""

    def __init__(
        self,
        entries_path: str | None = None,
        hierarchy_path: str | None = None,
        interpro2go_path: str | None = None,
    ):
        """Constructs interface to query InterPro entries."""
        default = lambda x, d: x if x is not None else d
        self.entries_path = default(entries_path, str(C.data_root() / C.INTERPRO_ENTRY))
        self.hierarchy_graph_path = default(
            hierarchy_path, str(C.data_root() / C.INTERPRO_HIERARCHY)
        )
        self.interpro2go_path = default(
            interpro2go_path, str(C.data_root() / C.INTERPRO2GO)
        )

    @cached_property
    def interpro2go(self) -> dict[str, list[str]]:
        """Reads the InterPro to GO term mapping."""
        assert self.interpro2go_path is not None
        return _parse_interpro2go(self.interpro2go_path)

    @cached_property
    def entries_frame(self) -> pd.DataFrame:
        """Loads full InterPro entry set as a DataFrame.

        Colums are
            - "id": str interpro accession /id as
            - "type": InterProEntryType representing the type of annotation.
            - "name": Short name of the entry.
        """
        with Path(self.entries_path).open("r") as f:
            df = pd.read_csv(f, sep="\t")
        assert all(
            col in df.columns for col in ["ENTRY_AC", "ENTRY_TYPE", "ENTRY_NAME"]
        )
        df.rename(
            columns={
                "ENTRY_AC": "id",
                "ENTRY_TYPE": "type",
                "ENTRY_NAME": "name",
            },
            inplace=True,
        )
        df["type"] = df.type.str.upper().apply(
            lambda type_name: InterProEntryType[type_name]
        )
        return df

    @cached_property
    def entries(self) -> dict[str, InterProEntry]:
        """Returns all InterPro entries."""
        return {
            row.id: InterProEntry(  # type: ignore
                id=row.id,  # type: ignore
                type=row.type,  # type: ignore
                name=row.name,  # type: ignore
            )
            for row in self.entries_frame.itertuples()
        }

    def lookup_name(self, interpro_id: str) -> str | None:
        """Short name / title for an interpro id."""
        if interpro_id not in self.entries:
            return None
        return self.entries[interpro_id].name

    def lookup_entry_type(self, interpro_id: str) -> InterProEntryType:
        """Looks up entry-type for an interpro id."""
        if interpro_id in self.entries:
            return self.entries[interpro_id].type
        else:
            return InterProEntryType.UNKNOWN

    @cached_property
    def graph(self) -> nx.DiGraph:
        """Reads the InterPro hierarchy of InterPro."""
        graph = nx.DiGraph()
        with Path(self.hierarchy_graph_path).open("r") as f:
            parents = []
            for line in f:
                ipr = line.split("::", maxsplit=1)[0]
                ipr_strip = ipr.lstrip("-")
                level = (len(ipr) - len(ipr_strip)) // 2
                parents = parents[:level]
                graph.add_node(ipr_strip)
                if parents:
                    graph.add_edge(ipr_strip, parents[-1])
                parents.append(ipr_strip)
        return graph


def parse_interpro_features(
    interpro_accessions: list[str],
    interpro_starts: list[int],
    interpro_ends: list[int],
) -> list[InterProRangeAnnotation]:
    """Parses raw InterPro ranges.

    Args:
        interpro_accessions: list of InterPro accessions
        interpro_starts: list of one-indexed inclusive residue locations where the
          annotation from `interpro_accesisons` begin.
        interpro_ends: list of one-indexed *inclusive* residue locations where the
          annotation from `interpro_accesisons` end.
    Returns:
        Collated InterProRangeAnnotations. NOTE that index conversion will convert range
        bounds to zero-indexed [inclusive, exclusive) start/end indices.
    """
    assert len(interpro_accessions) == len(interpro_starts) == len(interpro_ends)

    # Residue locations from Uniprot/InterPro are [inclusive, inclusive] and 1-index.
    start_idcs = np.array(interpro_starts).astype(int)
    end_idcs = np.array(interpro_ends).astype(int)

    # We want to use Python's convention of [inclusive, exclusive) and 0-indexing.
    # Interpro residue indices are [inclusive, inclusive] and 1-indexing.
    # The conversion ends up being:
    #   ```python
    #   end_idcs += 1  # [inclusive, inclusive] -> [inclusive, exclusive)
    #   start_idcs -= 1  # 1 -> 0 indexing
    #   end_idcs -= 1  # 1 -> 0 indexing
    #   ```
    # Which simply results in:
    start_idcs -= 1

    ranges = []
    for interpro_accession, start_idx, end_idx in zip(
        interpro_accessions, start_idcs, end_idcs
    ):
        # NOTE: Skip unintegrated Interpro labels, for now.
        if interpro_accession == "-":
            continue

        ranges.append(
            InterProRangeAnnotation(
                interpro_accession=interpro_accession,
                start_idx=start_idx,
                end_idx=end_idx,
            )
        )

    return ranges