Spaces:
Sleeping
Sleeping
"""Utilities for interacting with InterPro.""" | |
import itertools | |
import re | |
from dataclasses import dataclass | |
from enum import IntEnum, auto | |
from functools import cached_property | |
from pathlib import Path | |
import networkx as nx | |
import numpy as np | |
import pandas as pd | |
from esm.utils.constants import esm3 as C | |
def parse_go_terms(text: str) -> list[str]: | |
"""Parses GO terms from a string. | |
Args: | |
text: String containing GO terms. Example: "GO:0008309, GO:1902267" Note that GO | |
terms have exactly 7 digits. | |
Returns: | |
All GO terms found in the string. Example: ['GO:0008309', 'GO:1902267'] | |
""" | |
return re.findall(r"GO:(?:\d{7,})", text) | |
def _parse_interpro2go(path: str) -> dict[str, list[str]]: | |
"""Parses InterPro2GO file into map. | |
NOTE: this file has a very strange, non-standard format. | |
Args: | |
path: path to InterPro2GO file from: https://www.ebi.ac.uk/GOA/InterPro2GO | |
Returns: | |
Mapping from InterPro to list of associated GO terms. | |
""" | |
with Path(path).open("r") as f: | |
text = f.read() | |
df = pd.Series(text.split("\n"), name="line").to_frame() | |
df = df[~df.line.str.startswith("!")] | |
df["interpro_id"] = df.line.apply(lambda line: re.findall(r"IPR\d+", line)) | |
df["go_ids"] = df.line.apply(parse_go_terms) | |
df = df[df.go_ids.apply(len).gt(0) & df.interpro_id.apply(len).eq(1)] | |
df["interpro_id"] = df["interpro_id"].apply(lambda xs: xs[0]) # type: ignore | |
# Group all mappints together into a single map. | |
df = ( | |
df.groupby("interpro_id")["go_ids"] # type: ignore | |
.apply(lambda group: list(itertools.chain.from_iterable(group))) | |
.reset_index() | |
) | |
return dict(zip(df.interpro_id, df.go_ids)) # type: ignore | |
class InterProEntryType(IntEnum): | |
"""InterPro types and representation counts: | |
Family 21,942 | |
Domain 14,053 | |
Homologous_superfamily 3,446 | |
Conserved_site 728 | |
Repeat 374 | |
Active_site 133 | |
Binding_site 75 | |
PTM 17 | |
""" | |
ACTIVE_SITE = 0 | |
BINDING_SITE = auto() | |
CONSERVED_SITE = auto() | |
DOMAIN = auto() | |
FAMILY = auto() | |
HOMOLOGOUS_SUPERFAMILY = auto() | |
PTM = auto() | |
REPEAT = auto() | |
UNKNOWN = auto() | |
class InterProEntry: | |
"""Represents an InterPro entry.""" | |
id: str # Example: IPR000006 | |
type: InterProEntryType | |
name: str # Example: "Metallothionein, vertebrate" | |
description: str | None = None | |
class InterProRangeAnnotation: | |
"""Represents a InterPro annotation along a range of residues in a protein.""" | |
interpro_accession: str | |
start_idx: int | |
end_idx: int | |
class InterPro: | |
"""Convenience class interacting with InterPro ontology/data.""" | |
def __init__( | |
self, | |
entries_path: str | None = None, | |
hierarchy_path: str | None = None, | |
interpro2go_path: str | None = None, | |
): | |
"""Constructs interface to query InterPro entries.""" | |
default = lambda x, d: x if x is not None else d | |
self.entries_path = default(entries_path, str(C.data_root() / C.INTERPRO_ENTRY)) | |
self.hierarchy_graph_path = default( | |
hierarchy_path, str(C.data_root() / C.INTERPRO_HIERARCHY) | |
) | |
self.interpro2go_path = default( | |
interpro2go_path, str(C.data_root() / C.INTERPRO2GO) | |
) | |
def interpro2go(self) -> dict[str, list[str]]: | |
"""Reads the InterPro to GO term mapping.""" | |
assert self.interpro2go_path is not None | |
return _parse_interpro2go(self.interpro2go_path) | |
def entries_frame(self) -> pd.DataFrame: | |
"""Loads full InterPro entry set as a DataFrame. | |
Colums are | |
- "id": str interpro accession /id as | |
- "type": InterProEntryType representing the type of annotation. | |
- "name": Short name of the entry. | |
""" | |
with Path(self.entries_path).open("r") as f: | |
df = pd.read_csv(f, sep="\t") | |
assert all( | |
col in df.columns for col in ["ENTRY_AC", "ENTRY_TYPE", "ENTRY_NAME"] | |
) | |
df.rename( | |
columns={ | |
"ENTRY_AC": "id", | |
"ENTRY_TYPE": "type", | |
"ENTRY_NAME": "name", | |
}, | |
inplace=True, | |
) | |
df["type"] = df.type.str.upper().apply( | |
lambda type_name: InterProEntryType[type_name] | |
) | |
return df | |
def entries(self) -> dict[str, InterProEntry]: | |
"""Returns all InterPro entries.""" | |
return { | |
row.id: InterProEntry( # type: ignore | |
id=row.id, # type: ignore | |
type=row.type, # type: ignore | |
name=row.name, # type: ignore | |
) | |
for row in self.entries_frame.itertuples() | |
} | |
def lookup_name(self, interpro_id: str) -> str | None: | |
"""Short name / title for an interpro id.""" | |
if interpro_id not in self.entries: | |
return None | |
return self.entries[interpro_id].name | |
def lookup_entry_type(self, interpro_id: str) -> InterProEntryType: | |
"""Looks up entry-type for an interpro id.""" | |
if interpro_id in self.entries: | |
return self.entries[interpro_id].type | |
else: | |
return InterProEntryType.UNKNOWN | |
def graph(self) -> nx.DiGraph: | |
"""Reads the InterPro hierarchy of InterPro.""" | |
graph = nx.DiGraph() | |
with Path(self.hierarchy_graph_path).open("r") as f: | |
parents = [] | |
for line in f: | |
ipr = line.split("::", maxsplit=1)[0] | |
ipr_strip = ipr.lstrip("-") | |
level = (len(ipr) - len(ipr_strip)) // 2 | |
parents = parents[:level] | |
graph.add_node(ipr_strip) | |
if parents: | |
graph.add_edge(ipr_strip, parents[-1]) | |
parents.append(ipr_strip) | |
return graph | |
def parse_interpro_features( | |
interpro_accessions: list[str], | |
interpro_starts: list[int], | |
interpro_ends: list[int], | |
) -> list[InterProRangeAnnotation]: | |
"""Parses raw InterPro ranges. | |
Args: | |
interpro_accessions: list of InterPro accessions | |
interpro_starts: list of one-indexed inclusive residue locations where the | |
annotation from `interpro_accesisons` begin. | |
interpro_ends: list of one-indexed *inclusive* residue locations where the | |
annotation from `interpro_accesisons` end. | |
Returns: | |
Collated InterProRangeAnnotations. NOTE that index conversion will convert range | |
bounds to zero-indexed [inclusive, exclusive) start/end indices. | |
""" | |
assert len(interpro_accessions) == len(interpro_starts) == len(interpro_ends) | |
# Residue locations from Uniprot/InterPro are [inclusive, inclusive] and 1-index. | |
start_idcs = np.array(interpro_starts).astype(int) | |
end_idcs = np.array(interpro_ends).astype(int) | |
# We want to use Python's convention of [inclusive, exclusive) and 0-indexing. | |
# Interpro residue indices are [inclusive, inclusive] and 1-indexing. | |
# The conversion ends up being: | |
# ```python | |
# end_idcs += 1 # [inclusive, inclusive] -> [inclusive, exclusive) | |
# start_idcs -= 1 # 1 -> 0 indexing | |
# end_idcs -= 1 # 1 -> 0 indexing | |
# ``` | |
# Which simply results in: | |
start_idcs -= 1 | |
ranges = [] | |
for interpro_accession, start_idx, end_idx in zip( | |
interpro_accessions, start_idcs, end_idcs | |
): | |
# NOTE: Skip unintegrated Interpro labels, for now. | |
if interpro_accession == "-": | |
continue | |
ranges.append( | |
InterProRangeAnnotation( | |
interpro_accession=interpro_accession, | |
start_idx=start_idx, | |
end_idx=end_idx, | |
) | |
) | |
return ranges | |