Spaces:

anonymousforpaper
/

M3Site

Sleeping

App Files Files Community

M3Site / esm /utils /function /interpro.py

anonymousforpaper

Upload 103 files

224a33f verified 2 months ago

raw

history blame contribute delete

8.06 kB

	"""Utilities for interacting with InterPro."""
	import itertools
	import re
	from dataclasses import dataclass
	from enum import IntEnum, auto
	from functools import cached_property
	from pathlib import Path

	import networkx as nx
	import numpy as np
	import pandas as pd

	from esm.utils.constants import esm3 as C


	def parse_go_terms(text: str) -> list[str]:
	"""Parses GO terms from a string.

	Args:
	text: String containing GO terms. Example: "GO:0008309, GO:1902267" Note that GO
	terms have exactly 7 digits.
	Returns:
	All GO terms found in the string. Example: ['GO:0008309', 'GO:1902267']
	"""
	return re.findall(r"GO:(?:\d{7,})", text)


	def _parse_interpro2go(path: str) -> dict[str, list[str]]:
	"""Parses InterPro2GO file into map.

	NOTE: this file has a very strange, non-standard format.

	Args:
	path: path to InterPro2GO file from: https://www.ebi.ac.uk/GOA/InterPro2GO
	Returns:
	Mapping from InterPro to list of associated GO terms.
	"""
	with Path(path).open("r") as f:
	text = f.read()
	df = pd.Series(text.split("\n"), name="line").to_frame()
	df = df[~df.line.str.startswith("!")]
	df["interpro_id"] = df.line.apply(lambda line: re.findall(r"IPR\d+", line))
	df["go_ids"] = df.line.apply(parse_go_terms)
	df = df[df.go_ids.apply(len).gt(0) & df.interpro_id.apply(len).eq(1)]
	df["interpro_id"] = df["interpro_id"].apply(lambda xs: xs[0]) # type: ignore

	# Group all mappints together into a single map.
	df = (
	df.groupby("interpro_id")["go_ids"] # type: ignore
	.apply(lambda group: list(itertools.chain.from_iterable(group)))
	.reset_index()
	)
	return dict(zip(df.interpro_id, df.go_ids)) # type: ignore


	class InterProEntryType(IntEnum):
	"""InterPro types and representation counts:

	Family 21,942
	Domain 14,053
	Homologous_superfamily 3,446
	Conserved_site 728
	Repeat 374
	Active_site 133
	Binding_site 75
	PTM 17
	"""

	ACTIVE_SITE = 0
	BINDING_SITE = auto()
	CONSERVED_SITE = auto()
	DOMAIN = auto()
	FAMILY = auto()
	HOMOLOGOUS_SUPERFAMILY = auto()
	PTM = auto()
	REPEAT = auto()
	UNKNOWN = auto()


	@dataclass
	class InterProEntry:
	"""Represents an InterPro entry."""

	id: str # Example: IPR000006
	type: InterProEntryType
	name: str # Example: "Metallothionein, vertebrate"
	description: str \| None = None


	@dataclass(frozen=True)
	class InterProRangeAnnotation:
	"""Represents a InterPro annotation along a range of residues in a protein."""

	interpro_accession: str
	start_idx: int
	end_idx: int


	class InterPro:
	"""Convenience class interacting with InterPro ontology/data."""

	def __init__(
	self,
	entries_path: str \| None = None,
	hierarchy_path: str \| None = None,
	interpro2go_path: str \| None = None,
	):
	"""Constructs interface to query InterPro entries."""
	default = lambda x, d: x if x is not None else d
	self.entries_path = default(entries_path, str(C.data_root() / C.INTERPRO_ENTRY))
	self.hierarchy_graph_path = default(
	hierarchy_path, str(C.data_root() / C.INTERPRO_HIERARCHY)
	)
	self.interpro2go_path = default(
	interpro2go_path, str(C.data_root() / C.INTERPRO2GO)
	)

	@cached_property
	def interpro2go(self) -> dict[str, list[str]]:
	"""Reads the InterPro to GO term mapping."""
	assert self.interpro2go_path is not None
	return _parse_interpro2go(self.interpro2go_path)

	@cached_property
	def entries_frame(self) -> pd.DataFrame:
	"""Loads full InterPro entry set as a DataFrame.

	Colums are
	- "id": str interpro accession /id as
	- "type": InterProEntryType representing the type of annotation.
	- "name": Short name of the entry.
	"""
	with Path(self.entries_path).open("r") as f:
	df = pd.read_csv(f, sep="\t")
	assert all(
	col in df.columns for col in ["ENTRY_AC", "ENTRY_TYPE", "ENTRY_NAME"]
	)
	df.rename(
	columns={
	"ENTRY_AC": "id",
	"ENTRY_TYPE": "type",
	"ENTRY_NAME": "name",
	},
	inplace=True,
	)
	df["type"] = df.type.str.upper().apply(
	lambda type_name: InterProEntryType[type_name]
	)
	return df

	@cached_property
	def entries(self) -> dict[str, InterProEntry]:
	"""Returns all InterPro entries."""
	return {
	row.id: InterProEntry( # type: ignore
	id=row.id, # type: ignore
	type=row.type, # type: ignore
	name=row.name, # type: ignore
	)
	for row in self.entries_frame.itertuples()
	}

	def lookup_name(self, interpro_id: str) -> str \| None:
	"""Short name / title for an interpro id."""
	if interpro_id not in self.entries:
	return None
	return self.entries[interpro_id].name

	def lookup_entry_type(self, interpro_id: str) -> InterProEntryType:
	"""Looks up entry-type for an interpro id."""
	if interpro_id in self.entries:
	return self.entries[interpro_id].type
	else:
	return InterProEntryType.UNKNOWN

	@cached_property
	def graph(self) -> nx.DiGraph:
	"""Reads the InterPro hierarchy of InterPro."""
	graph = nx.DiGraph()
	with Path(self.hierarchy_graph_path).open("r") as f:
	parents = []
	for line in f:
	ipr = line.split("::", maxsplit=1)[0]
	ipr_strip = ipr.lstrip("-")
	level = (len(ipr) - len(ipr_strip)) // 2
	parents = parents[:level]
	graph.add_node(ipr_strip)
	if parents:
	graph.add_edge(ipr_strip, parents[-1])
	parents.append(ipr_strip)
	return graph


	def parse_interpro_features(
	interpro_accessions: list[str],
	interpro_starts: list[int],
	interpro_ends: list[int],
	) -> list[InterProRangeAnnotation]:
	"""Parses raw InterPro ranges.

	Args:
	interpro_accessions: list of InterPro accessions
	interpro_starts: list of one-indexed inclusive residue locations where the
	annotation from `interpro_accesisons` begin.
	interpro_ends: list of one-indexed inclusive residue locations where the
	annotation from `interpro_accesisons` end.
	Returns:
	Collated InterProRangeAnnotations. NOTE that index conversion will convert range
	bounds to zero-indexed [inclusive, exclusive) start/end indices.
	"""
	assert len(interpro_accessions) == len(interpro_starts) == len(interpro_ends)

	# Residue locations from Uniprot/InterPro are [inclusive, inclusive] and 1-index.
	start_idcs = np.array(interpro_starts).astype(int)
	end_idcs = np.array(interpro_ends).astype(int)

	# We want to use Python's convention of [inclusive, exclusive) and 0-indexing.
	# Interpro residue indices are [inclusive, inclusive] and 1-indexing.
	# The conversion ends up being:
	# ```python
	# end_idcs += 1 # [inclusive, inclusive] -> [inclusive, exclusive)
	# start_idcs -= 1 # 1 -> 0 indexing
	# end_idcs -= 1 # 1 -> 0 indexing
	# ```
	# Which simply results in:
	start_idcs -= 1

	ranges = []
	for interpro_accession, start_idx, end_idx in zip(
	interpro_accessions, start_idcs, end_idcs
	):
	# NOTE: Skip unintegrated Interpro labels, for now.
	if interpro_accession == "-":
	continue

	ranges.append(
	InterProRangeAnnotation(
	interpro_accession=interpro_accession,
	start_idx=start_idx,
	end_idx=end_idx,
	)
	)

	return ranges