Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /SearchIO /HmmerIO /hmmer3_text.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

18.7 kB

	# Copyright 2012 by Wibowo Arindrarto. All rights reserved.
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Bio.SearchIO parser for HMMER plain text output format."""

	import re

	from Bio.SearchIO._utils import read_forward
	from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment

	from ._base import _BaseHmmerTextIndexer

	__all__ = ("Hmmer3TextParser", "Hmmer3TextIndexer")


	# precompile regex patterns for faster processing
	# regex for program name capture
	_RE_PROGRAM = re.compile(r"^# (\whmm\w+) :: .$")
	# regex for version string capture
	_RE_VERSION = re.compile(r"# \w+ ([\w+\.]+) .; http.$")
	# regex for option string capture
	_RE_OPT = re.compile(r"^# (.+):\s+(.+)$")
	# regex for parsing query id and length, for parsing
	_QRE_ID_LEN_PTN = r"^Query:\s(.)\s+\[\w=(\d+)\]"
	_QRE_ID_LEN = re.compile(_QRE_ID_LEN_PTN)
	# regex for hsp validation
	_HRE_VALIDATE = re.compile(r"score:\s(-?\d+\.?\d+)\sbits.value:\s(.)")
	# regexes for parsing hsp alignment blocks
	_HRE_ANNOT_LINE = re.compile(r"^(\s+)(.+)\s(\w+)")
	_HRE_ID_LINE = re.compile(r"^(\s+\S+\s+[0-9-]+ )(.+?)(\s+[0-9-]+)")


	class Hmmer3TextParser:
	"""Parser for the HMMER 3.0 text output."""

	def __init__(self, handle):
	"""Initialize the class."""
	self.handle = handle
	self.line = read_forward(self.handle)
	self._meta = self._parse_preamble()

	def __iter__(self):
	"""Iterate over query results."""
	yield from self._parse_qresult()

	def _read_until(self, bool_func):
	"""Read the file handle until the given function returns True (PRIVATE)."""
	while True:
	if not self.line or bool_func(self.line):
	return
	else:
	self.line = read_forward(self.handle)

	def _parse_preamble(self):
	"""Parse HMMER preamble (lines beginning with '#') (PRIVATE)."""
	meta = {}
	# bool flag for storing state ~ whether we are parsing the option
	# lines or not
	has_opts = False
	while True:
	# no pound sign means we've left the preamble
	if not self.line.startswith("#"):
	break
	# dashes could either mean we are entering or leaving the options
	# section ~ so it's a switch for the has_opts flag
	elif "- - -" in self.line:
	if not has_opts:
	# if flag is false, that means we're entering opts
	# so switch the flag accordingly
	has_opts = True
	else:
	# if flag is true, that means we've reached the end of opts
	# so we can break out of the function
	break
	elif not has_opts:
	# try parsing program
	regx = re.search(_RE_PROGRAM, self.line)
	if regx:
	meta["program"] = regx.group(1)
	# try parsing version
	regx = re.search(_RE_VERSION, self.line)
	if regx:
	meta["version"] = regx.group(1)
	elif has_opts:
	regx = re.search(_RE_OPT, self.line)
	# if target in regx.group(1), then we store the key as target
	if "target" in regx.group(1):
	meta["target"] = regx.group(2).strip()
	else:
	meta[regx.group(1)] = regx.group(2)

	self.line = read_forward(self.handle)

	return meta

	def _parse_qresult(self):
	"""Parse a HMMER3 query block (PRIVATE)."""
	self._read_until(lambda line: line.startswith("Query:"))

	while self.line:

	regx = re.search(_QRE_ID_LEN, self.line)

	while not regx:
	self.line = read_forward(self.handle)
	regx = re.search(_QRE_ID_LEN, self.line)

	# get query id and length
	qid = regx.group(1).strip()
	# store qresult attributes
	qresult_attrs = {
	"seq_len": int(regx.group(2)),
	"program": self._meta.get("program"),
	"version": self._meta.get("version"),
	"target": self._meta.get("target"),
	}

	# get description and accession, if they exist
	qdesc = "<unknown description>" # placeholder
	while not self.line.startswith("Scores for "):
	self.line = read_forward(self.handle)

	if self.line.startswith("Accession:"):
	acc = self.line.strip().split(" ", 1)[1]
	qresult_attrs["accession"] = acc.strip()
	elif self.line.startswith("Description:"):
	qdesc = self.line.strip().split(" ", 1)[1].strip()
	qresult_attrs["description"] = qdesc

	# parse the query hits
	while self.line and "//" not in self.line:
	hit_list = self._parse_hit(qid, qdesc)
	# read through the statistics summary
	# TODO: parse and store this information?
	if self.line.startswith("Internal pipeline"):
	while self.line and "//" not in self.line:
	self.line = read_forward(self.handle)

	# create qresult, set its attributes and yield
	# not initializing hit_list directly to handle empty hits
	# (i.e. need to set its query description manually)
	qresult = QueryResult(id=qid, hits=hit_list)
	for attr, value in qresult_attrs.items():
	setattr(qresult, attr, value)
	yield qresult
	self.line = read_forward(self.handle)

	# Skip line beginning with '# Alignment of', which are output
	# when running phmmer with the '-A' flag.
	if self.line.startswith("#"):
	self.line = self.handle.readline()

	# HMMER >= 3.1 outputs '[ok]' at the end of all results file,
	# which means we can break the main loop when we see the line
	if "[ok]" in self.line:
	break

	def _parse_hit(self, qid, qdesc):
	"""Parse a HMMER3 hit block, beginning with the hit table (PRIVATE)."""
	# get to the end of the hit table delimiter and read one more line
	self._read_until(lambda line: line.startswith(" ------- ------ -----"))
	self.line = read_forward(self.handle)

	# assume every hit is in inclusion threshold until the inclusion
	# threshold line is encountered
	is_included = True

	# parse the hit table
	hit_attr_list = []
	while True:
	if not self.line:
	return []
	elif self.line.startswith(" ------ inclusion"):
	is_included = False
	self.line = read_forward(self.handle)
	# if there are no hits, then there are no hsps
	# so we forward-read until 'Internal pipeline..'
	elif self.line.startswith(" [No hits detected that satisfy reporting"):
	while True:
	self.line = read_forward(self.handle)
	if self.line.startswith("Internal pipeline"):
	assert len(hit_attr_list) == 0
	return []
	elif self.line.startswith("Domain annotation for each "):
	hit_list = self._create_hits(hit_attr_list, qid, qdesc)
	return hit_list
	# entering hit results row
	# parse the columns into a list
	row = [x for x in self.line.strip().split(" ") if x]
	# join the description words if it's >1 word
	if len(row) > 10:
	row[9] = " ".join(row[9:])
	# if there's no description, set it to an empty string
	elif len(row) < 10:
	row.append("")
	assert len(row) == 10
	# create the hit object
	hit_attrs = {
	"id": row[8],
	"query_id": qid,
	"evalue": float(row[0]),
	"bitscore": float(row[1]),
	"bias": float(row[2]),
	# row[3:6] is not parsed, since the info is available
	# at the HSP level
	"domain_exp_num": float(row[6]),
	"domain_obs_num": int(row[7]),
	"description": row[9],
	"is_included": is_included,
	}
	hit_attr_list.append(hit_attrs)

	self.line = read_forward(self.handle)

	def _create_hits(self, hit_attrs, qid, qdesc):
	"""Parse a HMMER3 hsp block, beginning with the hsp table (PRIVATE)."""
	# read through until the beginning of the hsp block
	self._read_until(
	lambda line: line.startswith("Internal pipeline") or line.startswith(">>")
	)

	# start parsing the hsp block
	hit_list = []
	while True:
	if self.line.startswith("Internal pipeline"):
	# by this time we should've emptied the hit attr list
	assert len(hit_attrs) == 0
	return hit_list
	assert self.line.startswith(">>")
	hid, hdesc = self.line[len(">> ") :].split(" ", 1)
	hdesc = hdesc.strip()

	# read through the hsp table header and move one more line
	self._read_until(
	lambda line: line.startswith(" --- ------ ----- --------")
	or line.startswith(" [No individual domains")
	)
	self.line = read_forward(self.handle)

	# parse the hsp table for the current hit
	hsp_list = []
	while True:
	# break out of hsp parsing if there are no hits, it's the last hsp
	# or it's the start of a new hit
	if (
	self.line.startswith(" [No targets detected that satisfy")
	or self.line.startswith(" [No individual domains")
	or self.line.startswith("Internal pipeline statistics summary:")
	or self.line.startswith(" Alignments for each domain:")
	or self.line.startswith(">>")
	):

	hit_attr = hit_attrs.pop(0)
	hit = Hit(hsp_list)
	for attr, value in hit_attr.items():
	if attr == "description":
	cur_val = getattr(hit, attr)
	if cur_val and value and cur_val.startswith(value):
	continue
	setattr(hit, attr, value)
	if not hit:
	hit.query_description = qdesc
	hit_list.append(hit)
	break

	parsed = [x for x in self.line.strip().split(" ") if x]
	assert len(parsed) == 16
	# parsed column order:
	# index, is_included, bitscore, bias, evalue_cond, evalue
	# hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito,
	# envfrom, envto, acc_avg
	frag = HSPFragment(hid, qid)
	# set query and hit descriptions if they are defined / nonempty string
	if qdesc:
	frag.query_description = qdesc
	if hdesc:
	frag.hit_description = hdesc
	# HMMER3 results are always protein
	frag.molecule_type = "protein"
	# depending on whether the program is hmmsearch, hmmscan, or phmmer
	# {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to}
	# for hmmscan, hit is the hmm profile, query is the sequence
	if self._meta.get("program") == "hmmscan":
	# adjust 'from' and 'to' coordinates to 0-based ones
	frag.hit_start = int(parsed[6]) - 1
	frag.hit_end = int(parsed[7])
	frag.query_start = int(parsed[9]) - 1
	frag.query_end = int(parsed[10])
	elif self._meta.get("program") in ["hmmsearch", "phmmer"]:
	# adjust 'from' and 'to' coordinates to 0-based ones
	frag.hit_start = int(parsed[9]) - 1
	frag.hit_end = int(parsed[10])
	frag.query_start = int(parsed[6]) - 1
	frag.query_end = int(parsed[7])
	# strand is always 0, since HMMER now only handles protein
	frag.hit_strand = frag.query_strand = 0

	hsp = HSP([frag])
	hsp.domain_index = int(parsed[0])
	hsp.is_included = parsed[1] == "!"
	hsp.bitscore = float(parsed[2])
	hsp.bias = float(parsed[3])
	hsp.evalue_cond = float(parsed[4])
	hsp.evalue = float(parsed[5])
	if self._meta.get("program") == "hmmscan":
	# adjust 'from' and 'to' coordinates to 0-based ones
	hsp.hit_endtype = parsed[8]
	hsp.query_endtype = parsed[11]
	elif self._meta.get("program") in ["hmmsearch", "phmmer"]:
	# adjust 'from' and 'to' coordinates to 0-based ones
	hsp.hit_endtype = parsed[11]
	hsp.query_endtype = parsed[8]
	# adjust 'from' and 'to' coordinates to 0-based ones
	hsp.env_start = int(parsed[12]) - 1
	hsp.env_end = int(parsed[13])
	hsp.env_endtype = parsed[14]
	hsp.acc_avg = float(parsed[15])

	hsp_list.append(hsp)
	self.line = read_forward(self.handle)

	# parse the hsp alignments
	if self.line.startswith(" Alignments for each domain:"):
	self._parse_aln_block(hid, hit.hsps)

	def _parse_aln_block(self, hid, hsp_list):
	"""Parse a HMMER3 HSP alignment block (PRIVATE)."""
	self.line = read_forward(self.handle)
	dom_counter = 0
	while True:
	if self.line.startswith(">>") or self.line.startswith("Internal pipeline"):
	return hsp_list
	assert self.line.startswith(" == domain %i" % (dom_counter + 1))
	# alias hsp to local var
	# but note that we're still changing the attrs of the actual
	# hsp inside the qresult as we're not creating a copy
	frag = hsp_list[dom_counter][0]
	# XXX: should we validate again here? regex is expensive..
	# regx = re.search(_HRE_VALIDATE, self.line)
	# assert hsp.bitscore == float(regx.group(1))
	# assert hsp.evalue_cond == float(regx.group(2))
	hmmseq = ""
	aliseq = ""
	annot = {}
	self.line = self.handle.readline()

	# parse all the alignment blocks in the hsp
	while True:

	regx = None

	# check for hit or query line
	# we don't check for the hit or query id specifically
	# to anticipate special cases where query id == hit id
	regx = re.search(_HRE_ID_LINE, self.line)
	if regx:
	# the first hit/query self.line we encounter is the hmmseq
	if len(hmmseq) == len(aliseq):
	hmmseq += regx.group(2)
	# and for subsequent self.lines, len(hmmseq) is either
	# > or == len(aliseq)
	elif len(hmmseq) > len(aliseq):
	aliseq += regx.group(2)
	assert len(hmmseq) >= len(aliseq)
	# check for start of new domain
	elif (
	self.line.startswith(" == domain")
	or self.line.startswith(">>")
	or self.line.startswith("Internal pipeline")
	):
	frag.aln_annotation = annot
	if self._meta.get("program") == "hmmscan":
	frag.hit = hmmseq
	frag.query = aliseq
	elif self._meta.get("program") in ["hmmsearch", "phmmer"]:
	frag.hit = aliseq
	frag.query = hmmseq
	dom_counter += 1
	hmmseq = ""
	aliseq = ""
	annot = {}
	break
	# otherwise check if it's an annotation line and parse it
	# len(hmmseq) is only != len(aliseq) when the cursor is parsing
	# the similarity character. Since we're not parsing that, we
	# check for when the condition is False (i.e. when it's ==)
	elif len(hmmseq) == len(aliseq):
	regx = re.search(_HRE_ANNOT_LINE, self.line)
	if regx:
	annot_name = regx.group(3)
	if annot_name in annot:
	annot[annot_name] += regx.group(2)
	else:
	annot[annot_name] = regx.group(2)

	self.line = self.handle.readline()


	class Hmmer3TextIndexer(_BaseHmmerTextIndexer):
	"""Indexer class for HMMER plain text output."""

	_parser = Hmmer3TextParser
	qresult_start = b"Query: "
	qresult_end = b"//"

	def __iter__(self):
	"""Iterate over Hmmer3TextIndexer; yields query results' key, offsets, 0."""
	handle = self._handle
	handle.seek(0)
	start_offset = handle.tell()
	regex_id = re.compile(_QRE_ID_LEN_PTN.encode())

	while True:
	line = read_forward(handle)
	end_offset = handle.tell()

	if line.startswith(self.qresult_start):
	regx = re.search(regex_id, line)
	qresult_key = regx.group(1).strip()
	# qresult start offset is the offset of this line
	# (starts with the start mark)
	start_offset = end_offset - len(line)
	elif line.startswith(self.qresult_end):
	yield qresult_key.decode(), start_offset, 0
	start_offset = end_offset
	elif not line:
	break


	# if not used as a module, run the doctest
	if __name__ == "__main__":
	from Bio._utils import run_doctest

	run_doctest()