Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /SearchIO /HmmerIO /hmmer2_text.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

12.8 kB

	# Copyright 2012 by Kai Blin. All rights reserved.
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Bio.SearchIO parser for HMMER 2 text output."""

	import re

	from Bio.SearchIO._utils import read_forward
	from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment

	from ._base import _BaseHmmerTextIndexer

	__all__ = ("Hmmer2TextParser", "Hmmer2TextIndexer")


	_HSP_ALIGN_LINE = re.compile(r"(\S+):\s+domain (\d+) of (\d+)")


	class _HitPlaceholder:
	def createHit(self, hsp_list):
	hit = Hit(hsp_list)
	hit.id_ = self.id_
	hit.evalue = self.evalue
	hit.bitscore = self.bitscore
	if self.description:
	hit.description = self.description
	hit.domain_obs_num = self.domain_obs_num
	return hit


	class Hmmer2TextParser:
	"""Iterator for the HMMER 2.0 text output."""

	def __init__(self, handle):
	"""Initialize the class."""
	self.handle = handle
	self.buf = []
	self._meta = self.parse_preamble()

	def __iter__(self):
	"""Iterate over Hmmer2TextParser, yields query results."""
	for qresult in self.parse_qresult():
	qresult.program = self._meta.get("program")
	qresult.target = self._meta.get("target")
	qresult.version = self._meta.get("version")
	yield qresult

	def read_next(self, rstrip=True):
	"""Return the next non-empty line, trailing whitespace removed."""
	if len(self.buf) > 0:
	return self.buf.pop()
	self.line = self.handle.readline()
	while self.line and rstrip and not self.line.strip():
	self.line = self.handle.readline()
	if self.line:
	if rstrip:
	self.line = self.line.rstrip()
	return self.line

	def push_back(self, line):
	"""Un-read a line that should not be parsed yet."""
	self.buf.append(line)

	def parse_key_value(self):
	"""Parse key-value pair separated by colon."""
	key, value = self.line.split(":", 1)
	return key.strip(), value.strip()

	def parse_preamble(self):
	"""Parse HMMER2 preamble."""
	meta = {}
	state = "GENERIC"
	while self.read_next():
	if state == "GENERIC":
	if self.line.startswith("hmm"):
	meta["program"] = self.line.split("-")[0].strip()
	elif self.line.startswith("HMMER is"):
	continue
	elif self.line.startswith("HMMER"):
	meta["version"] = self.line.split()[1]
	elif self.line.count("-") == 36:
	state = "OPTIONS"
	continue

	assert state == "OPTIONS"
	assert "program" in meta

	if self.line.count("-") == 32:
	break

	key, value = self.parse_key_value()
	if meta["program"] == "hmmsearch":
	if key == "Sequence database":
	meta["target"] = value
	continue
	elif meta["program"] == "hmmpfam":
	if key == "HMM file":
	meta["target"] = value
	continue
	meta[key] = value

	return meta

	def parse_qresult(self):
	"""Parse a HMMER2 query block."""
	while self.read_next():
	if not self.line.startswith("Query"):
	return
	_, id_ = self.parse_key_value()
	self.qresult = QueryResult(id=id_)

	description = None

	while self.read_next() and not self.line.startswith("Scores"):
	if self.line.startswith("Accession"):
	self.qresult.accession = self.parse_key_value()[1]
	if self.line.startswith("Description"):
	description = self.parse_key_value()[1]

	hit_placeholders = self.parse_hits()
	if len(hit_placeholders) > 0:
	self.parse_hsps(hit_placeholders)
	self.parse_hsp_alignments()

	while not self.line.startswith("Query"):
	self.read_next()
	if not self.line:
	break
	self.buf.append(self.line)

	if description is not None:
	self.qresult.description = description
	yield self.qresult

	def parse_hits(self):
	"""Parse a HMMER2 hit block, beginning with the hit table."""
	hit_placeholders = []
	while self.read_next():
	if self.line.startswith("Parsed"):
	break
	if self.line.find("no hits") > -1:
	break

	if (
	self.line.startswith("Sequence")
	or self.line.startswith("Model")
	or self.line.startswith("-------- ")
	):
	continue

	fields = self.line.split()
	id_ = fields.pop(0)
	domain_obs_num = int(fields.pop())
	evalue = float(fields.pop())
	bitscore = float(fields.pop())
	description = " ".join(fields).strip()

	hit = _HitPlaceholder()
	hit.id_ = id_
	hit.evalue = evalue
	hit.bitscore = bitscore
	hit.description = description
	hit.domain_obs_num = domain_obs_num
	hit_placeholders.append(hit)

	return hit_placeholders

	def parse_hsps(self, hit_placeholders):
	"""Parse a HMMER2 hsp block, beginning with the hsp table."""
	# HSPs may occur in different order than the hits
	# so store Hit objects separately first
	unordered_hits = {}
	while self.read_next():
	if (
	self.line.startswith("Alignments")
	or self.line.startswith("Histogram")
	or self.line == "//"
	):
	break
	if (
	self.line.startswith("Model")
	or self.line.startswith("Sequence")
	or self.line.startswith("--------")
	):
	continue

	(
	id_,
	domain,
	seq_f,
	seq_t,
	seq_compl,
	hmm_f,
	hmm_t,
	hmm_compl,
	score,
	evalue,
	) = self.line.split()

	frag = HSPFragment(id_, self.qresult.id)
	frag.molecule_type = "protein"
	if self._meta["program"] == "hmmpfam":
	frag.hit_start = int(hmm_f) - 1
	frag.hit_end = int(hmm_t)
	frag.query_start = int(seq_f) - 1
	frag.query_end = int(seq_t)
	elif self._meta["program"] == "hmmsearch":
	frag.query_start = int(hmm_f) - 1
	frag.query_end = int(hmm_t)
	frag.hit_start = int(seq_f) - 1
	frag.hit_end = int(seq_t)

	hsp = HSP([frag])
	hsp.evalue = float(evalue)
	hsp.bitscore = float(score)
	hsp.domain_index = int(domain.split("/")[0])
	if self._meta["program"] == "hmmpfam":
	hsp.hit_endtype = hmm_compl
	hsp.query_endtype = seq_compl
	elif self._meta["program"] == "hmmsearch":
	hsp.query_endtype = hmm_compl
	hsp.hit_endtype = seq_compl

	if id_ not in unordered_hits:
	placeholder = [p for p in hit_placeholders if p.id_ == id_][0]
	hit = placeholder.createHit([hsp])
	unordered_hits[id_] = hit
	else:
	hit = unordered_hits[id_]
	hsp.hit_description = hit.description
	hit.append(hsp)

	# The placeholder list is in the correct order, so use that order for
	# the Hit objects in the qresult
	for p in hit_placeholders:
	self.qresult.append(unordered_hits[p.id_])

	def parse_hsp_alignments(self):
	"""Parse a HMMER2 HSP alignment block."""
	if not self.line.startswith("Alignments"):
	return

	while self.read_next():
	if self.line == "//" or self.line.startswith("Histogram"):
	break

	match = re.search(_HSP_ALIGN_LINE, self.line)
	if match is None:
	continue

	id_ = match.group(1)
	idx = int(match.group(2))
	num = int(match.group(3))

	hit = self.qresult[id_]
	if hit.domain_obs_num != num:
	continue

	frag = hit[idx - 1][0]

	hmmseq = ""
	consensus = ""
	otherseq = ""
	structureseq = ""
	pad = 0
	while self.read_next() and self.line.startswith(" "):
	# if there's structure information, parse that
	if self.line[16:18] == "CS":
	structureseq += self.line[19:].strip()

	if not self.read_next():
	break

	# skip the *-> start marker if it exists
	if self.line[19:22] == "*->":
	seq = self.line[22:]
	pad = 3
	else:
	seq = self.line[19:]
	pad = 0

	hmmseq += seq
	line_len = len(seq)
	if not self.read_next(rstrip=False):
	break
	consensus += self.line[19 + pad : 19 + pad + line_len]
	# If there's no consensus sequence, hmmer2 doesn't
	# bother to put spaces here, so add extra padding
	extra_padding = len(hmmseq) - len(consensus)
	consensus += " " * extra_padding

	if not self.read_next():
	break

	# if we have a line break in the end marker, we get a
	# whitespace-only otherseq line, making split()[0] return
	# the end coordinate. That'll be a -, which is a valid character
	# in the sequence, meaning we can't just strip it.
	parts = self.line[19:].split()
	if len(parts) == 2:
	otherseq += self.line[19:].split()[0].strip()

	self.push_back(self.line)

	# get rid of the end marker
	if hmmseq.endswith("<-*"):
	hmmseq = hmmseq[:-3]
	consensus = consensus[:-3]

	# add similarity sequence to annotation
	frag.aln_annotation["similarity"] = consensus

	# if there's structure information, add it to the fragment
	if structureseq:
	frag.aln_annotation["CS"] = structureseq

	if self._meta["program"] == "hmmpfam":
	frag.hit = hmmseq
	frag.query = otherseq
	else:
	frag.hit = otherseq
	frag.query = hmmseq


	class Hmmer2TextIndexer(_BaseHmmerTextIndexer):
	"""Indexer for hmmer2-text format."""

	_parser = Hmmer2TextParser
	qresult_start = b"Query"
	# qresults_ends for hmmpfam and hmmsearch
	# need to anticipate both since hmmsearch have different query end mark
	qresult_end = b"//"

	def __iter__(self):
	"""Iterate over Hmmer2TextIndexer; yields query results' key, offsets, 0."""
	handle = self._handle
	handle.seek(0)
	start_offset = handle.tell()
	regex_id = re.compile(rb"Query\s(?:sequence\|HMM)?:\s(.*)")

	# determine flag for hmmsearch
	is_hmmsearch = False
	line = read_forward(handle)
	if line.startswith(b"hmmsearch"):
	is_hmmsearch = True

	while True:
	end_offset = handle.tell()

	if line.startswith(self.qresult_start):
	regx = re.search(regex_id, line)
	qresult_key = regx.group(1).strip()
	# qresult start offset is the offset of this line
	# (starts with the start mark)
	start_offset = end_offset - len(line)
	elif line.startswith(self.qresult_end):
	yield qresult_key.decode(), start_offset, 0
	start_offset = end_offset
	elif not line:
	# HACK: since hmmsearch can only have one query result
	if is_hmmsearch:
	yield qresult_key.decode(), start_offset, 0
	break

	line = read_forward(handle)


	# if not used as a module, run the doctest
	if __name__ == "__main__":
	from Bio._utils import run_doctest

	run_doctest()