Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /SearchIO /HHsuiteIO /hhsuite2_text.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

9.26 kB

	# Copyright 2019 by Jens Thomas. All rights reserved.
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Bio.SearchIO parser for HHSUITE version 2 and 3 plain text output format."""

	import re
	import warnings

	from Bio.SearchIO._utils import read_forward
	from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment

	__all__ = ("Hhsuite2TextParser",)

	# precompile regex patterns for faster processing
	# regex for query name capture
	_RE_QUERY = re.compile(r"^Query\s+(.+)\s?$")

	# regex for version string capture
	_RE_HIT_BLOCK_START = re.compile(r"^No +(\d+)\s+$")

	# id and full description
	_RE_HIT_BLOCK_DESC = re.compile(r">(\S+)\s+(.*)$")

	# sequence alignment line
	# Q sp\|Q9BSU1\|CP07 229 DAKMRVFERSVYFGDSCQDVLSMLGSPHKV 258 (422)
	_RE_MATCH_BLOCK_QUERY_SEQ = re.compile(r"^Q\s+(.+) +(\d+) +([A-Z-]+) +(\d+) +$\d+$$")
	_RE_MATCH_BLOCK_HIT_SEQ = re.compile(r"^T\s+(.+) +(\d+) +([A-Z-]+) +(\d+) +$\d+$$")

	_END_OF_FILE_MARKER = "Done!"

	_PROGRAM = "HHSUITE"

	# Maximum number of lines to read before expecting a hit block
	# This determines the maximum number of hits that would be allowed in
	# the initial hit table.
	MAX_READ_UNTIL = 5000


	class Hhsuite2TextParser:
	"""Parser for the HHSUITE version 2 and 3 text output."""

	def __init__(self, handle):
	"""Initialize the class."""
	self.handle = handle
	self.line = read_forward(self.handle)
	self.done = False
	self.query_id = None
	self.seq_len = None

	def __iter__(self):
	"""Iterate over query results - there will only ever be one."""
	yield from self._parse_qresult()

	def _read_until(self, bool_func, stop_on_blank=True, max_read_until=MAX_READ_UNTIL):
	"""Read the file handle until the given function returns True (PRIVATE)."""
	count = 0
	while True:
	if stop_on_blank and not self.line:
	return
	if bool_func(self.line):
	return
	else:
	self.line = read_forward(self.handle)
	count += 1
	if count >= max_read_until:
	raise RuntimeError("Exceeded max_read_until in _read_until")

	def _parse_qresult(self):
	"""Parse HHSUITE output file (PRIVATE)."""
	hit_block_data = []
	self._parse_preamble()
	self._read_until(
	lambda line: re.search(_RE_HIT_BLOCK_START, line), stop_on_blank=False
	)
	while not self.done:
	hit_dict = self._parse_hit_block()
	hit_block_data.append(hit_dict)
	return self._create_qresult(hit_block_data)

	def _parse_preamble(self):
	"""Parse metadata about query (PRIVATE)."""
	meta = {}
	while self.line:
	regx = re.search(_RE_QUERY, self.line)
	if regx:
	self.query_id = regx.group(1)
	if self.line.startswith("Match_columns"):
	self.seq_len = int(self.line.strip().split()[1])
	self.line = self.handle.readline().strip()
	return meta

	def _parse_hit_block(self):
	"""Parse a hit block (PRIVATE)."""
	self.line = read_forward(self.handle)
	match = re.search(_RE_HIT_BLOCK_DESC, self.line)
	if not match:
	raise RuntimeError(
	f"Unexpected content in HIT_BLOCK_DESC line'{self.line}'"
	)
	hit_data = {
	"hit_id": match.group(1),
	"description": match.group(2).lstrip(" ;"),
	"evalue": None,
	"hit_start": None,
	"hit_end": None,
	"hit_seq": "",
	"prob": None,
	"query_start": None,
	"query_end": None,
	"query_seq": "",
	"score": None,
	}
	self.line = self.handle.readline()
	self._process_score_line(self.line, hit_data)
	while True:
	self.line = read_forward(self.handle)
	if not self.line.strip() or self.line.startswith(_END_OF_FILE_MARKER):
	# _END_OF_FILE_MARKER isn't always present
	self.done = True
	return hit_data
	elif re.search(_RE_HIT_BLOCK_START, self.line):
	return hit_data
	else:
	self._parse_hit_match_block(hit_data)

	@staticmethod
	def _process_score_line(line, hit_data):
	"""Parse the scores from the line and populate hit_data dict (PRIVATE).

	Lines are of the form:
	Probab=99.95 E-value=3.7e-34 Score=210.31 Aligned_cols=171 Identities=100% Similarity=2.050 Sum_probs=166.9

	E-value could be in decimal or scientific notation, so split the string rather then use regexp - this
	also means we should be tolerant of additional fields being added/removed
	"""
	score_map = {"E-value": "evalue", "Score": "score", "Probab": "prob"}
	for score_pair in line.strip().split():
	key, value = score_pair.split("=")
	if key in score_map:
	try:
	hit_data[score_map[key]] = float(value)
	except KeyError:
	# We trigger warnings here as it's not a big enough problem to crash, but indicates something unexpected.
	warnings.warn(
	f"HHsuite parser: unable to extract {key} from line: {line}"
	)

	def _parse_hit_match_block(self, hit_match_data):
	"""Parse a single block of hit sequence data (PRIVATE).

	Parses block such as ::

	Q ss_pred ceecchHHHHHHHHHHHHHHHHHHHhhhhhcCCCCccc
	Q 4P79:A\|PDBID\|C 160 YELGPALYLGWSASLLSILGGICVFSTAAASSKEEPAT 197 (198)
	Q Consensus 160 ~~~g~sf~l~~~~~~l~~~~~~l~~~~~~~~~~~~~~~ 197 (198)
	.++\|\|\|\|\|++\|++.++.+++++++++..+..++++..+
	T Consensus 327 ~~~GwS~~l~~~s~~l~lia~~l~~~~~~~~~~~~~~~ 364 (364)
	T 5B2G_A 327 REMGASLYVGWAASGLLLLGGGLLCCSGPSSGENLYFQ 364 (364)
	T ss_dssp EEECTHHHHHHHHHHHHHHHHHHHHCC-----------
	T ss_pred cccchHHHHHHHHHHHHHHHHHHHHhcCCCCCCccccC

	"""

	def match_is_valid(match):
	"""Return True if match is not a Consensus column (PRIVATE).

	It's not possible to distinguish a sequence line from a Consensus line with
	a regexp, so need to check the ID column.
	"""
	return match.group(1).strip() != "Consensus"

	while True:
	if not self.line.strip(): # blank lines indicate the end of a hit block
	return
	match = re.match(_RE_MATCH_BLOCK_QUERY_SEQ, self.line)
	if match and match_is_valid(match):
	hit_match_data["query_seq"] += match.group(3).strip()
	if hit_match_data["query_start"] is None:
	hit_match_data["query_start"] = int(match.group(2))
	hit_match_data["query_end"] = int(match.group(4))
	else:
	match = re.match(_RE_MATCH_BLOCK_HIT_SEQ, self.line)
	if match and match_is_valid(match):
	hit_match_data["hit_seq"] += match.group(3).strip()
	if hit_match_data["hit_start"] is None:
	hit_match_data["hit_start"] = int(match.group(2))
	hit_match_data["hit_end"] = int(match.group(4))
	self.line = self.handle.readline()

	def _create_qresult(self, hit_blocks):
	"""Create the Biopython data structures from the parsed data (PRIVATE)."""
	query_id = self.query_id
	hit_dict = {}

	for output_index, block in enumerate(hit_blocks):
	hit_id = block["hit_id"]

	frag = HSPFragment(hit_id, query_id)
	frag.molecule_type = "protein"
	frag.query_start = block["query_start"] - 1
	frag.query_end = block["query_end"]
	frag.hit_start = block["hit_start"] - 1
	frag.hit_end = block["hit_end"]
	frag.hit = block["hit_seq"]
	frag.query = block["query_seq"]

	hsp = HSP([frag])
	hsp.hit_id = hit_id
	hsp.output_index = output_index
	hsp.query_id = query_id
	hsp.hit_description = block["description"]
	is_included = True # Should everything should be included?
	hsp.is_included = is_included
	hsp.evalue = block["evalue"]
	hsp.score = block["score"]
	hsp.prob = block["prob"]

	if hit_id not in hit_dict:
	hit = Hit([hsp], hit_id)
	hit.description = block["description"]
	hit.is_included = is_included
	hit.evalue = block["evalue"]
	hit.score = block["score"]
	hit_dict[hit_id] = hit
	else:
	hit_dict[hit_id].append(hsp)

	qresult = QueryResult(hit_dict.values(), query_id)
	qresult.program = _PROGRAM
	qresult.seq_len = self.seq_len
	return [qresult]