Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

App Files Files Community

DrVai-Rag-Testing / myenv /lib /python3.10 /site-packages /Bio /Align /hhr.py

aakash0017

Upload folder using huggingface_hub

b7731cd over 1 year ago

raw

history blame contribute delete

9.5 kB

	# Copyright 2022 by Michiel de Hoon. All rights reserved.
	#
	# This file is part of the Biopython distribution and governed by your
	# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
	# Please see the LICENSE file that should have been included as part of this
	# package.
	"""Bio.Align support for hhr files generated by HHsearch or HHblits in HH-suite.

	This module provides support for output in the hhr file format generated by
	HHsearch or HHblits in HH-suite.

	You are expected to use this module via the Bio.Align functions.
	"""
	from Bio.Align import Alignment
	from Bio.Align import interfaces
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord


	class AlignmentIterator(interfaces.AlignmentIterator):
	"""Alignment iterator for hhr output files generated by HHsearch or HHblits.

	HHsearch and HHblits are part of the HH-suite of programs for Hidden Markov
	Models. An output files in the hhr format contains multiple pairwise
	alignments for a single query sequence.
	"""

	fmt = "hhr"

	def _read_header(self, stream):
	metadata = {}
	for line in stream:
	line = line.strip()
	if line == "":
	break
	key, value = line.split(None, 1)
	if key == "Query":
	self.query_name = value
	elif key == "Match_columns":
	metadata[key] = int(value)
	elif key == "No_of_seqs":
	value1, value2 = value.split(" out of ")
	metadata[key] = (int(value1), int(value2))
	elif key in ("Neff", "Template_Neff"):
	metadata[key] = float(value)
	elif key == "Searched_HMMs":
	metadata[key] = int(value)
	elif key == "Date":
	metadata["Rundate"] = value
	elif key == "Command":
	metadata["Command line"] = value
	else:
	raise ValueError("Unknown key '%s'" % key)
	self.metadata = metadata
	try:
	line = next(stream)
	except StopIteration:
	raise ValueError("Truncated file.") from None
	assert line.split() == [
	"No",
	"Hit",
	"Prob",
	"E-value",
	"P-value",
	"Score",
	"SS",
	"Cols",
	"Query",
	"HMM",
	"Template",
	"HMM",
	]
	counter = 0
	for line in stream:
	if line.strip() == "":
	break
	counter += 1
	word, _ = line.split(None, 1)
	assert int(word) == counter
	self._length = counter
	self._counter = 0

	def _read_next_alignment(self, stream):
	def create_alignment():
	n = len(target_sequence)
	assert len(query_sequence) == n
	if n == 0:
	return
	coordinates = Alignment.infer_coordinates([target_sequence, query_sequence])
	coordinates[0, :] += target_start
	coordinates[1, :] += query_start
	sequence = {query_start: query_sequence.replace("-", "")}
	query_seq = Seq(sequence, length=query_length)
	query = SeqRecord(query_seq, id=self.query_name)
	sequence = {target_start: target_sequence.replace("-", "")}
	target_seq = Seq(sequence, length=target_length)
	target_annotations = {
	"hmm_name": hmm_name,
	"hmm_description": hmm_description,
	}
	target = SeqRecord(
	target_seq, id=target_name, annotations=target_annotations
	)
	fmt = f"{' ' * target_start}%-{target_length - target_start}s"
	target.letter_annotations["Consensus"] = fmt % target_consensus.replace(
	"-", ""
	)
	target.letter_annotations["ss_pred"] = fmt % target_ss_pred.replace("-", "")
	target.letter_annotations["ss_dssp"] = fmt % target_ss_dssp.replace("-", "")
	target.letter_annotations["Confidence"] = fmt % confidence.replace(" ", "")
	fmt = f"{' ' * query_start}%-{query_length - query_start}s"
	query.letter_annotations["Consensus"] = fmt % query_consensus.replace(
	"-", ""
	)
	query.letter_annotations["ss_pred"] = fmt % query_ss_pred.replace("-", "")
	records = [target, query]
	alignment = Alignment(records, coordinates=coordinates)
	alignment.annotations = alignment_annotations
	alignment.column_annotations = {}
	alignment.column_annotations["column score"] = column_score
	return alignment

	query_start = None
	query_sequence = ""
	query_consensus = ""
	query_ss_pred = ""
	target_start = None
	target_sequence = ""
	target_consensus = ""
	target_ss_pred = ""
	target_ss_dssp = ""
	column_score = ""
	confidence = ""
	for line in stream:
	line = line.rstrip()
	if not line:
	pass
	elif line.startswith(">"):
	hmm_name, hmm_description = line[1:].split(None, 1)
	line = next(stream)
	words = line.split()
	alignment_annotations = {}
	for word in words:
	key, value = word.split("=")
	if key == "Aligned_cols":
	continue # can be obtained from coordinates
	if key == "Identities":
	value = value.rstrip("%")
	value = float(value)
	alignment_annotations[key] = value
	elif line == "Done!":
	try:
	next(stream)
	except StopIteration:
	pass
	else:
	raise ValueError(
	"Found additional data after 'Done!'; corrupt file?"
	)
	elif line.startswith(" "):
	column_score += line.strip()
	elif line.startswith("No "):
	counter = self._counter
	self._counter += 1
	key, value = line.split()
	assert int(value) == self._counter
	if self._counter > self._length:
	raise ValueError(
	"Expected %d alignments, found %d"
	% (self._length, self._counter)
	)
	if counter > 0:
	return create_alignment()
	elif line.startswith("Confidence"):
	key, value = line.split(None, 1)
	confidence += value
	elif line.startswith("Q ss_pred "):
	key, value = line.rsplit(None, 1)
	query_ss_pred += value
	elif line.startswith("Q Consensus "):
	key1, key2, start, consensus, end, total = line.split()
	start = int(start) - 1
	end = int(end)
	assert total.startswith("(")
	assert total.endswith(")")
	total = int(total[1:-1])
	query_consensus += consensus
	elif line.startswith("Q "):
	key1, key2, start, sequence, end, total = line.split()
	assert self.query_name.startswith(key2)
	start = int(start) - 1
	end = int(end)
	assert total.startswith("(")
	assert total.endswith(")")
	query_length = int(total[1:-1])
	assert query_length == self.metadata["Match_columns"]
	if query_start is None:
	query_start = start
	query_sequence += sequence
	elif line.startswith("T ss_pred "):
	key, value = line.rsplit(None, 1)
	target_ss_pred += value
	elif line.startswith("T ss_dssp "):
	key, value = line.rsplit(None, 1)
	target_ss_dssp += value
	elif line.startswith("T Consensus "):
	key1, key2, start, consensus, end, total = line.split()
	start = int(start) - 1
	end = int(end)
	assert total.startswith("(")
	assert total.endswith(")")
	total = int(total[1:-1])
	target_consensus += consensus
	elif line.startswith("T "):
	key, name, start, sequence, end, total = line.split()
	assert key == "T"
	target_name = name
	start = int(start) - 1
	end = int(end)
	assert total.startswith("(")
	assert total.endswith(")")
	target_length = int(total[1:-1])
	if target_start is None:
	target_start = start
	target_sequence += sequence
	else:
	raise ValueError("Failed to parse line '%s...'" % line[:30])
	alignment = create_alignment()
	length = self._length
	counter = self._counter
	if length == counter:
	self._close()
	del self._counter
	if alignment is None and length > 0:
	raise ValueError("Expected %d alignments, found %d" % (length, counter))
	return alignment

	def __len__(self):
	return self._length