# Copyright 2022 by Michiel de Hoon. All rights reserved. # # This file is part of the Biopython distribution and governed by your # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". # Please see the LICENSE file that should have been included as part of this # package. """Bio.Align support for hhr files generated by HHsearch or HHblits in HH-suite. This module provides support for output in the hhr file format generated by HHsearch or HHblits in HH-suite. You are expected to use this module via the Bio.Align functions. """ from Bio.Align import Alignment from Bio.Align import interfaces from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord class AlignmentIterator(interfaces.AlignmentIterator): """Alignment iterator for hhr output files generated by HHsearch or HHblits. HHsearch and HHblits are part of the HH-suite of programs for Hidden Markov Models. An output files in the hhr format contains multiple pairwise alignments for a single query sequence. """ fmt = "hhr" def _read_header(self, stream): metadata = {} for line in stream: line = line.strip() if line == "": break key, value = line.split(None, 1) if key == "Query": self.query_name = value elif key == "Match_columns": metadata[key] = int(value) elif key == "No_of_seqs": value1, value2 = value.split(" out of ") metadata[key] = (int(value1), int(value2)) elif key in ("Neff", "Template_Neff"): metadata[key] = float(value) elif key == "Searched_HMMs": metadata[key] = int(value) elif key == "Date": metadata["Rundate"] = value elif key == "Command": metadata["Command line"] = value else: raise ValueError("Unknown key '%s'" % key) self.metadata = metadata try: line = next(stream) except StopIteration: raise ValueError("Truncated file.") from None assert line.split() == [ "No", "Hit", "Prob", "E-value", "P-value", "Score", "SS", "Cols", "Query", "HMM", "Template", "HMM", ] counter = 0 for line in stream: if line.strip() == "": break counter += 1 word, _ = line.split(None, 1) assert int(word) == counter self._length = counter self._counter = 0 def _read_next_alignment(self, stream): def create_alignment(): n = len(target_sequence) assert len(query_sequence) == n if n == 0: return coordinates = Alignment.infer_coordinates([target_sequence, query_sequence]) coordinates[0, :] += target_start coordinates[1, :] += query_start sequence = {query_start: query_sequence.replace("-", "")} query_seq = Seq(sequence, length=query_length) query = SeqRecord(query_seq, id=self.query_name) sequence = {target_start: target_sequence.replace("-", "")} target_seq = Seq(sequence, length=target_length) target_annotations = { "hmm_name": hmm_name, "hmm_description": hmm_description, } target = SeqRecord( target_seq, id=target_name, annotations=target_annotations ) fmt = f"{' ' * target_start}%-{target_length - target_start}s" target.letter_annotations["Consensus"] = fmt % target_consensus.replace( "-", "" ) target.letter_annotations["ss_pred"] = fmt % target_ss_pred.replace("-", "") target.letter_annotations["ss_dssp"] = fmt % target_ss_dssp.replace("-", "") target.letter_annotations["Confidence"] = fmt % confidence.replace(" ", "") fmt = f"{' ' * query_start}%-{query_length - query_start}s" query.letter_annotations["Consensus"] = fmt % query_consensus.replace( "-", "" ) query.letter_annotations["ss_pred"] = fmt % query_ss_pred.replace("-", "") records = [target, query] alignment = Alignment(records, coordinates=coordinates) alignment.annotations = alignment_annotations alignment.column_annotations = {} alignment.column_annotations["column score"] = column_score return alignment query_start = None query_sequence = "" query_consensus = "" query_ss_pred = "" target_start = None target_sequence = "" target_consensus = "" target_ss_pred = "" target_ss_dssp = "" column_score = "" confidence = "" for line in stream: line = line.rstrip() if not line: pass elif line.startswith(">"): hmm_name, hmm_description = line[1:].split(None, 1) line = next(stream) words = line.split() alignment_annotations = {} for word in words: key, value = word.split("=") if key == "Aligned_cols": continue # can be obtained from coordinates if key == "Identities": value = value.rstrip("%") value = float(value) alignment_annotations[key] = value elif line == "Done!": try: next(stream) except StopIteration: pass else: raise ValueError( "Found additional data after 'Done!'; corrupt file?" ) elif line.startswith(" "): column_score += line.strip() elif line.startswith("No "): counter = self._counter self._counter += 1 key, value = line.split() assert int(value) == self._counter if self._counter > self._length: raise ValueError( "Expected %d alignments, found %d" % (self._length, self._counter) ) if counter > 0: return create_alignment() elif line.startswith("Confidence"): key, value = line.split(None, 1) confidence += value elif line.startswith("Q ss_pred "): key, value = line.rsplit(None, 1) query_ss_pred += value elif line.startswith("Q Consensus "): key1, key2, start, consensus, end, total = line.split() start = int(start) - 1 end = int(end) assert total.startswith("(") assert total.endswith(")") total = int(total[1:-1]) query_consensus += consensus elif line.startswith("Q "): key1, key2, start, sequence, end, total = line.split() assert self.query_name.startswith(key2) start = int(start) - 1 end = int(end) assert total.startswith("(") assert total.endswith(")") query_length = int(total[1:-1]) assert query_length == self.metadata["Match_columns"] if query_start is None: query_start = start query_sequence += sequence elif line.startswith("T ss_pred "): key, value = line.rsplit(None, 1) target_ss_pred += value elif line.startswith("T ss_dssp "): key, value = line.rsplit(None, 1) target_ss_dssp += value elif line.startswith("T Consensus "): key1, key2, start, consensus, end, total = line.split() start = int(start) - 1 end = int(end) assert total.startswith("(") assert total.endswith(")") total = int(total[1:-1]) target_consensus += consensus elif line.startswith("T "): key, name, start, sequence, end, total = line.split() assert key == "T" target_name = name start = int(start) - 1 end = int(end) assert total.startswith("(") assert total.endswith(")") target_length = int(total[1:-1]) if target_start is None: target_start = start target_sequence += sequence else: raise ValueError("Failed to parse line '%s...'" % line[:30]) alignment = create_alignment() length = self._length counter = self._counter if length == counter: self._close() del self._counter if alignment is None and length > 0: raise ValueError("Expected %d alignments, found %d" % (length, counter)) return alignment def __len__(self): return self._length