Spaces:
No application file
No application file
# Copyright 2012 by Kai Blin. All rights reserved. | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.SearchIO parser for HMMER 2 text output.""" | |
import re | |
from Bio.SearchIO._utils import read_forward | |
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment | |
from ._base import _BaseHmmerTextIndexer | |
__all__ = ("Hmmer2TextParser", "Hmmer2TextIndexer") | |
_HSP_ALIGN_LINE = re.compile(r"(\S+):\s+domain (\d+) of (\d+)") | |
class _HitPlaceholder: | |
def createHit(self, hsp_list): | |
hit = Hit(hsp_list) | |
hit.id_ = self.id_ | |
hit.evalue = self.evalue | |
hit.bitscore = self.bitscore | |
if self.description: | |
hit.description = self.description | |
hit.domain_obs_num = self.domain_obs_num | |
return hit | |
class Hmmer2TextParser: | |
"""Iterator for the HMMER 2.0 text output.""" | |
def __init__(self, handle): | |
"""Initialize the class.""" | |
self.handle = handle | |
self.buf = [] | |
self._meta = self.parse_preamble() | |
def __iter__(self): | |
"""Iterate over Hmmer2TextParser, yields query results.""" | |
for qresult in self.parse_qresult(): | |
qresult.program = self._meta.get("program") | |
qresult.target = self._meta.get("target") | |
qresult.version = self._meta.get("version") | |
yield qresult | |
def read_next(self, rstrip=True): | |
"""Return the next non-empty line, trailing whitespace removed.""" | |
if len(self.buf) > 0: | |
return self.buf.pop() | |
self.line = self.handle.readline() | |
while self.line and rstrip and not self.line.strip(): | |
self.line = self.handle.readline() | |
if self.line: | |
if rstrip: | |
self.line = self.line.rstrip() | |
return self.line | |
def push_back(self, line): | |
"""Un-read a line that should not be parsed yet.""" | |
self.buf.append(line) | |
def parse_key_value(self): | |
"""Parse key-value pair separated by colon.""" | |
key, value = self.line.split(":", 1) | |
return key.strip(), value.strip() | |
def parse_preamble(self): | |
"""Parse HMMER2 preamble.""" | |
meta = {} | |
state = "GENERIC" | |
while self.read_next(): | |
if state == "GENERIC": | |
if self.line.startswith("hmm"): | |
meta["program"] = self.line.split("-")[0].strip() | |
elif self.line.startswith("HMMER is"): | |
continue | |
elif self.line.startswith("HMMER"): | |
meta["version"] = self.line.split()[1] | |
elif self.line.count("-") == 36: | |
state = "OPTIONS" | |
continue | |
assert state == "OPTIONS" | |
assert "program" in meta | |
if self.line.count("-") == 32: | |
break | |
key, value = self.parse_key_value() | |
if meta["program"] == "hmmsearch": | |
if key == "Sequence database": | |
meta["target"] = value | |
continue | |
elif meta["program"] == "hmmpfam": | |
if key == "HMM file": | |
meta["target"] = value | |
continue | |
meta[key] = value | |
return meta | |
def parse_qresult(self): | |
"""Parse a HMMER2 query block.""" | |
while self.read_next(): | |
if not self.line.startswith("Query"): | |
return | |
_, id_ = self.parse_key_value() | |
self.qresult = QueryResult(id=id_) | |
description = None | |
while self.read_next() and not self.line.startswith("Scores"): | |
if self.line.startswith("Accession"): | |
self.qresult.accession = self.parse_key_value()[1] | |
if self.line.startswith("Description"): | |
description = self.parse_key_value()[1] | |
hit_placeholders = self.parse_hits() | |
if len(hit_placeholders) > 0: | |
self.parse_hsps(hit_placeholders) | |
self.parse_hsp_alignments() | |
while not self.line.startswith("Query"): | |
self.read_next() | |
if not self.line: | |
break | |
self.buf.append(self.line) | |
if description is not None: | |
self.qresult.description = description | |
yield self.qresult | |
def parse_hits(self): | |
"""Parse a HMMER2 hit block, beginning with the hit table.""" | |
hit_placeholders = [] | |
while self.read_next(): | |
if self.line.startswith("Parsed"): | |
break | |
if self.line.find("no hits") > -1: | |
break | |
if ( | |
self.line.startswith("Sequence") | |
or self.line.startswith("Model") | |
or self.line.startswith("-------- ") | |
): | |
continue | |
fields = self.line.split() | |
id_ = fields.pop(0) | |
domain_obs_num = int(fields.pop()) | |
evalue = float(fields.pop()) | |
bitscore = float(fields.pop()) | |
description = " ".join(fields).strip() | |
hit = _HitPlaceholder() | |
hit.id_ = id_ | |
hit.evalue = evalue | |
hit.bitscore = bitscore | |
hit.description = description | |
hit.domain_obs_num = domain_obs_num | |
hit_placeholders.append(hit) | |
return hit_placeholders | |
def parse_hsps(self, hit_placeholders): | |
"""Parse a HMMER2 hsp block, beginning with the hsp table.""" | |
# HSPs may occur in different order than the hits | |
# so store Hit objects separately first | |
unordered_hits = {} | |
while self.read_next(): | |
if ( | |
self.line.startswith("Alignments") | |
or self.line.startswith("Histogram") | |
or self.line == "//" | |
): | |
break | |
if ( | |
self.line.startswith("Model") | |
or self.line.startswith("Sequence") | |
or self.line.startswith("--------") | |
): | |
continue | |
( | |
id_, | |
domain, | |
seq_f, | |
seq_t, | |
seq_compl, | |
hmm_f, | |
hmm_t, | |
hmm_compl, | |
score, | |
evalue, | |
) = self.line.split() | |
frag = HSPFragment(id_, self.qresult.id) | |
frag.molecule_type = "protein" | |
if self._meta["program"] == "hmmpfam": | |
frag.hit_start = int(hmm_f) - 1 | |
frag.hit_end = int(hmm_t) | |
frag.query_start = int(seq_f) - 1 | |
frag.query_end = int(seq_t) | |
elif self._meta["program"] == "hmmsearch": | |
frag.query_start = int(hmm_f) - 1 | |
frag.query_end = int(hmm_t) | |
frag.hit_start = int(seq_f) - 1 | |
frag.hit_end = int(seq_t) | |
hsp = HSP([frag]) | |
hsp.evalue = float(evalue) | |
hsp.bitscore = float(score) | |
hsp.domain_index = int(domain.split("/")[0]) | |
if self._meta["program"] == "hmmpfam": | |
hsp.hit_endtype = hmm_compl | |
hsp.query_endtype = seq_compl | |
elif self._meta["program"] == "hmmsearch": | |
hsp.query_endtype = hmm_compl | |
hsp.hit_endtype = seq_compl | |
if id_ not in unordered_hits: | |
placeholder = [p for p in hit_placeholders if p.id_ == id_][0] | |
hit = placeholder.createHit([hsp]) | |
unordered_hits[id_] = hit | |
else: | |
hit = unordered_hits[id_] | |
hsp.hit_description = hit.description | |
hit.append(hsp) | |
# The placeholder list is in the correct order, so use that order for | |
# the Hit objects in the qresult | |
for p in hit_placeholders: | |
self.qresult.append(unordered_hits[p.id_]) | |
def parse_hsp_alignments(self): | |
"""Parse a HMMER2 HSP alignment block.""" | |
if not self.line.startswith("Alignments"): | |
return | |
while self.read_next(): | |
if self.line == "//" or self.line.startswith("Histogram"): | |
break | |
match = re.search(_HSP_ALIGN_LINE, self.line) | |
if match is None: | |
continue | |
id_ = match.group(1) | |
idx = int(match.group(2)) | |
num = int(match.group(3)) | |
hit = self.qresult[id_] | |
if hit.domain_obs_num != num: | |
continue | |
frag = hit[idx - 1][0] | |
hmmseq = "" | |
consensus = "" | |
otherseq = "" | |
structureseq = "" | |
pad = 0 | |
while self.read_next() and self.line.startswith(" "): | |
# if there's structure information, parse that | |
if self.line[16:18] == "CS": | |
structureseq += self.line[19:].strip() | |
if not self.read_next(): | |
break | |
# skip the *-> start marker if it exists | |
if self.line[19:22] == "*->": | |
seq = self.line[22:] | |
pad = 3 | |
else: | |
seq = self.line[19:] | |
pad = 0 | |
hmmseq += seq | |
line_len = len(seq) | |
if not self.read_next(rstrip=False): | |
break | |
consensus += self.line[19 + pad : 19 + pad + line_len] | |
# If there's no consensus sequence, hmmer2 doesn't | |
# bother to put spaces here, so add extra padding | |
extra_padding = len(hmmseq) - len(consensus) | |
consensus += " " * extra_padding | |
if not self.read_next(): | |
break | |
# if we have a line break in the end marker, we get a | |
# whitespace-only otherseq line, making split()[0] return | |
# the end coordinate. That'll be a -, which is a valid character | |
# in the sequence, meaning we can't just strip it. | |
parts = self.line[19:].split() | |
if len(parts) == 2: | |
otherseq += self.line[19:].split()[0].strip() | |
self.push_back(self.line) | |
# get rid of the end marker | |
if hmmseq.endswith("<-*"): | |
hmmseq = hmmseq[:-3] | |
consensus = consensus[:-3] | |
# add similarity sequence to annotation | |
frag.aln_annotation["similarity"] = consensus | |
# if there's structure information, add it to the fragment | |
if structureseq: | |
frag.aln_annotation["CS"] = structureseq | |
if self._meta["program"] == "hmmpfam": | |
frag.hit = hmmseq | |
frag.query = otherseq | |
else: | |
frag.hit = otherseq | |
frag.query = hmmseq | |
class Hmmer2TextIndexer(_BaseHmmerTextIndexer): | |
"""Indexer for hmmer2-text format.""" | |
_parser = Hmmer2TextParser | |
qresult_start = b"Query" | |
# qresults_ends for hmmpfam and hmmsearch | |
# need to anticipate both since hmmsearch have different query end mark | |
qresult_end = b"//" | |
def __iter__(self): | |
"""Iterate over Hmmer2TextIndexer; yields query results' key, offsets, 0.""" | |
handle = self._handle | |
handle.seek(0) | |
start_offset = handle.tell() | |
regex_id = re.compile(rb"Query\s*(?:sequence|HMM)?:\s*(.*)") | |
# determine flag for hmmsearch | |
is_hmmsearch = False | |
line = read_forward(handle) | |
if line.startswith(b"hmmsearch"): | |
is_hmmsearch = True | |
while True: | |
end_offset = handle.tell() | |
if line.startswith(self.qresult_start): | |
regx = re.search(regex_id, line) | |
qresult_key = regx.group(1).strip() | |
# qresult start offset is the offset of this line | |
# (starts with the start mark) | |
start_offset = end_offset - len(line) | |
elif line.startswith(self.qresult_end): | |
yield qresult_key.decode(), start_offset, 0 | |
start_offset = end_offset | |
elif not line: | |
# HACK: since hmmsearch can only have one query result | |
if is_hmmsearch: | |
yield qresult_key.decode(), start_offset, 0 | |
break | |
line = read_forward(handle) | |
# if not used as a module, run the doctest | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest() | |