aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2012 by Wibowo Arindrarto. All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SearchIO parser for HMMER plain text output format."""
import re
from Bio.SearchIO._utils import read_forward
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
from ._base import _BaseHmmerTextIndexer
__all__ = ("Hmmer3TextParser", "Hmmer3TextIndexer")
# precompile regex patterns for faster processing
# regex for program name capture
_RE_PROGRAM = re.compile(r"^# (\w*hmm\w+) :: .*$")
# regex for version string capture
_RE_VERSION = re.compile(r"# \w+ ([\w+\.]+) .*; http.*$")
# regex for option string capture
_RE_OPT = re.compile(r"^# (.+):\s+(.+)$")
# regex for parsing query id and length, for parsing
_QRE_ID_LEN_PTN = r"^Query:\s*(.*)\s+\[\w=(\d+)\]"
_QRE_ID_LEN = re.compile(_QRE_ID_LEN_PTN)
# regex for hsp validation
_HRE_VALIDATE = re.compile(r"score:\s(-?\d+\.?\d+)\sbits.*value:\s(.*)")
# regexes for parsing hsp alignment blocks
_HRE_ANNOT_LINE = re.compile(r"^(\s+)(.+)\s(\w+)")
_HRE_ID_LINE = re.compile(r"^(\s+\S+\s+[0-9-]+ )(.+?)(\s+[0-9-]+)")
class Hmmer3TextParser:
"""Parser for the HMMER 3.0 text output."""
def __init__(self, handle):
"""Initialize the class."""
self.handle = handle
self.line = read_forward(self.handle)
self._meta = self._parse_preamble()
def __iter__(self):
"""Iterate over query results."""
yield from self._parse_qresult()
def _read_until(self, bool_func):
"""Read the file handle until the given function returns True (PRIVATE)."""
while True:
if not self.line or bool_func(self.line):
return
else:
self.line = read_forward(self.handle)
def _parse_preamble(self):
"""Parse HMMER preamble (lines beginning with '#') (PRIVATE)."""
meta = {}
# bool flag for storing state ~ whether we are parsing the option
# lines or not
has_opts = False
while True:
# no pound sign means we've left the preamble
if not self.line.startswith("#"):
break
# dashes could either mean we are entering or leaving the options
# section ~ so it's a switch for the has_opts flag
elif "- - -" in self.line:
if not has_opts:
# if flag is false, that means we're entering opts
# so switch the flag accordingly
has_opts = True
else:
# if flag is true, that means we've reached the end of opts
# so we can break out of the function
break
elif not has_opts:
# try parsing program
regx = re.search(_RE_PROGRAM, self.line)
if regx:
meta["program"] = regx.group(1)
# try parsing version
regx = re.search(_RE_VERSION, self.line)
if regx:
meta["version"] = regx.group(1)
elif has_opts:
regx = re.search(_RE_OPT, self.line)
# if target in regx.group(1), then we store the key as target
if "target" in regx.group(1):
meta["target"] = regx.group(2).strip()
else:
meta[regx.group(1)] = regx.group(2)
self.line = read_forward(self.handle)
return meta
def _parse_qresult(self):
"""Parse a HMMER3 query block (PRIVATE)."""
self._read_until(lambda line: line.startswith("Query:"))
while self.line:
regx = re.search(_QRE_ID_LEN, self.line)
while not regx:
self.line = read_forward(self.handle)
regx = re.search(_QRE_ID_LEN, self.line)
# get query id and length
qid = regx.group(1).strip()
# store qresult attributes
qresult_attrs = {
"seq_len": int(regx.group(2)),
"program": self._meta.get("program"),
"version": self._meta.get("version"),
"target": self._meta.get("target"),
}
# get description and accession, if they exist
qdesc = "<unknown description>" # placeholder
while not self.line.startswith("Scores for "):
self.line = read_forward(self.handle)
if self.line.startswith("Accession:"):
acc = self.line.strip().split(" ", 1)[1]
qresult_attrs["accession"] = acc.strip()
elif self.line.startswith("Description:"):
qdesc = self.line.strip().split(" ", 1)[1].strip()
qresult_attrs["description"] = qdesc
# parse the query hits
while self.line and "//" not in self.line:
hit_list = self._parse_hit(qid, qdesc)
# read through the statistics summary
# TODO: parse and store this information?
if self.line.startswith("Internal pipeline"):
while self.line and "//" not in self.line:
self.line = read_forward(self.handle)
# create qresult, set its attributes and yield
# not initializing hit_list directly to handle empty hits
# (i.e. need to set its query description manually)
qresult = QueryResult(id=qid, hits=hit_list)
for attr, value in qresult_attrs.items():
setattr(qresult, attr, value)
yield qresult
self.line = read_forward(self.handle)
# Skip line beginning with '# Alignment of', which are output
# when running phmmer with the '-A' flag.
if self.line.startswith("#"):
self.line = self.handle.readline()
# HMMER >= 3.1 outputs '[ok]' at the end of all results file,
# which means we can break the main loop when we see the line
if "[ok]" in self.line:
break
def _parse_hit(self, qid, qdesc):
"""Parse a HMMER3 hit block, beginning with the hit table (PRIVATE)."""
# get to the end of the hit table delimiter and read one more line
self._read_until(lambda line: line.startswith(" ------- ------ -----"))
self.line = read_forward(self.handle)
# assume every hit is in inclusion threshold until the inclusion
# threshold line is encountered
is_included = True
# parse the hit table
hit_attr_list = []
while True:
if not self.line:
return []
elif self.line.startswith(" ------ inclusion"):
is_included = False
self.line = read_forward(self.handle)
# if there are no hits, then there are no hsps
# so we forward-read until 'Internal pipeline..'
elif self.line.startswith(" [No hits detected that satisfy reporting"):
while True:
self.line = read_forward(self.handle)
if self.line.startswith("Internal pipeline"):
assert len(hit_attr_list) == 0
return []
elif self.line.startswith("Domain annotation for each "):
hit_list = self._create_hits(hit_attr_list, qid, qdesc)
return hit_list
# entering hit results row
# parse the columns into a list
row = [x for x in self.line.strip().split(" ") if x]
# join the description words if it's >1 word
if len(row) > 10:
row[9] = " ".join(row[9:])
# if there's no description, set it to an empty string
elif len(row) < 10:
row.append("")
assert len(row) == 10
# create the hit object
hit_attrs = {
"id": row[8],
"query_id": qid,
"evalue": float(row[0]),
"bitscore": float(row[1]),
"bias": float(row[2]),
# row[3:6] is not parsed, since the info is available
# at the HSP level
"domain_exp_num": float(row[6]),
"domain_obs_num": int(row[7]),
"description": row[9],
"is_included": is_included,
}
hit_attr_list.append(hit_attrs)
self.line = read_forward(self.handle)
def _create_hits(self, hit_attrs, qid, qdesc):
"""Parse a HMMER3 hsp block, beginning with the hsp table (PRIVATE)."""
# read through until the beginning of the hsp block
self._read_until(
lambda line: line.startswith("Internal pipeline") or line.startswith(">>")
)
# start parsing the hsp block
hit_list = []
while True:
if self.line.startswith("Internal pipeline"):
# by this time we should've emptied the hit attr list
assert len(hit_attrs) == 0
return hit_list
assert self.line.startswith(">>")
hid, hdesc = self.line[len(">> ") :].split(" ", 1)
hdesc = hdesc.strip()
# read through the hsp table header and move one more line
self._read_until(
lambda line: line.startswith(" --- ------ ----- --------")
or line.startswith(" [No individual domains")
)
self.line = read_forward(self.handle)
# parse the hsp table for the current hit
hsp_list = []
while True:
# break out of hsp parsing if there are no hits, it's the last hsp
# or it's the start of a new hit
if (
self.line.startswith(" [No targets detected that satisfy")
or self.line.startswith(" [No individual domains")
or self.line.startswith("Internal pipeline statistics summary:")
or self.line.startswith(" Alignments for each domain:")
or self.line.startswith(">>")
):
hit_attr = hit_attrs.pop(0)
hit = Hit(hsp_list)
for attr, value in hit_attr.items():
if attr == "description":
cur_val = getattr(hit, attr)
if cur_val and value and cur_val.startswith(value):
continue
setattr(hit, attr, value)
if not hit:
hit.query_description = qdesc
hit_list.append(hit)
break
parsed = [x for x in self.line.strip().split(" ") if x]
assert len(parsed) == 16
# parsed column order:
# index, is_included, bitscore, bias, evalue_cond, evalue
# hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito,
# envfrom, envto, acc_avg
frag = HSPFragment(hid, qid)
# set query and hit descriptions if they are defined / nonempty string
if qdesc:
frag.query_description = qdesc
if hdesc:
frag.hit_description = hdesc
# HMMER3 results are always protein
frag.molecule_type = "protein"
# depending on whether the program is hmmsearch, hmmscan, or phmmer
# {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to}
# for hmmscan, hit is the hmm profile, query is the sequence
if self._meta.get("program") == "hmmscan":
# adjust 'from' and 'to' coordinates to 0-based ones
frag.hit_start = int(parsed[6]) - 1
frag.hit_end = int(parsed[7])
frag.query_start = int(parsed[9]) - 1
frag.query_end = int(parsed[10])
elif self._meta.get("program") in ["hmmsearch", "phmmer"]:
# adjust 'from' and 'to' coordinates to 0-based ones
frag.hit_start = int(parsed[9]) - 1
frag.hit_end = int(parsed[10])
frag.query_start = int(parsed[6]) - 1
frag.query_end = int(parsed[7])
# strand is always 0, since HMMER now only handles protein
frag.hit_strand = frag.query_strand = 0
hsp = HSP([frag])
hsp.domain_index = int(parsed[0])
hsp.is_included = parsed[1] == "!"
hsp.bitscore = float(parsed[2])
hsp.bias = float(parsed[3])
hsp.evalue_cond = float(parsed[4])
hsp.evalue = float(parsed[5])
if self._meta.get("program") == "hmmscan":
# adjust 'from' and 'to' coordinates to 0-based ones
hsp.hit_endtype = parsed[8]
hsp.query_endtype = parsed[11]
elif self._meta.get("program") in ["hmmsearch", "phmmer"]:
# adjust 'from' and 'to' coordinates to 0-based ones
hsp.hit_endtype = parsed[11]
hsp.query_endtype = parsed[8]
# adjust 'from' and 'to' coordinates to 0-based ones
hsp.env_start = int(parsed[12]) - 1
hsp.env_end = int(parsed[13])
hsp.env_endtype = parsed[14]
hsp.acc_avg = float(parsed[15])
hsp_list.append(hsp)
self.line = read_forward(self.handle)
# parse the hsp alignments
if self.line.startswith(" Alignments for each domain:"):
self._parse_aln_block(hid, hit.hsps)
def _parse_aln_block(self, hid, hsp_list):
"""Parse a HMMER3 HSP alignment block (PRIVATE)."""
self.line = read_forward(self.handle)
dom_counter = 0
while True:
if self.line.startswith(">>") or self.line.startswith("Internal pipeline"):
return hsp_list
assert self.line.startswith(" == domain %i" % (dom_counter + 1))
# alias hsp to local var
# but note that we're still changing the attrs of the actual
# hsp inside the qresult as we're not creating a copy
frag = hsp_list[dom_counter][0]
# XXX: should we validate again here? regex is expensive..
# regx = re.search(_HRE_VALIDATE, self.line)
# assert hsp.bitscore == float(regx.group(1))
# assert hsp.evalue_cond == float(regx.group(2))
hmmseq = ""
aliseq = ""
annot = {}
self.line = self.handle.readline()
# parse all the alignment blocks in the hsp
while True:
regx = None
# check for hit or query line
# we don't check for the hit or query id specifically
# to anticipate special cases where query id == hit id
regx = re.search(_HRE_ID_LINE, self.line)
if regx:
# the first hit/query self.line we encounter is the hmmseq
if len(hmmseq) == len(aliseq):
hmmseq += regx.group(2)
# and for subsequent self.lines, len(hmmseq) is either
# > or == len(aliseq)
elif len(hmmseq) > len(aliseq):
aliseq += regx.group(2)
assert len(hmmseq) >= len(aliseq)
# check for start of new domain
elif (
self.line.startswith(" == domain")
or self.line.startswith(">>")
or self.line.startswith("Internal pipeline")
):
frag.aln_annotation = annot
if self._meta.get("program") == "hmmscan":
frag.hit = hmmseq
frag.query = aliseq
elif self._meta.get("program") in ["hmmsearch", "phmmer"]:
frag.hit = aliseq
frag.query = hmmseq
dom_counter += 1
hmmseq = ""
aliseq = ""
annot = {}
break
# otherwise check if it's an annotation line and parse it
# len(hmmseq) is only != len(aliseq) when the cursor is parsing
# the similarity character. Since we're not parsing that, we
# check for when the condition is False (i.e. when it's ==)
elif len(hmmseq) == len(aliseq):
regx = re.search(_HRE_ANNOT_LINE, self.line)
if regx:
annot_name = regx.group(3)
if annot_name in annot:
annot[annot_name] += regx.group(2)
else:
annot[annot_name] = regx.group(2)
self.line = self.handle.readline()
class Hmmer3TextIndexer(_BaseHmmerTextIndexer):
"""Indexer class for HMMER plain text output."""
_parser = Hmmer3TextParser
qresult_start = b"Query: "
qresult_end = b"//"
def __iter__(self):
"""Iterate over Hmmer3TextIndexer; yields query results' key, offsets, 0."""
handle = self._handle
handle.seek(0)
start_offset = handle.tell()
regex_id = re.compile(_QRE_ID_LEN_PTN.encode())
while True:
line = read_forward(handle)
end_offset = handle.tell()
if line.startswith(self.qresult_start):
regx = re.search(regex_id, line)
qresult_key = regx.group(1).strip()
# qresult start offset is the offset of this line
# (starts with the start mark)
start_offset = end_offset - len(line)
elif line.startswith(self.qresult_end):
yield qresult_key.decode(), start_offset, 0
start_offset = end_offset
elif not line:
break
# if not used as a module, run the doctest
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()