aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2012 by Wibowo Arindrarto. All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SearchIO parser for HMMER domain table output format."""
from itertools import chain
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
from .hmmer3_tab import Hmmer3TabParser, Hmmer3TabIndexer
__all__ = (
"Hmmer3DomtabHmmhitParser",
"Hmmer3DomtabHmmqueryParser",
"Hmmer3DomtabHmmhitIndexer",
"Hmmer3DomtabHmmqueryIndexer",
"Hmmer3DomtabHmmhitWriter",
"Hmmer3DomtabHmmqueryWriter",
)
class Hmmer3DomtabParser(Hmmer3TabParser):
"""Base hmmer3-domtab iterator."""
def _parse_row(self):
"""Return a dictionary of parsed row values (PRIVATE)."""
assert self.line
cols = [x for x in self.line.strip().split(" ") if x]
# if len(cols) > 23, we have extra description columns
# combine them all into one string in the 19th column
if len(cols) > 23:
cols[22] = " ".join(cols[22:])
elif len(cols) < 23:
cols.append("")
assert len(cols) == 23
# assign parsed column data into qresult, hit, and hsp dicts
qresult = {}
qresult["id"] = cols[3] # query name
qresult["accession"] = cols[4] # query accession
qresult["seq_len"] = int(cols[5]) # qlen
hit = {}
hit["id"] = cols[0] # target name
hit["accession"] = cols[1] # target accession
hit["seq_len"] = int(cols[2]) # tlen
hit["evalue"] = float(cols[6]) # evalue
hit["bitscore"] = float(cols[7]) # score
hit["bias"] = float(cols[8]) # bias
hit["description"] = cols[22] # description of target
hsp = {}
hsp["domain_index"] = int(cols[9]) # # (domain number)
# not parsing cols[10] since it's basically len(hit)
hsp["evalue_cond"] = float(cols[11]) # c-evalue
hsp["evalue"] = float(cols[12]) # i-evalue
hsp["bitscore"] = float(cols[13]) # score
hsp["bias"] = float(cols[14]) # bias
hsp["env_start"] = int(cols[19]) - 1 # env from
hsp["env_end"] = int(cols[20]) # env to
hsp["acc_avg"] = float(cols[21]) # acc
frag = {}
# strand is always 0, since HMMER now only handles protein
frag["hit_strand"] = frag["query_strand"] = 0
frag["hit_start"] = int(cols[15]) - 1 # hmm from
frag["hit_end"] = int(cols[16]) # hmm to
frag["query_start"] = int(cols[17]) - 1 # ali from
frag["query_end"] = int(cols[18]) # ali to
# HMMER results are always protein
frag["molecule_type"] = "protein"
# switch hmm<-->ali coordinates if hmm is not hit
if not self.hmm_as_hit:
frag["hit_end"], frag["query_end"] = (frag["query_end"], frag["hit_end"])
frag["hit_start"], frag["query_start"] = (
frag["query_start"],
frag["hit_start"],
)
return {"qresult": qresult, "hit": hit, "hsp": hsp, "frag": frag}
def _parse_qresult(self):
"""Return QueryResult objects (PRIVATE)."""
# state values, determines what to do for each line
state_EOF = 0
state_QRES_NEW = 1
state_QRES_SAME = 3
state_HIT_NEW = 2
state_HIT_SAME = 4
# dummies for initial states
qres_state = None
hit_state = None
file_state = None
# dummies for initial id caches
prev_qid = None
prev_hid = None
# dummies for initial parsed value containers
cur, prev = None, None
hit_list, hsp_list = [], []
cur_qid = None
cur_hid = None
while True:
# store previous line's parsed values, for every line after the 1st
if cur is not None:
prev = cur
prev_qid = cur_qid
prev_hid = cur_hid
# only parse the line if it's not EOF
if self.line and not self.line.startswith("#"):
cur = self._parse_row()
cur_qid = cur["qresult"]["id"]
cur_hid = cur["hit"]["id"]
else:
file_state = state_EOF
# mock ID values since the line is empty
cur_qid, cur_hid = None, None
# get the state of hit and qresult
if prev_qid != cur_qid:
qres_state = state_QRES_NEW
else:
qres_state = state_QRES_SAME
# new hits are hits with different ids or hits in a new qresult
if prev_hid != cur_hid or qres_state == state_QRES_NEW:
hit_state = state_HIT_NEW
else:
hit_state = state_HIT_SAME
# start creating objects after the first line (i.e. prev is filled)
if prev is not None:
# each line is basically an HSP with one HSPFragment
frag = HSPFragment(prev_hid, prev_qid)
for attr, value in prev["frag"].items():
setattr(frag, attr, value)
hsp = HSP([frag])
for attr, value in prev["hsp"].items():
setattr(hsp, attr, value)
hsp_list.append(hsp)
# create hit object when we've finished parsing all its hsps
# i.e. when hit state is state_HIT_NEW
if hit_state == state_HIT_NEW:
hit = Hit(hsp_list)
for attr, value in prev["hit"].items():
setattr(hit, attr, value)
hit_list.append(hit)
hsp_list = []
# create qresult and yield if we're at a new qresult or EOF
if qres_state == state_QRES_NEW or file_state == state_EOF:
qresult = QueryResult(hit_list, prev_qid)
for attr, value in prev["qresult"].items():
setattr(qresult, attr, value)
yield qresult
# if current line is EOF, break
if file_state == state_EOF:
break
hit_list = []
self.line = self.handle.readline()
class Hmmer3DomtabHmmhitParser(Hmmer3DomtabParser):
"""HMMER domain table parser using hit coordinates.
Parser for the HMMER domain table format that assumes HMM profile
coordinates are hit coordinates.
"""
hmm_as_hit = True
class Hmmer3DomtabHmmqueryParser(Hmmer3DomtabParser):
"""HMMER domain table parser using query coordinates.
Parser for the HMMER domain table format that assumes HMM profile
coordinates are query coordinates.
"""
hmm_as_hit = False
class Hmmer3DomtabHmmhitIndexer(Hmmer3TabIndexer):
"""HMMER domain table indexer using hit coordinates.
Indexer class for HMMER domain table output that assumes HMM profile
coordinates are hit coordinates.
"""
_parser = Hmmer3DomtabHmmhitParser
_query_id_idx = 3
class Hmmer3DomtabHmmqueryIndexer(Hmmer3TabIndexer):
"""HMMER domain table indexer using query coordinates.
Indexer class for HMMER domain table output that assumes HMM profile
coordinates are query coordinates.
"""
_parser = Hmmer3DomtabHmmqueryParser
_query_id_idx = 3
class Hmmer3DomtabHmmhitWriter:
"""HMMER domain table writer using hit coordinates.
Writer for hmmer3-domtab output format which writes hit coordinates
as HMM profile coordinates.
"""
hmm_as_hit = True
def __init__(self, handle):
"""Initialize the class."""
self.handle = handle
def write_file(self, qresults):
"""Write to the handle.
Returns a tuple of how many QueryResult, Hit, and HSP objects were written.
"""
handle = self.handle
qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0
try:
first_qresult = next(qresults)
except StopIteration:
handle.write(self._build_header())
else:
# write header
handle.write(self._build_header(first_qresult))
# and then the qresults
for qresult in chain([first_qresult], qresults):
if qresult:
handle.write(self._build_row(qresult))
qresult_counter += 1
hit_counter += len(qresult)
hsp_counter += sum(len(hit) for hit in qresult)
frag_counter += sum(len(hit.fragments) for hit in qresult)
return qresult_counter, hit_counter, hsp_counter, frag_counter
def _build_header(self, first_qresult=None):
"""Return the header string of a domain HMMER table output (PRIVATE)."""
# calculate whitespace required
# adapted from HMMER's source: src/p7_tophits.c#L1157
if first_qresult:
# qnamew = max(20, len(first_qresult.id))
qnamew = 20
tnamew = max(20, len(first_qresult[0].id))
try:
qaccw = max(10, len(first_qresult.acc))
taccw = max(10, len(first_qresult[0].acc))
except AttributeError:
qaccw, taccw = 10, 10
else:
qnamew, tnamew, qaccw, taccw = 20, 20, 10, 10
# Turn black code style off
# fmt: off
header = ("#%*s %22s %40s %11s %11s %11s\n"
% (tnamew + qnamew - 1 + 15 + taccw + qaccw, "", "--- full sequence ---",
"-------------- this domain -------------", "hmm coord",
"ali coord", "env coord"))
header += ("#%-*s %-*s %5s %-*s %-*s %5s %9s %6s %5s %3s %3s %9s "
"%9s %6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n"
% (tnamew - 1,
" target name", taccw, "accession", "tlen", qnamew,
"query name", qaccw, "accession", "qlen", "E-value", "score",
"bias", "#", "of", "c-Evalue", "i-Evalue", "score", "bias",
"from", "to", "from", "to", "from", "to", "acc",
"description of target"))
header += ("#%*s %*s %5s %*s %*s %5s %9s %6s %5s %3s %3s %9s %9s "
"%6s %5s %5s %5s %5s %5s %5s %5s %4s %s\n"
% (tnamew - 1,
"-------------------", taccw, "----------", "-----",
qnamew, "--------------------", qaccw, "----------",
"-----", "---------", "------", "-----", "---", "---",
"---------", "---------", "------", "-----", "-----", "-----",
"-----", "-----", "-----", "-----", "----",
"---------------------"))
# Turn black code style on
# fmt: on
return header
def _build_row(self, qresult):
"""Return a string or one row or more of the QueryResult object (PRIVATE)."""
rows = ""
# calculate whitespace required
# adapted from HMMER's source: src/p7_tophits.c#L1083
qnamew = max(20, len(qresult.id))
tnamew = max(20, len(qresult[0].id))
try:
qaccw = max(10, len(qresult.accession))
taccw = max(10, len(qresult[0].accession))
qresult_acc = qresult.accession
except AttributeError:
qaccw, taccw = 10, 10
qresult_acc = "-"
for hit in qresult:
# try to get hit accession
try:
hit_acc = hit.accession
except AttributeError:
hit_acc = "-"
for hsp in hit.hsps:
if self.hmm_as_hit:
hmm_to = hsp.hit_end
hmm_from = hsp.hit_start + 1
ali_to = hsp.query_end
ali_from = hsp.query_start + 1
else:
hmm_to = hsp.query_end
hmm_from = hsp.query_start + 1
ali_to = hsp.hit_end
ali_from = hsp.hit_start + 1
rows += (
"%-*s %-*s %5d %-*s %-*s %5d %9.2g %6.1f %5.1f %3d"
" %3d %9.2g %9.2g %6.1f %5.1f %5d %5d %5ld %5ld"
" %5d %5d %4.2f %s\n"
% (
tnamew,
hit.id,
taccw,
hit_acc,
hit.seq_len,
qnamew,
qresult.id,
qaccw,
qresult_acc,
qresult.seq_len,
hit.evalue,
hit.bitscore,
hit.bias,
hsp.domain_index,
len(hit.hsps),
hsp.evalue_cond,
hsp.evalue,
hsp.bitscore,
hsp.bias,
hmm_from,
hmm_to,
ali_from,
ali_to,
hsp.env_start + 1,
hsp.env_end,
hsp.acc_avg,
hit.description,
)
)
return rows
class Hmmer3DomtabHmmqueryWriter(Hmmer3DomtabHmmhitWriter):
"""HMMER domain table writer using query coordinates.
Writer for hmmer3-domtab output format which writes query coordinates
as HMM profile coordinates.
"""
hmm_as_hit = False
# if not used as a module, run the doctest
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()