aakash0017's picture
Upload folder using huggingface_hub
b7731cd
# Copyright 2012 by Wibowo Arindrarto. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SearchIO parser for BLAST+ tab output format, with or without comments."""
import re
from Bio.SearchIO._index import SearchIndexer
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
__all__ = ("BlastTabIndexer", "BlastTabParser", "BlastTabWriter")
# longname-shortname map
# maps the column names shown in a commented output to its short name
# (the one used in the command line)
_LONG_SHORT_MAP = {
"query id": "qseqid",
"query acc.": "qacc",
"query acc.ver": "qaccver",
"query length": "qlen",
"subject id": "sseqid",
"subject acc.": "sacc",
"subject acc.ver": "saccver",
"subject length": "slen",
"alignment length": "length",
"bit score": "bitscore",
"score": "score",
"evalue": "evalue",
"identical": "nident",
"% identity": "pident",
"positives": "positive",
"% positives": "ppos",
"mismatches": "mismatch",
"gaps": "gaps",
"q. start": "qstart",
"q. end": "qend",
"s. start": "sstart",
"s. end": "send",
"query frame": "qframe",
"sbjct frame": "sframe",
"query/sbjct frames": "frames",
"query seq": "qseq",
"subject seq": "sseq",
"gap opens": "gapopen",
"query gi": "qgi",
"subject ids": "sallseqid",
"subject gi": "sgi",
"subject gis": "sallgi",
"BTOP": "btop",
"subject accs.": "sallacc",
"subject tax ids": "staxids",
"subject sci names": "sscinames",
"subject com names": "scomnames",
"subject blast names": "sblastnames",
"subject super kingdoms": "sskingdoms",
"subject title": "stitle",
"subject titles": "salltitles",
"subject strand": "sstrand",
"% subject coverage": "qcovs",
"% hsp coverage": "qcovhsp",
}
# function to create a list from semicolon-delimited string
# used in BlastTabParser._parse_result_row
def _list_semicol(s):
return s.split(";")
def _list_diamond(s):
return s.split("<>")
# column to class attribute map
_COLUMN_QRESULT = {
"qseqid": ("id", str),
"qacc": ("accession", str),
"qaccver": ("accession_version", str),
"qlen": ("seq_len", int),
"qgi": ("gi", str),
}
_COLUMN_HIT = {
"sseqid": ("id", str),
"sallseqid": ("id_all", _list_semicol),
"sacc": ("accession", str),
"saccver": ("accession_version", str),
"sallacc": ("accession_all", _list_semicol),
"sgi": ("gi", str),
"sallgi": ("gi_all", str),
"slen": ("seq_len", int),
"staxids": ("tax_ids", _list_semicol),
"sscinames": ("sci_names", _list_semicol),
"scomnames": ("com_names", _list_semicol),
"sblastnames": ("blast_names", _list_semicol),
"sskingdoms": ("super_kingdoms", _list_semicol),
"stitle": ("title", str),
"salltitles": ("title_all", _list_diamond),
# set strand as HSP property?
"sstrand": ("strand", str),
"qcovs": ("query_coverage", float),
}
_COLUMN_HSP = {
"bitscore": ("bitscore", float),
"score": ("bitscore_raw", int),
"evalue": ("evalue", float),
"nident": ("ident_num", int),
"pident": ("ident_pct", float),
"positive": ("pos_num", int),
"ppos": ("pos_pct", float),
"mismatch": ("mismatch_num", int),
"gaps": ("gap_num", int),
"gapopen": ("gapopen_num", int),
"btop": ("btop", str),
"qcovhsp": ("query_coverage", float),
}
_COLUMN_FRAG = {
"length": ("aln_span", int),
"qstart": ("query_start", int),
"qend": ("query_end", int),
"sstart": ("hit_start", int),
"send": ("hit_end", int),
"qframe": ("query_frame", int),
"sframe": ("hit_frame", int),
"frames": ("frames", str),
"qseq": ("query", str),
"sseq": ("hit", str),
}
_SUPPORTED_FIELDS = set(
list(_COLUMN_QRESULT) + list(_COLUMN_HIT) + list(_COLUMN_HSP) + list(_COLUMN_FRAG)
)
# column order in the non-commented tabular output variant
# values must be keys inside the column-attribute maps above
_DEFAULT_FIELDS = [
"qseqid",
"sseqid",
"pident",
"length",
"mismatch",
"gapopen",
"qstart",
"qend",
"sstart",
"send",
"evalue",
"bitscore",
]
# one field from each of the following sets must exist in order for the
# parser to work
_MIN_QUERY_FIELDS = {"qseqid", "qacc", "qaccver"}
_MIN_HIT_FIELDS = {"sseqid", "sacc", "saccver", "sallseqid"}
# simple function to create BLAST HSP attributes that may be computed if
# other certain attributes are present
# This was previously implemented in the HSP objects in the old model
_RE_GAPOPEN = re.compile(r"\w-")
def _compute_gapopen_num(hsp):
"""Return the number of gap openings in the given HSP (PRIVATE)."""
gapopen = 0
for seq_type in ("query", "hit"):
seq = str(getattr(hsp, seq_type).seq)
gapopen += len(re.findall(_RE_GAPOPEN, seq))
return gapopen
def _augment_blast_hsp(hsp, attr):
"""Calculate the given HSP attribute, for writing (PRIVATE)."""
if not hasattr(hsp, attr) and not attr.endswith("_pct"):
# aln_span is number of identical matches + mismatches + gaps
if attr == "aln_span":
hsp.aln_span = hsp.ident_num + hsp.mismatch_num + hsp.gap_num
# ident and gap requires the num values to be computed first
elif attr.startswith("ident"):
setattr(hsp, attr, hsp.aln_span - hsp.mismatch_num - hsp.gap_num)
elif attr.startswith("gap"):
setattr(hsp, attr, hsp.aln_span - hsp.ident_num - hsp.mismatch_num)
elif attr == "mismatch_num":
setattr(hsp, attr, hsp.aln_span - hsp.ident_num - hsp.gap_num)
elif attr == "gapopen_num":
if not hasattr(hsp, "query") or not hasattr(hsp, "hit"):
raise AttributeError
hsp.gapopen_num = _compute_gapopen_num(hsp)
# if the attr is a percent value, calculate it
if attr == "ident_pct":
hsp.ident_pct = hsp.ident_num / hsp.aln_span * 100
elif attr == "pos_pct":
hsp.pos_pct = hsp.pos_num / hsp.aln_span * 100
elif attr == "gap_pct":
hsp.gap_pct = hsp.gap_num / hsp.aln_span * 100
class BlastTabParser:
"""Parser for the BLAST tabular format."""
def __init__(self, handle, comments=False, fields=_DEFAULT_FIELDS):
"""Initialize the class."""
self.handle = handle
self.has_comments = comments
self.fields = self._prep_fields(fields)
self.line = self.handle.readline().strip()
def __iter__(self):
"""Iterate over BlastTabParser, yields query results."""
# stop iteration if file has no lines
if not self.line:
return
# determine which iterator to use
elif self.has_comments:
iterfunc = self._parse_commented_qresult
else:
if self.line.startswith("#"):
raise ValueError(
"Encountered unexpected character '#' at the beginning of a line. "
"Set comments=True if the file is a commented file."
)
iterfunc = self._parse_qresult
yield from iterfunc()
def _prep_fields(self, fields):
"""Validate and format the given fields for use by the parser (PRIVATE)."""
# cast into list if fields is a space-separated string
if isinstance(fields, str):
fields = fields.strip().split(" ")
# blast allows 'std' as a proxy for the standard default lists
# we want to transform 'std' to its proper column names
if "std" in fields:
idx = fields.index("std")
fields = fields[:idx] + _DEFAULT_FIELDS + fields[idx + 1 :]
# if set(fields) has a null intersection with minimum required
# fields for hit and query, raise an exception
if not set(fields).intersection(_MIN_QUERY_FIELDS) or not set(
fields
).intersection(_MIN_HIT_FIELDS):
raise ValueError("Required query and/or hit ID field not found.")
return fields
def _parse_commented_qresult(self):
"""Yield ``QueryResult`` objects from a commented file (PRIVATE)."""
while True:
comments = self._parse_comments()
if comments:
try:
self.fields = comments["fields"]
# iterator for the query results
qres_iter = self._parse_qresult()
except KeyError:
# no fields means the query has no results
assert "fields" not in comments
# create an iterator returning one empty qresult
# if the query has no results
qres_iter = iter([QueryResult()])
for qresult in qres_iter:
for key, value in comments.items():
setattr(qresult, key, value)
yield qresult
else:
break
def _parse_comments(self):
"""Return a dictionary containing tab file comments (PRIVATE)."""
comments = {}
while True:
# parse program and version
# example: # BLASTX 2.2.26+
if "BLAST" in self.line and "processed" not in self.line:
program_line = self.line[len(" #") :].split(" ")
comments["program"] = program_line[0].lower()
comments["version"] = program_line[1]
# parse query id and description (if available)
# example: # Query: gi|356995852 Mus musculus POU domain
elif "Query" in self.line:
query_line = self.line[len("# Query: ") :].split(" ", 1)
comments["id"] = query_line[0]
if len(query_line) == 2:
comments["description"] = query_line[1]
# parse target database
# example: # Database: db/minirefseq_protein
elif "Database" in self.line:
comments["target"] = self.line[len("# Database: ") :]
# parse RID (from remote searches)
elif "RID" in self.line:
comments["rid"] = self.line[len("# RID: ") :]
# parse column order, required for parsing the result lines
# example: # Fields: query id, query gi, query acc., query length
elif "Fields" in self.line:
comments["fields"] = self._parse_fields_line()
# if the line has these strings, it's either the end of a comment
# or the end of a file, so we return all the comments we've parsed
elif " hits found" in self.line or "processed" in self.line:
self.line = self.handle.readline().strip()
return comments
self.line = self.handle.readline()
if not self.line:
return comments
else:
self.line = self.line.strip()
def _parse_fields_line(self):
"""Return column short names line from 'Fields' comment line (PRIVATE)."""
raw_field_str = self.line[len("# Fields: ") :]
long_fields = raw_field_str.split(", ")
fields = [_LONG_SHORT_MAP[long_name] for long_name in long_fields]
return self._prep_fields(fields)
def _parse_result_row(self):
"""Return a dictionary of parsed row values (PRIVATE)."""
fields = self.fields
columns = self.line.strip().split("\t")
if len(fields) != len(columns):
raise ValueError(
"Expected %i columns, found: %i" % (len(fields), len(columns))
)
qresult, hit, hsp, frag = {}, {}, {}, {}
for idx, value in enumerate(columns):
sname = fields[idx]
# flag to check if any of the _COLUMNs contain sname
in_mapping = False
# iterate over each dict, mapping pair to determine
# attribute name and value of each column
for parsed_dict, mapping in (
(qresult, _COLUMN_QRESULT),
(hit, _COLUMN_HIT),
(hsp, _COLUMN_HSP),
(frag, _COLUMN_FRAG),
):
# process parsed value according to mapping
if sname in mapping:
attr_name, caster = mapping[sname]
if caster is not str:
value = caster(value)
parsed_dict[attr_name] = value
in_mapping = True
# make sure that any unhandled field is not supported
if not in_mapping:
assert sname not in _SUPPORTED_FIELDS
return {"qresult": qresult, "hit": hit, "hsp": hsp, "frag": frag}
def _get_id(self, parsed):
"""Return the value used for a QueryResult or Hit ID from a parsed row (PRIVATE)."""
# use 'id', with 'id_all', 'accession' and 'accession_version'
# fallbacks one of these must have a value since we've checked whether
# they exist or not when parsing the comments
id_cache = parsed.get("id")
if id_cache is None and "id_all" in parsed:
id_cache = parsed.get("id_all")[0]
if id_cache is None:
id_cache = parsed.get("accession")
if id_cache is None:
id_cache = parsed.get("accession_version")
return id_cache
def _parse_qresult(self):
"""Yield QueryResult objects (PRIVATE)."""
# state values, used to determine what to do with each line
state_EOF = 0
state_QRES_NEW = 1
state_QRES_SAME = 3
state_HIT_NEW = 2
state_HIT_SAME = 4
# dummies for initial states
qres_state = None
hit_state = None
file_state = None
cur_qid = None
cur_hid = None
# dummies for initial id caches
prev_qid = None
prev_hid = None
# dummies for initial parsed value containers
cur, prev = None, None
hit_list, hsp_list = [], []
while True:
# store previous line's parsed values if we've past the first line
if cur is not None:
prev = cur
prev_qid = cur_qid
prev_hid = cur_hid
# only parse the line if it's not EOF or not a comment line
if self.line and not self.line.startswith("#"):
cur = self._parse_result_row()
cur_qid = self._get_id(cur["qresult"])
cur_hid = self._get_id(cur["hit"])
else:
file_state = state_EOF
# mock values for cur_qid and cur_hid since the line is empty
cur_qid, cur_hid = None, None
# get the state of hit and qresult
if prev_qid != cur_qid:
qres_state = state_QRES_NEW
else:
qres_state = state_QRES_SAME
# new hits are hits with different id or hits in a new qresult
if prev_hid != cur_hid or qres_state == state_QRES_NEW:
hit_state = state_HIT_NEW
else:
hit_state = state_HIT_SAME
# we're creating objects for the previously parsed line(s),
# so nothing is done in the first parsed line (prev == None)
if prev is not None:
# every line is essentially an HSP with one fragment, so we
# create both of these for every line
frag = HSPFragment(prev_hid, prev_qid)
for attr, value in prev["frag"].items():
# adjust coordinates to Python range
# NOTE: this requires both start and end coords to be
# present, otherwise a KeyError will be raised.
# Without this limitation, we might misleadingly set the
# start / end coords
for seq_type in ("query", "hit"):
if attr == seq_type + "_start":
value = min(value, prev["frag"][seq_type + "_end"]) - 1
elif attr == seq_type + "_end":
value = max(value, prev["frag"][seq_type + "_start"])
setattr(frag, attr, value)
# strand and frame setattr require the full parsed values
# to be set first
for seq_type in ("hit", "query"):
# try to set hit and query frame
frame = self._get_frag_frame(frag, seq_type, prev["frag"])
setattr(frag, "%s_frame" % seq_type, frame)
# try to set hit and query strand
strand = self._get_frag_strand(frag, seq_type, prev["frag"])
setattr(frag, "%s_strand" % seq_type, strand)
hsp = HSP([frag])
for attr, value in prev["hsp"].items():
setattr(hsp, attr, value)
hsp_list.append(hsp)
# create hit and append to temp hit container if hit_state
# says we're not at the same hit or at a new query
if hit_state == state_HIT_NEW:
hit = Hit(hsp_list)
for attr, value in prev["hit"].items():
if attr != "id_all":
setattr(hit, attr, value)
else:
# not setting hit ID since it's already set from the
# prev_hid above
setattr(hit, "_id_alt", value[1:])
hit_list.append(hit)
hsp_list = []
# create qresult and yield if we're at a new qresult or EOF
if qres_state == state_QRES_NEW or file_state == state_EOF:
qresult = QueryResult(hit_list, prev_qid)
for attr, value in prev["qresult"].items():
setattr(qresult, attr, value)
yield qresult
# if current line is EOF, break
if file_state == state_EOF:
break
hit_list = []
self.line = self.handle.readline().strip()
def _get_frag_frame(self, frag, seq_type, parsedict):
"""Return fragment frame for given object (PRIVATE).
Returns ``HSPFragment`` frame given the object, its sequence type,
and its parsed dictionary values.
"""
assert seq_type in ("query", "hit")
frame = getattr(frag, "%s_frame" % seq_type, None)
if frame is not None:
return frame
else:
if "frames" in parsedict:
# frames is 'x1/x2' string, x1 is query frame, x2 is hit frame
idx = 0 if seq_type == "query" else 1
return int(parsedict["frames"].split("/")[idx])
# else implicit None return
def _get_frag_strand(self, frag, seq_type, parsedict):
"""Return fragment strand for given object (PRIVATE).
Returns ``HSPFragment`` strand given the object, its sequence type,
and its parsed dictionary values.
"""
# NOTE: this will never set the strands as 0 for protein
# queries / hits, since we can't detect the blast flavors
# from the columns alone.
assert seq_type in ("query", "hit")
strand = getattr(frag, "%s_strand" % seq_type, None)
if strand is not None:
return strand
else:
# using parsedict instead of the fragment object since
# we need the unadjusted coordinated values
start = parsedict.get("%s_start" % seq_type)
end = parsedict.get("%s_end" % seq_type)
if start is not None and end is not None:
return 1 if start <= end else -1
# else implicit None return
class BlastTabIndexer(SearchIndexer):
"""Indexer class for BLAST+ tab output."""
_parser = BlastTabParser
def __init__(self, filename, comments=False, fields=_DEFAULT_FIELDS):
"""Initialize the class."""
SearchIndexer.__init__(self, filename, comments=comments, fields=fields)
# if the file doesn't have comments,
# get index of column used as the key (qseqid / qacc / qaccver)
if not self._kwargs["comments"]:
if "qseqid" in fields:
self._key_idx = fields.index("qseqid")
elif "qacc" in fields:
self._key_idx = fields.index("qacc")
elif "qaccver" in fields:
self._key_idx = fields.index("qaccver")
else:
raise ValueError(
"Custom fields is missing an ID column. One of these must be "
"present: 'qseqid', 'qacc', or 'qaccver'."
)
def __iter__(self):
"""Iterate over the file handle; yields key, start offset, and length."""
handle = self._handle
handle.seek(0)
if not self._kwargs["comments"]:
iterfunc = self._qresult_index
else:
iterfunc = self._qresult_index_commented
for key, offset, length in iterfunc():
yield key.decode(), offset, length
def _qresult_index_commented(self):
"""Indexer for commented BLAST tabular files (PRIVATE)."""
handle = self._handle
handle.seek(0)
start_offset = 0
# mark of a new query
query_mark = None
# mark of the query's ID
qid_mark = b"# Query: "
# mark of the last line
end_mark = b"# BLAST processed"
while True:
end_offset = handle.tell()
line = handle.readline()
if query_mark is None:
query_mark = line
start_offset = end_offset
elif line.startswith(qid_mark):
qresult_key = line[len(qid_mark) :].split()[0]
elif line == query_mark or line.startswith(end_mark):
yield qresult_key, start_offset, end_offset - start_offset
start_offset = end_offset
elif not line:
break
def _qresult_index(self):
"""Indexer for noncommented BLAST tabular files (PRIVATE)."""
handle = self._handle
handle.seek(0)
start_offset = 0
qresult_key = None
key_idx = self._key_idx
while True:
# get end offset here since we only know a qresult ends after
# encountering the next one
end_offset = handle.tell()
# line = handle.readline()
line = handle.readline()
if qresult_key is None:
qresult_key = line.split(b"\t")[key_idx]
else:
try:
curr_key = line.split(b"\t")[key_idx]
except IndexError:
curr_key = b""
if curr_key != qresult_key:
yield qresult_key, start_offset, end_offset - start_offset
qresult_key = curr_key
start_offset = end_offset
# break if we've reached EOF
if not line:
break
def get_raw(self, offset):
"""Return the raw bytes string of a QueryResult object from the given offset."""
if self._kwargs["comments"]:
getfunc = self._get_raw_qresult_commented
else:
getfunc = self._get_raw_qresult
return getfunc(offset)
def _get_raw_qresult(self, offset):
"""Return the raw bytes string of a single QueryResult from a noncommented file (PRIVATE)."""
handle = self._handle
handle.seek(offset)
qresult_raw = b""
key_idx = self._key_idx
qresult_key = None
while True:
line = handle.readline()
# get the key if the first line (qresult key)
if qresult_key is None:
qresult_key = line.split(b"\t")[key_idx]
else:
try:
curr_key = line.split(b"\t")[key_idx]
except IndexError:
curr_key = b""
# only break when qresult is finished (key is different)
if curr_key != qresult_key:
break
# append to the raw string as long as qresult is the same
qresult_raw += line
return qresult_raw
def _get_raw_qresult_commented(self, offset):
"""Return the bytes raw string of a single QueryResult from a commented file (PRIVATE)."""
handle = self._handle
handle.seek(offset)
qresult_raw = b""
end_mark = b"# BLAST processed"
# query mark is the line marking a new query
# something like '# TBLASTN 2.2.25+'
query_mark = None
line = handle.readline()
while line:
# since query_mark depends on the BLAST search, we need to obtain it
# first
if query_mark is None:
query_mark = line
# break when we've reached the next qresult or the search ends
elif line == query_mark or line.startswith(end_mark):
break
qresult_raw += line
line = handle.readline()
return qresult_raw
class BlastTabWriter:
"""Writer for blast-tab output format."""
def __init__(self, handle, comments=False, fields=_DEFAULT_FIELDS):
"""Initialize the class."""
self.handle = handle
self.has_comments = comments
self.fields = fields
def write_file(self, qresults):
"""Write to the handle, return how many QueryResult objects were written."""
handle = self.handle
qresult_counter, hit_counter, hsp_counter, frag_counter = 0, 0, 0, 0
for qresult in qresults:
if self.has_comments:
handle.write(self._build_comments(qresult))
if qresult:
handle.write(self._build_rows(qresult))
if not self.has_comments:
qresult_counter += 1
hit_counter += len(qresult)
hsp_counter += sum(len(hit) for hit in qresult)
frag_counter += sum(len(hit.fragments) for hit in qresult)
# if it's commented and there are no hits in the qresult, we still
# increment the counter
if self.has_comments:
qresult_counter += 1
# commented files have a line saying how many queries were processed
if self.has_comments:
handle.write("# BLAST processed %i queries" % qresult_counter)
return qresult_counter, hit_counter, hsp_counter, frag_counter
def _build_rows(self, qresult):
"""Return a string containing tabular rows of the QueryResult object (PRIVATE)."""
coordinates = {"qstart", "qend", "sstart", "send"}
qresult_lines = ""
for hit in qresult:
for hsp in hit:
line = []
for field in self.fields:
# get the column value ~ could either be an attribute
# of qresult, hit, or hsp
if field in _COLUMN_QRESULT:
value = getattr(qresult, _COLUMN_QRESULT[field][0])
elif field in _COLUMN_HIT:
if field == "sallseqid":
value = getattr(hit, "id_all")
else:
value = getattr(hit, _COLUMN_HIT[field][0])
# special case, since 'frames' can be determined from
# query frame and hit frame
elif field == "frames":
value = "%i/%i" % (hsp.query_frame, hsp.hit_frame)
elif field in _COLUMN_HSP:
try:
value = getattr(hsp, _COLUMN_HSP[field][0])
except AttributeError:
attr = _COLUMN_HSP[field][0]
_augment_blast_hsp(hsp, attr)
value = getattr(hsp, attr)
elif field in _COLUMN_FRAG:
value = getattr(hsp, _COLUMN_FRAG[field][0])
else:
assert field not in _SUPPORTED_FIELDS
continue
# adjust from and to according to strand, if from and to
# is included in the output field
if field in coordinates:
value = self._adjust_coords(field, value, hsp)
# adjust output formatting
value = self._adjust_output(field, value)
line.append(value)
hsp_line = "\t".join(line)
qresult_lines += hsp_line + "\n"
return qresult_lines
def _adjust_coords(self, field, value, hsp):
"""Adjust start and end coordinates according to strand (PRIVATE)."""
assert field in ("qstart", "qend", "sstart", "send")
# determine sequence type to operate on based on field's first letter
seq_type = "query" if field.startswith("q") else "hit"
strand = getattr(hsp, "%s_strand" % seq_type, None)
if strand is None:
raise ValueError(
"Required attribute %r not found." % ("%s_strand" % (seq_type))
)
# switch start <--> end coordinates if strand is -1
if strand < 0:
if field.endswith("start"):
value = getattr(hsp, "%s_end" % seq_type)
elif field.endswith("end"):
value = getattr(hsp, "%s_start" % seq_type) + 1
elif field.endswith("start"):
# adjust start coordinate for positive strand
value += 1
return value
def _adjust_output(self, field, value):
"""Adjust formatting of given field and value to mimic native tab output (PRIVATE)."""
# qseq and sseq are stored as SeqRecord, but here we only need the str
if field in ("qseq", "sseq"):
value = str(value.seq)
# evalue formatting, adapted from BLAST+ source:
# src/objtools/align_format/align_format_util.cpp#L668
elif field == "evalue":
if value < 1.0e-180:
value = "0.0"
elif value < 1.0e-99:
value = "%2.0e" % value
elif value < 0.0009:
value = "%3.0e" % value
elif value < 0.1:
value = "%4.3f" % value
elif value < 1.0:
value = "%3.2f" % value
elif value < 10.0:
value = "%2.1f" % value
else:
value = "%5.0f" % value
# pident and ppos formatting
elif field in ("pident", "ppos"):
value = "%.2f" % value
# evalue formatting, adapted from BLAST+ source:
# src/objtools/align_format/align_format_util.cpp#L723
elif field == "bitscore":
if value > 9999:
value = "%4.3e" % value
elif value > 99.9:
value = "%4.0d" % value
else:
value = "%4.1f" % value
# coverages have no comma (using floats still ~ a more proper
# representation)
elif field in ("qcovhsp", "qcovs"):
value = "%.0f" % value
# list into '<>'-delimited string
elif field == "salltitles":
value = "<>".join(value)
# list into ';'-delimited string
elif field in (
"sallseqid",
"sallacc",
"staxids",
"sscinames",
"scomnames",
"sblastnames",
"sskingdoms",
):
value = ";".join(value)
# everything else
else:
value = str(value)
return value
def _build_comments(self, qres):
"""Return QueryResult tabular comment as a string (PRIVATE)."""
comments = []
# inverse mapping of the long-short name map, required
# for writing comments
inv_field_map = {v: k for k, v in _LONG_SHORT_MAP.items()}
# try to anticipate qress without version
program = qres.program.upper()
try:
version = qres.version
except AttributeError:
program_line = "# %s" % program
else:
program_line = f"# {program} {version}"
comments.append(program_line)
# description may or may not be None
if qres.description is None:
comments.append("# Query: %s" % qres.id)
else:
comments.append(f"# Query: {qres.id} {qres.description}")
# try appending RID line, if present
try:
comments.append("# RID: %s" % qres.rid)
except AttributeError:
pass
comments.append("# Database: %s" % qres.target)
# qresults without hits don't show the Fields comment
if qres:
comments.append(
"# Fields: %s"
% ", ".join(inv_field_map[field] for field in self.fields)
)
comments.append("# %i hits found" % len(qres))
return "\n".join(comments) + "\n"
# if not used as a module, run the doctest
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()