Spaces:
No application file
No application file
# Copyright 2012 by Wibowo Arindrarto. All rights reserved. | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.SearchIO objects to model high scoring regions between query and hit.""" | |
import warnings | |
from operator import ge, le | |
from Bio import BiopythonWarning | |
from Bio.Align import MultipleSeqAlignment | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from Bio.SearchIO._utils import ( | |
singleitem, | |
allitems, | |
fullcascade, | |
fragcascade, | |
getattr_str, | |
) | |
from ._base import _BaseHSP | |
class HSP(_BaseHSP): | |
"""Class representing high-scoring region(s) between query and hit. | |
HSP (high-scoring pair) objects are contained by Hit objects (see Hit). | |
In most cases, HSP objects store the bulk of the statistics and results | |
(e.g. e-value, bitscores, query sequence, etc.) produced by a search | |
program. | |
Depending on the search output file format, a given HSP will contain one | |
or more HSPFragment object(s). Examples of search programs that produce HSP | |
with one HSPFragments are BLAST, HMMER, and FASTA. Other programs such as | |
BLAT or Exonerate may produce HSPs containing more than one HSPFragment. | |
However, their native terminologies may differ: in BLAT these fragments | |
are called 'blocks' while in Exonerate they are called exons or NER. | |
Here are examples from each type of HSP. The first one comes from a BLAST | |
search:: | |
>>> from Bio import SearchIO | |
>>> blast_qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) | |
>>> blast_hsp = blast_qresult[1][0] # the first HSP from the second hit | |
>>> blast_hsp | |
HSP(hit_id='gi|301171311|ref|NR_035856.1|', query_id='33211', 1 fragments) | |
>>> print(blast_hsp) | |
Query: 33211 mir_1 | |
Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... | |
Query range: [1:61] (1) | |
Hit range: [0:60] (1) | |
Quick stats: evalue 1.7e-22; bitscore 109.49 | |
Fragments: 1 (60 columns) | |
Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG | |
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| | |
Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG | |
For HSPs with a single HSPFragment, you can invoke ``print`` on it and see the | |
underlying sequence alignment, if it exists. This is not the case for HSPs | |
with more than one HSPFragment. Below is an example, using an HSP from a | |
BLAT search. Invoking ``print`` on these HSPs will instead show a table of the | |
HSPFragment objects it contains:: | |
>>> blat_qresult = SearchIO.read('Blat/mirna.pslx', 'blat-psl', pslx=True) | |
>>> blat_hsp = blat_qresult[1][0] # the first HSP from the second hit | |
>>> blat_hsp | |
HSP(hit_id='chr11', query_id='blat_1', 2 fragments) | |
>>> print(blat_hsp) | |
Query: blat_1 <unknown description> | |
Hit: chr11 <unknown description> | |
Query range: [42:67] (-1) | |
Hit range: [59018929:59018955] (1) | |
Quick stats: evalue ?; bitscore ? | |
Fragments: --- -------------- ---------------------- ---------------------- | |
# Span Query range Hit range | |
--- -------------- ---------------------- ---------------------- | |
0 6 [61:67] [59018929:59018935] | |
1 16 [42:58] [59018939:59018955] | |
Notice that in HSPs with more than one HSPFragments, the HSP's ``query_range`` | |
``hit_range`` properties encompasses all fragments it contains. | |
You can check whether an HSP has more than one HSPFragments or not using the | |
``is_fragmented`` property:: | |
>>> blast_hsp.is_fragmented | |
False | |
>>> blat_hsp.is_fragmented | |
True | |
Since HSP objects are also containers similar to Python lists, you can | |
access a single fragment in an HSP using its integer index:: | |
>>> blat_fragment = blat_hsp[0] | |
>>> print(blat_fragment) | |
Query: blat_1 <unknown description> | |
Hit: chr11 <unknown description> | |
Query range: [61:67] (-1) | |
Hit range: [59018929:59018935] (1) | |
Fragments: 1 (6 columns) | |
Query - tatagt | |
Hit - tatagt | |
This applies to HSPs objects with a single fragment as well:: | |
>>> blast_fragment = blast_hsp[0] | |
>>> print(blast_fragment) | |
Query: 33211 mir_1 | |
Hit: gi|301171311|ref|NR_035856.1| Pan troglodytes microRNA mir-520b ... | |
Query range: [1:61] (1) | |
Hit range: [0:60] (1) | |
Fragments: 1 (60 columns) | |
Query - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG | |
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| | |
Hit - CCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG | |
Regardless of the search output file format, HSP objects provide the | |
properties listed below. These properties always return values in a list, | |
due to the HSP object itself being a list-like container. However, for | |
HSP objects with a single HSPFragment, shortcut properties that fetches | |
the item from the list are also provided. | |
+----------------------+---------------------+-----------------------------+ | |
| Property | Shortcut | Value | | |
+======================+=====================+=============================+ | |
| aln_all | aln | HSP alignments as | | |
| | | MultipleSeqAlignment object | | |
+----------------------+---------------------+-----------------------------+ | |
| aln_annotation_all | aln_annotation | dictionary of annotation(s) | | |
| | | of all fragments' alignments| | |
+----------------------+---------------------+-----------------------------+ | |
| fragments | fragment | HSPFragment objects | | |
+----------------------+---------------------+-----------------------------+ | |
| hit_all | hit | hit sequence as SeqRecord | | |
| | | objects | | |
+----------------------+---------------------+-----------------------------+ | |
| hit_features_all | hit_features | SeqFeatures of all hit | | |
| | | fragments | | |
+----------------------+---------------------+-----------------------------+ | |
| hit_start_all | hit_start* | start coordinates of the | | |
| | | hit fragments | | |
+----------------------+---------------------+-----------------------------+ | |
| hit_end_all | hit_end* | end coordinates of the hit | | |
| | | fragments | | |
+----------------------+---------------------+-----------------------------+ | |
| hit_span_all | hit_span* | sizes of each hit fragments | | |
+----------------------+---------------------+-----------------------------+ | |
| hit_strand_all | hit_strand | strand orientations of the | | |
| | | hit fragments | | |
+----------------------+---------------------+-----------------------------+ | |
| hit_frame_all | hit_frame | reading frames of the hit | | |
| | | fragments | | |
+----------------------+---------------------+-----------------------------+ | |
| hit_range_all | hit_range | tuples of start and end | | |
| | | coordinates of each hit | | |
| | | fragment | | |
+----------------------+---------------------+-----------------------------+ | |
| query_all | query | query sequence as SeqRecord | | |
| | | object | | |
+----------------------+---------------------+-----------------------------+ | |
| query_features_all | query_features | SeqFeatures of all query | | |
| | | fragments | | |
+----------------------+---------------------+-----------------------------+ | |
| query_start_all | query_start* | start coordinates of the | | |
| | | fragments | | |
+----------------------+---------------------+-----------------------------+ | |
| query_end_all | query_end* | end coordinates of the | | |
| | | query fragments | | |
+----------------------+---------------------+-----------------------------+ | |
| query_span_all | query_span* | sizes of each query | | |
| | | fragments | | |
+----------------------+---------------------+-----------------------------+ | |
| query_strand_all | query_strand | strand orientations of the | | |
| | | query fragments | | |
+----------------------+---------------------+-----------------------------+ | |
| query_frame_all | query_frame | reading frames of the query | | |
| | | fragments | | |
+----------------------+---------------------+-----------------------------+ | |
| query_range_all | query_range | tuples of start and end | | |
| | | coordinates of each query | | |
| | | fragment | | |
+----------------------+---------------------+-----------------------------+ | |
For all types of HSP objects, the property will return the values in a list. | |
Shortcuts are only applicable for HSPs with one fragment. Except the ones | |
noted, if they are used on an HSP with more than one fragments, an exception | |
will be raised. | |
For properties that may be used in HSPs with multiple or single fragments | |
(``*_start``, ``*_end``, and ``*_span`` properties), their interpretation depends | |
on how many fragment the HSP has: | |
+------------+---------------------------------------------------+ | |
| Property | Value | | |
+============+===================================================+ | |
| hit_start | smallest coordinate value of all hit fragments | | |
+------------+---------------------------------------------------+ | |
| hit_end | largest coordinate value of all hit fragments | | |
+------------+---------------------------------------------------+ | |
| hit_span | difference between ``hit_start`` and ``hit_end`` | | |
+------------+---------------------------------------------------+ | |
| query_start| smallest coordinate value of all query fragments | | |
+------------+---------------------------------------------------+ | |
| query_end | largest coordinate value of all query fragments | | |
+------------+---------------------------------------------------+ | |
| query_span | difference between ``query_start`` and | | |
| | ``query_end`` | | |
+------------+---------------------------------------------------+ | |
In addition to the objects listed above, HSP objects also provide the | |
following properties and/or attributes: | |
+--------------------+------------------------------------------------------+ | |
| Property | Value | | |
+====================+======================================================+ | |
| aln_span | total number of residues in all HSPFragment objects | | |
+--------------------+------------------------------------------------------+ | |
| molecule_type | molecule_type of the hit and query SeqRecord objects | | |
+--------------------+------------------------------------------------------+ | |
| is_fragmented | boolean, whether there are multiple fragments or not | | |
+--------------------+------------------------------------------------------+ | |
| hit_id | ID of the hit sequence | | |
+--------------------+------------------------------------------------------+ | |
| hit_description | description of the hit sequence | | |
+--------------------+------------------------------------------------------+ | |
| hit_inter_ranges | list of hit sequence coordinates of the regions | | |
| | between fragments | | |
+--------------------+------------------------------------------------------+ | |
| hit_inter_spans | list of lengths of the regions between hit fragments | | |
+--------------------+------------------------------------------------------+ | |
| output_index | 0-based index for storing the order by which the HSP | | |
| | appears in the output file (default: -1). | | |
+--------------------+------------------------------------------------------+ | |
| query_id | ID of the query sequence | | |
+--------------------+------------------------------------------------------+ | |
| query_description | description of the query sequence | | |
+--------------------+------------------------------------------------------+ | |
| query_inter_ranges | list of query sequence coordinates of the regions | | |
| | between fragments | | |
+--------------------+------------------------------------------------------+ | |
| query_inter_spans | list of lengths of the regions between query | | |
| | fragments | | |
+--------------------+------------------------------------------------------+ | |
.. [1] may be used in HSPs with multiple fragments | |
""" | |
# attributes we don't want to transfer when creating a new Hit class | |
# from this one | |
_NON_STICKY_ATTRS = ("_items",) | |
def __init__(self, fragments=(), output_index=-1): | |
"""Initialize an HSP object. | |
:param fragments: fragments contained in the HSP object | |
:type fragments: iterable yielding HSPFragment | |
:param output_index: optional index / ordering of the HSP fragment in | |
the original input file. | |
:type output_index: integer | |
HSP objects must be initialized with a list containing at least one | |
HSPFragment object. If multiple HSPFragment objects are used for | |
initialization, they must all have the same ``query_id``, | |
``query_description``, ``hit_id``, ``hit_description``, and | |
``molecule_type`` properties. | |
""" | |
if not fragments: | |
raise ValueError("HSP objects must have at least one HSPFragment object.") | |
# TODO - Move this into the for look in case hsps is a single use | |
# iterable? | |
# check that all fragments contain the same IDs, descriptions, | |
# molecule_type | |
for attr in ( | |
"query_id", | |
"query_description", | |
"hit_id", | |
"hit_description", | |
"molecule_type", | |
): | |
if len({getattr(frag, attr) for frag in fragments}) != 1: | |
raise ValueError( | |
"HSP object can not contain fragments with more than one %s." % attr | |
) | |
self.output_index = output_index | |
self._items = [] | |
for fragment in fragments: | |
self._validate_fragment(fragment) | |
self._items.append(fragment) | |
def __repr__(self): | |
"""Return string representation of HSP object.""" | |
return "%s(hit_id=%r, query_id=%r, %r fragments)" % ( | |
self.__class__.__name__, | |
self.hit_id, | |
self.query_id, | |
len(self), | |
) | |
def __iter__(self): | |
"""Iterate over HSP items.""" | |
return iter(self._items) | |
def __contains__(self, fragment): | |
"""Return True if HSPFragment is on HSP items.""" | |
return fragment in self._items | |
def __len__(self): | |
"""Return number of HSPs items.""" | |
return len(self._items) | |
def __bool__(self): | |
"""Return True if it has HSPs.""" | |
return bool(self._items) | |
def __str__(self): | |
"""Return a human readable summary of the HSP object.""" | |
lines = [] | |
# set hsp info line | |
statline = [] | |
# evalue | |
evalue = getattr_str(self, "evalue", fmt="%.2g") | |
statline.append("evalue " + evalue) | |
# bitscore | |
bitscore = getattr_str(self, "bitscore", fmt="%.2f") | |
statline.append("bitscore " + bitscore) | |
lines.append("Quick stats: " + "; ".join(statline)) | |
if len(self.fragments) == 1: | |
return "\n".join( | |
[self._str_hsp_header(), "\n".join(lines), self.fragments[0]._str_aln()] | |
) | |
else: | |
lines.append( | |
" Fragments: %s %s %s %s" % ("-" * 3, "-" * 14, "-" * 22, "-" * 22) | |
) | |
pattern = "%16s %14s %22s %22s" | |
lines.append(pattern % ("#", "Span", "Query range", "Hit range")) | |
lines.append(pattern % ("-" * 3, "-" * 14, "-" * 22, "-" * 22)) | |
for idx, block in enumerate(self.fragments): | |
# set hsp line and table | |
# alignment span | |
aln_span = getattr_str(block, "aln_span") | |
# query region | |
query_start = getattr_str(block, "query_start") | |
query_end = getattr_str(block, "query_end") | |
query_range = "[%s:%s]" % (query_start, query_end) | |
# max column length is 20 | |
query_range = ( | |
query_range[:20] + "~]" if len(query_range) > 22 else query_range | |
) | |
# hit region | |
hit_start = getattr_str(block, "hit_start") | |
hit_end = getattr_str(block, "hit_end") | |
hit_range = "[%s:%s]" % (hit_start, hit_end) | |
hit_range = hit_range[:20] + "~]" if len(hit_range) > 22 else hit_range | |
# append the hsp row | |
lines.append(pattern % (str(idx), aln_span, query_range, hit_range)) | |
return self._str_hsp_header() + "\n" + "\n".join(lines) | |
def __getitem__(self, idx): | |
"""Return object of index idx.""" | |
# if key is slice, return a new HSP instance | |
if isinstance(idx, slice): | |
obj = self.__class__(self._items[idx]) | |
self._transfer_attrs(obj) | |
return obj | |
return self._items[idx] | |
def __setitem__(self, idx, fragments): | |
"""Set an item of index idx with the given fragments.""" | |
# handle case if hsps is a list of hsp | |
if isinstance(fragments, (list, tuple)): | |
for fragment in fragments: | |
self._validate_fragment(fragment) | |
else: | |
self._validate_fragment(fragments) | |
self._items[idx] = fragments | |
def __delitem__(self, idx): | |
"""Delete item of index idx.""" | |
# note that this may result in an empty HSP object, which should be | |
# invalid | |
del self._items[idx] | |
def _validate_fragment(self, fragment): | |
if not isinstance(fragment, HSPFragment): | |
raise TypeError("HSP objects can only contain HSPFragment objects.") | |
# HACK: to make validation during __init__ work | |
if self._items: | |
if fragment.hit_id != self.hit_id: | |
raise ValueError( | |
"Expected HSPFragment with hit ID %r, found %r instead." | |
% (self.id, fragment.hit_id) | |
) | |
if fragment.hit_description != self.hit_description: | |
raise ValueError( | |
"Expected HSPFragment with hit description %r, found %r instead." | |
% (self.description, fragment.hit_description) | |
) | |
if fragment.query_id != self.query_id: | |
raise ValueError( | |
"Expected HSPFragment with query ID %r, found %r instead." | |
% (self.query_id, fragment.query_id) | |
) | |
if fragment.query_description != self.query_description: | |
raise ValueError( | |
"Expected HSP with query description %r, found %r instead." | |
% (self.query_description, fragment.query_description) | |
) | |
def _aln_span_get(self): | |
# length of all alignments | |
# alignment span can be its own attribute, or computed from | |
# query / hit length | |
return sum(frg.aln_span for frg in self.fragments) | |
aln_span = property( | |
fget=_aln_span_get, doc="Total number of columns in all HSPFragment objects." | |
) | |
# coordinate properties # | |
def _get_coords(self, seq_type, coord_type): | |
assert seq_type in ("hit", "query") | |
assert coord_type in ("start", "end") | |
coord_name = "%s_%s" % (seq_type, coord_type) | |
coords = [getattr(frag, coord_name) for frag in self.fragments] | |
if None in coords: | |
warnings.warn( | |
"'None' exist in %s coordinates; ignored" % (coord_name), | |
BiopythonWarning, | |
) | |
return coords | |
def _hit_start_get(self): | |
return min(self._get_coords("hit", "start")) | |
hit_start = property( | |
fget=_hit_start_get, doc="Smallest coordinate value of all hit fragments." | |
) | |
def _query_start_get(self): | |
return min(self._get_coords("query", "start")) | |
query_start = property( | |
fget=_query_start_get, doc="Smallest coordinate value of all query fragments." | |
) | |
def _hit_end_get(self): | |
return max(self._get_coords("hit", "end")) | |
hit_end = property( | |
fget=_hit_end_get, doc="Largest coordinate value of all hit fragments." | |
) | |
def _query_end_get(self): | |
return max(self._get_coords("query", "end")) | |
query_end = property( | |
fget=_query_end_get, doc="Largest coordinate value of all hit fragments." | |
) | |
# coordinate-dependent properties # | |
def _hit_span_get(self): | |
try: | |
return self.hit_end - self.hit_start | |
except TypeError: # triggered if any of the coordinates are None | |
return None | |
hit_span = property( | |
fget=_hit_span_get, doc="The number of hit residues covered by the HSP." | |
) | |
def _query_span_get(self): | |
try: | |
return self.query_end - self.query_start | |
except TypeError: # triggered if any of the coordinates are None | |
return None | |
query_span = property( | |
fget=_query_span_get, doc="The number of query residues covered by the HSP." | |
) | |
def _hit_range_get(self): | |
return (self.hit_start, self.hit_end) | |
hit_range = property( | |
fget=_hit_range_get, doc="Tuple of HSP hit start and end coordinates." | |
) | |
def _query_range_get(self): | |
return (self.query_start, self.query_end) | |
query_range = property( | |
fget=_query_range_get, doc="Tuple of HSP query start and end coordinates." | |
) | |
def _inter_ranges_get(self, seq_type): | |
# this property assumes that there are no mixed strands in a hit/query | |
assert seq_type in ("query", "hit") | |
strand = getattr(self, "%s_strand_all" % seq_type)[0] | |
coords = getattr(self, "%s_range_all" % seq_type) | |
# determine function used to set inter range | |
# start and end coordinates, given two pairs | |
# of fragment start and end coordinates | |
if strand == -1: | |
startfunc, endfunc = min, max | |
else: | |
startfunc, endfunc = max, min | |
inter_coords = [] | |
for idx, coord in enumerate(coords[:-1]): | |
start = startfunc(coords[idx]) | |
end = endfunc(coords[idx + 1]) | |
inter_coords.append((min(start, end), max(start, end))) | |
return inter_coords | |
def _hit_inter_ranges_get(self): | |
return self._inter_ranges_get("hit") | |
hit_inter_ranges = property( | |
fget=_hit_inter_ranges_get, | |
doc="Hit sequence coordinates of the regions between fragments.", | |
) | |
def _query_inter_ranges_get(self): | |
return self._inter_ranges_get("query") | |
query_inter_ranges = property( | |
fget=_query_inter_ranges_get, | |
doc="Query sequence coordinates of the regions between fragments.", | |
) | |
def _inter_spans_get(self, seq_type): | |
assert seq_type in ("query", "hit") | |
attr_name = "%s_inter_ranges" % seq_type | |
return [coord[1] - coord[0] for coord in getattr(self, attr_name)] | |
def _hit_inter_spans_get(self): | |
return self._inter_spans_get("hit") | |
hit_inter_spans = property( | |
fget=_hit_inter_spans_get, doc="Lengths of regions between hit fragments." | |
) | |
def _query_inter_spans_get(self): | |
return self._inter_spans_get("query") | |
query_inter_spans = property( | |
fget=_query_inter_spans_get, doc="Lengths of regions between query fragments." | |
) | |
# shortcuts for fragments' properties # | |
# bool check if there's more than one fragments | |
is_fragmented = property( | |
lambda self: len(self) > 1, | |
doc="Whether the HSP has more than one HSPFragment objects.", | |
) | |
# first item properties with setters | |
hit_description = fullcascade( | |
"hit_description", doc="Description of the hit sequence." | |
) | |
query_description = fullcascade( | |
"query_description", doc="Description of the query sequence." | |
) | |
hit_id = fullcascade("hit_id", doc="ID of the hit sequence.") | |
query_id = fullcascade("query_id", doc="ID of the query sequence.") | |
molecule_type = fullcascade( | |
"molecule_type", doc="molecule_type of the hit and query SeqRecord objects." | |
) | |
# properties for single-fragment HSPs | |
fragment = singleitem(doc="HSPFragment object, first fragment.") | |
hit = singleitem("hit", doc="Hit sequence as a SeqRecord object, first fragment.") | |
query = singleitem( | |
"query", doc="Query sequence as a SeqRecord object, first fragment." | |
) | |
aln = singleitem( | |
"aln", doc="Alignment of the first fragment as a MultipleSeqAlignment object." | |
) | |
aln_annotation = singleitem( | |
"aln_annotation", | |
doc="Dictionary of annotation(s) of the first fragment's alignment.", | |
) | |
hit_features = singleitem( | |
"hit_features", doc="Hit sequence features, first fragment." | |
) | |
query_features = singleitem( | |
"query_features", doc="Query sequence features, first fragment." | |
) | |
hit_strand = singleitem("hit_strand", doc="Hit strand orientation, first fragment.") | |
query_strand = singleitem( | |
"query_strand", doc="Query strand orientation, first fragment." | |
) | |
hit_frame = singleitem( | |
"hit_frame", doc="Hit sequence reading frame, first fragment." | |
) | |
query_frame = singleitem( | |
"query_frame", doc="Query sequence reading frame, first fragment." | |
) | |
# properties for multi-fragment HSPs | |
fragments = allitems(doc="List of all HSPFragment objects.") | |
hit_all = allitems( | |
"hit", doc="List of all fragments' hit sequences as SeqRecord objects." | |
) | |
query_all = allitems( | |
"query", doc="List of all fragments' query sequences as SeqRecord objects." | |
) | |
aln_all = allitems( | |
"aln", doc="List of all fragments' alignments as MultipleSeqAlignment objects." | |
) | |
aln_annotation_all = allitems( | |
"aln_annotation", | |
doc="Dictionary of annotation(s) of all fragments' alignments.", | |
) | |
hit_features_all = allitems( | |
"hit_features", doc="List of all hit sequence features." | |
) | |
query_features_all = allitems( | |
"query_features", doc="List of all query sequence features." | |
) | |
hit_strand_all = allitems( | |
"hit_strand", doc="List of all fragments' hit sequence strands." | |
) | |
query_strand_all = allitems( | |
"query_strand", doc="List of all fragments' query sequence strands" | |
) | |
hit_frame_all = allitems( | |
"hit_frame", doc="List of all fragments' hit sequence reading frames." | |
) | |
query_frame_all = allitems( | |
"query_frame", doc="List of all fragments' query sequence reading frames." | |
) | |
hit_start_all = allitems( | |
"hit_start", doc="List of all fragments' hit start coordinates." | |
) | |
query_start_all = allitems( | |
"query_start", doc="List of all fragments' query start coordinates." | |
) | |
hit_end_all = allitems("hit_end", doc="List of all fragments' hit end coordinates.") | |
query_end_all = allitems( | |
"query_end", doc="List of all fragments' query end coordinates." | |
) | |
hit_span_all = allitems("hit_span", doc="List of all fragments' hit sequence size.") | |
query_span_all = allitems( | |
"query_span", doc="List of all fragments' query sequence size." | |
) | |
hit_range_all = allitems( | |
"hit_range", doc="List of all fragments' hit start and end coordinates." | |
) | |
query_range_all = allitems( | |
"query_range", doc="List of all fragments' query start and end coordinates." | |
) | |
class HSPFragment(_BaseHSP): | |
"""Class representing a contiguous alignment of hit-query sequence. | |
HSPFragment forms the core of any parsed search output file. Depending on | |
the search output file format, it may contain the actual query and/or hit | |
sequences that produces the search hits. These sequences are stored as | |
SeqRecord objects (see SeqRecord): | |
>>> from Bio import SearchIO | |
>>> qresult = next(SearchIO.parse('Blast/mirna.xml', 'blast-xml')) | |
>>> fragment = qresult[0][0][0] # first hit, first hsp, first fragment | |
>>> print(fragment) | |
Query: 33211 mir_1 | |
Hit: gi|262205317|ref|NR_030195.1| Homo sapiens microRNA 520b (MIR520... | |
Query range: [0:61] (1) | |
Hit range: [0:61] (1) | |
Fragments: 1 (61 columns) | |
Query - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG | |
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| | |
Hit - CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTTTAGAGGG | |
# the query sequence is a SeqRecord object | |
>>> fragment.query.__class__ | |
<class 'Bio.SeqRecord.SeqRecord'> | |
>>> print(fragment.query) | |
ID: 33211 | |
Name: aligned query sequence | |
Description: mir_1 | |
Number of features: 0 | |
/molecule_type=DNA | |
Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG') | |
# the hit sequence is a SeqRecord object as well | |
>>> fragment.hit.__class__ | |
<class 'Bio.SeqRecord.SeqRecord'> | |
>>> print(fragment.hit) | |
ID: gi|262205317|ref|NR_030195.1| | |
Name: aligned hit sequence | |
Description: Homo sapiens microRNA 520b (MIR520B), microRNA | |
Number of features: 0 | |
/molecule_type=DNA | |
Seq('CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAGTGCTTCCTTT...GGG') | |
# when both query and hit are present, we get a MultipleSeqAlignment object | |
>>> fragment.aln.__class__ | |
<class 'Bio.Align.MultipleSeqAlignment'> | |
>>> print(fragment.aln) | |
Alignment with 2 rows and 61 columns | |
CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG 33211 | |
CCCTCTACAGGGAAGCGCTTTCTGTTGTCTGAAAGAAAAGAAAG...GGG gi|262205317|ref|NR_030195.1| | |
""" | |
def __init__( | |
self, | |
hit_id="<unknown id>", | |
query_id="<unknown id>", | |
hit=None, | |
query=None, | |
molecule_type=None, | |
): | |
"""Initialize the class.""" | |
self._molecule_type = molecule_type | |
self.aln_annotation = {} | |
self._hit_id = hit_id | |
self._query_id = query_id | |
for seq_type in ("query", "hit"): | |
# query or hit attributes default attributes | |
setattr(self, "_%s_description" % seq_type, "<unknown description>") | |
setattr(self, "_%s_features" % seq_type, []) | |
# query or hit attributes whose default attribute is None | |
for attr in ("strand", "frame", "start", "end"): | |
setattr(self, "%s_%s" % (seq_type, attr), None) | |
# self.query or self.hit | |
if eval(seq_type): | |
setattr(self, seq_type, eval(seq_type)) | |
else: | |
setattr(self, seq_type, None) | |
def __repr__(self): | |
"""Return HSPFragment info; hit id, query id, number of columns.""" | |
info = "hit_id=%r, query_id=%r" % (self.hit_id, self.query_id) | |
try: | |
info += ", %i columns" % len(self) | |
except AttributeError: | |
pass | |
return "%s(%s)" % (self.__class__.__name__, info) | |
def __len__(self): | |
"""Return alignment span.""" | |
return self.aln_span | |
def __str__(self): | |
"""Return string of HSP header and alignments.""" | |
return self._str_hsp_header() + "\n" + self._str_aln() | |
def __getitem__(self, idx): | |
"""Return object of index idx.""" | |
if self.aln is not None: | |
obj = self.__class__( | |
hit_id=self.hit_id, | |
query_id=self.query_id, | |
molecule_type=self.molecule_type, | |
) | |
# transfer query and hit attributes | |
# let SeqRecord handle feature slicing, then retrieve the sliced | |
# features into the sliced HSPFragment | |
if self.query is not None: | |
obj.query = self.query[idx] | |
obj.query_features = obj.query.features | |
if self.hit is not None: | |
obj.hit = self.hit[idx] | |
obj.hit_features = obj.hit.features | |
# description, strand, frame | |
for attr in ("description", "strand", "frame"): | |
for seq_type in ("hit", "query"): | |
attr_name = "%s_%s" % (seq_type, attr) | |
self_val = getattr(self, attr_name) | |
setattr(obj, attr_name, self_val) | |
# alignment annotation should be transferred, since we can compute | |
# the resulting annotation | |
obj.aln_annotation = {} | |
for key, value in self.aln_annotation.items(): | |
assert len(value[idx]) == len(obj) | |
obj.aln_annotation[key] = value[idx] | |
return obj | |
else: | |
raise TypeError( | |
"Slicing for HSP objects without alignment is not supported." | |
) | |
def _str_aln(self): | |
lines = [] | |
# alignment length | |
aln_span = getattr_str(self, "aln_span") | |
lines.append(" Fragments: 1 (%s columns)" % aln_span) | |
# sequences | |
if self.query is not None and self.hit is not None: | |
try: | |
qseq = self.query.seq | |
except AttributeError: # query is None | |
qseq = "?" | |
try: | |
hseq = self.hit.seq | |
except AttributeError: # hit is None | |
hseq = "?" | |
# similarity line | |
simil = "" | |
if "similarity" in self.aln_annotation and isinstance( | |
self.aln_annotation.get("similarity"), str | |
): | |
simil = self.aln_annotation["similarity"] | |
if self.aln_span <= 67: | |
lines.append("%10s - %s" % ("Query", qseq)) | |
if simil: | |
lines.append(" %s" % simil) | |
lines.append("%10s - %s" % ("Hit", hseq)) | |
else: | |
# adjust continuation character length, so we don't display | |
# the same residues twice | |
if self.aln_span - 66 > 3: | |
cont = "~" * 3 | |
else: | |
cont = "~" * (self.aln_span - 66) | |
lines.append("%10s - %s%s%s" % ("Query", qseq[:59], cont, qseq[-5:])) | |
if simil: | |
lines.append(" %s%s%s" % (simil[:59], cont, simil[-5:])) | |
lines.append("%10s - %s%s%s" % ("Hit", hseq[:59], cont, hseq[-5:])) | |
return "\n".join(lines) | |
# sequence properties # | |
def _set_seq(self, seq, seq_type): | |
"""Check the given sequence for attribute setting (PRIVATE). | |
:param seq: sequence to check | |
:type seq: string or SeqRecord | |
:param seq_type: sequence type | |
:type seq_type: string, choice of 'hit' or 'query' | |
""" | |
assert seq_type in ("hit", "query") | |
if seq is None: | |
return seq # return immediately if seq is None | |
else: | |
if not isinstance(seq, (str, SeqRecord)): | |
raise TypeError( | |
"%s sequence must be a string or a SeqRecord object." % seq_type | |
) | |
# check length if the opposite sequence is not None | |
opp_type = "hit" if seq_type == "query" else "query" | |
opp_seq = getattr(self, "_%s" % opp_type, None) | |
if opp_seq is not None: | |
if len(seq) != len(opp_seq): | |
raise ValueError( | |
"Sequence lengths do not match. Expected: %r (%s); found: %r (%s)." | |
% (len(opp_seq), opp_type, len(seq), seq_type) | |
) | |
seq_id = getattr(self, "%s_id" % seq_type) | |
seq_desc = getattr(self, "%s_description" % seq_type) | |
seq_feats = getattr(self, "%s_features" % seq_type) | |
seq_name = "aligned %s sequence" % seq_type | |
if isinstance(seq, SeqRecord): | |
seq.id = seq_id | |
seq.description = seq_desc | |
seq.name = seq_name | |
seq.features = seq_feats | |
seq.annotations["molecule_type"] = self.molecule_type | |
elif isinstance(seq, str): | |
seq = SeqRecord( | |
Seq(seq), | |
id=seq_id, | |
name=seq_name, | |
description=seq_desc, | |
features=seq_feats, | |
annotations={"molecule_type": self.molecule_type}, | |
) | |
return seq | |
def _hit_get(self): | |
return self._hit | |
def _hit_set(self, value): | |
self._hit = self._set_seq(value, "hit") | |
hit = property( | |
fget=_hit_get, | |
fset=_hit_set, | |
doc="Hit sequence as a SeqRecord object, defaults to None.", | |
) | |
def _query_get(self): | |
return self._query | |
def _query_set(self, value): | |
self._query = self._set_seq(value, "query") | |
query = property( | |
fget=_query_get, | |
fset=_query_set, | |
doc="Query sequence as a SeqRecord object, defaults to None.", | |
) | |
def _aln_get(self): | |
if self.query is None and self.hit is None: | |
return None | |
if self.hit is None: | |
msa = MultipleSeqAlignment([self.query]) | |
elif self.query is None: | |
msa = MultipleSeqAlignment([self.hit]) | |
else: | |
msa = MultipleSeqAlignment([self.query, self.hit]) | |
molecule_type = self.molecule_type | |
if molecule_type is not None: | |
msa.molecule_type = molecule_type | |
return msa | |
aln = property( | |
fget=_aln_get, | |
doc="Query-hit alignment as a MultipleSeqAlignment object, defaults to None.", | |
) | |
def _molecule_type_get(self): | |
return self._molecule_type | |
def _molecule_type_set(self, value): | |
self._molecule_type = value | |
try: | |
self.query.annotations["molecule_type"] = value | |
except AttributeError: | |
pass | |
try: | |
self.hit.annotations["molecule_type"] = value | |
except AttributeError: | |
pass | |
molecule_type = property( | |
fget=_molecule_type_get, | |
fset=_molecule_type_set, | |
doc="molecule type used in the fragment's " | |
"sequence records and alignment, defaults to None.", | |
) | |
def _aln_span_get(self): | |
# length of alignment (gaps included) | |
# alignment span can be its own attribute, or computed from | |
# query / hit length | |
try: | |
self._aln_span | |
except AttributeError: | |
if self.query is not None: | |
self._aln_span = len(self.query) | |
elif self.hit is not None: | |
self._aln_span = len(self.hit) | |
return self._aln_span | |
def _aln_span_set(self, value): | |
self._aln_span = value | |
aln_span = property( | |
fget=_aln_span_get, | |
fset=_aln_span_set, | |
doc="The number of alignment columns covered by the fragment.", | |
) | |
# id, description, and features properties # | |
hit_description = fragcascade("description", "hit", doc="Hit sequence description.") | |
query_description = fragcascade( | |
"description", "query", doc="Query sequence description." | |
) | |
hit_id = fragcascade("id", "hit", doc="Hit sequence ID.") | |
query_id = fragcascade("id", "query", doc="Query sequence ID.") | |
hit_features = fragcascade("features", "hit", doc="Hit sequence features.") | |
query_features = fragcascade("features", "query", doc="Query sequence features.") | |
# strand properties # | |
def _prep_strand(self, strand): | |
# follow SeqFeature's convention | |
if strand not in (-1, 0, 1, None): | |
raise ValueError("Strand should be -1, 0, 1, or None; not %r" % strand) | |
return strand | |
def _get_strand(self, seq_type): | |
assert seq_type in ("hit", "query") | |
strand = getattr(self, "_%s_strand" % seq_type) | |
if strand is None: | |
# try to compute strand from frame | |
frame = getattr(self, "%s_frame" % seq_type) | |
if frame is not None: | |
try: | |
strand = frame // abs(frame) | |
except ZeroDivisionError: | |
strand = 0 | |
setattr(self, "%s_strand" % seq_type, strand) | |
return strand | |
def _hit_strand_get(self): | |
return self._get_strand("hit") | |
def _hit_strand_set(self, value): | |
self._hit_strand = self._prep_strand(value) | |
hit_strand = property( | |
fget=_hit_strand_get, | |
fset=_hit_strand_set, | |
doc="Hit sequence strand, defaults to None.", | |
) | |
def _query_strand_get(self): | |
return self._get_strand("query") | |
def _query_strand_set(self, value): | |
self._query_strand = self._prep_strand(value) | |
query_strand = property( | |
fget=_query_strand_get, | |
fset=_query_strand_set, | |
doc="Query sequence strand, defaults to None.", | |
) | |
# frame properties # | |
def _prep_frame(self, frame): | |
if frame not in (-3, -2, -1, 0, 1, 2, 3, None): | |
raise ValueError( | |
"Strand should be an integer between -3 and 3, or None; not %r" % frame | |
) | |
return frame | |
def _hit_frame_get(self): | |
return self._hit_frame | |
def _hit_frame_set(self, value): | |
self._hit_frame = self._prep_frame(value) | |
hit_frame = property( | |
fget=_hit_frame_get, | |
fset=_hit_frame_set, | |
doc="Hit sequence reading frame, defaults to None.", | |
) | |
def _query_frame_get(self): | |
"""Get query sequence reading frame (PRIVATE).""" | |
return self._query_frame | |
def _query_frame_set(self, value): | |
"""Set query sequence reading frame (PRIVATE).""" | |
self._query_frame = self._prep_frame(value) | |
query_frame = property( | |
fget=_query_frame_get, | |
fset=_query_frame_set, | |
doc="Query sequence reading frame, defaults to None.", | |
) | |
# coordinate properties # | |
def _prep_coord(self, coord, opp_coord_name, op): | |
# coord must either be None or int | |
if coord is None: | |
return coord | |
assert isinstance(coord, int) | |
# try to get opposite coordinate, if it's not present, return | |
try: | |
opp_coord = getattr(self, opp_coord_name) | |
except AttributeError: | |
return coord | |
# if opposite coordinate is None, return | |
if opp_coord is None: | |
return coord | |
# otherwise compare it to coord ('>=' or '<=') | |
else: | |
assert op(coord, opp_coord) | |
return coord | |
def _hit_start_get(self): | |
"""Get the sequence hit start coordinate (PRIVATE).""" | |
return self._hit_start | |
def _hit_start_set(self, value): | |
"""Set the sequence hit start coordinate (PRIVATE).""" | |
self._hit_start = self._prep_coord(value, "hit_end", le) | |
hit_start = property( | |
fget=_hit_start_get, | |
fset=_hit_start_set, | |
doc="Hit sequence start coordinate, defaults to None.", | |
) | |
def _query_start_get(self): | |
"""Get the query sequence start coordinate (PRIVATE).""" | |
return self._query_start | |
def _query_start_set(self, value): | |
"""Set the query sequence start coordinate (PRIVATE).""" | |
self._query_start = self._prep_coord(value, "query_end", le) | |
query_start = property( | |
fget=_query_start_get, | |
fset=_query_start_set, | |
doc="Query sequence start coordinate, defaults to None.", | |
) | |
def _hit_end_get(self): | |
"""Get the hit sequence end coordinate (PRIVATE).""" | |
return self._hit_end | |
def _hit_end_set(self, value): | |
"""Set the hit sequence end coordinate (PRIVATE).""" | |
self._hit_end = self._prep_coord(value, "hit_start", ge) | |
hit_end = property( | |
fget=_hit_end_get, | |
fset=_hit_end_set, | |
doc="Hit sequence end coordinate, defaults to None.", | |
) | |
def _query_end_get(self): | |
"""Get the query sequence end coordinate (PRIVATE).""" | |
return self._query_end | |
def _query_end_set(self, value): | |
"""Set the query sequence end coordinate (PRIVATE).""" | |
self._query_end = self._prep_coord(value, "query_start", ge) | |
query_end = property( | |
fget=_query_end_get, | |
fset=_query_end_set, | |
doc="Query sequence end coordinate, defaults to None.", | |
) | |
# coordinate-dependent properties # | |
def _hit_span_get(self): | |
"""Return the number of residues covered by the hit sequence (PRIVATE).""" | |
try: | |
return self.hit_end - self.hit_start | |
except TypeError: # triggered if any of the coordinates are None | |
return None | |
hit_span = property( | |
fget=_hit_span_get, doc="The number of residues covered by the hit sequence." | |
) | |
def _query_span_get(self): | |
"""Return the number or residues covered by the query (PRIVATE).""" | |
try: | |
return self.query_end - self.query_start | |
except TypeError: # triggered if any of the coordinates are None | |
return None | |
query_span = property( | |
fget=_query_span_get, | |
doc="The number of residues covered by the query sequence.", | |
) | |
def _hit_range_get(self): | |
"""Return the start and end of a hit (PRIVATE).""" | |
return (self.hit_start, self.hit_end) | |
hit_range = property( | |
fget=_hit_range_get, doc="Tuple of hit start and end coordinates." | |
) | |
def _query_range_get(self): | |
"""Return the start and end of a query (PRIVATE).""" | |
return (self.query_start, self.query_end) | |
query_range = property( | |
fget=_query_range_get, doc="Tuple of query start and end coordinates." | |
) | |
# if not used as a module, run the doctest | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest() | |