Spaces:
No application file
No application file
# Copyright 2022 by Michiel de Hoon. All rights reserved. | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Bio.Align support for the "psl" pairwise alignment format. | |
The Pattern Space Layout (PSL) format, described by UCSC, stores a series | |
of pairwise alignments in a single file. Typically they are used for | |
transcript to genome alignments. PSL files store the alignment positions | |
and alignment scores, but do not store the aligned sequences. | |
See http://genome.ucsc.edu/FAQ/FAQformat.html#format2 | |
You are expected to use this module via the Bio.Align functions. | |
Coordinates in the PSL format are defined in terms of zero-based start | |
positions (like Python) and aligning region sizes. | |
A minimal aligned region of length one and starting at first position in the | |
source sequence would have ``start == 0`` and ``size == 1``. | |
As we can see in this example, ``start + size`` will give one more than the | |
zero-based end position. We can therefore manipulate ``start`` and | |
``start + size`` as python list slice boundaries. | |
""" | |
from itertools import chain | |
import numpy | |
from Bio.Align import Alignment | |
from Bio.Align import interfaces | |
from Bio.Seq import Seq, reverse_complement, UndefinedSequenceError | |
from Bio.SeqRecord import SeqRecord | |
from Bio.SeqFeature import SeqFeature, ExactPosition, SimpleLocation, CompoundLocation | |
class AlignmentWriter(interfaces.AlignmentWriter): | |
"""Alignment file writer for the Pattern Space Layout (PSL) file format.""" | |
fmt = "PSL" | |
def __init__(self, target, header=True, mask=None, wildcard="N"): | |
"""Create an AlignmentWriter object. | |
Arguments: | |
- target - output stream or file name | |
- header - If True (default), write the PSL header consisting of | |
five lines containing the PSL format version and a | |
header for each column. | |
If False, suppress the PSL header, resulting in a simple | |
tab-delimited file. | |
- mask - Specify if repeat regions in the target sequence are | |
masked and should be reported in the `repMatches` field | |
of the PSL file instead of in the `matches` field. | |
Acceptable values are | |
None : no masking (default); | |
"lower": masking by lower-case characters; | |
"upper": masking by upper-case characters. | |
- wildcard - Report alignments to the wildcard character in the | |
target or query sequence in the `nCount` field of the | |
PSL file instead of in the `matches`, `misMatches`, or | |
`repMatches` fields. | |
Default value is 'N'. | |
""" | |
super().__init__(target) | |
self.header = header | |
if wildcard is not None: | |
if mask == "upper": | |
wildcard = ord(wildcard.lower()) | |
else: | |
wildcard = ord(wildcard.upper()) | |
self.wildcard = wildcard | |
self.mask = mask | |
def write_header(self, alignments): | |
"""Write the PSL header.""" | |
if not self.header: | |
return | |
try: | |
metadata = alignments.metadata | |
except AttributeError: | |
version = "3" | |
else: | |
version = metadata.get("psLayout version", "3") | |
# fmt: off | |
self.stream.write( | |
f"""\ | |
psLayout version {version} | |
match mis- rep. N's Q gap Q gap T gap T gap strand Q Q Q Q T T T T block blockSizes qStarts tStarts | |
match match count bases count bases name size start end name size start end count | |
--------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
""" # noqa: W191, E101 | |
) | |
# fmt: on | |
def format_alignment(self, alignment): | |
"""Return a string with a single alignment formatted as one PSL line.""" | |
if not isinstance(alignment, Alignment): | |
raise TypeError("Expected an Alignment object") | |
coordinates = alignment.coordinates | |
if not coordinates.size: # alignment consists of gaps only | |
return "" | |
target, query = alignment.sequences | |
try: | |
qName = query.id | |
except AttributeError: | |
qName = "query" | |
try: | |
query = query.seq | |
except AttributeError: | |
pass | |
try: | |
tName = target.id | |
except AttributeError: | |
tName = "target" | |
try: | |
target = target.seq | |
except AttributeError: | |
pass | |
tSize = len(target) | |
qSize = len(query) | |
# fmt: off | |
dnax = None # set to True for translated DNA aligned to protein, | |
# and to False for DNA/RNA aligned to DNA/RNA # noqa: E114, E116 | |
if coordinates[1, 0] > coordinates[1, -1]: | |
# DNA/RNA mapped to reverse strand of DNA/RNA | |
strand = "-" | |
query = reverse_complement(query, inplace=False) | |
coordinates = coordinates.copy() | |
coordinates[1, :] = qSize - coordinates[1, :] | |
elif coordinates[0, 0] > coordinates[0, -1]: | |
# protein mapped to reverse strand of DNA | |
strand = "-" | |
target = reverse_complement(target, inplace=False) | |
coordinates = coordinates.copy() | |
coordinates[0, :] = tSize - coordinates[0, :] | |
dnax = True | |
else: | |
# mapped to forward strand | |
strand = "+" | |
# fmt: on | |
wildcard = self.wildcard | |
mask = self.mask | |
# variable names follow those in the PSL file format specification | |
matches = 0 | |
misMatches = 0 | |
repMatches = 0 | |
nCount = 0 | |
qNumInsert = 0 | |
qBaseInsert = 0 | |
tNumInsert = 0 | |
tBaseInsert = 0 | |
blockSizes = [] | |
qStarts = [] | |
tStarts = [] | |
tStart, qStart = coordinates[:, 0] | |
for tEnd, qEnd in coordinates[:, 1:].transpose(): | |
if tStart == tEnd: | |
if qStart > 0 and qEnd < qSize: | |
qNumInsert += 1 | |
qBaseInsert += qEnd - qStart | |
qStart = qEnd | |
elif qStart == qEnd: | |
if tStart > 0 and tEnd < tSize: | |
tNumInsert += 1 | |
tBaseInsert += tEnd - tStart | |
tStart = tEnd | |
else: | |
tCount = tEnd - tStart | |
qCount = qEnd - qStart | |
tStarts.append(tStart) | |
qStarts.append(qStart) | |
blockSizes.append(qCount) | |
if tCount == qCount: | |
assert dnax is not True | |
dnax = False | |
else: | |
# translated DNA aligned to protein, typically generated by | |
# blat -t=dnax -q=prot | |
assert tCount == 3 * qCount | |
assert dnax is not False | |
dnax = True | |
tSeq = target[tStart:tEnd] | |
qSeq = query[qStart:qEnd] | |
try: | |
tSeq = bytes(tSeq) | |
except TypeError: # string | |
tSeq = bytes(tSeq, "ASCII") | |
except UndefinedSequenceError: # sequence contents is unknown | |
tSeq = None | |
try: | |
qSeq = bytes(qSeq) | |
except TypeError: # string | |
qSeq = bytes(qSeq, "ASCII") | |
except UndefinedSequenceError: # sequence contents is unknown | |
qSeq = None | |
if tSeq is None or qSeq is None: | |
# contents of at least one sequence is unknown; | |
# count all aligned letters as matches: | |
matches += qCount | |
else: | |
if mask == "lower": | |
for u1, u2, c1 in zip(tSeq.upper(), qSeq.upper(), tSeq): | |
if u1 == wildcard or u2 == wildcard: | |
nCount += 1 | |
elif u1 == u2: | |
if u1 == c1: | |
matches += 1 | |
else: | |
repMatches += 1 | |
else: | |
misMatches += 1 | |
elif mask == "upper": | |
for u1, u2, c1 in zip(tSeq.lower(), qSeq.lower(), tSeq): | |
if u1 == wildcard or u2 == wildcard: | |
nCount += 1 | |
elif u1 == u2: | |
if u1 == c1: | |
matches += 1 | |
else: | |
repMatches += 1 | |
else: | |
misMatches += 1 | |
else: | |
for u1, u2 in zip(tSeq.upper(), qSeq.upper()): | |
if u1 == wildcard or u2 == wildcard: | |
nCount += 1 | |
elif u1 == u2: | |
matches += 1 | |
else: | |
misMatches += 1 | |
tStart = tEnd | |
qStart = qEnd | |
try: | |
matches = alignment.matches | |
except AttributeError: | |
pass | |
try: | |
misMatches = alignment.misMatches | |
except AttributeError: | |
pass | |
try: | |
repMatches = alignment.repMatches | |
except AttributeError: | |
pass | |
try: | |
nCount = alignment.nCount | |
except AttributeError: | |
pass | |
tStart = tStarts[0] # start of alignment in target | |
qStart = qStarts[0] # start of alignment in query | |
tEnd = tStarts[-1] + tCount # end of alignment in target | |
qEnd = qStarts[-1] + qCount # end of alignment in query | |
if strand == "-": | |
if dnax is True: | |
tStart, tEnd = tSize - tEnd, tSize - tStart | |
else: | |
qStart, qEnd = qSize - qEnd, qSize - qStart | |
blockCount = len(blockSizes) | |
blockSizes = ",".join(map(str, blockSizes)) + "," | |
qStarts = ",".join(map(str, qStarts)) + "," | |
tStarts = ",".join(map(str, tStarts)) + "," | |
if dnax: | |
strand = "+" + strand | |
words = [ | |
str(matches), | |
str(misMatches), | |
str(repMatches), | |
str(nCount), | |
str(qNumInsert), | |
str(qBaseInsert), | |
str(tNumInsert), | |
str(tBaseInsert), | |
strand, | |
qName, | |
str(qSize), | |
str(qStart), | |
str(qEnd), | |
tName, | |
str(tSize), | |
str(tStart), | |
str(tEnd), | |
str(blockCount), | |
blockSizes, | |
qStarts, | |
tStarts, | |
] | |
line = "\t".join(words) + "\n" | |
return line | |
class AlignmentIterator(interfaces.AlignmentIterator): | |
"""Alignment iterator for Pattern Space Layout (PSL) files. | |
Each line in the file contains one pairwise alignment, which are loaded | |
and returned incrementally. Alignment score information such as the number | |
of matches and mismatches are stored as attributes of each alignment. | |
""" | |
fmt = "PSL" | |
def _read_header(self, stream): | |
line = next(stream) | |
if line.startswith("psLayout "): | |
words = line.split() | |
if words[1] != "version": | |
raise ValueError("Unexpected word '%s' in header line" % words[1]) | |
self.metadata = {"psLayout version": words[2]} | |
line = next(stream) | |
line = next(stream) | |
line = next(stream) | |
line = next(stream) | |
if line.lstrip("-").strip() != "": | |
raise ValueError("End of header not found") | |
else: | |
self._line = line | |
def _read_next_alignment(self, stream): | |
try: | |
line = self._line | |
except AttributeError: | |
lines = stream | |
else: | |
del self._line | |
lines = chain([line], stream) | |
for line in lines: | |
words = line.split() | |
if len(words) == 23: | |
pslx = True | |
elif len(words) == 21: | |
pslx = False | |
else: | |
raise ValueError("line has %d columns; expected 21 or 23" % len(words)) | |
strand = words[8] | |
qName = words[9] | |
qSize = int(words[10]) | |
tName = words[13] | |
tSize = int(words[14]) | |
blockCount = int(words[17]) | |
blockSizes = [ | |
int(blockSize) for blockSize in words[18].rstrip(",").split(",") | |
] | |
qStarts = [int(start) for start in words[19].rstrip(",").split(",")] | |
tStarts = [int(start) for start in words[20].rstrip(",").split(",")] | |
if len(blockSizes) != blockCount: | |
raise ValueError( | |
"Inconsistent number of blocks (%d found, expected %d)" | |
% (len(blockSizes), blockCount) | |
) | |
if len(qStarts) != blockCount: | |
raise ValueError( | |
"Inconsistent number of query start positions (%d found, expected %d)" | |
% (len(qStarts), blockCount) | |
) | |
if len(tStarts) != blockCount: | |
raise ValueError( | |
"Inconsistent number of target start positions (%d found, expected %d)" | |
% (len(tStarts), blockCount) | |
) | |
qStarts = numpy.array(qStarts) | |
tStarts = numpy.array(tStarts) | |
qBlockSizes = numpy.array(blockSizes) | |
if strand in ("++", "+-"): | |
# protein sequence aligned against translated DNA sequence | |
tBlockSizes = 3 * qBlockSizes | |
else: | |
tBlockSizes = qBlockSizes | |
qPosition = qStarts[0] | |
tPosition = tStarts[0] | |
coordinates = [[tPosition, qPosition]] | |
for tBlockSize, qBlockSize, tStart, qStart in zip( | |
tBlockSizes, qBlockSizes, tStarts, qStarts | |
): | |
if tStart != tPosition: | |
coordinates.append([tStart, qPosition]) | |
tPosition = tStart | |
if qStart != qPosition: | |
coordinates.append([tPosition, qStart]) | |
qPosition = qStart | |
tPosition += tBlockSize | |
qPosition += qBlockSize | |
coordinates.append([tPosition, qPosition]) | |
coordinates = numpy.array(coordinates).transpose() | |
qNumInsert = 0 | |
qBaseInsert = 0 | |
tNumInsert = 0 | |
tBaseInsert = 0 | |
tStart, qStart = coordinates[:, 0] | |
for tEnd, qEnd in coordinates[:, 1:].transpose(): | |
tCount = tEnd - tStart | |
qCount = qEnd - qStart | |
if tCount == 0: | |
if qStart > 0 and qEnd < qSize: | |
qNumInsert += 1 | |
qBaseInsert += qCount | |
qStart = qEnd | |
elif qCount == 0: | |
if tStart > 0 and tEnd < tSize: | |
tNumInsert += 1 | |
tBaseInsert += tCount | |
tStart = tEnd | |
else: | |
tStart = tEnd | |
qStart = qEnd | |
if qNumInsert != int(words[4]): | |
raise ValueError( | |
"Inconsistent qNumInsert found (%s, expected %d)" | |
% (words[4], qNumInsert) | |
) | |
if qBaseInsert != int(words[5]): | |
raise ValueError( | |
"Inconsistent qBaseInsert found (%s, expected %d)" | |
% (words[5], qBaseInsert) | |
) | |
if tNumInsert != int(words[6]): | |
raise ValueError( | |
"Inconsistent tNumInsert found (%s, expected %d)" | |
% (words[6], tNumInsert) | |
) | |
if tBaseInsert != int(words[7]): | |
raise ValueError( | |
"Inconsistent tBaseInsert found (%s, expected %d)" | |
% (words[7], tBaseInsert) | |
) | |
qStart = int(words[11]) | |
qEnd = int(words[12]) | |
tStart = int(words[15]) | |
tEnd = int(words[16]) | |
if strand == "-": | |
qStart, qEnd = qEnd, qStart | |
coordinates[1, :] = qSize - coordinates[1, :] | |
elif strand == "+-": | |
tStart, tEnd = tEnd, tStart | |
coordinates[0, :] = tSize - coordinates[0, :] | |
if tStart != coordinates[0, 0]: | |
raise ValueError( | |
"Inconsistent tStart found (%d, expected %d)" | |
% (tStart, coordinates[0, 0]) | |
) | |
if tEnd != coordinates[0, -1]: | |
raise ValueError( | |
"Inconsistent tEnd found (%d, expected %d)" | |
% (tEnd, coordinates[0, -1]) | |
) | |
if qStart != coordinates[1, 0]: | |
raise ValueError( | |
"Inconsistent qStart found (%d, expected %d)" | |
% (qStart, coordinates[1, 0]) | |
) | |
if qEnd != coordinates[1, -1]: | |
raise ValueError( | |
"Inconsistent qEnd found (%d, expected %d)" | |
% (qEnd, coordinates[1, -1]) | |
) | |
feature = None | |
if pslx is True: | |
qSeqs = words[21].rstrip(",").split(",") | |
tSeqs = words[22].rstrip(",").split(",") | |
qSeq = dict(zip(qStarts, qSeqs)) | |
if strand in ("++", "+-"): | |
# protein sequence aligned against translated DNA sequence | |
target_sequence = Seq(None, length=tSize) | |
query_sequence = Seq(qSeq, length=qSize) | |
if strand == "++": | |
tStart, qStart = coordinates[:, 0] | |
locations = [] | |
for tEnd, qEnd in coordinates[:, 1:].transpose(): | |
if qStart < qEnd and tStart < tEnd: | |
location = SimpleLocation( | |
ExactPosition(tStart), | |
ExactPosition(tEnd), | |
strand=+1, | |
) | |
locations.append(location) | |
qStart = qEnd | |
tStart = tEnd | |
if len(locations) > 1: | |
location = CompoundLocation(locations, "join") | |
tSeq = "".join(tSeqs) | |
qualifiers = {"translation": [tSeq]} | |
feature = SeqFeature( | |
location, type="CDS", qualifiers=qualifiers | |
) | |
elif strand == "+-": | |
tEnd, qStart = coordinates[:, 0] | |
locations = [] | |
for tStart, qEnd in coordinates[:, 1:].transpose(): | |
if qStart < qEnd and tStart < tEnd: | |
location = SimpleLocation( | |
ExactPosition(tStart), | |
ExactPosition(tEnd), | |
strand=-1, | |
) | |
locations.append(location) | |
tEnd = tStart | |
qStart = qEnd | |
if len(locations) > 1: | |
location = CompoundLocation(locations, "join") | |
tSeq = "".join(tSeqs) | |
qualifiers = {"translation": [tSeq]} | |
feature = SeqFeature( | |
location, type="CDS", qualifiers=qualifiers | |
) | |
else: | |
tSeq = dict(zip(tStarts, tSeqs)) | |
target_sequence = Seq(tSeq, length=tSize) | |
query_sequence = Seq(qSeq, length=qSize) | |
if strand == "-": | |
query_sequence = query_sequence.reverse_complement() | |
else: | |
target_sequence = Seq(None, length=tSize) | |
query_sequence = Seq(None, length=qSize) | |
target_record = SeqRecord(target_sequence, id=tName, description="") | |
query_record = SeqRecord(query_sequence, id=qName, description="") | |
if feature is not None: | |
target_record.features.append(feature) | |
records = [target_record, query_record] | |
alignment = Alignment(records, coordinates) | |
alignment.matches = int(words[0]) | |
alignment.misMatches = int(words[1]) | |
alignment.repMatches = int(words[2]) | |
alignment.nCount = int(words[3]) | |
return alignment | |