Spaces:
No application file
No application file
File size: 4,802 Bytes
b7731cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
#!/usr/bin/env python
# Copyright 2004-2005 by Michael Hoffman. All rights reserved.
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Run and process output from the Wise2 package tool dnal.
Bio.Wise contains modules for running and processing the output of
some of the models in the Wise2 package by Ewan Birney available from:
ftp://ftp.ebi.ac.uk/pub/software/unix/wise2/
http://www.ebi.ac.uk/Wise2/
Bio.Wise.psw is for protein Smith-Waterman alignments
Bio.Wise.dnal is for Smith-Waterman DNA alignments
"""
import re
# Importing with leading underscore as not intended to be exposed
from subprocess import getoutput as _getoutput
from Bio import Wise
_SCORE_MATCH = 4
_SCORE_MISMATCH = -1
_SCORE_GAP_START = -5
_SCORE_GAP_EXTENSION = -1
_CMDLINE_DNAL = ["dnal", "-alb", "-nopretty"]
def _build_dnal_cmdline(match, mismatch, gap, extension):
res = _CMDLINE_DNAL[:]
res.extend(["-match", str(match)])
res.extend(["-mis", str(mismatch)])
res.extend(["-gap", str(-gap)]) # negative: convert score to penalty
res.extend(["-ext", str(-extension)]) # negative: convert score to penalty
return res
_CMDLINE_FGREP_COUNT = "fgrep -c '%s' %s"
def _fgrep_count(pattern, file):
return int(_getoutput(_CMDLINE_FGREP_COUNT % (pattern, file)))
_re_alb_line2coords = re.compile(r"^\[([^:]+):[^\[]+\[([^:]+):")
def _alb_line2coords(line):
return tuple(
int(coord) + 1 # one-based -> zero-based
for coord in _re_alb_line2coords.match(line).groups()
)
def _get_coords(filename):
alb = open(filename)
start_line = None
end_line = None
for line in alb:
if line.startswith("["):
if not start_line:
start_line = line # rstrip not needed
else:
end_line = line
if end_line is None: # sequence is too short
return [(0, 0), (0, 0)]
return list(
zip(*map(_alb_line2coords, [start_line, end_line]))
) # returns [(start0, end0), (start1, end1)]
class Statistics:
"""Calculate statistics from an ALB report."""
def __init__(self, filename, match, mismatch, gap, extension):
"""Initialize the class."""
self.matches = _fgrep_count(f'"SEQUENCE" {match}', filename)
self.mismatches = _fgrep_count(f'"SEQUENCE" {mismatch}', filename)
self.gaps = _fgrep_count(f'"INSERT" {gap}', filename)
if gap == extension:
self.extensions = 0
else:
self.extensions = _fgrep_count(f'"INSERT" {extension}', filename)
self.score = (
match * self.matches
+ mismatch * self.mismatches
+ gap * self.gaps
+ extension * self.extensions
)
if self.matches or self.mismatches or self.gaps or self.extensions:
self.coords = _get_coords(filename)
else:
self.coords = [(0, 0), (0, 0)]
def identity_fraction(self):
"""Calculate the fraction of matches."""
return self.matches / (self.matches + self.mismatches)
header = "identity_fraction\tmatches\tmismatches\tgaps\textensions"
def __str__(self):
"""Statistics as a tab separated string."""
return "\t".join(
str(x)
for x in (
self.identity_fraction(),
self.matches,
self.mismatches,
self.gaps,
self.extensions,
)
)
def align(
pair,
match=_SCORE_MATCH,
mismatch=_SCORE_MISMATCH,
gap=_SCORE_GAP_START,
extension=_SCORE_GAP_EXTENSION,
**keywds,
):
"""Align a pair of DNA files using dnal and calculate the statistics of the alignment."""
cmdline = _build_dnal_cmdline(match, mismatch, gap, extension)
temp_file = Wise.align(cmdline, pair, **keywds)
try:
return Statistics(temp_file.name, match, mismatch, gap, extension)
except AttributeError:
try:
keywds["dry_run"]
return None
except KeyError:
raise
def main():
"""Command line implementation."""
import sys
stats = align(sys.argv[1:3])
print(
"\n".join(
f"{attr}: {getattr(stats, attr)}"
for attr in ("matches", "mismatches", "gaps", "extensions")
)
)
print(f"identity_fraction: {stats.identity_fraction()}")
print(f"coords: {stats.coords}")
def _test(*args, **keywds):
import doctest
import sys
doctest.testmod(sys.modules[__name__], *args, **keywds)
if __name__ == "__main__":
if __debug__:
_test()
main()
|