Spaces:
No application file
No application file
# Copyright 2013 by Zheng Ruan ([email protected]). | |
# All rights reserved. | |
# This code is part of the Biopython distribution and governed by its | |
# license. Please see the LICENSE file that should have been included | |
# as part of this package. | |
"""Code for dealing with Codon Alignments.""" | |
import copy | |
from collections.abc import Mapping, Iterable | |
from Bio import BiopythonWarning | |
from Bio import BiopythonExperimentalWarning | |
from Bio.SeqRecord import SeqRecord | |
from Bio.Data import CodonTable | |
from Bio.codonalign.codonseq import CodonSeq | |
from Bio.codonalign.codonalignment import CodonAlignment, mktest | |
import warnings | |
warnings.warn( | |
"Bio.codonalign is an experimental module which may undergo " | |
"significant changes prior to its future official release.", | |
BiopythonExperimentalWarning, | |
) | |
def build( | |
pro_align, | |
nucl_seqs, | |
corr_dict=None, | |
gap_char="-", | |
unknown="X", | |
codon_table=None, | |
complete_protein=False, | |
anchor_len=10, | |
max_score=10, | |
): | |
"""Build a codon alignment from protein alignment and corresponding nucleotides. | |
Arguments: | |
- pro_align - a protein MultipleSeqAlignment object | |
- nucl_seqs - an object returned by SeqIO.parse or SeqIO.index | |
or a collection of SeqRecord. | |
- corr_dict - a dict that maps protein id to nucleotide id | |
- complete_protein - whether the sequence begins with a start | |
codon | |
Return a CodonAlignment object. | |
The example below answers this Biostars question: https://www.biostars.org/p/89741/ | |
>>> from Bio.Seq import Seq | |
>>> from Bio.SeqRecord import SeqRecord | |
>>> from Bio.Align import MultipleSeqAlignment | |
>>> from Bio.codonalign import build | |
>>> seq1 = SeqRecord(Seq('ATGTCTCGT'), id='pro1') | |
>>> seq2 = SeqRecord(Seq('ATGCGT'), id='pro2') | |
>>> pro1 = SeqRecord(Seq('MSR'), id='pro1') | |
>>> pro2 = SeqRecord(Seq('M-R'), id='pro2') | |
>>> aln = MultipleSeqAlignment([pro1, pro2]) | |
>>> codon_aln = build(aln, [seq1, seq2]) | |
>>> print(codon_aln) | |
CodonAlignment with 2 rows and 9 columns (3 codons) | |
ATGTCTCGT pro1 | |
ATG---CGT pro2 | |
""" | |
# TODO | |
# add an option to allow the user to specify the returned object? | |
from Bio.Align import MultipleSeqAlignment | |
# check the type of object of pro_align | |
if not isinstance(pro_align, MultipleSeqAlignment): | |
raise TypeError("the first argument should be a MultipleSeqAlignment object") | |
# check whether the number of seqs in pro_align and nucl_seqs is | |
# the same | |
pro_num = len(pro_align) | |
if corr_dict is None: | |
try: | |
nucl_num = len(nucl_seqs) | |
except TypeError: | |
# nucl_seqs will be an iterator if returned by SeqIO.parse() | |
nucl_seqs = tuple(nucl_seqs) | |
nucl_num = len(nucl_seqs) | |
if pro_num > nucl_num: | |
raise ValueError( | |
f"Higher Number of SeqRecords in Protein Alignment ({pro_num}) " | |
f"than the Number of Nucleotide SeqRecords ({nucl_num}) are found!" | |
) | |
# Determine the protein sequences and nucl sequences | |
# correspondence. If nucl_seqs is a list, tuple or read by | |
# SeqIO.parse(), we assume the order of sequences in pro_align | |
# and nucl_seqs are the same. If nucl_seqs is a dict or read by | |
# SeqIO.index(), we match seqs in pro_align and those in | |
# nucl_seq by their id. | |
if isinstance(nucl_seqs, Mapping): | |
corr_method = 1 | |
elif isinstance(nucl_seqs, Iterable): | |
corr_method = 0 | |
else: | |
raise TypeError( | |
"Nucl Sequences Error, Unknown type to assign correspondence method" | |
) | |
else: | |
if not isinstance(corr_dict, dict): | |
raise TypeError( | |
"corr_dict should be a dict that corresponds " | |
"protein id to nucleotide id!" | |
) | |
if len(corr_dict) >= pro_num: | |
if isinstance(nucl_seqs, Mapping): | |
pass | |
else: | |
d = {} | |
for record in nucl_seqs: | |
key = record.id | |
if key in d: | |
raise ValueError(f"Duplicate key '{key}'") | |
d[key] = record | |
nucl_seqs = d | |
corr_method = 2 | |
else: | |
raise RuntimeError( | |
f"Number of items in corr_dict ({len(corr_dict)}) " | |
f"is less than number of protein records ({pro_num})" | |
) | |
# set up pro-nucl correspondence based on corr_method | |
# corr_method = 0, consecutive pairing | |
if corr_method == 0: | |
pro_nucl_pair = zip(pro_align, nucl_seqs) | |
# corr_method = 1, keyword pairing | |
elif corr_method == 1: | |
nucl_id = set(nucl_seqs.keys()) | |
pro_id = {i.id for i in pro_align} | |
# check if there is pro_id that does not have a nucleotide match | |
if pro_id - nucl_id: | |
diff = pro_id - nucl_id | |
raise ValueError( | |
f"Protein Record {', '.join(diff)} cannot find a " | |
"nucleotide sequence match, please check the id" | |
) | |
else: | |
pro_nucl_pair = [] | |
for pro_rec in pro_align: | |
pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id])) | |
# corr_method = 2, dict pairing | |
elif corr_method == 2: | |
pro_nucl_pair = [] | |
for pro_rec in pro_align: | |
try: | |
nucl_id = corr_dict[pro_rec.id] | |
except KeyError: | |
print(f"Protein record ({pro_rec.id}) is not in corr_dict!") | |
exit(1) | |
pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id])) | |
if codon_table is None: | |
codon_table = CodonTable.generic_by_id[1] | |
codon_aln = [] | |
shift = False | |
for pair in pro_nucl_pair: | |
# Beware that the following span corresponds to an ungapped | |
# nucleotide sequence. | |
corr_span = _check_corr( | |
pair[0], | |
pair[1], | |
gap_char=gap_char, | |
codon_table=codon_table, | |
complete_protein=complete_protein, | |
anchor_len=anchor_len, | |
) | |
if not corr_span: | |
raise ValueError( | |
f"Protein Record {pair[0].id} and " | |
f"Nucleotide Record {pair[1].id} do not match!" | |
) | |
else: | |
codon_rec = _get_codon_rec( | |
pair[0], | |
pair[1], | |
corr_span, | |
gap_char=gap_char, | |
complete_protein=complete_protein, | |
codon_table=codon_table, | |
max_score=max_score, | |
) | |
codon_aln.append(codon_rec) | |
if corr_span[1] == 2: | |
shift = True | |
if shift: | |
return CodonAlignment(_align_shift_recs(codon_aln)) | |
else: | |
return CodonAlignment(codon_aln) | |
def _codons2re(codons): | |
"""Generate regular expression based on a given list of codons (PRIVATE).""" | |
reg = "" | |
for i in zip(*codons): | |
if len(set(i)) == 1: | |
reg += "".join(set(i)) | |
else: | |
reg += "[" + "".join(set(i)) + "]" | |
return reg | |
def _get_aa_regex(codon_table, stop="*", unknown="X"): | |
"""Set up the regular expression of a given CodonTable (PRIVATE). | |
>>> from Bio.Data.CodonTable import generic_by_id | |
>>> p = generic_by_id[1] | |
>>> t = _get_aa_regex(p) | |
>>> print(t['A'][0]) | |
G | |
>>> print(t['A'][1]) | |
C | |
>>> print(sorted(list(t['A'][2:]))) | |
['A', 'C', 'G', 'T', 'U', '[', ']'] | |
>>> print(sorted(list(t['L'][:5]))) | |
['C', 'T', 'U', '[', ']'] | |
>>> print(sorted(list(t['L'][5:9]))) | |
['T', 'U', '[', ']'] | |
>>> print(sorted(list(t['L'][9:]))) | |
['A', 'C', 'G', 'T', 'U', '[', ']'] | |
""" | |
from Bio.Data.CodonTable import CodonTable | |
if not isinstance(codon_table, CodonTable): | |
raise TypeError("Input table is not a instance of Bio.Data.CodonTable object") | |
aa2codon = {} | |
for codon, aa in codon_table.forward_table.items(): | |
aa2codon.setdefault(aa, []).append(codon) | |
for aa, codons in aa2codon.items(): | |
aa2codon[aa] = _codons2re(codons) | |
aa2codon[stop] = _codons2re(codon_table.stop_codons) | |
aa2codon[unknown] = "..." | |
return aa2codon | |
def _check_corr( | |
pro, nucl, gap_char, codon_table, complete_protein=False, anchor_len=10 | |
): | |
"""Check if the nucleotide can be translated into the protein (PRIVATE). | |
Expects two SeqRecord objects. | |
""" | |
import re | |
if not isinstance(pro, SeqRecord) or not isinstance(nucl, SeqRecord): | |
raise TypeError( | |
"_check_corr accepts two SeqRecord object. Please check your input." | |
) | |
aa2re = _get_aa_regex(codon_table) | |
pro_re = "" | |
for aa in pro.seq: | |
if aa != gap_char: | |
pro_re += aa2re[aa] | |
nucl_seq = str(nucl.seq.upper().replace(gap_char, "")) | |
match = re.search(pro_re, nucl_seq) | |
if match: | |
# mode = 0, direct match | |
return (match.span(), 0) | |
else: | |
# Might caused by mismatches or frameshift, using anchors to | |
# have a try | |
# anchor_len = 10 # adjust this value to test performance | |
pro_seq = str(pro.seq).replace(gap_char, "") | |
anchors = [ | |
pro_seq[i : (i + anchor_len)] for i in range(0, len(pro_seq), anchor_len) | |
] | |
# if the last anchor is less than the specified anchor | |
# size, we combine the penultimate and the last anchor | |
# together as the last one. | |
# TODO: modify this to deal with short sequence with only | |
# one anchor. | |
if len(anchors[-1]) < anchor_len: | |
anchors[-1] = anchors[-2] + anchors[-1] | |
pro_re = [] | |
anchor_distance = 0 | |
anchor_pos = [] | |
for i, anchor in enumerate(anchors): | |
this_anchor_len = len(anchor) | |
qcodon = "" | |
fncodon = "" | |
# dirty code to deal with the last anchor | |
# as the last anchor is combined in the steps | |
# above, we need to get the true last anchor to | |
# pro_re | |
if this_anchor_len == anchor_len: | |
for aa in anchor: | |
if complete_protein and i == 0: | |
qcodon += _codons2re(codon_table.start_codons) | |
fncodon += aa2re["X"] | |
continue | |
qcodon += aa2re[aa] | |
fncodon += aa2re["X"] | |
match = re.search(qcodon, nucl_seq) | |
elif this_anchor_len > anchor_len: | |
last_qcodon = "" | |
last_fcodon = "" | |
for j in range(anchor_len, len(anchor)): | |
last_qcodon += aa2re[anchor[j]] | |
last_fcodon += aa2re["X"] | |
match = re.search(last_qcodon, nucl_seq) | |
# build full_pro_re from anchors | |
if match: | |
anchor_pos.append((match.start(), match.end(), i)) | |
if this_anchor_len == anchor_len: | |
pro_re.append(qcodon) | |
else: | |
pro_re.append(last_qcodon) | |
else: | |
if this_anchor_len == anchor_len: | |
pro_re.append(fncodon) | |
else: | |
pro_re.append(last_fcodon) | |
full_pro_re = "".join(pro_re) | |
match = re.search(full_pro_re, nucl_seq) | |
if match: | |
# mode = 1, mismatch | |
return (match.span(), 1) | |
else: | |
# check frames of anchors | |
# ten frameshift events are allowed in a sequence | |
first_anchor = True | |
shift_id_pos = 0 | |
# check the first anchor | |
if first_anchor and anchor_pos[0][2] != 0: | |
shift_val_lst = [1, 2, 3 * anchor_len - 2, 3 * anchor_len - 1, 0] | |
sh_anc = anchors[0] | |
for shift_val in shift_val_lst: | |
if shift_val == 0: | |
qcodon = None | |
break | |
if shift_val in (1, 2): | |
sh_nuc_len = anchor_len * 3 + shift_val | |
elif shift_val in (3 * anchor_len - 2, 3 * anchor_len - 1): | |
sh_nuc_len = anchor_len * 3 - (3 * anchor_len - shift_val) | |
if anchor_pos[0][0] >= sh_nuc_len: | |
sh_nuc = nucl_seq[ | |
anchor_pos[0][0] - sh_nuc_len : anchor_pos[0][0] | |
] | |
else: | |
# this is unlikely to produce the correct output | |
sh_nuc = nucl_seq[: anchor_pos[0][0]] | |
qcodon, shift_id_pos = _get_shift_anchor_re( | |
sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos | |
) | |
if qcodon is not None and qcodon != -1: | |
# pro_re[0] should be '.'*anchor_len, therefore I | |
# replace it. | |
pro_re[0] = qcodon | |
break | |
if qcodon == -1: | |
warnings.warn( | |
f"first frameshift detection failed for {nucl.id}", | |
BiopythonWarning, | |
) | |
# check anchors in the middle | |
for i in range(len(anchor_pos) - 1): | |
shift_val = (anchor_pos[i + 1][0] - anchor_pos[i][0]) % (3 * anchor_len) | |
sh_anc = "".join(anchors[anchor_pos[i][2] : anchor_pos[i + 1][2]]) | |
sh_nuc = nucl_seq[anchor_pos[i][0] : anchor_pos[i + 1][0]] | |
qcodon = None | |
if shift_val != 0: | |
qcodon, shift_id_pos = _get_shift_anchor_re( | |
sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos | |
) | |
if qcodon is not None and qcodon != -1: | |
pro_re[anchor_pos[i][2] : anchor_pos[i + 1][2]] = [qcodon] | |
qcodon = None | |
elif qcodon == -1: | |
warnings.warn( | |
f"middle frameshift detection failed for {nucl.id}", | |
BiopythonWarning, | |
) | |
# check the last anchor | |
if anchor_pos[-1][2] + 1 == len(anchors) - 1: | |
sh_anc = anchors[-1] | |
this_anchor_len = len(sh_anc) | |
shift_val_lst = [ | |
1, | |
2, | |
3 * this_anchor_len - 2, | |
3 * this_anchor_len - 1, | |
0, | |
] | |
for shift_val in shift_val_lst: | |
if shift_val == 0: | |
qcodon = None | |
break | |
if shift_val in (1, 2): | |
sh_nuc_len = this_anchor_len * 3 + shift_val | |
elif shift_val in ( | |
3 * this_anchor_len - 2, | |
3 * this_anchor_len - 1, | |
): | |
sh_nuc_len = this_anchor_len * 3 - ( | |
3 * this_anchor_len - shift_val | |
) | |
if len(nucl_seq) - anchor_pos[-1][0] >= sh_nuc_len: | |
sh_nuc = nucl_seq[ | |
anchor_pos[-1][0] : anchor_pos[-1][0] + sh_nuc_len | |
] | |
else: | |
# this is unlikely to produce the correct output | |
sh_nuc = nucl_seq[anchor_pos[-1][0] :] | |
qcodon, shift_id_pos = _get_shift_anchor_re( | |
sh_anc, sh_nuc, shift_val, aa2re, this_anchor_len, shift_id_pos | |
) | |
if qcodon is not None and qcodon != -1: | |
pro_re.pop() | |
pro_re[-1] = qcodon | |
break | |
if qcodon == -1: | |
warnings.warn( | |
f"last frameshift detection failed for {nucl.id}", | |
BiopythonWarning, | |
) | |
# try global match | |
full_pro_re = "".join(pro_re) | |
match = re.search(full_pro_re, nucl_seq) | |
if match: | |
return (match.span(), 2, match) | |
else: | |
raise RuntimeError( | |
f"Protein SeqRecord ({pro.id}) and " | |
f"Nucleotide SeqRecord ({nucl.id}) do not match!" | |
) | |
def _get_shift_anchor_re(sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos): | |
"""Find a regular expression matching a potentially shifted anchor (PRIVATE). | |
Arguments: | |
- sh_anc - shifted anchor sequence | |
- sh_nuc - potentially corresponding nucleotide sequence | |
of sh_anc | |
- shift_val - 1 or 2 indicates forward frame shift, whereas | |
3*anchor_len-1 or 3*anchor_len-2 indicates | |
backward shift | |
- aa2re - aa to codon re dict | |
- anchor_len - length of the anchor | |
- shift_id_pos - specify current shift name we are at | |
""" | |
import re | |
shift_id = [chr(i) for i in range(97, 107)] | |
if 0 < shift_val < 3 * anchor_len - 2: | |
# if shift_val in (1, 2): | |
for j in range(len(sh_anc)): | |
qcodon = "^" | |
for k, aa in enumerate(sh_anc): | |
if k == j: | |
qcodon += aa2re[aa] + "(?P<" + shift_id[shift_id_pos] + ">..*)" | |
else: | |
qcodon += aa2re[aa] | |
qcodon += "$" | |
match = re.search(qcodon, sh_nuc) | |
if match: | |
qcodon = qcodon.replace("^", "").replace("$", "") | |
shift_id_pos += 1 | |
return qcodon, shift_id_pos | |
if not match: | |
# failed to find a match (frameshift) | |
return -1, shift_id_pos | |
elif shift_val in (3 * anchor_len - 1, 3 * anchor_len - 2): | |
shift_val = 3 * anchor_len - shift_val | |
# obtain shifted anchor and corresponding nucl | |
# first check if the shifted pos is just at the end of the | |
# previous anchor. | |
for j in range(1, len(sh_anc)): | |
qcodon = "^" | |
for k, aa in enumerate(sh_anc): | |
if k == j - 1: | |
# will be considered in the next step | |
pass | |
elif k == j: | |
qcodon += _merge_aa2re( | |
sh_anc[j - 1], | |
sh_anc[j], | |
shift_val, | |
aa2re, | |
shift_id[shift_id_pos].upper(), | |
) | |
else: | |
qcodon += aa2re[aa] | |
qcodon += "$" | |
match = re.search(qcodon, sh_nuc) | |
if match: | |
qcodon = qcodon.replace("^", "").replace("$", "") | |
shift_id_pos += 1 | |
return qcodon, shift_id_pos | |
if not match: | |
# failed to find a match (frameshift) | |
return -1, shift_id_pos | |
def _merge_aa2re(aa1, aa2, shift_val, aa2re, reid): | |
"""Merge two amino acids based on detected frame shift value (PRIVATE).""" | |
def get_aa_from_codonre(re_aa): | |
aas = [] | |
m = 0 | |
for i in re_aa: | |
if i == "[": | |
m = -1 | |
aas.append("") | |
elif i == "]": | |
m = 0 | |
continue | |
elif m == -1: | |
aas[-1] = aas[-1] + i | |
elif m == 0: | |
aas.append(i) | |
return aas | |
scodon = list(map(get_aa_from_codonre, (aa2re[aa1], aa2re[aa2]))) | |
if shift_val == 1: | |
intersect = "".join(set(scodon[0][2]) & set(scodon[1][0])) | |
scodonre = "(?P<" + reid + ">" | |
scodonre += ( | |
"[" | |
+ scodon[0][0] | |
+ "]" | |
+ "[" | |
+ scodon[0][1] | |
+ "]" | |
+ "[" | |
+ intersect | |
+ "]" | |
+ "[" | |
+ scodon[1][1] | |
+ "]" | |
+ "[" | |
+ scodon[1][2] | |
+ "]" | |
) | |
elif shift_val == 2: | |
intersect1 = "".join(set(scodon[0][1]) & set(scodon[1][0])) | |
intersect2 = "".join(set(scodon[0][2]) & set(scodon[1][1])) | |
scodonre = "(?P<" + reid + ">" | |
scodonre += ( | |
"[" | |
+ scodon[0][0] | |
+ "]" | |
+ "[" | |
+ intersect1 | |
+ "]" | |
+ "[" | |
+ intersect2 | |
+ "]" | |
+ "[" | |
+ scodon[1][2] | |
+ "]" | |
) | |
scodonre += ")" | |
return scodonre | |
def _get_codon_rec( | |
pro, nucl, span_mode, gap_char, codon_table, complete_protein=False, max_score=10 | |
): | |
"""Generate codon alignment based on regular re match (PRIVATE). | |
span_mode is a tuple returned by _check_corr. The first element | |
is the span of a re search, and the second element is the mode | |
for the match. | |
mode | |
- 0: direct match | |
- 1: mismatch (no indels) | |
- 2: frameshift | |
""" | |
import re | |
from Bio.Seq import Seq | |
nucl_seq = nucl.seq.replace(gap_char, "") | |
span = span_mode[0] | |
mode = span_mode[1] | |
aa2re = _get_aa_regex(codon_table) | |
if mode in (0, 1): | |
if len(pro.seq.replace(gap_char, "")) * 3 != (span[1] - span[0]): | |
raise ValueError( | |
f"Protein Record {pro.id} and " | |
f"Nucleotide Record {nucl.id} do not match!" | |
) | |
aa_num = 0 | |
codon_seq = CodonSeq() | |
for aa in pro.seq: | |
if aa == "-": | |
codon_seq += "---" | |
elif complete_protein and aa_num == 0: | |
this_codon = nucl_seq[span[0] : span[0] + 3] | |
if not re.search( | |
_codons2re(codon_table.start_codons), str(this_codon.upper()) | |
): | |
max_score -= 1 | |
warnings.warn( | |
f"start codon of {pro.id} ({aa} {aa_num}) does not " | |
f"correspond to {nucl.id} ({this_codon})", | |
BiopythonWarning, | |
) | |
if max_score == 0: | |
raise RuntimeError( | |
f"max_score reached for {nucl.id}! Please raise up " | |
"the tolerance to get an alignment in anyway" | |
) | |
codon_seq += this_codon | |
aa_num += 1 | |
else: | |
this_codon = nucl_seq[span[0] + 3 * aa_num : span[0] + 3 * (aa_num + 1)] | |
if this_codon.upper().translate(table=codon_table) != aa: | |
max_score -= 1 | |
warnings.warn( | |
"%s(%s %d) does not correspond to %s(%s)" | |
% (pro.id, aa, aa_num, nucl.id, this_codon), | |
BiopythonWarning, | |
) | |
if max_score == 0: | |
raise RuntimeError( | |
f"max_score reached for {nucl.id}! Please raise up " | |
"the tolerance to get an alignment in anyway" | |
) | |
codon_seq += this_codon | |
aa_num += 1 | |
return SeqRecord(codon_seq, id=nucl.id) | |
elif mode == 2: | |
from collections import deque | |
shift_pos = deque([]) | |
shift_start = [] | |
match = span_mode[2] | |
m_groupdict = list(match.groupdict().keys()) | |
# backward frameshift | |
for i in m_groupdict: | |
shift_pos.append(match.span(i)) | |
shift_start.append(match.start(i)) | |
rf_table = [] | |
i = match.start() | |
while True: | |
rf_table.append(i) | |
i += 3 | |
if i in shift_start and m_groupdict[shift_start.index(i)].isupper(): | |
shift_index = shift_start.index(i) | |
shift_val = 6 - (shift_pos[shift_index][1] - shift_pos[shift_index][0]) | |
rf_table.append(i) | |
rf_table.append(i + 3 - shift_val) | |
i = shift_pos[shift_index][1] | |
elif i in shift_start and m_groupdict[shift_start.index(i)].islower(): | |
i = shift_pos[shift_start.index(i)][1] | |
if i >= match.end(): | |
break | |
codon_seq = CodonSeq() | |
aa_num = 0 | |
for aa in pro.seq: | |
if aa == "-": | |
codon_seq += "---" | |
elif complete_protein and aa_num == 0: | |
this_codon = nucl_seq[rf_table[0] : rf_table[0] + 3] | |
if not re.search( | |
_codons2re(codon_table.start_codons), str(this_codon.upper()) | |
): | |
max_score -= 1 | |
warnings.warn( | |
f"start codon of {pro.id}({aa} {aa_num}) does not " | |
f"correspond to {nucl.id}({this_codon})", | |
BiopythonWarning, | |
) | |
codon_seq += this_codon | |
aa_num += 1 | |
else: | |
if ( | |
aa_num < len(pro.seq.replace("-", "")) - 1 | |
and rf_table[aa_num + 1] - rf_table[aa_num] - 3 < 0 | |
): | |
max_score -= 1 | |
start = rf_table[aa_num] | |
end = start + (3 - shift_val) | |
ngap = shift_val | |
this_codon = nucl_seq[start:end] + "-" * ngap | |
elif rf_table[aa_num] - rf_table[aa_num - 1] - 3 > 0: | |
max_score -= 1 | |
start = rf_table[aa_num - 1] + 3 | |
end = rf_table[aa_num] | |
ngap = 3 - (rf_table[aa_num] - rf_table[aa_num - 1] - 3) | |
this_codon = ( | |
nucl_seq[start:end] | |
+ "-" * ngap | |
+ nucl_seq[rf_table[aa_num] : rf_table[aa_num] + 3] | |
) | |
else: | |
start = rf_table[aa_num] | |
end = start + 3 | |
this_codon = nucl_seq[start:end] | |
if this_codon.upper().translate(table=codon_table) != aa: | |
max_score -= 1 | |
warnings.warn( | |
f"Codon of {pro.id}({aa} {aa_num}) does not " | |
f"correspond to {nucl.id}({this_codon})", | |
BiopythonWarning, | |
) | |
if max_score == 0: | |
raise RuntimeError( | |
f"max_score reached for {nucl.id}! Please raise up " | |
"the tolerance to get an alignment in anyway" | |
) | |
codon_seq += this_codon | |
aa_num += 1 | |
codon_seq.rf_table = rf_table | |
return SeqRecord(codon_seq, id=nucl.id) | |
def _align_shift_recs(recs): | |
"""Build alignment according to the frameshift detected by _check_corr (PRIVATE). | |
Argument: | |
- recs - a list of SeqRecords containing a CodonSeq dictated | |
by a rf_table (with frameshift in some of them). | |
""" | |
def find_next_int(k, lst): | |
idx = lst.index(k) | |
p = 0 | |
while True: | |
if isinstance(lst[idx + p], int): | |
return lst[idx + p], p | |
p += 1 | |
full_rf_table_lst = [rec.seq.get_full_rf_table() for rec in recs] | |
rf_num = [0] * len(recs) | |
for k, rec in enumerate(recs): | |
for i in rec.seq.get_full_rf_table(): | |
if isinstance(i, int): | |
rf_num[k] += 1 | |
# isinstance(i, float) should be True | |
elif rec.seq[int(i) : int(i) + 3] == "---": | |
rf_num[k] += 1 | |
if len(set(rf_num)) != 1: | |
raise RuntimeError("Number of alignable codons unequal in given records") | |
i = 0 | |
rec_num = len(recs) | |
while True: | |
add_lst = [] | |
try: | |
col_rf_lst = [k[i] for k in full_rf_table_lst] | |
except IndexError: | |
# we probably reached the last codon | |
break | |
for j, k in enumerate(col_rf_lst): | |
add_lst.append((j, int(k))) | |
if isinstance(k, float) and recs[j].seq[int(k) : int(k) + 3] != "---": | |
m, p = find_next_int(k, full_rf_table_lst[j]) | |
if (m - k) % 3 != 0: | |
gap_num = 3 - (m - k) % 3 | |
else: | |
gap_num = 0 | |
if gap_num != 0: | |
gaps = "-" * int(gap_num) | |
seq = CodonSeq(rf_table=recs[j].seq.rf_table) | |
seq += recs[j].seq[: int(k)] + gaps + recs[j].seq[int(k) :] | |
full_rf_table = full_rf_table_lst[j] | |
bp = full_rf_table.index(k) | |
full_rf_table = full_rf_table[:bp] + [ | |
v + int(gap_num) for v in full_rf_table[bp + 1 :] | |
] | |
full_rf_table_lst[j] = full_rf_table | |
recs[j].seq = seq | |
add_lst.pop() | |
gap_num += m - k | |
i += p - 1 | |
if len(add_lst) != rec_num: | |
for j, k in add_lst: | |
seq = CodonSeq(rf_table=recs[j].seq.rf_table) | |
gaps = "-" * int(gap_num) | |
seq += recs[j].seq[: int(k)] + gaps + recs[j].seq[int(k) :] | |
full_rf_table = full_rf_table_lst[j] | |
bp = full_rf_table.index(k) | |
inter_rf = [] | |
for t in range(0, len(gaps), 3): | |
inter_rf.append(k + t + 3.0) | |
full_rf_table = ( | |
full_rf_table[:bp] | |
+ inter_rf | |
+ [v + int(gap_num) for v in full_rf_table[bp:]] | |
) | |
full_rf_table_lst[j] = full_rf_table | |
recs[j].seq = seq | |
i += 1 | |
return recs | |
if __name__ == "__main__": | |
from Bio._utils import run_doctest | |
run_doctest() | |