Spaces:

aakash0017
/

DrVai-Rag-Testing

No application file

File size: 30,013 Bytes

b7731cd

# Copyright 2013 by Zheng Ruan ([email protected]).
# All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Code for dealing with Codon Alignments."""

import copy
from collections.abc import Mapping, Iterable

from Bio import BiopythonWarning
from Bio import BiopythonExperimentalWarning

from Bio.SeqRecord import SeqRecord
from Bio.Data import CodonTable

from Bio.codonalign.codonseq import CodonSeq
from Bio.codonalign.codonalignment import CodonAlignment, mktest

import warnings

warnings.warn(
    "Bio.codonalign is an experimental module which may undergo "
    "significant changes prior to its future official release.",
    BiopythonExperimentalWarning,
)


def build(
    pro_align,
    nucl_seqs,
    corr_dict=None,
    gap_char="-",
    unknown="X",
    codon_table=None,
    complete_protein=False,
    anchor_len=10,
    max_score=10,
):
    """Build a codon alignment from protein alignment and corresponding nucleotides.

    Arguments:
     - pro_align  - a protein MultipleSeqAlignment object
     - nucl_seqs - an object returned by SeqIO.parse or SeqIO.index
       or a collection of SeqRecord.
     - corr_dict  - a dict that maps protein id to nucleotide id
     - complete_protein - whether the sequence begins with a start
       codon

    Return a CodonAlignment object.

    The example below answers this Biostars question: https://www.biostars.org/p/89741/

    >>> from Bio.Seq import Seq
    >>> from Bio.SeqRecord import SeqRecord
    >>> from Bio.Align import MultipleSeqAlignment
    >>> from Bio.codonalign import build
    >>> seq1 = SeqRecord(Seq('ATGTCTCGT'), id='pro1')
    >>> seq2 = SeqRecord(Seq('ATGCGT'), id='pro2')
    >>> pro1 = SeqRecord(Seq('MSR'), id='pro1')
    >>> pro2 = SeqRecord(Seq('M-R'), id='pro2')
    >>> aln = MultipleSeqAlignment([pro1, pro2])
    >>> codon_aln = build(aln, [seq1, seq2])
    >>> print(codon_aln)
    CodonAlignment with 2 rows and 9 columns (3 codons)
    ATGTCTCGT pro1
    ATG---CGT pro2

    """
    # TODO
    # add an option to allow the user to specify the returned object?

    from Bio.Align import MultipleSeqAlignment

    # check the type of object of pro_align
    if not isinstance(pro_align, MultipleSeqAlignment):
        raise TypeError("the first argument should be a MultipleSeqAlignment object")
    # check whether the number of seqs in pro_align and nucl_seqs is
    # the same
    pro_num = len(pro_align)
    if corr_dict is None:
        try:
            nucl_num = len(nucl_seqs)
        except TypeError:
            # nucl_seqs will be an iterator if returned by SeqIO.parse()
            nucl_seqs = tuple(nucl_seqs)
            nucl_num = len(nucl_seqs)
        if pro_num > nucl_num:
            raise ValueError(
                f"Higher Number of SeqRecords in Protein Alignment ({pro_num}) "
                f"than the Number of Nucleotide SeqRecords ({nucl_num}) are found!"
            )

        # Determine the protein sequences and nucl sequences
        # correspondence. If nucl_seqs is a list, tuple or read by
        # SeqIO.parse(), we assume the order of sequences in pro_align
        # and nucl_seqs are the same. If nucl_seqs is a dict or read by
        # SeqIO.index(), we match seqs in pro_align and those in
        # nucl_seq by their id.
        if isinstance(nucl_seqs, Mapping):
            corr_method = 1
        elif isinstance(nucl_seqs, Iterable):
            corr_method = 0
        else:
            raise TypeError(
                "Nucl Sequences Error, Unknown type to assign correspondence method"
            )
    else:
        if not isinstance(corr_dict, dict):
            raise TypeError(
                "corr_dict should be a dict that corresponds "
                "protein id to nucleotide id!"
            )
        if len(corr_dict) >= pro_num:
            if isinstance(nucl_seqs, Mapping):
                pass
            else:
                d = {}
                for record in nucl_seqs:
                    key = record.id
                    if key in d:
                        raise ValueError(f"Duplicate key '{key}'")
                    d[key] = record
                nucl_seqs = d
            corr_method = 2
        else:
            raise RuntimeError(
                f"Number of items in corr_dict ({len(corr_dict)}) "
                f"is less than number of protein records ({pro_num})"
            )

    # set up pro-nucl correspondence based on corr_method
    # corr_method = 0, consecutive pairing
    if corr_method == 0:
        pro_nucl_pair = zip(pro_align, nucl_seqs)
    # corr_method = 1, keyword pairing
    elif corr_method == 1:
        nucl_id = set(nucl_seqs.keys())
        pro_id = {i.id for i in pro_align}
        # check if there is pro_id that does not have a nucleotide match
        if pro_id - nucl_id:
            diff = pro_id - nucl_id
            raise ValueError(
                f"Protein Record {', '.join(diff)} cannot find a "
                "nucleotide sequence match, please check the id"
            )
        else:
            pro_nucl_pair = []
            for pro_rec in pro_align:
                pro_nucl_pair.append((pro_rec, nucl_seqs[pro_rec.id]))
    # corr_method = 2, dict pairing
    elif corr_method == 2:
        pro_nucl_pair = []
        for pro_rec in pro_align:
            try:
                nucl_id = corr_dict[pro_rec.id]
            except KeyError:
                print(f"Protein record ({pro_rec.id}) is not in corr_dict!")
                exit(1)
            pro_nucl_pair.append((pro_rec, nucl_seqs[nucl_id]))

    if codon_table is None:
        codon_table = CodonTable.generic_by_id[1]

    codon_aln = []
    shift = False
    for pair in pro_nucl_pair:
        # Beware that the following span corresponds to an ungapped
        # nucleotide sequence.
        corr_span = _check_corr(
            pair[0],
            pair[1],
            gap_char=gap_char,
            codon_table=codon_table,
            complete_protein=complete_protein,
            anchor_len=anchor_len,
        )
        if not corr_span:
            raise ValueError(
                f"Protein Record {pair[0].id} and "
                f"Nucleotide Record {pair[1].id} do not match!"
            )
        else:
            codon_rec = _get_codon_rec(
                pair[0],
                pair[1],
                corr_span,
                gap_char=gap_char,
                complete_protein=complete_protein,
                codon_table=codon_table,
                max_score=max_score,
            )
            codon_aln.append(codon_rec)
            if corr_span[1] == 2:
                shift = True
    if shift:
        return CodonAlignment(_align_shift_recs(codon_aln))
    else:
        return CodonAlignment(codon_aln)


def _codons2re(codons):
    """Generate regular expression based on a given list of codons (PRIVATE)."""
    reg = ""
    for i in zip(*codons):
        if len(set(i)) == 1:
            reg += "".join(set(i))
        else:
            reg += "[" + "".join(set(i)) + "]"
    return reg


def _get_aa_regex(codon_table, stop="*", unknown="X"):
    """Set up the regular expression of a given CodonTable (PRIVATE).

    >>> from Bio.Data.CodonTable import generic_by_id
    >>> p = generic_by_id[1]
    >>> t = _get_aa_regex(p)
    >>> print(t['A'][0])
    G
    >>> print(t['A'][1])
    C
    >>> print(sorted(list(t['A'][2:])))
    ['A', 'C', 'G', 'T', 'U', '[', ']']
    >>> print(sorted(list(t['L'][:5])))
    ['C', 'T', 'U', '[', ']']
    >>> print(sorted(list(t['L'][5:9])))
    ['T', 'U', '[', ']']
    >>> print(sorted(list(t['L'][9:])))
    ['A', 'C', 'G', 'T', 'U', '[', ']']

    """
    from Bio.Data.CodonTable import CodonTable

    if not isinstance(codon_table, CodonTable):
        raise TypeError("Input table is not a instance of Bio.Data.CodonTable object")
    aa2codon = {}
    for codon, aa in codon_table.forward_table.items():
        aa2codon.setdefault(aa, []).append(codon)
    for aa, codons in aa2codon.items():
        aa2codon[aa] = _codons2re(codons)
    aa2codon[stop] = _codons2re(codon_table.stop_codons)
    aa2codon[unknown] = "..."
    return aa2codon


def _check_corr(
    pro, nucl, gap_char, codon_table, complete_protein=False, anchor_len=10
):
    """Check if the nucleotide can be translated into the protein (PRIVATE).

    Expects two SeqRecord objects.
    """
    import re

    if not isinstance(pro, SeqRecord) or not isinstance(nucl, SeqRecord):
        raise TypeError(
            "_check_corr accepts two SeqRecord object. Please check your input."
        )

    aa2re = _get_aa_regex(codon_table)
    pro_re = ""
    for aa in pro.seq:
        if aa != gap_char:
            pro_re += aa2re[aa]

    nucl_seq = str(nucl.seq.upper().replace(gap_char, ""))
    match = re.search(pro_re, nucl_seq)
    if match:
        # mode = 0, direct match
        return (match.span(), 0)
    else:
        # Might caused by mismatches or frameshift, using anchors to
        # have a try
        # anchor_len = 10 # adjust this value to test performance
        pro_seq = str(pro.seq).replace(gap_char, "")
        anchors = [
            pro_seq[i : (i + anchor_len)] for i in range(0, len(pro_seq), anchor_len)
        ]
        # if the last anchor is less than the specified anchor
        # size, we combine the penultimate and the last anchor
        # together as the last one.
        # TODO: modify this to deal with short sequence with only
        # one anchor.
        if len(anchors[-1]) < anchor_len:
            anchors[-1] = anchors[-2] + anchors[-1]

        pro_re = []
        anchor_distance = 0
        anchor_pos = []
        for i, anchor in enumerate(anchors):
            this_anchor_len = len(anchor)
            qcodon = ""
            fncodon = ""
            # dirty code to deal with the last anchor
            # as the last anchor is combined in the steps
            # above, we need to get the true last anchor to
            # pro_re
            if this_anchor_len == anchor_len:
                for aa in anchor:
                    if complete_protein and i == 0:
                        qcodon += _codons2re(codon_table.start_codons)
                        fncodon += aa2re["X"]
                        continue
                    qcodon += aa2re[aa]
                    fncodon += aa2re["X"]
                match = re.search(qcodon, nucl_seq)
            elif this_anchor_len > anchor_len:
                last_qcodon = ""
                last_fcodon = ""
                for j in range(anchor_len, len(anchor)):
                    last_qcodon += aa2re[anchor[j]]
                    last_fcodon += aa2re["X"]
                match = re.search(last_qcodon, nucl_seq)
            # build full_pro_re from anchors
            if match:
                anchor_pos.append((match.start(), match.end(), i))
                if this_anchor_len == anchor_len:
                    pro_re.append(qcodon)
                else:
                    pro_re.append(last_qcodon)
            else:
                if this_anchor_len == anchor_len:
                    pro_re.append(fncodon)
                else:
                    pro_re.append(last_fcodon)
        full_pro_re = "".join(pro_re)
        match = re.search(full_pro_re, nucl_seq)
        if match:
            # mode = 1, mismatch
            return (match.span(), 1)
        else:
            # check frames of anchors
            # ten frameshift events are allowed in a sequence
            first_anchor = True
            shift_id_pos = 0
            # check the first anchor
            if first_anchor and anchor_pos[0][2] != 0:
                shift_val_lst = [1, 2, 3 * anchor_len - 2, 3 * anchor_len - 1, 0]
                sh_anc = anchors[0]
                for shift_val in shift_val_lst:
                    if shift_val == 0:
                        qcodon = None
                        break
                    if shift_val in (1, 2):
                        sh_nuc_len = anchor_len * 3 + shift_val
                    elif shift_val in (3 * anchor_len - 2, 3 * anchor_len - 1):
                        sh_nuc_len = anchor_len * 3 - (3 * anchor_len - shift_val)
                    if anchor_pos[0][0] >= sh_nuc_len:
                        sh_nuc = nucl_seq[
                            anchor_pos[0][0] - sh_nuc_len : anchor_pos[0][0]
                        ]
                    else:
                        # this is unlikely to produce the correct output
                        sh_nuc = nucl_seq[: anchor_pos[0][0]]
                    qcodon, shift_id_pos = _get_shift_anchor_re(
                        sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos
                    )
                    if qcodon is not None and qcodon != -1:
                        # pro_re[0] should be '.'*anchor_len, therefore I
                        # replace it.
                        pro_re[0] = qcodon
                        break
                if qcodon == -1:
                    warnings.warn(
                        f"first frameshift detection failed for {nucl.id}",
                        BiopythonWarning,
                    )
            # check anchors in the middle
            for i in range(len(anchor_pos) - 1):
                shift_val = (anchor_pos[i + 1][0] - anchor_pos[i][0]) % (3 * anchor_len)
                sh_anc = "".join(anchors[anchor_pos[i][2] : anchor_pos[i + 1][2]])
                sh_nuc = nucl_seq[anchor_pos[i][0] : anchor_pos[i + 1][0]]
                qcodon = None
                if shift_val != 0:
                    qcodon, shift_id_pos = _get_shift_anchor_re(
                        sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos
                    )
                if qcodon is not None and qcodon != -1:
                    pro_re[anchor_pos[i][2] : anchor_pos[i + 1][2]] = [qcodon]
                    qcodon = None
                elif qcodon == -1:
                    warnings.warn(
                        f"middle frameshift detection failed for {nucl.id}",
                        BiopythonWarning,
                    )
            # check the last anchor
            if anchor_pos[-1][2] + 1 == len(anchors) - 1:
                sh_anc = anchors[-1]
                this_anchor_len = len(sh_anc)
                shift_val_lst = [
                    1,
                    2,
                    3 * this_anchor_len - 2,
                    3 * this_anchor_len - 1,
                    0,
                ]
                for shift_val in shift_val_lst:
                    if shift_val == 0:
                        qcodon = None
                        break
                    if shift_val in (1, 2):
                        sh_nuc_len = this_anchor_len * 3 + shift_val
                    elif shift_val in (
                        3 * this_anchor_len - 2,
                        3 * this_anchor_len - 1,
                    ):
                        sh_nuc_len = this_anchor_len * 3 - (
                            3 * this_anchor_len - shift_val
                        )
                    if len(nucl_seq) - anchor_pos[-1][0] >= sh_nuc_len:
                        sh_nuc = nucl_seq[
                            anchor_pos[-1][0] : anchor_pos[-1][0] + sh_nuc_len
                        ]
                    else:
                        # this is unlikely to produce the correct output
                        sh_nuc = nucl_seq[anchor_pos[-1][0] :]
                    qcodon, shift_id_pos = _get_shift_anchor_re(
                        sh_anc, sh_nuc, shift_val, aa2re, this_anchor_len, shift_id_pos
                    )
                    if qcodon is not None and qcodon != -1:
                        pro_re.pop()
                        pro_re[-1] = qcodon
                        break
                if qcodon == -1:
                    warnings.warn(
                        f"last frameshift detection failed for {nucl.id}",
                        BiopythonWarning,
                    )
            # try global match
            full_pro_re = "".join(pro_re)
            match = re.search(full_pro_re, nucl_seq)
            if match:
                return (match.span(), 2, match)
            else:
                raise RuntimeError(
                    f"Protein SeqRecord ({pro.id}) and "
                    f"Nucleotide SeqRecord ({nucl.id}) do not match!"
                )


def _get_shift_anchor_re(sh_anc, sh_nuc, shift_val, aa2re, anchor_len, shift_id_pos):
    """Find a regular expression matching a potentially shifted anchor (PRIVATE).

    Arguments:
     - sh_anc    - shifted anchor sequence
     - sh_nuc    - potentially corresponding nucleotide sequence
       of sh_anc
     - shift_val - 1 or 2 indicates forward frame shift, whereas
       3*anchor_len-1 or 3*anchor_len-2 indicates
       backward shift
     - aa2re     - aa to codon re dict
     - anchor_len - length of the anchor
     - shift_id_pos - specify current shift name we are at

    """
    import re

    shift_id = [chr(i) for i in range(97, 107)]
    if 0 < shift_val < 3 * anchor_len - 2:
        # if shift_val in (1, 2):
        for j in range(len(sh_anc)):
            qcodon = "^"
            for k, aa in enumerate(sh_anc):
                if k == j:
                    qcodon += aa2re[aa] + "(?P<" + shift_id[shift_id_pos] + ">..*)"
                else:
                    qcodon += aa2re[aa]
            qcodon += "$"
            match = re.search(qcodon, sh_nuc)
            if match:
                qcodon = qcodon.replace("^", "").replace("$", "")
                shift_id_pos += 1
                return qcodon, shift_id_pos
        if not match:
            # failed to find a match (frameshift)
            return -1, shift_id_pos
    elif shift_val in (3 * anchor_len - 1, 3 * anchor_len - 2):
        shift_val = 3 * anchor_len - shift_val
        # obtain shifted anchor and corresponding nucl
        # first check if the shifted pos is just at the end of the
        # previous anchor.
        for j in range(1, len(sh_anc)):
            qcodon = "^"
            for k, aa in enumerate(sh_anc):
                if k == j - 1:
                    # will be considered in the next step
                    pass
                elif k == j:
                    qcodon += _merge_aa2re(
                        sh_anc[j - 1],
                        sh_anc[j],
                        shift_val,
                        aa2re,
                        shift_id[shift_id_pos].upper(),
                    )
                else:
                    qcodon += aa2re[aa]
            qcodon += "$"
            match = re.search(qcodon, sh_nuc)
            if match:
                qcodon = qcodon.replace("^", "").replace("$", "")
                shift_id_pos += 1
                return qcodon, shift_id_pos
        if not match:
            # failed to find a match (frameshift)
            return -1, shift_id_pos


def _merge_aa2re(aa1, aa2, shift_val, aa2re, reid):
    """Merge two amino acids based on detected frame shift value (PRIVATE)."""

    def get_aa_from_codonre(re_aa):
        aas = []
        m = 0
        for i in re_aa:
            if i == "[":
                m = -1
                aas.append("")
            elif i == "]":
                m = 0
                continue
            elif m == -1:
                aas[-1] = aas[-1] + i
            elif m == 0:
                aas.append(i)
        return aas

    scodon = list(map(get_aa_from_codonre, (aa2re[aa1], aa2re[aa2])))
    if shift_val == 1:
        intersect = "".join(set(scodon[0][2]) & set(scodon[1][0]))
        scodonre = "(?P<" + reid + ">"
        scodonre += (
            "["
            + scodon[0][0]
            + "]"
            + "["
            + scodon[0][1]
            + "]"
            + "["
            + intersect
            + "]"
            + "["
            + scodon[1][1]
            + "]"
            + "["
            + scodon[1][2]
            + "]"
        )
    elif shift_val == 2:
        intersect1 = "".join(set(scodon[0][1]) & set(scodon[1][0]))
        intersect2 = "".join(set(scodon[0][2]) & set(scodon[1][1]))
        scodonre = "(?P<" + reid + ">"
        scodonre += (
            "["
            + scodon[0][0]
            + "]"
            + "["
            + intersect1
            + "]"
            + "["
            + intersect2
            + "]"
            + "["
            + scodon[1][2]
            + "]"
        )
    scodonre += ")"
    return scodonre


def _get_codon_rec(
    pro, nucl, span_mode, gap_char, codon_table, complete_protein=False, max_score=10
):
    """Generate codon alignment based on regular re match (PRIVATE).

    span_mode is a tuple returned by _check_corr. The first element
    is the span of a re search, and the second element is the mode
    for the match.

    mode
     - 0: direct match
     - 1: mismatch (no indels)
     - 2: frameshift

    """
    import re
    from Bio.Seq import Seq

    nucl_seq = nucl.seq.replace(gap_char, "")
    span = span_mode[0]
    mode = span_mode[1]
    aa2re = _get_aa_regex(codon_table)
    if mode in (0, 1):
        if len(pro.seq.replace(gap_char, "")) * 3 != (span[1] - span[0]):
            raise ValueError(
                f"Protein Record {pro.id} and "
                f"Nucleotide Record {nucl.id} do not match!"
            )
        aa_num = 0
        codon_seq = CodonSeq()
        for aa in pro.seq:
            if aa == "-":
                codon_seq += "---"
            elif complete_protein and aa_num == 0:
                this_codon = nucl_seq[span[0] : span[0] + 3]
                if not re.search(
                    _codons2re(codon_table.start_codons), str(this_codon.upper())
                ):
                    max_score -= 1
                    warnings.warn(
                        f"start codon of {pro.id} ({aa} {aa_num}) does not "
                        f"correspond to {nucl.id} ({this_codon})",
                        BiopythonWarning,
                    )
                if max_score == 0:
                    raise RuntimeError(
                        f"max_score reached for {nucl.id}! Please raise up "
                        "the tolerance to get an alignment in anyway"
                    )
                codon_seq += this_codon
                aa_num += 1
            else:
                this_codon = nucl_seq[span[0] + 3 * aa_num : span[0] + 3 * (aa_num + 1)]
                if this_codon.upper().translate(table=codon_table) != aa:
                    max_score -= 1
                    warnings.warn(
                        "%s(%s %d) does not correspond to %s(%s)"
                        % (pro.id, aa, aa_num, nucl.id, this_codon),
                        BiopythonWarning,
                    )
                if max_score == 0:
                    raise RuntimeError(
                        f"max_score reached for {nucl.id}! Please raise up "
                        "the tolerance to get an alignment in anyway"
                    )
                codon_seq += this_codon
                aa_num += 1
        return SeqRecord(codon_seq, id=nucl.id)
    elif mode == 2:
        from collections import deque

        shift_pos = deque([])
        shift_start = []
        match = span_mode[2]
        m_groupdict = list(match.groupdict().keys())
        # backward frameshift
        for i in m_groupdict:
            shift_pos.append(match.span(i))
            shift_start.append(match.start(i))
        rf_table = []
        i = match.start()
        while True:
            rf_table.append(i)
            i += 3
            if i in shift_start and m_groupdict[shift_start.index(i)].isupper():
                shift_index = shift_start.index(i)
                shift_val = 6 - (shift_pos[shift_index][1] - shift_pos[shift_index][0])
                rf_table.append(i)
                rf_table.append(i + 3 - shift_val)
                i = shift_pos[shift_index][1]
            elif i in shift_start and m_groupdict[shift_start.index(i)].islower():
                i = shift_pos[shift_start.index(i)][1]
            if i >= match.end():
                break
        codon_seq = CodonSeq()
        aa_num = 0
        for aa in pro.seq:
            if aa == "-":
                codon_seq += "---"
            elif complete_protein and aa_num == 0:
                this_codon = nucl_seq[rf_table[0] : rf_table[0] + 3]
                if not re.search(
                    _codons2re(codon_table.start_codons), str(this_codon.upper())
                ):
                    max_score -= 1
                    warnings.warn(
                        f"start codon of {pro.id}({aa} {aa_num}) does not "
                        f"correspond to {nucl.id}({this_codon})",
                        BiopythonWarning,
                    )
                    codon_seq += this_codon
                    aa_num += 1
            else:
                if (
                    aa_num < len(pro.seq.replace("-", "")) - 1
                    and rf_table[aa_num + 1] - rf_table[aa_num] - 3 < 0
                ):
                    max_score -= 1
                    start = rf_table[aa_num]
                    end = start + (3 - shift_val)
                    ngap = shift_val
                    this_codon = nucl_seq[start:end] + "-" * ngap
                elif rf_table[aa_num] - rf_table[aa_num - 1] - 3 > 0:
                    max_score -= 1
                    start = rf_table[aa_num - 1] + 3
                    end = rf_table[aa_num]
                    ngap = 3 - (rf_table[aa_num] - rf_table[aa_num - 1] - 3)
                    this_codon = (
                        nucl_seq[start:end]
                        + "-" * ngap
                        + nucl_seq[rf_table[aa_num] : rf_table[aa_num] + 3]
                    )
                else:
                    start = rf_table[aa_num]
                    end = start + 3
                    this_codon = nucl_seq[start:end]
                    if this_codon.upper().translate(table=codon_table) != aa:
                        max_score -= 1
                        warnings.warn(
                            f"Codon of {pro.id}({aa} {aa_num}) does not "
                            f"correspond to {nucl.id}({this_codon})",
                            BiopythonWarning,
                        )
                if max_score == 0:
                    raise RuntimeError(
                        f"max_score reached for {nucl.id}! Please raise up "
                        "the tolerance to get an alignment in anyway"
                    )
                codon_seq += this_codon
                aa_num += 1
        codon_seq.rf_table = rf_table
        return SeqRecord(codon_seq, id=nucl.id)


def _align_shift_recs(recs):
    """Build alignment according to the frameshift detected by _check_corr (PRIVATE).

    Argument:
     - recs - a list of SeqRecords containing a CodonSeq dictated
       by a rf_table (with frameshift in some of them).

    """

    def find_next_int(k, lst):
        idx = lst.index(k)
        p = 0
        while True:
            if isinstance(lst[idx + p], int):
                return lst[idx + p], p
            p += 1

    full_rf_table_lst = [rec.seq.get_full_rf_table() for rec in recs]
    rf_num = [0] * len(recs)
    for k, rec in enumerate(recs):
        for i in rec.seq.get_full_rf_table():
            if isinstance(i, int):
                rf_num[k] += 1
            # isinstance(i, float) should be True
            elif rec.seq[int(i) : int(i) + 3] == "---":
                rf_num[k] += 1
    if len(set(rf_num)) != 1:
        raise RuntimeError("Number of alignable codons unequal in given records")
    i = 0
    rec_num = len(recs)
    while True:
        add_lst = []
        try:
            col_rf_lst = [k[i] for k in full_rf_table_lst]
        except IndexError:
            # we probably reached the last codon
            break
        for j, k in enumerate(col_rf_lst):
            add_lst.append((j, int(k)))
            if isinstance(k, float) and recs[j].seq[int(k) : int(k) + 3] != "---":
                m, p = find_next_int(k, full_rf_table_lst[j])
                if (m - k) % 3 != 0:
                    gap_num = 3 - (m - k) % 3
                else:
                    gap_num = 0
                if gap_num != 0:
                    gaps = "-" * int(gap_num)
                    seq = CodonSeq(rf_table=recs[j].seq.rf_table)
                    seq += recs[j].seq[: int(k)] + gaps + recs[j].seq[int(k) :]
                    full_rf_table = full_rf_table_lst[j]
                    bp = full_rf_table.index(k)
                    full_rf_table = full_rf_table[:bp] + [
                        v + int(gap_num) for v in full_rf_table[bp + 1 :]
                    ]
                    full_rf_table_lst[j] = full_rf_table
                    recs[j].seq = seq
                add_lst.pop()
                gap_num += m - k
                i += p - 1
        if len(add_lst) != rec_num:
            for j, k in add_lst:
                seq = CodonSeq(rf_table=recs[j].seq.rf_table)
                gaps = "-" * int(gap_num)
                seq += recs[j].seq[: int(k)] + gaps + recs[j].seq[int(k) :]
                full_rf_table = full_rf_table_lst[j]
                bp = full_rf_table.index(k)
                inter_rf = []
                for t in range(0, len(gaps), 3):
                    inter_rf.append(k + t + 3.0)
                full_rf_table = (
                    full_rf_table[:bp]
                    + inter_rf
                    + [v + int(gap_num) for v in full_rf_table[bp:]]
                )
                full_rf_table_lst[j] = full_rf_table
                recs[j].seq = seq
        i += 1
    return recs


if __name__ == "__main__":
    from Bio._utils import run_doctest

    run_doctest()