Spaces:

HUBioDataLab
/

ASCARIS

Running

File size: 20,620 Bytes

c2a02c6

from Bio import Align
from Bio.Align import substitution_matrices
from pathlib import Path
import streamlit as st
from Bio.pairwise2 import format_alignment
from Bio import pairwise2
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo as matlist


    
"""
def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
    aligner = Align.PairwiseAligner()
    #print(f'Aligning Datapoint: {identifier}')
    if len(pdbSequence) >= 1:
        f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
        aligner.mode = 'local'
        aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
        aligner.open_gap_score = -11
        aligner.extend_gap_score = -1
        alignments = aligner.align(uniprotSequence, pdbSequence)
        alignments = (list(alignments))

        merge_in_threes = str(alignments[0]).split('\n')
        K = 3
        res = ["".join(str(alignments[0]).split('\n')[idx: idx + K]) for idx in range(len(str(alignments[0]).split('\n')) - K + 1)]
        slice_val = slice(0,len(res),4)
        writtenlist = res[slice_val]

        new_alignment = []
        for i in writtenlist:
            cont1 = list(filter(None, i.split('target')))
            cont2 = cont1[0].split('query')
            target_pos = (list(filter(None,cont2[0].split(' '))))[0]
            target = (list(filter(None,cont2[0].split(' '))))[1]
            alg_pos = (list(filter(None,cont2[0].split(' '))))[2]
            alg = (list(filter(None,cont2[0].split(' '))))[3]
            query_pos = (list(filter(None,cont2[1].split(' '))))[0]
            query = (list(filter(None,cont2[1].split(' '))))[1]
            if int(target_pos)>0:
                new_target = int(target_pos) * 'X' + target 
            else:
                new_target = int(target_pos) * ' ' + target 
                
            if int(alg_pos)>0:
                new_alg = int(target_pos) * 'X' + target 
            else:
                new_alg = int(target_pos) * ' ' + alg 
            
            if int(query_pos)>0:
                new_query = int(target_pos) * 'X' + target 
            else:
                new_query = int(target_pos) * ' ' + target 
                
            new_alignment.append(new_target+'\n' +new_alg +'\n' +new_query)
        alignment_list = []
        k = 0
        for alignment in new_alignment:
            k += 1
            st.write('COUNT', k)
            st.write('alignment')
            st.write(alignment)
            f.write(str(alignment))
            f.write('\n')
            f.write('\n')
            alignment = (str(alignment).strip().split('\n'))
            alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]
            st.write('alignment_updated')
            st.write(alignment)
            alignment_list.append(alignment)
    return alignment_list

"""
def do_alignment(identifier, uniprotSequence, pdbSequence, alignment_path):
    aligner = Align.PairwiseAligner()
    #print(f'Aligning Datapoint: {identifier}')
    if len(pdbSequence) >= 1:
        f = open(Path(alignment_path / f'{identifier}_alignment.txt'), "w")
        aligner.mode = 'local'
        aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
        aligner.open_gap_score = -11
        aligner.extend_gap_score = -1
        alignments = aligner.align(uniprotSequence, pdbSequence)

        sub_matrix = matlist.blosum62
        alignments2 = pairwise2.align.localds(uniprotSequence, pdbSequence, sub_matrix, -11, -1)

        alignment_list = []
        k = 0
        for alignment in alignments:

            f.write(str(alignment))
            f.write('\n')
            f.write('\n')
            alignment = (str(alignment).strip().split('\n'))
            alignment = [''.join(['.' if m == ' ' else m for m in x]) for x in alignment]

            alignment_list.append(alignment)
    return alignment_list

def mutation_position_on_pdb(alignment_list, pos):
    which_alignment_to_go = 0
    for alignment in alignment_list:
        
        #char_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
        #for char in alignment[1]:
          #  if char in char_list:
            #    alignment[1] = alignment[1].replace(char, '.')

        
        which_alignment_to_go += 1
        alignment_uniprot = alignment[0]
        alignment_pdb = alignment[2]
        startGap = 0
        if alignment_uniprot.startswith('.') or alignment_uniprot.startswith('-'):
            for k in alignment_uniprot:
                if k == '.' or k == '-':
                    startGap += 1
                else:
                    break

        countGap = startGap
        countResidue = 0
        canonicalRes = ' '
        pdbRes = ' '
        for j in alignment_uniprot[startGap:]:
            if j == '.' or j == '-':
                countGap += 1
            else:
                countResidue += 1
            
            if int(countResidue) == int(pos):
                canonicalRes = alignment_uniprot[countResidue + countGap - 1]
                try:
                    pdbRes = alignment_pdb[countResidue + countGap - 1]
                except:
                    IndexError
                    pdbRes = 'nan'
                break
        
        if (alignment[1][countResidue + countGap - 1] == '|') or (alignment[1][countResidue + countGap - 1] == 'X'):
            if canonicalRes == pdbRes:
                pdb_alignStatus = 'aligned'
            elif canonicalRes != pdbRes:
                pdb_alignStatus = 'aligned*'
            countGap_pdb = 0
            countResidue_pdb = 0
            pdbRes = ' '
            for j in alignment_pdb[0:countResidue + countGap - 1]:
                if j == '.' or j == '-':
                    countGap_pdb += 1
            if alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
                countResidue + countGap - 1] == '-':
                mutationPositionOnPDB = 'nan'
                posPDB = 'nan'
 
                    
            else:
                posPDB = countResidue + countGap - countGap_pdb
                
                mutationPositionOnPDB = str(posPDB)
                
            break
        elif (canonicalRes == pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
                alignment[1][poscountResidue+ countGap - 1] == '-')):
            pdb_alignStatus = 'not_aligned'
            mutationPositionOnPDB = 'nan'
        elif (canonicalRes != pdbRes) and ((alignment[1][countResidue + countGap - 1] == '.') or (
                alignment[1][countResidue + countGap - 1] == '-')):
            pdb_alignStatus = 'not_aligned'
            mutationPositionOnPDB = 'nan'
        elif alignment_pdb[countResidue + countGap - 1] == '.' or alignment_pdb[
            countResidue + countGap - 1] == '-':
            mutationPositionOnPDB = 'nan'
            posPDB = 'nan'

    return (pdb_alignStatus, mutationPositionOnPDB, startGap, alignment_list[which_alignment_to_go - 1])


def find_position_on_pdb_for_range_annotations(posAnnotation, startGap, alignment_to_use):
    annotation_on_pdb_start = 'nan'
    annotation_on_pdb_end = 'nan'
    pos1 = int(posAnnotation.split('-')[0])
    count_gap = startGap
    count_residue = 0
    for j in alignment_to_use[0][startGap:]:
        if j == '.' or j == '-':
            count_gap += 1
        else:
            count_residue += 1
        if int(count_residue) == int(pos1):  # count gaps until the first position
            break
    annotation_on_up_start = int(pos1) + int(count_gap)

    pos2 = int(posAnnotation.split('-')[1])
    count_gap = startGap
    count_residue = 0
    for j in alignment_to_use[0][startGap:]:
        if j == '.' or j == '-':
            count_gap += 1
        else:
            count_residue += 1
        if int(count_residue) == int(pos2):  # count gaps until the first position
            break

    annotation_on_up_end = int(pos2) + int(count_gap)
    try:
        pdb_residue_start = alignment_to_use[2][annotation_on_up_start - 1].strip()
        if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
            for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
                if (alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '.') and \
                        (alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end][ran] != '-') and \
                        ((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
                         (alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
                    annotation_on_up_start += ran
                    break
        elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
                ((alignment_to_use[1][annotation_on_up_start - 1] == '.') or (
                        alignment_to_use[1][annotation_on_up_start - 1] == '-')):
            for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
                if ((alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == '|') or
                        (alignment_to_use[1][(annotation_on_up_start - 1):annotation_on_up_end][ran] == 'X')):
                    annotation_on_up_start += ran
                    break
        count_gap_pdb = 0
        if annotation_on_up_start != 'nan':
            for q in alignment_to_use[2][0:annotation_on_up_start - 1]:
                if q == '.' or q == '-':
                    count_gap_pdb += 1
            if alignment_to_use[1][annotation_on_up_start] == '-' or alignment_to_use[1][annotation_on_up_start] == '.':
                annotation_on_pdb_start = 'nan'
            else:
                annotation_on_pdb_start = int(annotation_on_up_start) - count_gap_pdb
        else:
            annotation_on_pdb_start = 'nan'
    except:
        IndexError
    try:
        pdb_residue_end = alignment_to_use[2][annotation_on_up_end - 1].strip()
        if pdb_residue_end == '.' or pdb_residue_end == '-':
            for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
                if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
                        (alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
                    annotation_on_up_start += (ran - 1)
                    annotation_on_up_end = annotation_on_up_start
                    break
        elif (pdb_residue_end != '.') and (pdb_residue_end != '-') and \
                ((alignment_to_use[1][annotation_on_up_end - 1] == '.') or (
                        alignment_to_use[1][annotation_on_up_end - 1] == '-')):
            for ran in range(len(alignment_to_use[2][(annotation_on_up_start - 1):annotation_on_up_end])):
                if ((alignment_to_use[1][annotation_on_up_start - 1:annotation_on_up_end][ran] == '.') or
                        (alignment_to_use[1][(annotation_on_up_start - 1):][ran] == '-')):
                    annotation_on_up_start += (ran - 1)
                    annotation_on_up_end = annotation_on_up_start
                    break
        count_gap_pdb = 0
        if annotation_on_up_end != 'nan':
            for q in alignment_to_use[2][0:annotation_on_up_end - 1]:
                if q == '.' or q == '-':
                    count_gap_pdb += 1
            if alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
                annotation_on_up_end - 1] == '.' and annotation_on_pdb_start == 'nan':
                annotation_on_pdb_end = 'nan'
            elif alignment_to_use[1][annotation_on_up_end - 1] == '-' or alignment_to_use[1][
                annotation_on_up_end - 1] == '.' and annotation_on_pdb_start != 'nan':
                annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
            else:
                annotation_on_pdb_end = int(annotation_on_up_end) - count_gap_pdb
        else:
            annotation_on_pdb_end = 'nan'
    except:
        IndexError  # Say isoform 2 is matched with the length 100, but canonical is 150 aa long. If there is an annotation at 105. position, for the isoform it throws an index error.

    if annotation_on_pdb_start == 'nan' and annotation_on_pdb_end != 'nan':
        annotation_on_pdb_start = annotation_on_up_start - count_gap_pdb
    if annotation_on_pdb_start == annotation_on_pdb_end:
        annotation_on_pdb_start = 'nan'
        annotation_on_pdb_end = 'nan'
    return annotation_on_up_start, annotation_on_up_end, annotation_on_pdb_start, annotation_on_pdb_end


def annotation_pos_on_pdb(annot_positions, startGap, alignment_to_use, identifier):
    newpos = []
    if annot_positions != 'nan':
        annot_positions = (str(annot_positions).replace("'", ''))
        annot_positions = (str(annot_positions).replace('[', ''))
        annot_positions = (str(annot_positions).replace("]", ''))
        positionList_perAnnotation = annot_positions.split(',')
        positionList_perAnnotation = [h.strip() for h in positionList_perAnnotation]

        position_start_on_pdb = 'nan'
        position_end_on_pdb = 'nan'
        try:
            positionList_perAnnotation = [i for i in positionList_perAnnotation if i != 'nan']
        except:
            TypeError
        for position in range(len(positionList_perAnnotation)):
            if ('-' not in str(positionList_perAnnotation[position])) and (str(positionList_perAnnotation[position]) != '?') and (str(positionList_perAnnotation[position]) != '') and (len(str(positionList_perAnnotation[position])) != 0):
                count_gap = startGap
                count_residue = 0
                for j in alignment_to_use[0][startGap:]:
                    if j == '.' or j == '-':
                        count_gap += 1
                    else:
                        count_residue += 1
                    try:
                        if int(count_residue) == int(positionList_perAnnotation[position]):
                            break
                    except:
                        ValueError

                annotation_on_up = int(positionList_perAnnotation[position]) + int(count_gap)
                try:
                    pdb_residue_start = alignment_to_use[2][annotation_on_up - 1].strip()
                except:
                    IndexError
                    pdb_residue_start = 'nan'
                if pdb_residue_start != 'nan':
                    try:
                        if (pdb_residue_start == '.') or (pdb_residue_start == '-'):
                            for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
                                if (alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][ran] != '.') and \
                                        (alignment_to_use[2][(annotation_on_up - 1):annotation_on_up][
                                             ran] != '-') and \
                                        ((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
                                              ran] == '|') or
                                         (alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][
                                              ran] == 'X')):
                                    annotation_on_up += ran
                                    break
                        elif (pdb_residue_start != '.') and (pdb_residue_start != '-') and \
                                ((alignment_to_use[1][annotation_on_up - 1] == '.') or (
                                        alignment_to_use[1][annotation_on_up - 1] == '-')):
                            for ran in range(len(alignment_to_use[2][(annotation_on_up - 1):annotation_on_up])):
                                if ((alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == '|') or
                                        (alignment_to_use[1][(annotation_on_up - 1):annotation_on_up][ran] == 'X')):
                                    annotation_on_up += ran
                                    break
                        count_gap_pdb = 0
                        for q in alignment_to_use[2][0:annotation_on_up - 1]:
                            if q == '.' or q == '-':
                                count_gap_pdb += 1
                        if alignment_to_use[1][annotation_on_up] == '-' or alignment_to_use[1][
                            annotation_on_up] == '.':
                            annotation_on_pdb = 'nan'
                        else:
                            annotation_on_pdb = int(annotation_on_up) - count_gap_pdb

                        if count_gap_pdb == annotation_on_up:
                            annotation_on_pdb = 'nan'
                        try:
                            if alignment_to_use[2][count_gap_pdb + annotation_on_pdb - 1] == '.' or alignment_to_use[2][
                                count_gap_pdb + annotation_on_pdb - 1] == '-':
                                annotation_on_pdb = 'nan'
                        except:
                            IndexError
                            annotation_on_pdb = 'nan'
                    except:
                        IndexError
                        annotation_on_pdb = 'nan'

                    newpos.append(annotation_on_pdb)

            elif ('-' in str(positionList_perAnnotation[position])) and (
                    str(positionList_perAnnotation[position]) != '?') and (
                    str(positionList_perAnnotation[position]) != ' ') and (
                    len(str(positionList_perAnnotation[position])) != 0):
                try:
                    position_start_on_pdb = \
                        find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
                                                                   startGap, alignment_to_use)[2]
                    position_end_on_pdb = \
                        find_position_on_pdb_for_range_annotations(positionList_perAnnotation[position],
                                                                   startGap, alignment_to_use)[3]
                except:
                    ValueError
                newpositions = str(position_start_on_pdb) + '-' + str(position_end_on_pdb)
                newpos.append(newpositions)
            else:
                pass
    try:
        newpos = [i for i in newpos if i != 'nan']
    except:
        TypeError
    return newpos


def final_stage(df, annotation_list, alignment_path):
    for i in df.index:
        

        identifier = df.at[i, 'uniprotID'] + '_' + df.at[i, 'pdbID'] + '_' + df.at[i, 'chain'] + '_'
        alignment_list = do_alignment(identifier, df.at[i, 'uniprotSequence'], df.at[i, 'pdbSequence'], alignment_path)
        df.at[i, 'pdb_alignStatus'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[0]
        
        print()
        df.at[i, 'mutationPositionOnPDB'] = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[1]
        startGap = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[2]
        alignment_to_use = mutation_position_on_pdb(alignment_list, df.at[i, 'pos'])[3]
        for annot in annotation_list:
            df.at[i, annot] = annotation_pos_on_pdb(df.at[i, annot], startGap, alignment_to_use, identifier)
        if str(df.at[i, 'domStart']) != 'nan' and str(df.at[i, 'domEnd']) != 'nan' and \
                ((str(df.at[i, 'domStart']) != '-1' and str(df.at[i, 'domEnd']) != '-1' and
                  str(df.at[i, 'domStart']) != '-1.0' and str(df.at[i, 'domEnd']) != '-1.0')):
            domainLoc = str(df.at[i, 'domStart']).split('.')[0] + '-' + str(df.at[i, 'domEnd']).split('.')[0]
            domain_pos = find_position_on_pdb_for_range_annotations(domainLoc, startGap, alignment_to_use)
            df.at[i, 'domainStartonPDB'] = domain_pos[2]
            df.at[i, 'domainEndonPDB'] = domain_pos[3]
        elif str(df.at[i, 'domStart']) != '-1' or str(df.at[i, 'domEnd']) != '-1' or \
                str(df.at[i, 'domStart']) != '-1.0' or str(df.at[i, 'domEnd']) != '-1.0':
            df.at[i, 'domainStartonPDB'] = 'nan'
            df.at[i, 'domainEndonPDB'] = 'nan'
        

    df = df.astype(str)
    return df

def alignment(dataframe_to_align, annotation_list, alignment_path):
    domainList = ['domStart', 'domEnd']
    result = final_stage(dataframe_to_align, annotation_list, alignment_path)
    return result
#