diff --git "a/code/pdb_featureVector.py" "b/code/pdb_featureVector.py" new file mode 100644--- /dev/null +++ "b/code/pdb_featureVector.py" @@ -0,0 +1,1716 @@ +# IMPORT NECESSARY MODULES AND LIBRARIES +from timeit import default_timer as timer +import xml.etree.ElementTree as ET +from collections import Counter +from bs4 import BeautifulSoup +from io import StringIO +from decimal import * +import pandas as pd +import requests +import os.path as op +import subprocess +import shutil +import ssbio.utils +import warnings +import sys +import pathlib +from pathlib import Path +import os, glob +import math +import ssbio +import ssl +from Bio.Align import substitution_matrices +from Bio.PDB.Polypeptide import * +from Bio.PDB import PDBList +from Bio import Align +from Bio import SeqIO +from Bio.PDB import * +import streamlit as st +from urllib.error import HTTPError +import Bio + +warnings.filterwarnings("ignore") +start = timer() + +# FUNCTIONS + + +# FUNCTIONS +from calc_pc_property import * +from add_domains import * +from add_annotations import * +from add_sequence import * +from add_structure import * +from add_alignment import * +from manage_files import * +from add_3Dalignment import * +from add_sasa import * +from standard import * +from add_interface_pos import * +from standard import * +from uniprotSequenceMatch import uniprotSequenceMatch +from process_input import clean_data + + +def pdb(input_set, mode, impute): + aligner = Align.PairwiseAligner() + """ + STEP 1 + Get input data as a console input. + Add datapoint identifier and remove non-standard input. + """ + data = clean_data(input_set) + path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files( + mode) + print('Creating directories...') + + annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand', + 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite', + 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide', + 'transitPeptide', 'glycosylation', 'propeptide'] + + print('Feature vector generation started...\n') + cont = True + try: + if cont == False: + print('Feature vectore generation terminated.') + else: + """ + STEP 2 + Add physicochemical properties. + """ + print('Adding physicochemical properties...\n') + + data = add_physicochemical(data) + + """ + STEP 3 + Add domain-related information. + """ + print('Adding domains\n') + + data = add_domains(data, path_to_domains) + data = data.astype(str) + data = data.replace({'NaN': 'nan'}) + data.domain = data.domain.replace({'nan': '-1'}) + data.domStart = data.domStart.replace({'nan': '-1'}) + data.domEnd = data.domEnd.replace({'nan': '-1'}) + data.distance = data.distance.replace({'nan': '-1'}) + + """ + STEP 4 + Retrieve canonical and isoform UniProt sequences. + Add to the data frame. + """ + print('Retrieving UniProt sequences...\n') + + canonical_fasta = pd.DataFrame(columns=['uniprotID', 'uniprotSequence']) + up_list = list(set(data['uniprotID'].to_list())) + for i in range(len(up_list)): + canonical_fasta.at[i, 'uniprotSequence'] = get_uniprot_seq(up_list[i]) + canonical_fasta.at[i, 'uniprotID'] = up_list[i] + canonical_fasta = canonical_fasta.drop_duplicates() + isoform_fasta = pd.DataFrame(columns=['uniprotID', 'isoformSequence']) + iso_dict = [] + for i in range(len(up_list)): + iso_dict.append(get_isoforms(up_list[i])) + + index = 0 + for i in iso_dict: + for key, val in i.items(): + isoform_fasta.at[index, 'uniprotID'] = key + isoform_fasta.at[index, 'isoformSequence'] = val + index += 1 + isoform_fasta = isoform_fasta.drop_duplicates() + for i in isoform_fasta.index: + isoform_fasta.at[i, 'whichIsoform'] = isoform_fasta.at[i, 'uniprotID'][7:10].strip() + isoform_fasta.at[i, 'uniprotID'] = isoform_fasta.at[i, 'uniprotID'][0:6] + print('Sequence files created...\n') + + data = data.merge(canonical_fasta, on='uniprotID', how='left') + data = data.astype(str) + data['whichIsoform'] = 'nan' + data.replace({'': 'nan'}, inplace=True) + data['wt_sequence_match'] = '' + for i in data.index: + if len(data.at[i, 'uniprotSequence']) >= int(data.at[i, 'pos']): + wt = data.at[i, 'wt'] + can = str(data.at[i, 'uniprotSequence'])[int(data.at[i, 'pos']) - 1] + if wt == can: + data.at[i, 'wt_sequence_match'] = 'm' + elif wt != can: + isoList = isoform_fasta[ + isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list() + for k in isoList: + if len(k) >= int(data.at[i, 'pos']): + resInIso = k[int(int(data.at[i, 'pos']) - 1)] + if wt == resInIso: + whichIsoform = \ + isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0] + data.at[i, 'wt_sequence_match'] = 'i' + data.at[i, 'whichIsoform'] = whichIsoform + break + + elif len(data.at[i, 'uniprotSequence']) < int(data.at[i, 'pos']): + isoList = isoform_fasta[ + isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list() + for k in isoList: + if len(k) >= int(data.at[i, 'pos']): + resInIso = k[int(int(data.at[i, 'pos']) - 1)] + wt = data.at[i, 'wt'] + if wt == resInIso: + whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[ + 0] + data.at[i, 'wt_sequence_match'] = 'i' + data.at[i, 'whichIsoform'] = whichIsoform + break + + data.wt_sequence_match = data.wt_sequence_match.astype('str') + data.replace({'': 'nan'}, inplace=True) + data_size = len(data.drop_duplicates(['datapoint'])) + not_match_in_uniprot = data[(data.uniprotSequence == 'nan') | (data.wt_sequence_match == 'nan')] + uniprot_matched = data[(data.uniprotSequence != 'nan') & (data.wt_sequence_match != 'nan')] + data = None + + print( + 'You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n' + % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])), + len(uniprot_matched.drop_duplicates(['datapoint'])))) + + """ + STEP 5 + Retrieve related PDB sequences, extract their sequences. + Add to the data frame. + """ + + pdb_fasta = pd.DataFrame(columns=['pdbID', 'chain', 'pdbSequence']) + pdb_info = pd.DataFrame(columns=['uniprotID', 'pdbID', 'chain', 'resolution']) + + print('Retrieving PDB structures...\n') + pdbs = [] + protein = uniprot_matched.uniprotID.to_list() + protein = list(set(protein)) + # pdbs = get_pdb_ids(protein) + + for prot in protein: + pdbs.append(get_pdb_ids(prot)) + + pdbs = [item for sublist in pdbs for item in sublist] + print('Processing PDB structures...\n') + if pdbs == []: + print('No PDB structure found for the query. ') + + print('Starting PDB structures download...\n') + pdbs = list(filter(None, pdbs)) + pdbs = (set(pdbs)) + pdbs = [i.lower() for i in pdbs] + pdbl = PDBList() + parser = PDBParser() + index = 0 + + try: + shutil.rmtree('obsolete') + except OSError as e: + pass + + existing_pdb = list(Path(path_to_output_files / 'pdb_structures').glob("*")) + existing_pdb = [str(i) for i in existing_pdb] + existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb] + + cnt = 0 + for search in pdbs: + try: + if search.lower() not in existing_pdb: + + # Specify the URL of the PDB file you want to download + pdb_url = f"https://files.rcsb.org/download/{search}.pdb" + # Set the path within your Hugging Face space where you want to store the PDB files + pdb_folder_path = Path(path_to_output_files / 'pdb_structures') + # Extract the PDB filename from the URL + pdb_filename = pdb_url.split("/")[-1] + + # Set the path for the downloaded file + pdb_file_path = os.path.join(pdb_folder_path, pdb_filename) + + # Send a GET request to download the PDB file + response = requests.get(pdb_url) + if response.status_code == 200: + # Save the file to the specified path + with open(pdb_file_path, "wb") as file: + file.write(response.content) + print("PDB file downloaded successfully!") + else: + print("Failed to download the PDB file.") + + else: + print('PDB structure file exists..') + for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): + filename_replace_ext = filename.with_suffix(".pdb") + filename.rename(filename_replace_ext) + + file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb') + + base = os.path.splitext(str(file))[0] + base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1] + os.rename(file, base + ".ent") + file = base + '.ent' + + # Parse the PDB file + structure = parser.get_structure("structure", file) + # Get the resolution from the Structure object + resolution = structure.header["resolution"] + + for record in SeqIO.parse(file, "pdb-seqres"): + if record.dbxrefs[0].split(':')[0] == 'UNP': + pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0] + pdb_fasta.at[index, 'chain'] = record.id.split(':')[1] + pdb_fasta.at[index, 'pdbSequence'] = str(record.seq) + pdb_info.at[index, 'uniprotID'] = record.dbxrefs[0].split(':')[1] + pdb_info.at[index, 'pdbID'] = record.id.split(':')[0] + pdb_info.at[index, 'chain'] = record.annotations["chain"] + pdb_info.at[index, 'resolution'] = resolution + index += 1 + + except: + IndexError + pdb_info.at[index, 'uniprotID'] = 'nan' + pdb_info.at[index, 'pdbID'] = 'nan' + pdb_info.at[index, 'chain'] = 'nan' + pdb_info.at[index, 'resolution'] = 'nan' + index += 1 + cnt += 1 + + print('PDB file processing finished..') + for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): + try: + filename_replace_ext = filename.with_suffix(".pdb") + filename.rename(filename_replace_ext) + except: + FileNotFoundError + + for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): + try: + if filename.stem.startswith("pdb"): + filename_replace_ext = filename.with_name(filename.stem[3:]) + filename.rename(filename_replace_ext.with_suffix('.pdb')) + except: + FileNotFoundError + + uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left') + uniprot_matched = uniprot_matched.astype(str) + uniprot_matched = uniprot_matched.drop_duplicates() + + uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left') + uniprot_matched = uniprot_matched.astype(str) + + with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & ( + (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & ( + uniprot_matched.resolution != 'None'))].drop_duplicates() + no_pdb = uniprot_matched[(uniprot_matched.pdbID == 'nan') | ( + (uniprot_matched.resolution == 'nan') | (uniprot_matched.resolution == 'OT') | ( + uniprot_matched.resolution == 'None'))] + no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())] + no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True) + + print( + 'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n' + % (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])), + len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])))) + + with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) + with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') + with_pdb.replace({'': 'nan'}, inplace=True) + + if len(with_pdb) == 0: + with_pdb['pdbInfo'] = '' + else: + for i in with_pdb.index: + try: + res = str(with_pdb.at[i, 'resolution']) + chain = with_pdb.at[i, 'chain'] + new = with_pdb.at[i, 'pdbID'] + ':' + chain + ':' + res + with_pdb.at[i, 'pdbInfo'] = new + except: + TypeError + with_pdb.at[i, 'pdbInfo'] = 'nan' + + with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence', + 'wt_sequence_match', + 'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']] + + # If the query data points are found in no_match_in_uniprot data frame, it will not give any results. + # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps. + # If the query data points are found in with_pdb data frame, it will be searched in the following steps. + + """ + STEP 6 + Retrieve sequence annotations. + Add to the data frame. + """ + + if len(with_pdb) > 0: + with_pdb = add_annotations(with_pdb) + else: + new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', + 'dnaBinding', + 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', + 'crosslink', 'mutagenesis', 'strand', + 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', + 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', + 'coiledCoil', 'peptide', + 'transitPeptide', 'glycosylation', 'propeptide', + 'disulfideBinary', + 'intMetBinary', 'intramembraneBinary', + 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', + 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', + 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', + 'strandBinary', 'helixBinary', 'turnBinary', + 'metalBindingBinary', + 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', + 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', + 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', + 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', + 'glycosylationBinary', 'propeptideBinary'] + with_pdb = pd.DataFrame(columns=new_cols) + try: + with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str') + except: + AttributeError + with_pdb['whichIsoform'] = '' + + with_pdb = with_pdb.astype(str) + with_pdb = with_pdb.replace({'NaN': 'nan'}) + with_pdb.replace({'[]': 'nan'}, inplace=True) + with_pdb.replace({'nan-nan': 'nan'}, inplace=True) + with_pdb.replace({'': 'nan'}, inplace=True) + + """ + STEP 7 + Do alignment for PDB + """ + # Canonical matches, i.e. labelled as m, canonical sequences will be aligned with PDB sequences. + # Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences. + with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C') + with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C') + + dfM = with_pdb[with_pdb.wt_sequence_match == 'm'] + dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) + dfM = dfM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') + + dfNM = with_pdb[with_pdb.wt_sequence_match == 'i'] + dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) + dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') + dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True) + + dfM = dfM.astype(str) + dfNM = dfNM.astype(str) + + dfM.reset_index(inplace=True) + dfM.drop(['index'], axis=1, inplace=True) + dfNM.reset_index(inplace=True) + dfNM.drop(['index'], axis=1, inplace=True) + + uniprot_matched_size = len(uniprot_matched.drop_duplicates(['datapoint'])) + uniprot_matched = None + pdb_fasta = None + pdb_info = None + pdbs = None + existing_pdb = None + with_pdb_size = len(with_pdb.drop_duplicates(['datapoint'])) + with_pdb = None + + print('Aligning sequences...\n') + + aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files')) + aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files')) + # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them. + for i in aligned_m.index: + if aligned_m.at[i, 'pdbSequence'] == 'nan': + aligned_m.at[i, 'mutationPositionOnPDB'] = 'nan' + aligned_m.at[i, 'domainStartonPDB'] = 'nan' + aligned_m.at[i, 'domainEndonPDB'] = 'nan' + aligned_m.at[i, 'pdb_alignStatus'] = 'nan' + + for i in aligned_nm.index: + if aligned_nm.at[i, 'pdbSequence'] == 'nan': + aligned_nm.at[i, 'mutationPositionOnPDB'] = 'nan' + aligned_nm.at[i, 'domainStartonPDB'] = 'nan' + aligned_nm.at[i, 'domainEndonPDB'] = 'nan' + aligned_nm.at[i, 'pdb_alignStatus'] = 'nan' + + # Check if they the same column name before merging. + aligned_m = aligned_m.astype(str) + aligned_nm = aligned_nm.astype(str) + + frames = [aligned_m, aligned_nm] + after_up_pdb_alignment = pd.concat(frames, sort=False) + if len(after_up_pdb_alignment) == 0: + after_up_pdb_alignment['pdb_alignStatus'] = '' + after_up_pdb_alignment['mutationPositionOnPDB'] = '' + after_up_pdb_alignment['domainStartonPDB'] = '' + after_up_pdb_alignment['domainEndonPDB'] = '' + + after_up_pdb_alignment = after_up_pdb_alignment.sort_values( + by=['uniprotID', 'wt', 'mut', 'pos', 'pdb_alignStatus', 'resolution', 'chain'], + ascending=[True, True, True, True, True, True, True]) + + after_up_pdb_alignment = after_up_pdb_alignment.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos'], + keep='first') + + after_up_pdb_alignment = after_up_pdb_alignment.astype('str') + + pdb_aligned = after_up_pdb_alignment[ + (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB != 'nan')] + yes_pdb_no_match = after_up_pdb_alignment[ + (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')] + no_pdb = no_pdb.copy() + + print('PDB matching is completed...\n') + print('SUMMARY') + print('-------') + print('%d data points that failed to match a UniProt Sequence are discarded.' % len( + not_match_in_uniprot.drop_duplicates(['datapoint']))) + print('Of the remaining %d:' % uniprot_matched_size) + print('--%d of %d successfully aligned with PDB structures.' % ( + len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) + print('--%d of %d not found on the covered area by the structure.' % ( + len(yes_pdb_no_match.drop_duplicates(['datapoint'])), with_pdb_size)) + print('--PDB structures not found for %d datapoints.' % len(no_pdb.drop_duplicates(['datapoint']))) + print('--%d will be searched in Swiss-Model database.\n' % ( + len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint'])))) + + dfM = None + dfNM = None + aligned_nm = None + aligned_m = None + after_up_pdb_alignment = None + + print('Proceeding to SwissModel search...') + print('------------------------------------\n') + + # At this point we have 4 dataframes + # 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well. + # 1a. aligned --- we are done with this. + # 1b. yes_pdb_no_match --- They have PDB structures but not matched, so will be searched in the other databases. + # 2. not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present. + # 3. no_pdb --- No PDB structures were found for them. Will be searched in other databases. + + """ + Step 8 + Neutralize data points that are to be searched in Swiss-Model + # One point is that yes_pdb_no_match's annotations are the adjusted according to the PDBs they are matched before. + # They need to be converted to their old original UniProt annotation positions. + """ + yes_pdb_no_match.drop(['disulfide', 'intMet', + 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'caBinding', 'topologicalDomain', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary', + 'intMetBinary', 'intramembraneBinary', + 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', + 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', + 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', + 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', + 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', + 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', + 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', + 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', + 'glycosylationBinary', 'propeptideBinary', 'pdbSequence', 'pdbInfo', 'pdbID', + 'chain', 'resolution', 'pdb_alignStatus', 'mutationPositionOnPDB', + 'domainStartonPDB', 'domainEndonPDB'], axis=1, inplace=True) + + to_swiss = pd.concat( + [yes_pdb_no_match.drop_duplicates(['datapoint']), no_pdb.drop_duplicates(['datapoint'])]) + no_pdb = None + to_swiss.reset_index(inplace=True) + to_swiss.drop(['index'], axis=1, inplace=True) + to_swiss = to_swiss.astype('str') + to_swiss = to_swiss.replace({'NaN': 'nan'}) + # Create model summary dataframe. + if len(to_swiss) != 0: + # import zipfile + # with zipfile.ZipFile(Path(path_to_input_files / 'swissmodel_structures.txt.zip'),"r") as zip_ref: + # zip_ref.extractall(Path(path_to_input_files)) + + print('Generating SwissModel file...\n') + + swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t', + dtype=str, header=None, skiprows=1, + names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', + 'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean', + 'qmean_norm', 'seqid', 'url']) + + else: + swiss_model = pd.DataFrame( + columns=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', 'coordinate_id', + 'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm', 'seqid', 'url', + 'whichIsoform']) + swiss_model = swiss_model.astype('str') + try: + swiss_model.iso_id = swiss_model.iso_id.astype('str') + except: + AttributeError + swiss_model['iso_id'] = 'nan' + swiss_model = swiss_model[swiss_model.UniProtKB_ac != 'nan'] + for ind in swiss_model.index: + swiss_model.at[ind, 'UniProtKB_ac'] = swiss_model.at[ind, 'UniProtKB_ac'].split('-')[0] + if swiss_model.at[ind, 'iso_id'] != 'nan': + + swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1] + else: + swiss_model.at[ind, 'whichIsoform'] = 'nan' + # swiss_model.drop(['input'], axis=1, inplace=True) + swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL'] + print('Index File Processed...\n') + + # Get relevant columns + swiss_model = swiss_model[ + ['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']] + # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one. + swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False) + swiss_model.reset_index(inplace=True) + swiss_model.drop(['index'], axis=1, inplace=True) + + # Get protein IDs for which there exist models. + swiss_model_ids = set(swiss_model.UniProtKB_ac.to_list()) + to_swiss = to_swiss.astype(str) + no_swiss_models = pd.DataFrame() + for i in to_swiss.index: + if to_swiss.at[i, 'uniprotID'] not in swiss_model_ids: + k = pd.Series(to_swiss.iloc[i]) + no_swiss_models = no_swiss_models.append(k, ignore_index=True) + + no_swiss_models = no_swiss_models.astype(str) + if len(no_swiss_models) == 0: + no_swiss_models = pd.DataFrame(columns=to_swiss.columns) + else: + no_swiss_models = no_swiss_models[to_swiss.columns] + no_swiss_models.reset_index(inplace=True) + no_swiss_models.drop('index', axis=1, inplace=True) + + with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False) + with_swiss_models = with_swiss_models[to_swiss.columns] + + # Add model info. + + with_swiss_models = with_swiss_models.astype(str) + swiss_model = swiss_model.astype(str) + swiss_models_with_data = pd.merge(with_swiss_models, swiss_model, left_on=['uniprotID', 'whichIsoform'], + right_on=['UniProtKB_ac', 'whichIsoform'], + how='left') + swiss_models_with_data = swiss_models_with_data.astype(str) + swiss_models_with_data = swiss_models_with_data.sort_values( + by=['uniprotID', 'wt', 'mut', 'pos', 'qmean_norm'], + ascending=False) + swiss_models_with_data = swiss_models_with_data.drop_duplicates() + swiss_models_with_data = swiss_models_with_data.drop(['UniProtKB_ac', 'seqid'], axis=1) + swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int') + swiss_models_with_data = swiss_models_with_data.astype(str) + + # Get the ones in the list but without model url and add to the list to go to modbase. + url_nan = swiss_models_with_data[swiss_models_with_data.url == 'nan'] + + # Add this nan's to no_model. These will be searched in MODBASE because here they dont have urls. + url_nan = url_nan.drop(['from', 'qmean_norm', 'template', 'to', 'url'], axis=1) + + no_swiss_models_2 = pd.concat([no_swiss_models, url_nan]) + swiss_models_with_data = swiss_models_with_data[swiss_models_with_data.url != 'nan'] + for i in swiss_models_with_data.index: + try: + swiss_models_with_data.at[i, 'chain'] = swiss_models_with_data.at[i, 'template'].split('.')[2] + swiss_models_with_data.at[i, 'template'] = swiss_models_with_data.at[i, 'template'].split('.')[0] + except: + IndexError + if len(swiss_models_with_data) == 0: + swiss_models_with_data['chain'] = '' + swiss_models_with_data['template'] = '' + + swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('str') + swiss_models_with_data.chain = swiss_models_with_data.chain.astype('str') + swiss_models_with_data['qmean_norm'] = swiss_models_with_data.qmean_norm.apply(lambda x: round(float(x), 2)) + swiss_models_with_data = swiss_models_with_data.astype(str) + + # swiss_models_with_data: These data points will be aligned with their corresponding model sequences. + # Add sequences + + no_swiss_models_2.reset_index(inplace=True) + no_swiss_models_2.drop('index', axis=1, inplace=True) + + swiss_models_with_data.reset_index(inplace=True) + swiss_models_with_data.drop('index', axis=1, inplace=True) + + swiss_model_ids = None + with_swiss_models = None + swiss_model = None + no_swiss_models = None + url_nan = None + + # At this point we have: + # pdb_aligned --- Align in the PDB phase + # not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present. + # to_swiss (no_pdb + yes_pdb_no_match) --- to be searched in SwissModel database + # to_swiss (with_swiss_models & no_swiss_models) + # swiss_models_with_data --- We found swiss models for them. + # no_swiss_models_2 (no_swiss_models + url_nan)--- to be searched in modbase (the ones having swissmodels but not matching with the boundaries & broken_swiss will be added here) + + """ + STEP 9 + Associated model IDs are added. + Download model files. + """ + print('Beginning SwissModel files download...') + existing_swiss = list(Path(path_to_output_files / 'swissmodel_structures').glob("*")) + existing_swiss = [str(i) for i in existing_swiss] + existing_swiss = ['.'.join(i.split('/')[-1].split('.')[:-1]) for i in existing_swiss] + swissmodels_fasta = pd.DataFrame() + + for i in swiss_models_with_data.index: + protein = swiss_models_with_data.at[i, 'uniprotID'] + template = swiss_models_with_data.at[i, 'template'].split('.')[0] + qmean_norm = str(round(float(swiss_models_with_data.at[i, 'qmean_norm']), 2)) + if protein + '_' + template + '_' + qmean_norm not in existing_swiss: + url = swiss_models_with_data.at[i, 'url'].strip('\"').strip('}').replace('\\', '').strip( + '\"').replace( + 'https', + 'https:') + req = requests.get(url) + name = Path( + path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt') + print('Downloading for Protein:', protein + ' Model: ' + template) + with open(name, 'wb') as f: + f.write(req.content) + else: + print('Model exists.') + name = Path( + path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt') + with open(name, encoding="utf8") as f: + fasta = '' + lines = f.readlines() + chain = '' + for row in lines: + if row[0:4] == 'ATOM' and row[13:15] == 'CA': + chain = row[20:22].strip() + fasta += threeToOne(row[17:20]) + if row[0:3] == 'TER': + k = pd.Series([protein, template, qmean_norm, chain.upper(), fasta]) + swissmodels_fasta = swissmodels_fasta.append(k, ignore_index=True) + fasta = '' + + if len(swissmodels_fasta) == 0: + swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']) + else: + swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'] + + swissmodels_fasta = swissmodels_fasta.astype(str) + + swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float) + swissmodels_fasta.qmean_norm = swissmodels_fasta.qmean_norm.astype(float) + + swissmodels_fasta = swissmodels_fasta.sort_values(['uniprotID', 'template', 'qmean_norm', 'chain'], + axis=0) # example = 3gdh + swissmodels_fasta.reset_index(inplace=True) + swissmodels_fasta.drop(['index'], axis=1, inplace=True) + swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'qmean_norm', 'chain']) + swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'chain', 'fasta']) + swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'fasta']) + # Some files were broken, thus their PDBs couldnt be recorded. + swissmodels_fasta = swissmodels_fasta.drop_duplicates() + swissmodels_fasta = swissmodels_fasta.astype(str) + + swiss_models_with_data = swiss_models_with_data.astype(str) + swissmodels_fasta = swissmodels_fasta.astype(str) + swiss_models_with_data1 = swiss_models_with_data.merge(swissmodels_fasta, + on=['uniprotID', 'template', 'qmean_norm', 'chain']) + + swiss_models_with_data1 = swiss_models_with_data1.sort_values(['datapoint', 'fasta'], axis=0, + ascending=[True, False]) + swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template']) + + swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list())) + swiss_models_with_data.reset_index(inplace=True) + swiss_models_with_data.drop(['index'], axis=1, inplace=True) + broken_swiss = pd.DataFrame() + c = 0 + for i in swiss_models_with_data.index: # en baştaki dfde var ama model gelende yok. + if swiss_models_with_data.at[i, 'datapoint'] not in swiss_models_with_data1_dp: + k = pd.Series(swiss_models_with_data.iloc[i]) + broken_swiss = broken_swiss.append(k, ignore_index=True) + c += 1 + + if len(broken_swiss) == 0: + broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list()) + + swiss_models_with_data = swiss_models_with_data1.copy() + + swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float') + swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'], + axis=0, ascending=[True, True, True, False]) + + # Delete the same model sequence with lower quality + swiss_models_with_data = swiss_models_with_data.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], + keep='first') + swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str') + swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int') + len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len( + broken_swiss.drop_duplicates(['datapoint'])) + len( + no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint'])) + # This printed data here includes all possible models with different qualities, + # because we may get a hit in either of them. + swiss_models_with_data.rename({'fasta': 'pdbSequence'}, axis=1, inplace=True) # for convenience. + + # NOW DO ALIGNMENT HERE + + swiss_models_with_data = swiss_models_with_data.replace({'[\'?\']': 'nan'}) + swiss_models_with_data = swiss_models_with_data.replace({'[]': 'nan'}) + swiss_models_with_data.rename({'template': 'pdbID'}, axis=1, + inplace=True) # Only to be able use the alignment code above. + swiss_models_with_data = swiss_models_with_data.astype(str) + swiss_models_with_data.pdbSequence = swiss_models_with_data.pdbSequence.astype('str') + swiss_models_with_data = add_annotations(swiss_models_with_data) + swiss_models_with_data = swiss_models_with_data.astype(str) + swiss_models_with_data.replace({'NaN': 'nan'}, inplace=True) + swiss_models_with_data_copy = swiss_models_with_data.copy() + swiss_models_with_data1_dp = None + swiss_models_with_data1 = None + existing_swiss = None + swissmodels_fasta = None + + print('Aligning sequences...\n') + + swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C') + swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C') + swiss_model_aligned = alignment(swiss_models_with_data, annotation_list, + path_to_output_files / 'alignment_files') + swiss_models_with_data = None + + if len(swiss_model_aligned) == 0: + swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns) + swiss_model_aligned['qmean_norm'] = 'nan' + else: + swiss_model_aligned = swiss_model_aligned.astype(str) + swiss_model_aligned.replace({'NaN': 'nan'}, inplace=True) + + # Some datapoints appear in both nan and not_nan. If not_nan we take it only once. + nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB == 'nan'] + not_nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB != 'nan'] + not_nan.qmean_norm = not_nan.qmean_norm.astype('float') + not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'qmean_norm'], ascending=[True, True, False], + inplace=True) + + which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first') + swiss_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] + swiss_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] + + swiss_match.qmean_norm = swiss_match.qmean_norm.astype('float') + swiss_match.sort_values(['uniprotID', 'wt', 'pos', 'mut', 'pdb_alignStatus', 'qmean_norm'], + ascending=[True, True, True, True, True, False], inplace=True) + swiss_match.drop_duplicates(['uniprotID', 'wt', 'pos', 'mut'], keep='first', inplace=True) + swiss_not_match = swiss_not_match[no_swiss_models_2.columns] + broken_swiss = broken_swiss[no_swiss_models_2.columns] + swiss_not_match = swiss_not_match.drop_duplicates(['datapoint']) + broken_swiss = broken_swiss.drop_duplicates(['datapoint']) + + to_modbase = pd.concat([no_swiss_models_2, broken_swiss]).drop_duplicates() + to_modbase = pd.concat([to_modbase, swiss_not_match]).drop_duplicates() + to_modbase = to_modbase.astype(str) + to_swiss_columns = to_swiss.columns + to_swiss_size = len(to_swiss.drop_duplicates(['datapoint'])) + to_swiss = None + + # CONTROL + + """ + # This should be the whole data. + len(swiss_match.drop_duplicates(['datapoint'])) + len(aligned.drop_duplicates(['datapoint'])) + len(to_modbase.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) ,len(data) + len(aligned.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) +len(to_swiss.drop_duplicates(['datapoint']))== len(data) + """ + print('SwissModel matching is completed...\n') + print('SUMMARY') + print('-------') + print('%d data points that failed to match a UniProt Sequence are discarded.' % len( + not_match_in_uniprot.drop_duplicates(['datapoint']))) + print('Of the remaining %d:' % uniprot_matched_size) + print('--%d of %d successfully aligned with PDB structures.' % ( + len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) + print('--%d of %d successfully aligned with SwissModels structures.' % ( + len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size)) + print('--%d will be searched in ModBase database.\n' % len(to_modbase.drop_duplicates(['datapoint']))) + + print('Proceeding to ModBase search...') + print('------------------------------------\n') + no_swiss_models_2 = None + broken_swiss = None + swiss_model_aligned = None + nan = None + not_nan = None + which_ones_are_match = None + swiss_not_match = None + + # STEP : GO TO MODBASE + # Should not include anything related to prev models. + if len(to_modbase) != 0: + to_modbase = to_modbase.astype(str) + + # GET MODBASE MODELS + + # Get IDs from data to retrieve only their models from MODBASE + to_modbase.reset_index(inplace=True) + to_modbase.drop(['index'], axis=1, inplace=True) + + existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*")) + existing_modbase_models = [str(i) for i in existing_modbase_models] + existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models] + + existing_modbase_models_ind = list( + Path(path_to_output_files / 'modbase_structures_individual').glob("*")) + existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind] + existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind] + + modbase_reduced = pd.DataFrame() + modbase_fasta = pd.DataFrame() + + print('Retrieving ModBase models...\n') + # Get model files associated with each UniProtID + for protein in list(set(to_modbase.uniprotID.to_list())): + if protein not in existing_modbase_models: + print('Downloading Modbase models for ', protein) + url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein + print(url) + req = requests.get(url) + name = path_to_output_files / 'modbase_structures' / f'{protein}.txt' + with open(name, 'wb') as f: + f.write(req.content) + else: + print('Model exists for', protein) + name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt') + with open(name, encoding="utf8") as f: + a = open(name, 'r').read() + soup = BeautifulSoup(a, 'lxml') + for pdb in soup.findAll('pdbfile'): + model_id = str(pdb.contents[1])[10:-11] + if model_id not in existing_modbase_models_ind: + with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', + 'w', + encoding="utf8") as individual: + individual.write(str('UniProt ID: ' + protein)) + individual.write('\n') + individual.write(str(pdb.contents[3])[10:-11].strip()) + with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', + encoding="utf8") as f: + fasta = '' + chain = '' + template_chain = '' + score = -999 + for ind_line in f.readlines(): + if ind_line[0:10] == 'UniProt ID': + uniprot_id = ind_line.split(':')[1].strip() + if ind_line[0:23] == 'REMARK 220 TARGET BEGIN': + target_begin = ind_line[40:43].strip() + if ind_line[0:21] == 'REMARK 220 TARGET END': + target_end = ind_line[40:43].strip() + if ind_line[0:25] == 'REMARK 220 TEMPLATE BEGIN': + pdb_begin = ind_line[40:43].strip() + if ind_line[0:23] == 'REMARK 220 TEMPLATE END': + pdb_end = ind_line[40:43].strip() + if ind_line[0:23] == 'REMARK 220 TEMPLATE PDB': + pdb_code = ind_line[40:43].strip() + if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN': + pdb_chain = ind_line[40:43].strip() + if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score': + quality_score = ind_line[40:].strip() + if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID': + model_id = ind_line[40:].strip() + if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN': + template_chain = ind_line[40:42].strip() + if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA': + fasta += threeToOne(ind_line[17:20]) + if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score': + try: + score = ind_line[40:].strip() + except (ValueError): + score = -999 + if ind_line[0:3] == 'TER' or ind_line[0:3] == 'END': + k = pd.Series([uniprot_id, model_id, str(score), template_chain, fasta]) + modbase_fasta = modbase_fasta.append(k, ignore_index=True) + fasta = '' + try: + k = pd.Series( + [uniprot_id, target_begin, target_end, pdb_code, pdb_chain, pdb_begin, pdb_end, + quality_score, + model_id]) + modbase_reduced = modbase_reduced.append(k, ignore_index=True) + except: + NameError + print('This file doesnt have Quality Score. Replacer: -999', model_id) + quality_score = -999 + + print() + if len(modbase_fasta) != 0: + modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta'] + else: + modbase_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'score', 'chain', 'fasta']) + modbase_fasta = modbase_fasta.astype(str) + modbase_fasta = modbase_fasta.replace({'': 'nan'}) + modbase_fasta = modbase_fasta.replace({'NaN': 'nan'}) + modbase_fasta = modbase_fasta[modbase_fasta.fasta != 'nan'] + + print('Modbase model frame constructed.\n') + if len(modbase_reduced) != 0: + modbase_reduced.columns = ['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', + 'PDBEnd', + 'ModPipeQualityScore', 'ModelID'] + else: + modbase_reduced = pd.DataFrame( + columns=['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', 'PDBEnd', + 'ModPipeQualityScore', 'ModelID']) + + to_modbase = add_annotations(to_modbase) + + to_modbase = to_modbase.astype(str) + to_modbase.fillna('nan', inplace=True) + to_modbase = to_modbase.replace({'NaN': 'nan'}) + to_modbase.replace({'[]': 'nan'}, inplace=True) + to_modbase.replace({'nan-nan': 'nan'}, inplace=True) + to_modbase.replace({'': 'nan'}, inplace=True) + model_info_added = to_modbase.merge(modbase_reduced, right_on='UniprotID', left_on='uniprotID', + how='left') + modbase_reduced = None + existing_modbase_models = None + existing_modbase_models_ind = None + + model_info_added = model_info_added.drop(['UniprotID'], axis=1) + model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to', + 'PDBCode': 'template', 'PDBChain': 'chain', + 'ModPipeQualityScore': 'score', + 'ModelID': 'pdbID'}) + model_info_added.drop(['PDBEnd', 'PDBBegin'], axis=1, inplace=True) + model_info_added.score = model_info_added.score.astype(float) + model_info_added = model_info_added.sort_values(by=['datapoint', 'score'], + ascending=False) + model_info_added.reset_index(inplace=True) + model_info_added.drop(['index'], axis=1, inplace=True) + model_info_added = model_info_added.drop_duplicates() + + model_info_added = model_info_added.astype(str) + model_info_added = model_info_added.replace({'NaN': 'nan'}) + no_info = model_info_added[model_info_added.pdbID == 'nan'] + with_modbase_info = model_info_added[model_info_added.pdbID != 'nan'] + model_info_added = None + + len(no_info.drop_duplicates(['datapoint'])), len(with_modbase_info.drop_duplicates(['datapoint'])) + len(no_info.drop_duplicates(['datapoint'])) + len( + with_modbase_info.drop_duplicates(['datapoint'])) == len( + to_modbase.drop_duplicates(['datapoint'])) + + # Add no_info to the rest down below! + no_info = no_info[to_swiss_columns] + + with_modbase_info.score = with_modbase_info.score.astype(float) + modbase_fasta.score = modbase_fasta.score.astype(float) + + modbase_fasta = modbase_fasta.sort_values(['uniprotID', 'score', 'template', 'chain'], + ascending=[True, False, True, True], axis=0) # example = 3gdh + + # I added this newly downloaded ones to the main model file. + + modbase_fasta = modbase_fasta.rename(columns={'template': 'pdbID'}) + with_modbase_info.pos = with_modbase_info.pos.astype('int') + with_modbase_info.score = with_modbase_info.score.astype(float) + with_modbase_info.score = with_modbase_info.score.apply(lambda x: round(x, 2)) + modbase_fasta.score = modbase_fasta.score.astype(float) + modbase_fasta.score = modbase_fasta.score.apply(lambda x: round(x, 2)) + + with_modbase_info = with_modbase_info.merge(modbase_fasta, on='pdbID', how='left') + + with_modbase_info.drop(['score_y'], axis=1, inplace=True) + with_modbase_info.rename(columns={'score_x': 'score'}, inplace=True) + with_modbase_info.drop(['uniprotID_y', 'chain_y'], axis=1, inplace=True) + with_modbase_info.rename(columns={'uniprotID_x': 'uniprotID', 'chain_x': 'chain'}, inplace=True) + + with_modbase_info.score = with_modbase_info.score.astype('float') + with_modbase_info = with_modbase_info.sort_values( + ['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'], + axis=0, + ascending=[True, True, True, True, False, True, False]) + with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], + keep='first') + + with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'}) + with_modbase_info = with_modbase_info.replace({'[]': 'nan'}) + with_modbase_info = with_modbase_info.replace({'\'?\', ': ''}) + with_modbase_info = with_modbase_info.replace({', \'?\'': ''}) + with_modbase_info = with_modbase_info.replace({'(': ''}) + with_modbase_info = with_modbase_info.replace( + {')': ''}) + with_modbase_info = with_modbase_info.astype(str) + with_modbase_info.fasta = with_modbase_info.fasta.astype('str') + with_modbase_info.reset_index(inplace=True) + with_modbase_info.drop('index', axis=1, inplace=True) + + align = with_modbase_info[ + with_modbase_info.fasta != 'nan'] + yes_pdb_no_match = with_modbase_info[ + with_modbase_info.fasta == 'nan'] + yes_pdb_no_match = yes_pdb_no_match[~yes_pdb_no_match.datapoint.isin(align.datapoint.to_list())] + + align.rename(columns={'fasta': 'pdbSequence'}, inplace=True) + align['uniprotSequence'] = align['uniprotSequence'].str.replace('U', 'C') + align['pdbSequence'] = align['pdbSequence'].str.replace('U', 'C') + + to_modbase_size = len(to_modbase.drop_duplicates(['datapoint'])) + modbase_fasta = None + to_modbase = None + print('Aligning sequences...\n') + modbase_aligned = alignment(align, annotation_list, path_to_output_files / 'alignment_files') + modbase_aligned = modbase_aligned.astype(str) + modbase_aligned = modbase_aligned.replace({'NaN': 'nan'}) + + # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.) + if len(with_modbase_info) != 0: + not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']), + with_modbase_info.drop_duplicates(['datapoint'])]).drop_duplicates( + ['datapoint'], + keep=False) + else: + not_in_aligned = pd.DataFrame( + columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', + 'intMet', + 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', + 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', + 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', + 'disulfide', + 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', + 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', + 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', + 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from', + 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta']) + with_modbase_info = None + if len(not_in_aligned) != 0: + not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']), + not_in_aligned.drop_duplicates(['datapoint'])]).drop_duplicates( + ['datapoint'], + keep='first') + # Retain the best model among the aligned ones. + else: + not_models = pd.DataFrame(columns=not_in_aligned.columns) + + yes_pdb_no_match = None + # # Some datapoints appear in both nan and not_nan. If not_nan we take it only once. + modbase_aligned = modbase_aligned.astype(str) + if len(modbase_aligned) != 0: + nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan'] + not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan'] + not_nan.score = not_nan.score.astype(float) + not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False], + inplace=True) + + not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'], + ascending=[True, True, False]) + not_nan = not_nan.drop_duplicates(['datapoint'], keep='first') + else: + nan = pd.DataFrame(columns=modbase_aligned.columns) + not_nan = pd.DataFrame(columns=modbase_aligned.columns) + modbase_aligned = None + which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first') + if len(which_ones_are_match) == 0: + which_ones_are_match = pd.DataFrame( + columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', + 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', + 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', + 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', + 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', + 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', + 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', + 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', + 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', + 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', + 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', + 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template', + 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus', + 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB']) + modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] + modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] + + else: + modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] + modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] + + which_ones_are_match = None + modbase_match.score = modbase_match.score.astype('float') + modbase_match = modbase_match.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'], + ascending=[True, True, False]) + modbase_match.drop_duplicates(['datapoint'], keep='first', inplace=True) + not_nan = None + nan = None + + # merge not_in_align and modbase_not_match as they were both excluded from modbase match. + + # No model + no_info = no_info[to_swiss_columns] + no_info = no_info.drop_duplicates() + + # Model present, no sequence + not_models = not_models[to_swiss_columns] + not_models = not_models.drop_duplicates() + + # Modbase model and sequence present, no match in PDB + modbase_not_match = modbase_not_match[to_swiss_columns] + modbase_not_match = modbase_not_match.drop_duplicates() + if len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) != 0: + rest = pd.concat([not_in_aligned, modbase_not_match, no_info]) + elif len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) == 0: + rest = pd.concat([not_in_aligned, modbase_not_match]) + elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) != 0: + rest = pd.concat([modbase_not_match, no_info]) + elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) != 0: + rest = pd.concat([not_in_aligned, no_info]) + elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) == 0: + rest = not_in_aligned + elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) == 0: + rest = modbase_not_match + elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0: + rest = no_info + else: + rest = pd.DataFrame( + columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint']) + + rest = rest[to_swiss_columns] + rest = rest.drop_duplicates() + + rest.reset_index(inplace=True) + rest.drop(['index'], axis=1, inplace=True) + rest = rest.astype('str') + + + else: + + modbase_match = pd.DataFrame( + columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', + 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', + 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', + 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', + 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', + 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', + 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', + 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', + 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', + 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', + 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', + 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template', + 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus', + 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB']) + not_in_aligned = pd.DataFrame( + columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', + 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide', + 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', + 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', + 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from', + 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta']) + no_info = pd.DataFrame( + columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint']) + rest = pd.DataFrame( + columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'wt_sequence_match', 'whichIsoform', 'datapoint']) + + rest = rest[to_swiss_columns] + rest = rest.drop_duplicates() + + rest.reset_index(inplace=True) + rest.drop(['index'], axis=1, inplace=True) + rest = rest.astype('str') + to_modbase_size = 0 + + print('Modbase matching is completed...\n') + print('SUMMARY') + print('-------') + print('%d data points that failed to match a UniProt Sequence are discarded.' % len( + not_match_in_uniprot.drop_duplicates(['datapoint']))) + print('Of the remaining %d:' % uniprot_matched_size) + print('--%d of %d successfully aligned with PDB structures.' % ( + len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) + print('--%d of %d successfully aligned with SwissModels structures.' % ( + len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size)) + print('--%d of %d successfully aligned with Modbase structures.\n' % ( + len(modbase_match.drop_duplicates(['datapoint'])), to_modbase_size)) + print('--Remaining %d not found to match any models.' % len(rest.drop_duplicates(['datapoint']))) + print('--A total of %d datapoints will not be evaluated.\n' % ( + len(rest.drop_duplicates(['datapoint'])) + len( + not_match_in_uniprot.drop_duplicates(['datapoint'])))) + + print('FOR CHECKING : ', + len(rest.drop_duplicates(['datapoint'])) + len( + not_match_in_uniprot.drop_duplicates(['datapoint'])) + len( + pdb_aligned.drop_duplicates(['datapoint'])) + len( + swiss_match.drop_duplicates(['datapoint'])) + len( + modbase_match.drop_duplicates(['datapoint'])) == data_size) + no_info = None + align = None + not_in_aligned = None + not_models = None + modbase_not_match = None + + # Final corrections + + # Now 3D alignment. + pdb = pdb_aligned.copy() + swiss = swiss_match.copy() + modbase = modbase_match.copy() + + pdb_aligned = None + swiss_match = None + modbase_match = None + + """ + WHAT DO WE HAVE NOW? + - uniprot sequence not found + - pdb aligned + - swiss aligned + - modbase aligned + - not aligned with anything (rest) + """ + + # Fix the axes and merge all data. + + pdb.drop(['pdbInfo'], axis=1, inplace=True) + pdb.rename(columns={'resolution': 'score'}, inplace=True) + swiss.rename(columns={'qmean_norm': 'score'}, inplace=True) + modbase.rename(columns={'qmean_norm': 'score'}, inplace=True) + + swiss = swiss[pdb.columns] + modbase = modbase[pdb.columns] + pdb['source'] = 'PDB' + swiss['source'] = 'SWISSMODEL' + modbase['source'] = 'MODBASE' + data = pd.concat([swiss, modbase, pdb]) + + data.reset_index(inplace=True) + data.drop(['index'], axis=1, inplace=True) + data = data.astype('str') + data_spare = pd.concat([not_match_in_uniprot, rest]) + not_match_in_uniprot = None + pdb = None + swiss = None + modbase = None + rest = None + + print('Generating FreeSASA files...') + print('------------------------------------\n') + # Folder to calculated RSA values. + + existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*")) + + existing_free_sasa = [str(i) for i in existing_free_sasa] + existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa] + + print('Calculation RSA for PDB Structure Files...\n') + + pdb_only = data[data.source == 'PDB'] + for pdbID in pdb_only.pdbID.to_list(): + if pdbID not in existing_free_sasa: + (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'), + Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), + include_hetatms=True, + outdir=None, force_rerun=False, file_type='pdb')) + + print('Calculation RSA for SwissModel Files...\n') + swiss_only = data[data.source == 'SWISSMODEL'] + swiss_dp = [] + for i in swiss_only.index: + swiss_dp.append(swiss_only.at[i, 'uniprotID'] + '_' + swiss_only.at[i, 'pdbID'].lower() + '_' + str( + round(float(swiss_only.at[i, 'score']), 2))) + for pdbID in swiss_dp: + if pdbID not in existing_free_sasa: + (run_freesasa(Path(path_to_output_files / 'swissmodel_structures' / f'{pdbID}.txt'), + Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt'), include_hetatms=True, + outdir=None, force_rerun=False, file_type='pdb')) + + print('Calculation RSA for Modbase Model Files...\n') + modbase_only = data[data.source == 'MODBASE'] + for pdbID in modbase_only.pdbID.to_list(): + if pdbID not in existing_free_sasa: + (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'), + Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), + include_hetatms=True, + outdir=None, force_rerun=False, file_type='pdb')) + + # This annotation list is different than the prev one, keep it. + + annotation_list += ['domainStartonPDB', 'domainEndonPDB'] + + folder_path = path_to_output_files / 'freesasa_files' + + aligner = Align.PairwiseAligner() + print('Proceeding to 3D distance calculation...\n') + + data.domainEndonPDB = data.domainEndonPDB.astype(str) + data.domainStartonPDB = data.domainStartonPDB.astype(str) + + existing_free_sasa = None + swiss_dp = None + pdb_only = None + swiss_only = None + modbase_only = None + data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C') + data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C') + for i in data.index: + id_ = data.at[i, 'pdbID'].lower() + up_id_ = data.at[i, 'uniprotID'] + score_ = str(data.at[i, 'score']) + if data.at[i, 'source'] == 'PDB': + pdb_path = Path(path_to_output_files / 'pdb_structures' / f'{id_}.pdb') + elif data.at[i, 'source'] == 'MODBASE': + pdb_path = Path(path_to_output_files / 'modbase_structures_individual' / f'{id_}.txt') + elif data.at[i, 'source'] == 'SWISSMODEL': + pdb_path = Path(path_to_output_files / 'swissmodel_structures' / f'{up_id_}_{id_}_{score_}.txt') + + pdbSequence = data.at[i, 'pdbSequence'] + source = data.at[i, 'source'] + chain = data.at[i, 'chain'] + uniprotID = data.at[i, 'uniprotID'] + pdbID = data.at[i, 'pdbID'] + alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, + Path(path_to_output_files / '3D_alignment'), file_format='gzip') + mutPos = data.at[i, 'mutationPositionOnPDB'] + try: + coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0] + except: + ValueError + coordMut = 'nan' + try: + sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2] + data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], + sasa_pos, data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb') + except: + ValueError + data.at[i, 'sasa'] = 'nan' # mutation position is nan + for annot in annotation_list: + annotx = [] + try: + positions_of_annotations = data.at[i, annot].split(',') + for pos in positions_of_annotations: + pos = pos.strip().strip('\'').strip('[\'').strip('\']') + try: + if '-' not in pos: + pos = int(float(pos)) + coordAnnot = get_coords(pos, alignments, 'nan', 'nan', mode)[0] + try: + annotx.append(find_distance(coordMut, coordAnnot)) + except: + ValueError + + else: + for r in range(int(pos.split('-')[0]), int(pos.split('-')[1]) + 1): + coordAnnot = get_coords(r, alignments, 'nan', 'nan', mode)[0] + annotx.append(find_distance(coordMut, coordAnnot)) + except: + ValueError + try: + data.at[i, annot] = min([float(i) for i in annotx]) + except: + ValueError + data.at[i, annot] = 'nan' + + except: + ValueError + + if (str(data.at[i, 'domainStartonPDB']) == 'NaN' or str(data.at[i, 'domainStartonPDB']) == 'nan') and ( + str(data.at[i, 'domainEndonPDB']) != 'NaN' and str(data.at[i, 'domainEndonPDB']) != 'nan'): + data.at[i, 'domainStartonPDB'] = 100000 + elif (str(data.at[i, 'domainEndonPDB']) == 'NaN' or str(data.at[i, 'domainEndonPDB']) == 'nan') and ( + str(data.at[i, 'domainStartonPDB']) != 'NaN' and str(data.at[i, 'domainStartonPDB']) != 'nan'): + data.at[i, 'domainEndonPDB'] = 100000 + elif (str(data.at[i, 'domainStartonPDB']) == 'NaN' and str(data.at[i, 'domainEndonPDB']) == 'nan'): + data.at[i, 'domaindistance3D'] = 'nan' + + data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']), + float(data.at[i, 'domainEndonPDB'])) + data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']), + float(data.at[i, 'domainEndonPDB'])) + + data = data.astype(str) + data.replace({'NaN': 'nan'}, inplace=True) + + # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match. + + # Get interface positions from ECLAIR. Download HQ human + print() + print('Assigning surface regions...') + print('------------------------------------\n') + + print('Extracting interface residues...\n') + data_interface = pd.read_csv(path_to_interfaces, sep='\t') + + positions = get_interface_positions(data_interface, 'P1', 'P2') + + interface_dataframe = pd.DataFrame() + + for key, val in positions.items(): + k = pd.Series((key, str(list(set(val))))) + interface_dataframe = interface_dataframe.append(k, ignore_index=True) + interface_dataframe.columns = ['uniprotID', 'positions'] + + if len(data) == 0: + data = pd.DataFrame( + columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', + 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', + 'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score', + 'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane', + 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding', + 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', + 'strand', 'helix', 'turn', 'metalBinding', 'repeat', + 'topologicalDomain', 'caBinding', 'bindingSite', 'region', + 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', + 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', + 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', + 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', + 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', + 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', + 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', + 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', + 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', + 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', + 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', + 'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus', + 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB', + 'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher']) + else: + data.sasa = data.sasa.astype('str') + + for i in data.index: + if '*' in data.at[i, 'sasa']: + data.at[i, 'sasa'] = data.at[i, 'sasa'].split('*')[0] + + data.sasa = data.sasa.replace({'N/A': 'nan'}) + data.sasa = data.sasa.replace({'None': 'nan'}) + data.replace({' N/A': 'nan'}, inplace=True) + data.replace({'None': 'nan'}, inplace=True) + data.sasa = data.sasa.astype(float) + data = data.astype(str) + for i in data.index: + if float(data.at[i, 'sasa']) < 5: + data.at[i, 'trsh4'] = 'core' + elif float(data.at[i, 'sasa']) >= 5: + data.at[i, 'trsh4'] = 'surface' + elif data.at[i, 'sasa'] == 'nan': + data.at[i, 'trsh4'] = 'nan' + + data = data.merge(interface_dataframe, on='uniprotID', how='left') + data.positions = data.positions.astype('str') + for i in data.index: + if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface': + print((str(data.at[i, 'pos']) in data.at[i, 'positions'])) + data.at[i, 'threeState_trsh4_HQ'] = 'interface' + elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface': + data.at[i, 'threeState_trsh4_HQ'] = 'surface' + elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core': + data.at[i, 'threeState_trsh4_HQ'] = 'core' + elif (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core': + data.at[i, 'threeState_trsh4_HQ'] = 'conflict' + elif data.at[i, 'trsh4'] == 'nan': + data.at[i, 'threeState_trsh4_HQ'] = 'nan' + + data.drop(['positions'], axis=1, inplace=True) + + # OPTIONAL + # DOMAIN SELECTION + # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most + # significant domains and 53th category will be NULL. + + fisherResult = pd.read_csv(fisher_path, sep='\t') + + significant_domains = fisherResult.domain.to_list() + for i in data.index: + if data.at[i, 'domain'] in significant_domains: + data.at[i, 'domain_fisher'] = data.at[i, 'domain'] + else: + data.at[i, 'domain_fisher'] = 'NULL' + + # Change the numbering for binary annotations and create 3 classes: + # nan--> 0, 0 -->1 and 1 -->2 + + print('Final adjustments are being done...\n') + binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', + 'dnaBindingBinary', + 'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', + 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', + 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', + 'repeatBinary', 'caBindingBinary', 'topologicalDomainBinary', + 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', + 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', + 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', + 'glycosylationBinary', 'propeptideBinary'] + data = data.astype(str) + data.replace({'NaN': 'nan'}, inplace=True) + for i in data.index: + for j in binaryCols: + data[j] = data[j].astype('str') + if (data.at[i, j] == '0') or (data.at[i, j] == '0.0'): + data.at[i, j] = '1' + elif data.at[i, j] == 'nan': + data.at[i, j] = '0' + elif (data.at[i, j] == '1') or (data.at[i, j] == '1.0'): + data.at[i, j] = '2' + + annotCols = ['disulfide', 'intMet', 'intramembrane', + 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding', + 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', + 'strand', 'helix', 'turn', 'metalBinding', 'repeat', 'caBinding', + 'topologicalDomain', 'bindingSite', 'region', 'signalPeptide', + 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide', + 'transitPeptide', 'glycosylation', 'propeptide'] + + for i in data.index: + for annot in annotCols: + binaryName = str(annot) + 'Binary' + if data.at[i, binaryName] == '2': + data.at[i, annot] = '0.0' + data.replace({'100000': 'nan'}, inplace=True) + data = add_physicochemical(data) + data.rename( + columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue', + 'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db', + 'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig', + 'domaindistance3D': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state', + 'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin', + 'intramembraneBinary': 'intramembrane_bin', + 'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin', + 'activeSiteBinary': 'activeSite_bin', + 'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin', + 'siteBinary': 'site_bin', + 'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin', + 'mutagenesisBinary': 'mutagenesis_bin', + 'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin', + 'metalBindingBinary': 'metalBinding_bin', + 'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin', + 'caBindingBinary': 'caBinding_bin', + 'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin', + 'signalPeptideBinary': 'signalPeptide_bin', + 'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin', + 'motifBinary': 'motif_bin', + 'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin', + 'transitPeptideBinary': 'transitPeptide_bin', + 'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin', + 'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist', + 'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist', + 'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist', + 'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist', + 'site': 'site_dist', + 'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist', + 'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist', + 'turn': 'turn_dist', + 'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist', + 'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist', + 'bindingSite': 'bindingSite_dist', 'region': 'region_dist', + 'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist', + 'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist', + 'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist', + 'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True) + + data = data[ + ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position', 'meta_merged', 'composition', 'polarity', + 'volume', + 'granthamScore', 'domains_all', + 'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin', + 'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin', + 'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin', + 'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin', + 'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin', + 'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin', + 'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin', + 'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin', + 'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist', + 'intramembrane_dist', + 'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist', + 'nucleotideBinding_dist', 'lipidation_dist', 'site_dist', + 'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist', + 'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist', + 'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist', + 'bindingSite_dist', 'region_dist', 'signalPeptide_dist', + 'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist', + 'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist', + 'glycosylation_dist', 'propeptide_dist']] + + ready = data.copy() + # Imputation + if (impute == 'True') or (impute == 'true') or (impute == True): + filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, + 15.99, 16.82, + 20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, + 22.36] + col_index = 0 + for col_ in ready.columns[-30:]: + ready[col_] = ready[col_].fillna(filler[col_index]) + ready[col_] = ready[col_].replace({'nan': filler[col_index]}) + col_index += 1 + ready['domains_3Ddist'] = ready['domains_3Ddist'].fillna(24.5) + ready['sasa'] = ready['sasa'].fillna(29.5) + ready['location_3state'] = ready['location_3state'].fillna('unknown') + elif (impute == 'False') or (impute == 'false') or (impute == False): + pass + ready = ready.replace({'nan': np.NaN}) + ready = ready.astype(str) + ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False) + if len(ready) == 0: + print( + 'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.') + #st.write(ready) + print('Feature vector successfully created...') + end = timer() + hours, rem = divmod(end - start, 3600) + minutes, seconds = divmod(rem, 60) + print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) + + return ready + + except: + AttributeError + +