diff --git "a/code/pdb_featureVector.py" "b/code/pdb_featureVector.py" --- "a/code/pdb_featureVector.py" +++ "b/code/pdb_featureVector.py" @@ -19,22 +19,20 @@ import os, glob import math import ssbio import ssl +import numpy as np from Bio.Align import substitution_matrices from Bio.PDB.Polypeptide import * from Bio.PDB import PDBList from Bio import Align from Bio import SeqIO from Bio.PDB import * -from Bio.PDB import PDBParser, PPBuilder warnings.filterwarnings("ignore") start = timer() -import streamlit as st -# FUNCTIONS - # FUNCTIONS from calc_pc_property import * from add_domains import * +from retrieveUniprotSequences import * from add_annotations import * from add_sequence import * from add_structure import * @@ -45,1647 +43,420 @@ from add_sasa import * from standard import * from add_interface_pos import * from standard import * +from utils import * +from pdbMapping import * from uniprotSequenceMatch import uniprotSequenceMatch from process_input import clean_data +from urllib.error import HTTPError +from swissModelAdd import * +from modbaseModelAdd import * +import streamlit as st def pdb(input_set, mode, impute): - aligner = Align.PairwiseAligner() - """ - STEP 1 - Get input data as a console input. - Add datapoint identifier and remove non-standard input. - """ - data = clean_data(input_set) - path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files( - mode) - out_path = path_to_output_files / 'log.txt' - print('Creating directories...') - annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand', - 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite', - 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide', - 'transitPeptide', 'glycosylation', 'propeptide'] - - print('Feature vector generation started...\n') + # Fill empty dataframes with SIMPLE_COLS + SIMPLE_COLS = ['uniprotID', 'wt', 'pos', 'mut', 'datapoint', 'composition', 'polarity', + 'volume', 'granthamScore', 'domain', 'domStart', 'domEnd', 'distance', + 'intMet', 'naturalVariant', 'activeSite', 'crosslink', 'mutagenesis', + 'strand', 'helix', 'turn', 'region', 'modifiedResidue', 'motif', + 'metalBinding', 'lipidation', 'glycosylation', 'topologicalDomain', + 'nucleotideBinding', 'bindingSite', 'transmembrane', 'transitPeptide', + 'repeat', 'site', 'peptide', 'signalPeptide', 'disulfide', 'coiledCoil', + 'intramembrane', 'zincFinger', 'caBinding', 'propeptide', 'dnaBinding', + 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', + 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', + 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', + 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', + 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', + 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', + 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', + 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', + 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', + 'glycosylationBinary', 'propeptideBinary'] + + UNIPROT_ANNOTATION_COLS = SIMPLE_COLS[-60:] + + + path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode) + out_path = path_to_output_files / 'log.txt' + #sys.stdout = open(out_path, 'w') + data = clean_data(input_set) + data = add_uniprot_sequence(data) + match = data[(data.wt_sequence_match == 'm')] + iso = data[(data.wt_sequence_match == 'i')] + noMatch = data[(data.wt_sequence_match != 'm') & (data.wt_sequence_match != 'i')] if len(data) == 0: - print('Feature vectore generation terminated.') + st.write('Feature vectore generation terminated. Please enter a query.') else: - """ - STEP 2 - Add physicochemical properties. - """ - print('Adding physicochemical properties...\n') - - data = add_physicochemical(data) - - """ - STEP 3 - Add domain-related information. - """ - print('Adding domains\n') - - data = add_domains(data, path_to_domains) - - data = data.astype(str) - data = data.replace({'NaN': 'nan'}) - data.domain = data.domain.replace({'nan': '-1'}) - data.domStart = data.domStart.replace({'nan': '-1'}) - data.domEnd = data.domEnd.replace({'nan': '-1'}) - data.distance = data.distance.replace({'nan': '-1'}) - """ - STEP 4 - Retrieve canonical and isoform UniProt sequences. - Add to the data frame. - """ - print('Retrieving UniProt sequences...\n') - - canonical_fasta = pd.DataFrame(columns=['uniprotID', 'uniprotSequence']) - up_list = list(set(data['uniprotID'].to_list())) - for i in range(len(up_list)): - canonical_fasta.at[i, 'uniprotSequence'] = get_uniprot_seq(up_list[i]) - canonical_fasta.at[i, 'uniprotID'] = up_list[i] - - canonical_fasta = canonical_fasta.drop_duplicates() - isoform_fasta = pd.DataFrame(columns=['uniprotID', 'isoformSequence']) - iso_dict = [] - for i in range(len(up_list)): - iso_dict.append(get_isoforms(up_list[i])) - - index = 0 - for i in iso_dict: - for key, val in i.items(): - isoform_fasta.at[index, 'uniprotID'] = key - isoform_fasta.at[index, 'isoformSequence'] = val - index += 1 - isoform_fasta = isoform_fasta.drop_duplicates() - - for i in isoform_fasta.index: - isoform_fasta.at[i, 'whichIsoform'] = isoform_fasta.at[i, 'uniprotID'][7:10].strip() - isoform_fasta.at[i, 'uniprotID'] = isoform_fasta.at[i, 'uniprotID'][0:6] - print('Sequence files created...\n') - - data = data.merge(canonical_fasta, on='uniprotID', how='left') - data = data.astype(str) - data['whichIsoform'] = 'nan' - data.replace({'': 'nan'}, inplace=True) - data['wt_sequence_match'] = '' - for i in data.index: - if len(data.at[i, 'uniprotSequence']) >= int(data.at[i, 'pos']): - wt = data.at[i, 'wt'] - can = str(data.at[i, 'uniprotSequence'])[int(data.at[i, 'pos']) - 1] - if wt == can: - data.at[i, 'wt_sequence_match'] = 'm' - elif wt != can: - isoList = isoform_fasta[ - isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list() - for k in isoList: - if len(k) >= int(data.at[i, 'pos']): - resInIso = k[int(int(data.at[i, 'pos']) - 1)] - if wt == resInIso: - whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[ - 0] - data.at[i, 'wt_sequence_match'] = 'i' - data.at[i, 'whichIsoform'] = whichIsoform - break - - elif len(data.at[i, 'uniprotSequence']) < int(data.at[i, 'pos']): - isoList = isoform_fasta[isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list() - for k in isoList: - if len(k) >= int(data.at[i, 'pos']): - resInIso = k[int(int(data.at[i, 'pos']) - 1)] - wt = data.at[i, 'wt'] - if wt == resInIso: - whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0] - data.at[i, 'wt_sequence_match'] = 'i' - data.at[i, 'whichIsoform'] = whichIsoform - break - - data.wt_sequence_match = data.wt_sequence_match.astype('str') - data.replace({'': 'nan'}, inplace=True) - - data_size = len(data.drop_duplicates(['datapoint'])) - not_match_in_uniprot = data[(data.uniprotSequence == 'nan') | (data.wt_sequence_match == 'nan')] - uniprot_matched = data[(data.uniprotSequence != 'nan') & (data.wt_sequence_match != 'nan')] - - for dp in data.index: - if (data.at[dp, 'wt_sequence_match'] == 'nan') & (data.at[dp, 'whichIsoform'] == 'nan'): - up_ = data.at[dp, 'uniprotID'] - pos_ = data.at[dp, 'pos'] - st.write(f'Wrong input aminoacid for {up_} at position {pos_}') - data = None - - print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n' - % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])), - len(uniprot_matched.drop_duplicates(['datapoint'])))) - - """ - STEP 5 - Retrieve related PDB sequences, extract their sequences. - Add to the data frame. - """ - from urllib.error import HTTPError - pdb_fasta = pd.DataFrame(columns=['pdbID', 'chain', 'pdbSequence']) - pdb_info = pd.DataFrame(columns=['uniprotID', 'pdbID', 'chain', 'resolution']) - - print('Retrieving PDB structures...\n') - pdbs = [] - protein = uniprot_matched.uniprotID.to_list() - protein = list(set(protein)) - - for prot in protein: - pdbs.append(get_pdb_ids(prot)) - if len(pdbs) >= 1: - pdbs = [item for sublist in pdbs for item in sublist] - + if len(noMatch) == len(data) : + st.write('Aminoacid at the position could not be mapped to canonical or isoform sequence. Please check the input amino acid.') + elif len(noMatch) > 0: + st.write( + f'{len(noMatch)} of {len(data)} datapoints has not been mapped to any sequence. These datapoints are omitted.') + if len(iso) > 0: + st.write(f'{len(iso)} of {len(data)} datapoints has been mapped to isoform sequences. These datapoints are omitted.') + if len(match) == 0: + st.write('Feature generation terminated due to failed mapping of input amino acid to UniProt sequence.') else: - pdbs = [] - print('Processing PDB structures...\n') - if pdbs == []: - print('No PDB structure found for the query. ') - print('Starting PDB structures download...\n') - pdbs = list(filter(None, pdbs)) - pdbs = (set(pdbs)) - pdbs = [i.lower() for i in pdbs] - pdbl = PDBList() - parser = PDBParser() - index = 0 - - try: - shutil.rmtree('obsolete') - except OSError as e: - pass - - cnt = 0 - pdbs = [i.upper() for i in pdbs] - def fetch_uniprot_ids(pdb_code): - response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_code}") + st.write(f'{len(match)} of {len(data)} datapoints has been mapped to canonical sequences. Proceeding with these datapoins.') + if (len(iso) != 0) | (len(noMatch) != 0): + st.write('Omitted datapoints are:', noMatch.datapoint.to_list() + iso.datapoint.to_list()) + st.write('\n') + st.write('Check log file for updates.') - response.raise_for_status() # Check for a successful response - data = response.json() - - return list(list(list(data.values())[0].values())[0].keys()) - for search in pdbs: - # Step 1: Fetch the PDB file - pdb_url = f"https://files.rcsb.org/download/{search}.pdb" - - try: - response = requests.get(pdb_url) - response.raise_for_status() # Check for a successful response - except : - continue # Skip to the next PDB code if fetching fails + data = match[['uniprotID', 'wt', 'pos', 'mut', 'datapoint']] + print('>> Feature vector generation started...\n') + print('\n>> Creating directories...') + print('\n>> Adding physicochemical properties...\n') + data = add_physicochemical(data) + print('\n>> Adding domains\n') + data = add_domains(data, path_to_domains) + print('\n>> Adding sequence annotations...\n') + data = add_annotations(data) + print('\n>> Retrieving PDB structure information...\n') + pdb_info = addPDBinfo(data, path_to_output_files) + if len(pdb_info) != 0: + data = pd.merge(data, pdb_info, on='uniprotID', how='left') + # Spare datapoint if there is no associated PDB. + no_pdb = data[data.pdbID.isna()].drop_duplicates() + pdb = data[~data.pdbID.isna()].drop_duplicates() + # Spare datapoint if associated PDB does not cover mutated area. + pdb.pos = pdb.pos.apply(lambda x:int(x)) + pdb.start = pdb.start.apply(lambda x: int(x)) + pdb.end = pdb.end.apply(lambda x: int(x)) + no_pdb_add = pdb[~((pdb.pos > pdb.start) & (pdb.pos < pdb.end))] + + pdb = pdb[(pdb.pos > pdb.start) & (pdb.pos < pdb.end)] # do not change order + + pdb.reset_index(drop=True, inplace=True) + # Delete spared datapoint from no_pdb list if it has any other PDB that spans the mutated area. + no_pdb_add = no_pdb_add[~no_pdb_add.datapoint.isin(pdb.datapoint.to_list())] + # Final collection of datapoints without PDB associaton. + no_pdb = pd.concat([no_pdb, no_pdb_add]) + no_pdb = no_pdb[SIMPLE_COLS] + no_pdb = no_pdb.drop_duplicates() + + pdb = pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) + pdb.reset_index(drop=True, inplace=True) + pdb.fillna(np.NaN, inplace=True) + # Get position mapping from added structures + print('\n>> Adding structure residue positions...\n') + if len(pdb) > 0: # there are mapped structures, and some of them span the mutated area. + pdb.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True) + pdb = pdbMapping(pdb, Path(path_to_output_files / 'pdb_structures')) + pdb.reset_index(drop=True, inplace=True) + pdb = pdb.fillna(np.NaN) + no_pdb_add_ = pdb[pdb.AAonPDB.isna()] + no_pdb_add = pdb[pdb.MATCHDICT.isna()] + no_pdb = pd.concat([no_pdb_add_, no_pdb, no_pdb_add]) + no_pdb.reset_index(inplace=True, drop=True) + pdb = pdb[~(pdb.MATCHDICT.isna())] + pdb = pdb[~(pdb.AAonPDB.isna())] + if len(pdb) > 0: + print('\n>> Mapping to PDB residues...\n') + pdb = changeUPtoPDB(pdb) + pdb.reset_index(drop=True, inplace=True) + print('\n>> Calculating 3D distances for PDB structures...\n') + pdb = isZeroDistance(pdb) + pdb = processFile(pdb, path_to_output_files) + pdb = match3D(pdb) + pdb = selectMaxAnnot(pdb) + pdb = pdb.sort_values(by=['datapoint', 'resolution', 'annotTotal'], ascending=[True, True, True]) + pdb = pdb.drop_duplicates(['datapoint']) + pdb.replace({'[]': np.NaN, 'hit':0.0}, inplace=True) + print('\n>> PDB matching is completed...\n') + else: + # There was no residue match in the associated PDB. So we cannot use PDB data. + pdb = pdb[SIMPLE_COLS] + print('\n>>> No PDB structure could be matched.') - # Step 2: Parse the PDB file from memory - pdb_data = response.text - pdb_parser = PDBParser(QUIET=True) # QUIET=True suppresses warnings - pdb_file_content = StringIO(pdb_data) - structure = pdb_parser.get_structure(search, pdb_file_content) - ppb = PPBuilder() - pdb_data_list = pdb_data.split('\n') - pdb_data_list_sequence = [i for i in pdb_data_list if i.startswith('SEQRES')] - pdb_data_list_sequence = [ list(filter(None,i.split(' '))) for i in pdb_data_list_sequence] - seqs = {} - for i in pdb_data_list_sequence: - if i[2] in seqs.keys(): - seqs[i[2]] += i[4:] else: - seqs[i[2]] = i[4:] - - for key, val in seqs.items(): - seqs[key] = ''.join([threeToOne(i) for i in val]) - pdb_data_list = [i for i in pdb_data_list if i.startswith('DBREF')] - pdb_data_list = [[list(filter(None,i.split(' '))) for j in i.split(' ') if j == 'UNP'] for i in pdb_data_list] - pdb_data_list = [i for i in pdb_data_list if i != []] - pdb_data_list_uniprot = [[j[6] for j in i] for i in pdb_data_list] - - - #pdb_data_list = [[list(filter(None,j)) for j in i] for i in pdb_data_list] - pdb_data_list = [[j[2] for j in i] for i in pdb_data_list] - pdb_data_list = [i[0] for i in pdb_data_list] - for model in structure: - for pp in ppb.build_peptides(model): - sequence = pp.get_sequence() - - for chain, up in zip(model,pdb_data_list_uniprot ): - chain_id = chain.get_id() - # Extract UniProt ID if available in the chain's annotations - uniprot_ids = fetch_uniprot_ids(search) - # Get the resolution from the PDB header - header = structure.header - resolution = header.get('resolution', 'N/A') - if chain_id in pdb_data_list: - # Print UniProt IDs, chain ID, and resolution for the current model - chain_id = chain.get_id() - - pdb_fasta.at[index, 'pdbID'] = search - pdb_fasta.at[index, 'chain'] = chain_id - pdb_fasta.at[index, 'pdbSequence'] = str(seqs[chain_id]) - pdb_info.at[index, 'uniprotID'] = ', '.join(up) - pdb_info.at[index, 'pdbID'] = search - pdb_info.at[index, 'chain'] = chain_id - pdb_info.at[index, 'resolution'] = resolution - index += 1 - - print('PDB file processing finished..') - for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): - try: - filename_replace_ext = filename.with_suffix(".pdb") - filename.rename(filename_replace_ext) - except: - FileNotFoundError - - for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): - try: - if filename.stem.startswith("pdb"): - filename_replace_ext = filename.with_name(filename.stem[3:]) - filename.rename(filename_replace_ext.with_suffix('.pdb')) - except: - FileNotFoundError - uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left') - uniprot_matched = uniprot_matched.astype(str) - uniprot_matched = uniprot_matched.drop_duplicates() - uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left') - uniprot_matched = uniprot_matched.astype(str) - - with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & ( - (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & ( - uniprot_matched.resolution != 'None'))].drop_duplicates() - no_pdb = uniprot_matched[(uniprot_matched.pdbID == 'nan') | ( - (uniprot_matched.resolution == 'nan') | (uniprot_matched.resolution == 'OT') | ( - uniprot_matched.resolution == 'None'))] - no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())] - no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True) - print( - 'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n' - % (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])), - len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])))) - - with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) - with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') - with_pdb.replace({'': 'nan'}, inplace=True) - - if len(with_pdb) == 0: - with_pdb['pdbInfo'] = '' - else: - for i in with_pdb.index: - try: - res = str(with_pdb.at[i, 'resolution']) - chain = with_pdb.at[i, 'chain'] - new = with_pdb.at[i, 'pdbID'] + ':' + chain + ':' + res - with_pdb.at[i, 'pdbInfo'] = new - except: - TypeError - with_pdb.at[i, 'pdbInfo'] = 'nan' - - with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence', - 'wt_sequence_match', - 'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']] - - # If the query data points are found in no_match_in_uniprot data frame, it will not give any results. - # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps. - # If the query data points are found in with_pdb data frame, it will be searched in the following steps. - - """ - STEP 6 - Retrieve sequence annotations. - Add to the data frame. - """ - - if len(with_pdb) > 0: - with_pdb = add_annotations(with_pdb) - else: - new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', - 'dnaBinding', - 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', - 'crosslink', 'mutagenesis', 'strand', - 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', - 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', - 'coiledCoil', 'peptide', - 'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary', - 'intMetBinary', 'intramembraneBinary', - 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', - 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', - 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', - 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', - 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', - 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', - 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', - 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', - 'glycosylationBinary', 'propeptideBinary'] - with_pdb = pd.DataFrame(columns=new_cols) - try: - with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str') - except: - AttributeError - with_pdb['whichIsoform'] = '' - - with_pdb = with_pdb.astype(str) - with_pdb = with_pdb.replace({'NaN': 'nan'}) - with_pdb.replace({'[]': 'nan'}, inplace=True) - with_pdb.replace({'nan-nan': 'nan'}, inplace=True) - with_pdb.replace({'': 'nan'}, inplace=True) - - """ - STEP 7 - Do alignment for PDB - """ - # Canonical matches, i.e. labelled as m, canonical sequences will be aligned with PDB sequences. - # Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences. - with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C') - with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C') - - dfM = with_pdb[with_pdb.wt_sequence_match == 'm'] - dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) - dfM = dfM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') - - dfNM = with_pdb[with_pdb.wt_sequence_match == 'i'] - dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) - dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') - dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True) - - dfM = dfM.astype(str) - dfNM = dfNM.astype(str) - - dfM.reset_index(inplace=True) - dfM.drop(['index'], axis=1, inplace=True) - dfNM.reset_index(inplace=True) - dfNM.drop(['index'], axis=1, inplace=True) - - uniprot_matched_size = len(uniprot_matched.drop_duplicates(['datapoint'])) - uniprot_matched = None - pdb_fasta = None - pdb_info = None - pdbs = None - - existing_pdb = None - with_pdb_size = len(with_pdb.drop_duplicates(['datapoint'])) - with_pdb = None - + pdb = pdb[SIMPLE_COLS] + print('\n>>> No PDB structure could be matched.') + - print('Aligning sequences...\n') - aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files')) - - aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files')) - - - - # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them. - for i in aligned_m.index: - if aligned_m.at[i, 'pdbSequence'] == 'nan': - aligned_m.at[i, 'mutationPositionOnPDB'] = 'nan' - aligned_m.at[i, 'domainStartonPDB'] = 'nan' - aligned_m.at[i, 'domainEndonPDB'] = 'nan' - aligned_m.at[i, 'pdb_alignStatus'] = 'nan' - - for i in aligned_nm.index: - if aligned_nm.at[i, 'pdbSequence'] == 'nan': - aligned_nm.at[i, 'mutationPositionOnPDB'] = 'nan' - aligned_nm.at[i, 'domainStartonPDB'] = 'nan' - aligned_nm.at[i, 'domainEndonPDB'] = 'nan' - aligned_nm.at[i, 'pdb_alignStatus'] = 'nan' - - # Check if they the same column name before merging. - aligned_m = aligned_m.astype(str) - aligned_nm = aligned_nm.astype(str) - - frames = [aligned_m, aligned_nm] - after_up_pdb_alignment = pd.concat(frames, sort=False) - if len(after_up_pdb_alignment) == 0: - after_up_pdb_alignment['pdb_alignStatus'] = '' - after_up_pdb_alignment['mutationPositionOnPDB'] = '' - after_up_pdb_alignment['domainStartonPDB'] = '' - after_up_pdb_alignment['domainEndonPDB'] = '' - - after_up_pdb_alignment = after_up_pdb_alignment.sort_values( - by=['uniprotID', 'wt', 'mut', 'pos', 'pdb_alignStatus', 'resolution', 'chain'], - ascending=[True, True, True, True, True, True, True]) - - after_up_pdb_alignment = after_up_pdb_alignment.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos'], keep='first') - - after_up_pdb_alignment = after_up_pdb_alignment.astype('str') - - pdb_aligned = after_up_pdb_alignment[ - (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB != 'nan')] - yes_pdb_no_match = after_up_pdb_alignment[ - (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')] - no_pdb = no_pdb.copy() - - - print('PDB matching is completed...\n') - print('SUMMARY') - print('-------') - print('%d data points that failed to match a UniProt Sequence are discarded.' % len( - not_match_in_uniprot.drop_duplicates(['datapoint']))) - print('Of the remaining %d:' % uniprot_matched_size) - print('--%d of %d successfully aligned with PDB structures.' % ( - len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) - print('--%d of %d not found on the covered area by the structure.' % ( - len(yes_pdb_no_match.drop_duplicates(['datapoint'])), with_pdb_size)) - print('--PDB structures not found for %d datapoints.' % len(no_pdb.drop_duplicates(['datapoint']))) - print('--%d will be searched in Swiss-Model database.\n' % ( - len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint'])))) - - dfM = None - dfNM = None - aligned_nm = None - aligned_m = None - after_up_pdb_alignment = None - - print('Proceeding to SwissModel search...') - print('------------------------------------\n') - # At this point we have 4 dataframes - # 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well. - # 1a. aligned --- we are done with this. - # 1b. yes_pdb_no_match --- They have PDB structures but not matched, so will be searched in the other databases. - # 2. not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present. - # 3. no_pdb --- No PDB structures were found for them. Will be searched in other databases. - - """ - Step 8 - Neutralize data points that are to be searched in Swiss-Model - # One point is that yes_pdb_no_match's annotations are the adjusted according to the PDBs they are matched before. - # They need to be converted to their old original UniProt annotation positions. - """ - yes_pdb_no_match.drop(['disulfide', 'intMet', - 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'caBinding', 'topologicalDomain', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary', - 'intMetBinary', 'intramembraneBinary', - 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', - 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', - 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', - 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', - 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', - 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', - 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', - 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', - 'glycosylationBinary', 'propeptideBinary', 'pdbSequence', 'pdbInfo', 'pdbID', - 'chain', 'resolution', 'pdb_alignStatus', 'mutationPositionOnPDB', - 'domainStartonPDB', 'domainEndonPDB'], axis=1, inplace=True) - - to_swiss = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']), no_pdb.drop_duplicates(['datapoint'])]) - no_pdb = None - to_swiss.reset_index(inplace=True) - to_swiss.drop(['index'], axis=1, inplace=True) - to_swiss = to_swiss.astype('str') - to_swiss = to_swiss.replace({'NaN': 'nan'}) - # Create model summary dataframe. - if len(to_swiss) != 0: - print('Generating SwissModel file...\n') - - swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t', - dtype=str, header=None, skiprows=1, - names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', - 'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean', - 'qmean_norm', 'seqid', 'url']) - - else: - swiss_model = pd.DataFrame( - columns=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', 'coordinate_id', - 'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm', 'seqid', 'url', 'whichIsoform']) - swiss_model = swiss_model.astype('str') - try: - swiss_model.iso_id = swiss_model.iso_id.astype('str') - except: - AttributeError - swiss_model['iso_id'] = 'nan' - swiss_model = swiss_model[swiss_model.UniProtKB_ac != 'nan'] - for ind in swiss_model.index: - swiss_model.at[ind, 'UniProtKB_ac'] = swiss_model.at[ind, 'UniProtKB_ac'].split('-')[0] - if swiss_model.at[ind, 'iso_id'] != 'nan': - - swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1] - else: - swiss_model.at[ind, 'whichIsoform'] = 'nan' - # swiss_model.drop(['input'], axis=1, inplace=True) - swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL'] - print('Index File Processed...\n') - - # Get relevant columns - swiss_model = swiss_model[ - ['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']] - # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one. - swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False) - swiss_model.reset_index(inplace=True) - swiss_model.drop(['index'], axis=1, inplace=True) - - # Get protein IDs for which there exist models. - swiss_model_ids = set(swiss_model.UniProtKB_ac.to_list()) - to_swiss = to_swiss.astype(str) - no_swiss_models = pd.DataFrame() - for i in to_swiss.index: - if to_swiss.at[i, 'uniprotID'] not in swiss_model_ids: - k = pd.Series(to_swiss.iloc[i]) - no_swiss_models = no_swiss_models.append(k, ignore_index=True) - - no_swiss_models = no_swiss_models.astype(str) - if len(no_swiss_models) == 0: - no_swiss_models = pd.DataFrame(columns=to_swiss.columns) - else: - no_swiss_models = no_swiss_models[to_swiss.columns] - no_swiss_models.reset_index(inplace=True) - no_swiss_models.drop('index', axis=1, inplace=True) - - with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False) - with_swiss_models = with_swiss_models[to_swiss.columns] - # Add model info. - - with_swiss_models = with_swiss_models.astype(str) - swiss_model = swiss_model.astype(str) - swiss_models_with_data = pd.merge(with_swiss_models, swiss_model, left_on=['uniprotID', 'whichIsoform'], - right_on=['UniProtKB_ac', 'whichIsoform'], - how='left') - swiss_models_with_data = swiss_models_with_data.astype(str) - swiss_models_with_data = swiss_models_with_data.sort_values(by=['uniprotID', 'wt', 'mut', 'pos', 'qmean_norm'], - ascending=False) - swiss_models_with_data = swiss_models_with_data.drop_duplicates() - swiss_models_with_data = swiss_models_with_data.drop(['UniProtKB_ac', 'seqid'], axis=1) - swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int') - swiss_models_with_data = swiss_models_with_data.astype(str) - - # Get the ones in the list but without model url and add to the list to go to modbase. - url_nan = swiss_models_with_data[swiss_models_with_data.url == 'nan'] - - # Add this nan's to no_model. These will be searched in MODBASE because here they dont have urls. - url_nan = url_nan.drop(['from', 'qmean_norm', 'template', 'to', 'url'], axis=1) - - no_swiss_models_2 = pd.concat([no_swiss_models, url_nan]) - swiss_models_with_data = swiss_models_with_data[swiss_models_with_data.url != 'nan'] - for i in swiss_models_with_data.index: - try: - swiss_models_with_data.at[i, 'chain'] = swiss_models_with_data.at[i, 'template'].split('.')[2] - swiss_models_with_data.at[i, 'template'] = swiss_models_with_data.at[i, 'template'].split('.')[0] - except: - IndexError - if len(swiss_models_with_data) == 0: - swiss_models_with_data['chain'] = '' - swiss_models_with_data['template'] = '' - - swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('str') - swiss_models_with_data.chain = swiss_models_with_data.chain.astype('str') - swiss_models_with_data['qmean_norm'] = swiss_models_with_data.qmean_norm.apply(lambda x: round(float(x), 2)) - swiss_models_with_data = swiss_models_with_data.astype(str) - - # swiss_models_with_data: These data points will be aligned with their corresponding model sequences. - # Add sequences - - no_swiss_models_2.reset_index(inplace=True) - no_swiss_models_2.drop('index', axis=1, inplace=True) - - swiss_models_with_data.reset_index(inplace=True) - swiss_models_with_data.drop('index', axis=1, inplace=True) - - swiss_model_ids = None - with_swiss_models = None - swiss_model = None - no_swiss_models = None - url_nan = None - - # At this point we have: - # pdb_aligned --- Align in the PDB phase - # not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present. - # to_swiss (no_pdb + yes_pdb_no_match) --- to be searched in SwissModel database - # to_swiss (with_swiss_models & no_swiss_models) - # swiss_models_with_data --- We found swiss models for them. - # no_swiss_models_2 (no_swiss_models + url_nan)--- to be searched in modbase (the ones having swissmodels but not matching with the boundaries & broken_swiss will be added here) - - """ - STEP 9 - Associated model IDs are added. - Download model files. - """ - print('Beginning SwissModel files download...') - existing_swiss = list(Path(path_to_output_files / 'swissmodel_structures').glob("*")) - existing_swiss = [str(i) for i in existing_swiss] - existing_swiss = ['.'.join(i.split('/')[-1].split('.')[:-1]) for i in existing_swiss] - swissmodels_fasta = pd.DataFrame() - - for i in swiss_models_with_data.index: - protein = swiss_models_with_data.at[i, 'uniprotID'] - template = swiss_models_with_data.at[i, 'template'].split('.')[0] - qmean_norm = str(round(float(swiss_models_with_data.at[i, 'qmean_norm']), 2)) - if protein + '_' + template + '_' + qmean_norm not in existing_swiss: - url = swiss_models_with_data.at[i, 'url'].strip('\"').strip('}').replace('\\', '').strip('\"').replace( - 'https', - 'https:') - req = requests.get(url) - name = Path(path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt') - print('Downloading for Protein:', protein + ' Model: ' + template) - with open(name, 'wb') as f: - f.write(req.content) else: - print('Model exists.') - name = Path(path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt') - with open(name, encoding="utf8") as f: - fasta = '' - lines = f.readlines() - chain = '' - for row in lines: - if row[0:4] == 'ATOM' and row[13:15] == 'CA': - chain = row[20:22].strip() - fasta += threeToOne(row[17:20]) - if row[0:3] == 'TER': - k = pd.Series([protein, template, qmean_norm, chain.upper(), fasta]) - swissmodels_fasta = swissmodels_fasta.append(k, ignore_index=True) - fasta = '' - - if len(swissmodels_fasta) == 0: - swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']) - else: - swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'] - swissmodels_fasta = swissmodels_fasta.astype(str) - - swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float) - swissmodels_fasta.qmean_norm = swissmodels_fasta.qmean_norm.astype(float) - - swissmodels_fasta = swissmodels_fasta.sort_values(['uniprotID', 'template', 'qmean_norm', 'chain'], - axis=0) # example = 3gdh - swissmodels_fasta.reset_index(inplace=True) - swissmodels_fasta.drop(['index'], axis=1, inplace=True) - swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'qmean_norm', 'chain']) - swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'chain', 'fasta']) - swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'fasta']) - # Some files were broken, thus their PDBs couldnt be recorded. - swissmodels_fasta = swissmodels_fasta.drop_duplicates() - swissmodels_fasta = swissmodels_fasta.astype(str) - - swiss_models_with_data = swiss_models_with_data.astype(str) - swissmodels_fasta = swissmodels_fasta.astype(str) - swiss_models_with_data1 = swiss_models_with_data.merge(swissmodels_fasta, - on=['uniprotID', 'template', 'qmean_norm', 'chain']) - - swiss_models_with_data1 = swiss_models_with_data1.sort_values(['datapoint', 'fasta'], axis=0, - ascending=[True, False]) - swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template']) - - swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list())) - swiss_models_with_data.reset_index(inplace=True) - swiss_models_with_data.drop(['index'], axis=1, inplace=True) - broken_swiss = pd.DataFrame() - c = 0 - for i in swiss_models_with_data.index: # en baştaki dfde var ama model gelende yok. - if swiss_models_with_data.at[i, 'datapoint'] not in swiss_models_with_data1_dp: - k = pd.Series(swiss_models_with_data.iloc[i]) - broken_swiss = broken_swiss.append(k, ignore_index=True) - c += 1 - - if len(broken_swiss) == 0: - broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list()) - - swiss_models_with_data = swiss_models_with_data1.copy() - - swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float') - swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'], - axis=0, ascending=[True, True, True, False]) - - # Delete the same model sequence with lower quality - swiss_models_with_data = swiss_models_with_data.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], - keep='first') - swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str') - swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int') - len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len( - broken_swiss.drop_duplicates(['datapoint'])) + len( - no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint'])) - # This printed data here includes all possible models with different qualities, - # because we may get a hit in either of them. - swiss_models_with_data.rename({'fasta': 'pdbSequence'}, axis=1, inplace=True) # for convenience. - - # NOW DO ALIGNMENT HERE - - swiss_models_with_data = swiss_models_with_data.replace({'[\'?\']': 'nan'}) - swiss_models_with_data = swiss_models_with_data.replace({'[]': 'nan'}) - swiss_models_with_data.rename({'template': 'pdbID'}, axis=1, - inplace=True) # Only to be able use the alignment code above. - swiss_models_with_data = swiss_models_with_data.astype(str) - swiss_models_with_data.pdbSequence = swiss_models_with_data.pdbSequence.astype('str') - swiss_models_with_data = add_annotations(swiss_models_with_data) - swiss_models_with_data = swiss_models_with_data.astype(str) - swiss_models_with_data.replace({'NaN': 'nan'}, inplace=True) - swiss_models_with_data_copy = swiss_models_with_data.copy() - swiss_models_with_data1_dp = None - swiss_models_with_data1 = None - existing_swiss = None - swissmodels_fasta = None - - print('Aligning sequences...\n') - - swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C') - swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C') - swiss_model_aligned = alignment(swiss_models_with_data, annotation_list, - path_to_output_files / 'alignment_files') - swiss_models_with_data = None - - if len(swiss_model_aligned) == 0: - swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns) - swiss_model_aligned['qmean_norm'] = 'nan' - else: - swiss_model_aligned = swiss_model_aligned.astype(str) - swiss_model_aligned.replace({'NaN': 'nan'}, inplace=True) - - # Some datapoints appear in both nan and not_nan. If not_nan we take it only once. - nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB == 'nan'] - not_nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB != 'nan'] - not_nan.qmean_norm = not_nan.qmean_norm.astype('float') - not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'qmean_norm'], ascending=[True, True, False], inplace=True) - - which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first') - swiss_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] - swiss_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] - - swiss_match.qmean_norm = swiss_match.qmean_norm.astype('float') - swiss_match.sort_values(['uniprotID', 'wt', 'pos', 'mut', 'pdb_alignStatus', 'qmean_norm'], - ascending=[True, True, True, True, True, False], inplace=True) - swiss_match.drop_duplicates(['uniprotID', 'wt', 'pos', 'mut'], keep='first', inplace=True) - swiss_not_match = swiss_not_match[no_swiss_models_2.columns] - broken_swiss = broken_swiss[no_swiss_models_2.columns] - swiss_not_match = swiss_not_match.drop_duplicates(['datapoint']) - broken_swiss = broken_swiss.drop_duplicates(['datapoint']) - - to_modbase = pd.concat([no_swiss_models_2, broken_swiss]).drop_duplicates() - to_modbase = pd.concat([to_modbase, swiss_not_match]).drop_duplicates() - to_modbase = to_modbase.astype(str) - to_swiss_columns = to_swiss.columns - to_swiss_size = len(to_swiss.drop_duplicates(['datapoint'])) - to_swiss = None - # CONTROL - - """ - # This should be the whole data. - len(swiss_match.drop_duplicates(['datapoint'])) + len(aligned.drop_duplicates(['datapoint'])) + len(to_modbase.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) ,len(data) - len(aligned.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) +len(to_swiss.drop_duplicates(['datapoint']))== len(data) - """ - print('SwissModel matching is completed...\n') - print('SUMMARY') - print('-------') - print('%d data points that failed to match a UniProt Sequence are discarded.' % len( - not_match_in_uniprot.drop_duplicates(['datapoint']))) - print('Of the remaining %d:' % uniprot_matched_size) - print('--%d of %d successfully aligned with PDB structures.' % ( - len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) - print('--%d of %d successfully aligned with SwissModels structures.' % ( - len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size)) - print('--%d will be searched in ModBase database.\n' % len(to_modbase.drop_duplicates(['datapoint']))) - - print('Proceeding to ModBase search...') - print('------------------------------------\n') - no_swiss_models_2 = None - broken_swiss = None - swiss_model_aligned = None - nan = None - not_nan = None - which_ones_are_match = None - swiss_not_match = None - - # STEP : GO TO MODBASE - # Should not include anything related to prev models. - if len(to_modbase) != 0: - to_modbase = to_modbase.astype(str) - - # GET MODBASE MODELS - - # Get IDs from data to retrieve only their models from MODBASE - to_modbase.reset_index(inplace=True) - to_modbase.drop(['index'], axis=1, inplace=True) - - existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*")) - existing_modbase_models = [str(i) for i in existing_modbase_models] - existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models] - - existing_modbase_models_ind = list(Path(path_to_output_files / 'modbase_structures_individual').glob("*")) - existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind] - existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind] - - modbase_reduced = pd.DataFrame() - modbase_fasta = pd.DataFrame() - - print('Retrieving ModBase models...\n') - # Get model files associated with each UniProtID - for protein in list(set(to_modbase.uniprotID.to_list())): - if protein not in existing_modbase_models: - print('Downloading Modbase models for ', protein) - url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein - req = requests.get(url) - name = path_to_output_files / 'modbase_structures' / f'{protein}.txt' - with open(name, 'wb') as f: - f.write(req.content) + pdb = pd.DataFrame(columns = SIMPLE_COLS) + print('\n>>> No PDB structure could be matched.') + no_pdb = data.copy() + no_pdb = no_pdb[SIMPLE_COLS] + + print( + 'PDB phase is finished...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n' + % (len(pdb.drop_duplicates(['datapoint'])), len(data.drop_duplicates(['datapoint'])), + len(no_pdb.drop_duplicates(['datapoint'])), len(data.drop_duplicates(['datapoint'])))) + + + + print('\n>>> Proceeding to SwissModel search...') + print('------------------------------------\n') + swiss = no_pdb.copy() + if len(swiss) > 0: + print('\n>> Adding SwissModel residue positions...\n') + swiss.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True) + swiss = swiss.fillna(np.NaN) + swiss, no_swiss_models= addSwissModels(swiss, path_to_input_files, path_to_output_files) + print('\n>> Mapping to SwissModels...\n') + if len(swiss) > 0: + swiss.reset_index(drop=True, inplace=True) + swiss = changeUPtoModels(swiss) + swiss.reset_index(drop=True, inplace=True) + print('\n>> Calculating 3D distances for SwissModels...\n') + swiss = isZeroDistance(swiss) + swiss = match3DModels(swiss) + swiss = selectMaxAnnot(swiss) + swiss = swiss.sort_values(by=['datapoint', 'qmean_norm', 'distance', 'hitTotal', 'annotTotal'], ascending=[True, True, True, True, True]) + swiss = swiss.drop_duplicates(['datapoint']) + swiss.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True) else: - print('Model exists for', protein) - name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt') - with open(name, encoding="utf8") as f: - a = open(name, 'r').read() - soup = BeautifulSoup(a, 'lxml') - for pdb in soup.findAll('pdbfile'): - model_id = str(pdb.contents[1])[10:-11] - if model_id not in existing_modbase_models_ind: - with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', 'w', - encoding="utf8") as individual: - individual.write(str('UniProt ID: ' + protein)) - individual.write('\n') - individual.write(str(pdb.contents[3])[10:-11].strip()) - with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', - encoding="utf8") as f: - fasta = '' - chain = '' - template_chain = '' - score = -999 - for ind_line in f.readlines(): - if ind_line[0:10] == 'UniProt ID': - uniprot_id = ind_line.split(':')[1].strip() - if ind_line[0:23] == 'REMARK 220 TARGET BEGIN': - target_begin = ind_line[40:43].strip() - if ind_line[0:21] == 'REMARK 220 TARGET END': - target_end = ind_line[40:43].strip() - if ind_line[0:25] == 'REMARK 220 TEMPLATE BEGIN': - pdb_begin = ind_line[40:43].strip() - if ind_line[0:23] == 'REMARK 220 TEMPLATE END': - pdb_end = ind_line[40:43].strip() - if ind_line[0:23] == 'REMARK 220 TEMPLATE PDB': - pdb_code = ind_line[40:43].strip() - if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN': - pdb_chain = ind_line[40:43].strip() - if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score': - quality_score = ind_line[40:].strip() - if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID': - model_id = ind_line[40:].strip() - if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN': - template_chain = ind_line[40:42].strip() - if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA': - fasta += threeToOne(ind_line[17:20]) - if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score': - try: - score = ind_line[40:].strip() - except (ValueError): - score = -999 - if ind_line[0:3] == 'TER' or ind_line[0:3] == 'END': - k = pd.Series([uniprot_id, model_id, str(score), template_chain, fasta]) - modbase_fasta = modbase_fasta.append(k, ignore_index=True) - fasta = '' - try: - k = pd.Series( - [uniprot_id, target_begin, target_end, pdb_code, pdb_chain, pdb_begin, pdb_end, - quality_score, - model_id]) - modbase_reduced = modbase_reduced.append(k, ignore_index=True) - except: - NameError - print('This file doesnt have Quality Score. Replacer: -999', model_id) - quality_score = -999 - - print() - if len(modbase_fasta) != 0: - modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta'] - else: - modbase_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'score', 'chain', 'fasta']) - modbase_fasta = modbase_fasta.astype(str) - modbase_fasta = modbase_fasta.replace({'': 'nan'}) - modbase_fasta = modbase_fasta.replace({'NaN': 'nan'}) - modbase_fasta = modbase_fasta[modbase_fasta.fasta != 'nan'] - - print('Modbase model frame constructed.\n') - if len(modbase_reduced) != 0: - modbase_reduced.columns = ['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', - 'PDBEnd', - 'ModPipeQualityScore', 'ModelID'] - else: - modbase_reduced = pd.DataFrame( - columns=['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', 'PDBEnd', - 'ModPipeQualityScore', 'ModelID']) - - to_modbase = add_annotations(to_modbase) - - to_modbase = to_modbase.astype(str) - to_modbase.fillna('nan', inplace=True) - to_modbase = to_modbase.replace({'NaN': 'nan'}) - to_modbase.replace({'[]': 'nan'}, inplace=True) - to_modbase.replace({'nan-nan': 'nan'}, inplace=True) - to_modbase.replace({'': 'nan'}, inplace=True) - model_info_added = to_modbase.merge(modbase_reduced, right_on='UniprotID', left_on='uniprotID', - how='left') - modbase_reduced = None - existing_modbase_models = None - existing_modbase_models_ind = None - - model_info_added = model_info_added.drop(['UniprotID'], axis=1) - model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to', - 'PDBCode': 'template', 'PDBChain': 'chain', - 'ModPipeQualityScore': 'score', - 'ModelID': 'pdbID'}) - model_info_added.drop(['PDBEnd', 'PDBBegin'], axis=1, inplace=True) - model_info_added.score = model_info_added.score.astype(float) - model_info_added = model_info_added.sort_values(by=['datapoint', 'score'], - ascending=False) - model_info_added.reset_index(inplace=True) - model_info_added.drop(['index'], axis=1, inplace=True) - model_info_added = model_info_added.drop_duplicates() - - model_info_added = model_info_added.astype(str) - model_info_added = model_info_added.replace({'NaN': 'nan'}) - no_info = model_info_added[model_info_added.pdbID == 'nan'] - with_modbase_info = model_info_added[model_info_added.pdbID != 'nan'] - model_info_added = None - - len(no_info.drop_duplicates(['datapoint'])), len(with_modbase_info.drop_duplicates(['datapoint'])) - len(no_info.drop_duplicates(['datapoint'])) + len(with_modbase_info.drop_duplicates(['datapoint'])) == len( - to_modbase.drop_duplicates(['datapoint'])) - - # Add no_info to the rest down below! - no_info = no_info[to_swiss_columns] - - with_modbase_info.score = with_modbase_info.score.astype(float) - modbase_fasta.score = modbase_fasta.score.astype(float) - - modbase_fasta = modbase_fasta.sort_values(['uniprotID', 'score', 'template', 'chain'], - ascending=[True, False, True, True], axis=0) # example = 3gdh - - # I added this newly downloaded ones to the main model file. - - modbase_fasta = modbase_fasta.rename(columns={'template': 'pdbID'}) - with_modbase_info.pos = with_modbase_info.pos.astype('int') - with_modbase_info.score = with_modbase_info.score.astype(float) - with_modbase_info.score = with_modbase_info.score.apply(lambda x: round(x, 2)) - modbase_fasta.score = modbase_fasta.score.astype(float) - modbase_fasta.score = modbase_fasta.score.apply(lambda x: round(x, 2)) - - with_modbase_info = with_modbase_info.merge(modbase_fasta, on='pdbID', how='left') - - with_modbase_info.drop(['score_y'], axis=1, inplace=True) - with_modbase_info.rename(columns={'score_x': 'score'}, inplace=True) - with_modbase_info.drop(['uniprotID_y', 'chain_y'], axis=1, inplace=True) - with_modbase_info.rename(columns={'uniprotID_x': 'uniprotID', 'chain_x': 'chain'}, inplace=True) - - with_modbase_info.score = with_modbase_info.score.astype('float') - with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'], - axis=0, - ascending=[True, True, True, True, False, True, False]) - with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], - keep='first') - - with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'}) - with_modbase_info = with_modbase_info.replace({'[]': 'nan'}) - with_modbase_info = with_modbase_info.replace({'\'?\', ': ''}) - with_modbase_info = with_modbase_info.replace({', \'?\'': ''}) - with_modbase_info = with_modbase_info.replace({'(': ''}) - with_modbase_info = with_modbase_info.replace( - {')': ''}) - with_modbase_info = with_modbase_info.astype(str) - with_modbase_info.fasta = with_modbase_info.fasta.astype('str') - with_modbase_info.reset_index(inplace=True) - with_modbase_info.drop('index', axis=1, inplace=True) - - align = with_modbase_info[ - with_modbase_info.fasta != 'nan'] - yes_pdb_no_match = with_modbase_info[ - with_modbase_info.fasta == 'nan'] - yes_pdb_no_match = yes_pdb_no_match[~yes_pdb_no_match.datapoint.isin(align.datapoint.to_list())] - - align.rename(columns={'fasta': 'pdbSequence'}, inplace=True) - align['uniprotSequence'] = align['uniprotSequence'].str.replace('U', 'C') - align['pdbSequence'] = align['pdbSequence'].str.replace('U', 'C') - - to_modbase_size = len(to_modbase.drop_duplicates(['datapoint'])) - modbase_fasta = None - to_modbase = None - print('Aligning sequences...\n') - modbase_aligned = alignment(align, annotation_list, path_to_output_files / 'alignment_files') - modbase_aligned = modbase_aligned.astype(str) - modbase_aligned = modbase_aligned.replace({'NaN': 'nan'}) - - # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.) - if len(with_modbase_info) != 0: - not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']), - with_modbase_info.drop_duplicates(['datapoint'])]).drop_duplicates( - ['datapoint'], - keep=False) - else: - not_in_aligned = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', - 'intMet', - 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', - 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', - 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', - 'disulfide', - 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', - 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', - 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', - 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from', - 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta']) - with_modbase_info = None - if len(not_in_aligned) != 0: - not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']), - not_in_aligned.drop_duplicates(['datapoint'])]).drop_duplicates(['datapoint'], - keep='first') - # Retain the best model among the aligned ones. + swiss = swiss[SIMPLE_COLS] + + if len(no_swiss_models) > 0: + no_swiss_models = no_swiss_models[SIMPLE_COLS] + no_swiss_models.reset_index(inplace=True, drop=True) + else: - not_models = pd.DataFrame(columns=not_in_aligned.columns) - - yes_pdb_no_match = None - # # Some datapoints appear in both nan and not_nan. If not_nan we take it only once. - modbase_aligned = modbase_aligned.astype(str) - if len(modbase_aligned) != 0: - nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan'] - not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan'] - not_nan.score = not_nan.score.astype(float) - not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False], - inplace=True) + swiss = swiss[SIMPLE_COLS] + no_swiss_models = no_pdb.copy() + - not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'], - ascending=[True, True, False]) - not_nan = not_nan.drop_duplicates(['datapoint'], keep='first') + + print('\n>>> Proceeding to Modbase search...') + print('------------------------------------\n') + if len(no_swiss_models) >0: + modbase = no_swiss_models.copy() + print('Proceeding to Modbase search...') + print('------------------------------------\n') + if len(modbase) > 0: + modbase = modbase[SIMPLE_COLS] + modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True) + modbase = modbase.fillna(np.NaN) + print('\n>> Adding Modbase residue positions...\n') + modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'datapoint']] + modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'datapoint']) + modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files) + modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'datapoint'], how = 'left') + no_modbase_models_updated['sasa'] = np.NaN + modbase.reset_index(inplace=True, drop=True) + no_modbase_add = modbase[pd.isna(modbase.coordinates)] + modbase = modbase[~pd.isna(modbase.coordinates)] + no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add]) + print('\n>> Mapping to Modbase models...\n') + modbase = changeUPtoModels(modbase) + print('\n>> Calculating 3D distances for Modbase models...\n') + modbase = isZeroDistance(modbase) + modbase = match3DModels(modbase) + modbase = selectMaxAnnot(modbase) + modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, True, True, True, True]) + modbase = modbase.drop_duplicates(['datapoint']) + modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True) + else: + modbase = modbase[SIMPLE_COLS] + else: - nan = pd.DataFrame(columns=modbase_aligned.columns) - not_nan = pd.DataFrame(columns=modbase_aligned.columns) - modbase_aligned = None - which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first') - if len(which_ones_are_match) == 0: - which_ones_are_match = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', - 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', - 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', - 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', - 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', - 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', - 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', - 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', - 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', - 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', - 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', - 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template', - 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus', - 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB']) - modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] - modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] - + no_modbase_models_updated = pd.DataFrame() + modbase= pd.DataFrame() + + COLS = ['uniprotID', 'wt', 'pos', 'mut', 'datapoint', 'composition', 'polarity', 'volume', 'granthamScore', 'domain', 'domStart', 'domEnd', 'distance', + 'region', 'crosslink', 'peptide', 'disulfide', 'signalPeptide', 'propeptide', 'naturalVariant', 'nucleotideBinding', 'modifiedResidue', 'site', + 'caBinding', 'turn', 'transmembrane', 'repeat', 'glycosylation', 'intramembrane', 'metalBinding', 'bindingSite', 'dnaBinding', 'activeSite', + 'coiledCoil', 'helix', 'mutagenesis', 'zincFinger', 'transitPeptide', 'intMet', 'strand', 'lipidation', 'motif', 'topologicalDomain', + 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', 'nucleotideBindingBinary', + 'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', + 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary', + 'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa'] + + if len(pdb)>0: + pdb = pdb[COLS] + pdb['Source'] = 'PDB' else: - modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] - modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] - - which_ones_are_match = None - modbase_match.score = modbase_match.score.astype('float') - modbase_match = modbase_match.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'], - ascending=[True, True, False]) - modbase_match.drop_duplicates(['datapoint'], keep='first', inplace=True) - not_nan = None - nan = None - - # merge not_in_align and modbase_not_match as they were both excluded from modbase match. - - # No model - no_info = no_info[to_swiss_columns] - no_info = no_info.drop_duplicates() - - # Model present, no sequence - not_models = not_models[to_swiss_columns] - not_models = not_models.drop_duplicates() - - # Modbase model and sequence present, no match in PDB - modbase_not_match = modbase_not_match[to_swiss_columns] - modbase_not_match = modbase_not_match.drop_duplicates() - if len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) != 0: - rest = pd.concat([not_in_aligned, modbase_not_match, no_info]) - elif len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) == 0: - rest = pd.concat([not_in_aligned, modbase_not_match]) - elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) != 0: - rest = pd.concat([modbase_not_match, no_info]) - elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) != 0: - rest = pd.concat([not_in_aligned, no_info]) - elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) == 0: - rest = not_in_aligned - elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) == 0: - rest = modbase_not_match - elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0: - rest = no_info + pdb = pd.DataFrame() + if len(swiss) > 0: + swiss = swiss[COLS] + swiss['Source'] = 'SWISS-Model' else: - rest = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint']) - - rest = rest[to_swiss_columns] - rest = rest.drop_duplicates() - - rest.reset_index(inplace=True) - rest.drop(['index'], axis=1, inplace=True) - rest = rest.astype('str') - - - else: - - modbase_match = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', - 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', - 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', - 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', - 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', - 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', - 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', - 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', - 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', - 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', - 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', - 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template', - 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus', - 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB']) - not_in_aligned = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', - 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide', - 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', - 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', - 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from', - 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta']) - no_info = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint']) - rest = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'wt_sequence_match', 'whichIsoform', 'datapoint']) - - rest = rest[to_swiss_columns] - rest = rest.drop_duplicates() - - rest.reset_index(inplace=True) - rest.drop(['index'], axis=1, inplace=True) - rest = rest.astype('str') - to_modbase_size = 0 - - print('Modbase matching is completed...\n') - print('SUMMARY') - print('-------') - print('%d data points that failed to match a UniProt Sequence are discarded.' % len( - not_match_in_uniprot.drop_duplicates(['datapoint']))) - print('Of the remaining %d:' % uniprot_matched_size) - print('--%d of %d successfully aligned with PDB structures.' % ( - len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) - print('--%d of %d successfully aligned with SwissModels structures.' % ( - len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size)) - print('--%d of %d successfully aligned with Modbase structures.\n' % ( - len(modbase_match.drop_duplicates(['datapoint'])), to_modbase_size)) - print('--Remaining %d not found to match any models.' % len(rest.drop_duplicates(['datapoint']))) - print('--A total of %d datapoints will not be evaluated.\n' % ( - len(rest.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])))) - - print('FOR CHECKING : ', - len(rest.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) + len( - pdb_aligned.drop_duplicates(['datapoint'])) + len(swiss_match.drop_duplicates(['datapoint'])) + len( - modbase_match.drop_duplicates(['datapoint'])) == data_size) - no_info = None - align = None - not_in_aligned = None - not_models = None - modbase_not_match = None - - # Final corrections - - # Now 3D alignment. - pdb = pdb_aligned.copy() - swiss = swiss_match.copy() - modbase = modbase_match.copy() - pdb_aligned = None - swiss_match = None - modbase_match = None - - """ - WHAT DO WE HAVE NOW? - - uniprot sequence not found - - pdb aligned - - swiss aligned - - modbase aligned - - not aligned with anything (rest) - """ - - # Fix the axes and merge all data. - - pdb.drop(['pdbInfo'], axis=1, inplace=True) - pdb.rename(columns={'resolution': 'score'}, inplace=True) - swiss.rename(columns={'qmean_norm': 'score'}, inplace=True) - modbase.rename(columns={'qmean_norm': 'score'}, inplace=True) - - swiss = swiss[pdb.columns] - modbase = modbase[pdb.columns] - pdb['source'] = 'PDB' - swiss['source'] = 'SWISSMODEL' - modbase['source'] = 'MODBASE' - data = pd.concat([swiss, modbase, pdb]) - data.reset_index(inplace=True) - data.drop(['index'], axis=1, inplace=True) - data = data.astype('str') - data_spare = pd.concat([not_match_in_uniprot, rest]) - not_match_in_uniprot = None - pdb = None - swiss = None - modbase = None - rest = None - - print('Generating FreeSASA files...') - print('------------------------------------\n') - # Folder to calculated RSA values. - - existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*")) - existing_free_sasa = [str(i) for i in existing_free_sasa] - existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa] - print('Calculation RSA for PDB Structure Files...\n') - pdb_only = data[data.source == 'PDB'] - - - for pdbID in pdb_only.pdbID.to_list(): - if pdbID not in existing_free_sasa: - (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'), - Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), - include_hetatms=True, - outdir=None, force_rerun=False, file_type='pdb')) - - print('Calculation RSA for SwissModel Files...\n') - swiss_only = data[data.source == 'SWISSMODEL'] - swiss_dp = [] - for i in swiss_only.index: - swiss_dp.append(swiss_only.at[i, 'uniprotID'] + '_' + swiss_only.at[i, 'pdbID'].lower() + '_' + str( - round(float(swiss_only.at[i, 'score']), 2))) - for pdbID in swiss_dp: - if pdbID not in existing_free_sasa: - (run_freesasa(Path(path_to_output_files / 'swissmodel_structures' / f'{pdbID}.txt'), - Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt'), include_hetatms=True, - outdir=None, force_rerun=False, file_type='pdb')) - - - print('Calculation RSA for Modbase Model Files...\n') - modbase_only = data[data.source == 'MODBASE'] - for pdbID in modbase_only.pdbID.to_list(): - if pdbID not in existing_free_sasa: - (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'), - Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), - include_hetatms=True, - outdir=None, force_rerun=False, file_type='pdb')) - - # This annotation list is different than the prev one, keep it. - existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*")) - existing_free_sasa = [str(i) for i in existing_free_sasa] - existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa] - annotation_list += ['domainStartonPDB', 'domainEndonPDB'] - - folder_path = path_to_output_files / 'freesasa_files' - aligner = Align.PairwiseAligner() - print('Proceeding to 3D distance calculation...\n') - data.domainEndonPDB = data.domainEndonPDB.astype(str) - data.domainStartonPDB = data.domainStartonPDB.astype(str) - - existing_free_sasa = None - swiss_dp = None - pdb_only = None - swiss_only = None - modbase_only = None - data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C') - data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C') - for i in data.index: - id_ = data.at[i, 'pdbID'].lower() - up_id_ = data.at[i, 'uniprotID'] - - score_ = str(data.at[i, 'score']) - if data.at[i, 'source'] == 'PDB': - pdb_path = Path(path_to_output_files / 'pdb_structures' / f'{id_}.pdb') - elif data.at[i, 'source'] == 'MODBASE': - pdb_path = Path(path_to_output_files / 'modbase_structures_individual' / f'{id_}.txt') - elif data.at[i, 'source'] == 'SWISSMODEL': - pdb_path = Path(path_to_output_files / 'swissmodel_structures' / f'{up_id_}_{id_}_{score_}.txt') - - pdbSequence = data.at[i, 'pdbSequence'] - source = data.at[i, 'source'] - chain = data.at[i, 'chain'] - uniprotID = data.at[i, 'uniprotID'] - pdbID = data.at[i, 'pdbID'] - - - alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip') - mutPos = data.at[i, 'mutationPositionOnPDB'] - try: - coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0] - except: - ValueError - coordMut = 'nan' - try: - sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2] - data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos, - data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb') - except: - ValueError - data.at[i, 'sasa'] = 'nan' # mutation position is nan - - for annot in annotation_list: - annotx = [] - try: - positions_of_annotations = data.at[i, annot].split(',') - for pos in positions_of_annotations: - pos = pos.strip().strip('\'').strip('[\'').strip('\']') - try: - if '-' not in pos: - pos = int(float(pos)) - coordAnnot = get_coords(pos, alignments, 'nan', 'nan', mode)[0] - try: - annotx.append(find_distance(coordMut, coordAnnot)) - except: - ValueError - - else: - for r in range(int(pos.split('-')[0]), int(pos.split('-')[1]) + 1): - coordAnnot = get_coords(r, alignments, 'nan', 'nan', mode)[0] - annotx.append(find_distance(coordMut, coordAnnot)) - except: - ValueError - try: - data.at[i, annot] = min([float(i) for i in annotx]) - except: - ValueError - data.at[i, annot] = 'nan' - - except: - ValueError - - if (str(data.at[i, 'domainStartonPDB']) == 'NaN' or str(data.at[i, 'domainStartonPDB']) == 'nan') and ( - str(data.at[i, 'domainEndonPDB']) != 'NaN' and str(data.at[i, 'domainEndonPDB']) != 'nan'): - data.at[i, 'domainStartonPDB'] = 100000 - elif (str(data.at[i, 'domainEndonPDB']) == 'NaN' or str(data.at[i, 'domainEndonPDB']) == 'nan') and ( - str(data.at[i, 'domainStartonPDB']) != 'NaN' and str(data.at[i, 'domainStartonPDB']) != 'nan'): - data.at[i, 'domainEndonPDB'] = 100000 - elif (str(data.at[i, 'domainStartonPDB']) == 'NaN' and str(data.at[i, 'domainEndonPDB']) == 'nan'): - data.at[i, 'domaindistance3D'] = 'nan' - - data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']), - float(data.at[i, 'domainEndonPDB'])) - data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']), - float(data.at[i, 'domainEndonPDB'])) - - data = data.astype(str) - data.replace({'NaN': 'nan'}, inplace=True) - - # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match. - - # Get interface positions from ECLAIR. Download HQ human - print() - print('Assigning surface regions...') - print('------------------------------------\n') - - print('Extracting interface residues...\n') - data_interface = pd.read_csv(path_to_interfaces, sep='\t') - - positions = get_interface_positions(data_interface, 'P1', 'P2') - - interface_dataframe = pd.DataFrame() - - for key, val in positions.items(): - k = pd.Series((key, str(list(set(val))))) - interface_dataframe = interface_dataframe.append(k, ignore_index=True) - interface_dataframe.columns = ['uniprotID', 'positions'] - - if len(data) == 0: - data = pd.DataFrame( - columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', - 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', - 'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score', - 'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane', - 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding', - 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', - 'strand', 'helix', 'turn', 'metalBinding', 'repeat', - 'topologicalDomain', 'caBinding', 'bindingSite', 'region', - 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', - 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', - 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', - 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', - 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', - 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', - 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', - 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', - 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', - 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', - 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', - 'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus', - 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB', - 'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher']) - else: - data.sasa = data.sasa.astype('str') - - for i in data.index: - if '*' in data.at[i, 'sasa']: - data.at[i, 'sasa'] = data.at[i, 'sasa'].split('*')[0] - - data.sasa = data.sasa.replace({'N/A': 'nan'}) - data.sasa = data.sasa.replace({'None': 'nan'}) - data.replace({' N/A': 'nan'}, inplace=True) - data.replace({'None': 'nan'}, inplace=True) - data.sasa = data.sasa.astype(float) - data = data.astype(str) - for i in data.index: - if float(data.at[i, 'sasa']) < 5: - data.at[i, 'trsh4'] = 'core' - elif float(data.at[i, 'sasa']) >= 5: - data.at[i, 'trsh4'] = 'surface' - elif data.at[i, 'sasa'] == 'nan': - data.at[i, 'trsh4'] = 'nan' - - data = data.merge(interface_dataframe, on='uniprotID', how='left') - data.positions = data.positions.astype('str') - for i in data.index: - if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface': - data.at[i, 'threeState_trsh4_HQ'] = 'interface' - elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface': - data.at[i, 'threeState_trsh4_HQ'] = 'surface' - elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core': - data.at[i, 'threeState_trsh4_HQ'] = 'core' - elif (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core': - data.at[i, 'threeState_trsh4_HQ'] = 'conflict' - elif data.at[i, 'trsh4'] == 'nan': - data.at[i, 'threeState_trsh4_HQ'] = 'nan' - - data.drop(['positions'], axis=1, inplace=True) - - # OPTIONAL - # DOMAIN SELECTION - # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most - # significant domains and 53th category will be NULL. - - fisherResult = pd.read_csv(fisher_path, sep='\t') - - significant_domains = fisherResult.domain.to_list() - for i in data.index: - if data.at[i, 'domain'] in significant_domains: - data.at[i, 'domain_fisher'] = data.at[i, 'domain'] + swiss = pd.DataFrame() + if len(modbase) > 0: + modbase = modbase[COLS] + modbase['Source'] = 'Modbase' else: - data.at[i, 'domain_fisher'] = 'NULL' - - # Change the numbering for binary annotations and create 3 classes: - # nan--> 0, 0 -->1 and 1 -->2 - - print('Final adjustments are being done...\n') - binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', - 'dnaBindingBinary', - 'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', - 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', - 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', - 'repeatBinary', 'caBindingBinary', 'topologicalDomainBinary', - 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', - 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', - 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', - 'glycosylationBinary', 'propeptideBinary'] - data = data.astype(str) - data.replace({'NaN': 'nan'}, inplace=True) - for i in data.index: - for j in binaryCols: - data[j] = data[j].astype('str') - if (data.at[i, j] == '0') or (data.at[i, j] == '0.0'): - data.at[i, j] = '1' - elif data.at[i, j] == 'nan': - data.at[i, j] = '0' - elif (data.at[i, j] == '1') or (data.at[i, j] == '1.0'): - data.at[i, j] = '2' - - annotCols = ['disulfide', 'intMet', 'intramembrane', - 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding', - 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', - 'strand', 'helix', 'turn', 'metalBinding', 'repeat', 'caBinding', - 'topologicalDomain', 'bindingSite', 'region', 'signalPeptide', - 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide', - 'transitPeptide', 'glycosylation', 'propeptide'] - - for i in data.index: - for annot in annotCols: - binaryName = str(annot) + 'Binary' - if data.at[i, binaryName] == '2': - data.at[i, annot] = '0.0' - data.replace({'100000': 'nan'}, inplace=True) - data = add_physicochemical(data) - data.rename( - columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue', - 'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db', - 'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig', - 'domaindistance3D': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state', - 'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin', - 'intramembraneBinary': 'intramembrane_bin', - 'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin', - 'activeSiteBinary': 'activeSite_bin', - 'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin', - 'siteBinary': 'site_bin', - 'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin', - 'mutagenesisBinary': 'mutagenesis_bin', - 'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin', - 'metalBindingBinary': 'metalBinding_bin', - 'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin', - 'caBindingBinary': 'caBinding_bin', - 'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin', - 'signalPeptideBinary': 'signalPeptide_bin', - 'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin', - 'motifBinary': 'motif_bin', - 'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin', - 'transitPeptideBinary': 'transitPeptide_bin', - 'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin', - 'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist', - 'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist', - 'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist', - 'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist', - 'site': 'site_dist', - 'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist', - 'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist', - 'turn': 'turn_dist', - 'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist', - 'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist', - 'bindingSite': 'bindingSite_dist', 'region': 'region_dist', - 'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist', - 'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist', - 'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist', - 'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True) - - data = data[ - ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position', 'meta_merged', 'composition', 'polarity', - 'volume', - 'granthamScore', 'domains_all', - 'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin', - 'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin', - 'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin', - 'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin', - 'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin', - 'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin', - 'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin', - 'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin', - 'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist', - 'intramembrane_dist', - 'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist', - 'nucleotideBinding_dist', 'lipidation_dist', 'site_dist', - 'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist', - 'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist', - 'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist', - 'bindingSite_dist', 'region_dist', 'signalPeptide_dist', - 'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist', - 'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist', - 'glycosylation_dist', 'propeptide_dist']] - ready = data.copy() - # Imputation - if (impute == 'True') or (impute == 'true') or (impute == True): - filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, - 16.82, - 20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36] - col_index = 0 - for col_ in ready.columns[-30:]: - ready[col_] = ready[col_].fillna(filler[col_index]) - ready[col_] = ready[col_].replace({'nan': filler[col_index]}) - col_index += 1 - ready['domains_3Ddist'] = ready['domains_3Ddist'].fillna(24.5) - ready['sasa'] = ready['sasa'].fillna(29.5) - ready['location_3state'] = ready['location_3state'].fillna('unknown') - elif (impute == 'False') or (impute == 'false') or (impute == False): - pass - ready = ready.replace({'nan': np.NaN}) - ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False) - if len(ready) == 0: - print( - 'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.') - print(ready) - print('Feature vector successfully created...') - return ready - - end = timer() - hours, rem = divmod(end - start, 3600) - minutes, seconds = divmod(rem, 60) - print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) - return ready \ No newline at end of file + modbase = pd.DataFrame() + no_modbase_models_updated = pd.DataFrame() + + # st.write('======PDB==========') + # st.write(pdb.to_string()) + # st.write('======SWISS==========') + # st.write(swiss.to_string()) + # st.write('======MODBASE==========') + # st.write(modbase.to_string()) + + + + allData = pd.concat([pdb, swiss, modbase]) + allData.reset_index(inplace=True, drop=True) + allData.replace({np.NaN: ''}, inplace=True) + # print('======ALL DATA==========') + # print(allData.to_string()) + if len(allData)>0: + allData.distance.replace({-1000: ''}, inplace=True) + + + # Get interface positions from ECLAIR. Download HQ human + print() + print('Assigning surface regions...') + print('------------------------------------\n') + + print('Extracting interface residues...\n') + data_interface = pd.read_csv(path_to_interfaces, sep='\t') + + positions = get_interface_positions(data_interface, 'P1', 'P2') + + interface_dataframe = pd.DataFrame() + + for key, val in positions.items(): + k = pd.Series((key, str(list(set(val))))) + interface_dataframe = interface_dataframe.append(k, ignore_index=True) + interface_dataframe.columns = ['uniprotID', 'positions'] + data = finalTouch(allData) + data = data.merge(interface_dataframe, on='uniprotID', how='left') + data.positions = data.positions.astype('str') + for i in data.index: + if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface': + data.at[i, 'threeState_trsh4_HQ'] = 'interface' + elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface': + data.at[i, 'threeState_trsh4_HQ'] = 'surface' + elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core': + data.at[i, 'threeState_trsh4_HQ'] = 'core' + elif (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core': + data.at[i, 'threeState_trsh4_HQ'] = 'conflict' + elif data.at[i, 'trsh4'] == 'nan': + data.at[i, 'threeState_trsh4_HQ'] = 'nan' + + data.drop(['positions'], axis=1, inplace=True) + + fisherResult = pd.read_csv(fisher_path, sep='\t') + significant_domains = fisherResult.domain.to_list() + for i in data.index: + if data.at[i, 'domain'] in significant_domains: + data.at[i, 'domain_fisher'] = data.at[i, 'domain'] + else: + data.at[i, 'domain_fisher'] = 'NULL' + print('Final adjustments are being done...\n') + binaryCols = UNIPROT_ANNOTATION_COLS[-30:] + data = data.astype(str) + data.replace({'NaN': 'nan'}, inplace=True) + for i in data.index: + for j in binaryCols: + data[j] = data[j].astype('str') + if (data.at[i, j] == '0') or (data.at[i, j] == '0.0'): + data.at[i, j] = '1' + elif data.at[i, j] == 'nan': + data.at[i, j] = '0' + elif (data.at[i, j] == '1') or (data.at[i, j] == '1.0'): + data.at[i, j] = '2' + + annotCols = UNIPROT_ANNOTATION_COLS[:30] + + for i in data.index: + for annot in annotCols: + binaryName = str(annot) + 'Binary' + if data.at[i, binaryName] == '2': + data.at[i, annot] = '0.0' + data.rename( + columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue', + 'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db', + 'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig', + 'distance': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state', + 'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin', + 'intramembraneBinary': 'intramembrane_bin', + 'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin', + 'activeSiteBinary': 'activeSite_bin', + 'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin', + 'siteBinary': 'site_bin', + 'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin', + 'mutagenesisBinary': 'mutagenesis_bin', + 'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin', + 'metalBindingBinary': 'metalBinding_bin', + 'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin', + 'caBindingBinary': 'caBinding_bin', + 'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin', + 'signalPeptideBinary': 'signalPeptide_bin', + 'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin', + 'motifBinary': 'motif_bin', + 'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin', + 'transitPeptideBinary': 'transitPeptide_bin', + 'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin', + 'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist', + 'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist', + 'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist', + 'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist', + 'site': 'site_dist', + 'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist', + 'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist', + 'turn': 'turn_dist', + 'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist', + 'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist', + 'bindingSite': 'bindingSite_dist', 'region': 'region_dist', + 'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist', + 'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist', + 'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist', + 'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True) + + data = data[ + ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position','Source', 'meta_merged', 'composition', 'polarity', + 'volume', + 'granthamScore', 'domains_all', + 'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin', + 'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin', + 'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin', + 'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin', + 'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin', + 'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin', + 'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin', + 'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin', + 'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist', + 'intramembrane_dist', + 'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist', + 'nucleotideBinding_dist', 'lipidation_dist', 'site_dist', + 'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist', + 'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist', + 'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist', + 'bindingSite_dist', 'region_dist', 'signalPeptide_dist', + 'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist', + 'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist', + 'glycosylation_dist', 'propeptide_dist']] + # Imputation + if (impute == 'True') or (impute == 'true') or (impute == True): + filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, + 16.82, + 20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36] + col_index = 0 + for col_ in data.columns[-30:]: + data[col_] = data[col_].fillna(filler[col_index]) + data[col_] = data[col_].replace({'nan': filler[col_index]}) + col_index += 1 + data['domains_3Ddist'] = data['domains_3Ddist'].fillna(24.5) + data['sasa'] = data['sasa'].fillna(29.5) + data['location_3state'] = data['location_3state'].fillna('unknown') + elif (impute == 'False') or (impute == 'false'): + pass + data = data.replace({'nan': np.NaN}) + data.domains_all = data.domains_all.replace({-1: 'NULL'}) + + # ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False) + if len(data) == 0: + print( + 'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.') + + data.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False) + + print('Feature vector successfully created...') + return data + + end = timer() + hours, rem = divmod(end - start, 3600) + minutes, seconds = divmod(rem, 60) + print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) + #sys.stdout.close() + return data \ No newline at end of file