Spaces:

HUBioDataLab
/

ASCARIS

Running

App Files Files Community

fatmacankara commited on Aug 23, 2023

Commit

8d9c11e

1 Parent(s): debd6c0

Update code/pdb_featureVector.py

Browse files

Files changed (1) hide show

code/pdb_featureVector.py +209 -167

code/pdb_featureVector.py CHANGED Viewed

@@ -1,3 +1,53 @@
 # IMPORT NECESSARY MODULES AND LIBRARIES
 from timeit import default_timer as timer
 import xml.etree.ElementTree as ET
@@ -25,13 +75,13 @@ from Bio.PDB import PDBList
 from Bio import Align
 from Bio import SeqIO
 from Bio.PDB import *
 warnings.filterwarnings("ignore")
 start = timer()
 import streamlit as st
 # FUNCTIONS
 # FUNCTIONS
 from calc_pc_property import *
 from add_domains import *
@@ -57,14 +107,16 @@ def pdb(input_set, mode, impute):
     Add datapoint identifier and remove non-standard input.
     """
     data = clean_data(input_set)
-    path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer =  manage_files(mode)
     out_path = path_to_output_files / 'log.txt'
     sys.stdout = open(out_path, 'w')
     print('Creating directories...')
     annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
                        'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
-                       'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
                        'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
                        'transitPeptide', 'glycosylation', 'propeptide']
@@ -139,12 +191,14 @@ def pdb(input_set, mode, impute):
                 if wt == can:
                     data.at[i, 'wt_sequence_match'] = 'm'
                 elif wt != can:
-                    isoList = isoform_fasta[isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
                     for k in isoList:
                         if len(k) >= int(data.at[i, 'pos']):
                             resInIso = k[int(int(data.at[i, 'pos']) - 1)]
                             if wt == resInIso:
-                                whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
                                 data.at[i, 'wt_sequence_match'] = 'i'
                                 data.at[i, 'whichIsoform'] = whichIsoform
                                 break
@@ -189,13 +243,13 @@ def pdb(input_set, mode, impute):
         for prot in protein:
             pdbs.append(get_pdb_ids(prot))
         print('PDBs', pdbs)
-        if len(pdbs)>=1:
             print('pdbs not empty')
             pdbs = [item for sublist in pdbs for item in sublist]
             print('NEW', pdbs)
         else:
             print('pdbs empty')
-            pdbs =[]
         print('Processing PDB structures...\n')
         if pdbs == []:
             print('No PDB structure found for the query. ')
@@ -218,8 +272,8 @@ def pdb(input_set, mode, impute):
         try:
             shutil.rmtree('obsolete')
         except OSError as e:
-            pass
-        existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*"))
         st.write('existing_pdb')
         st.write(existing_pdb)
         existing_pdb = [str(i) for i in existing_pdb]
@@ -229,28 +283,15 @@ def pdb(input_set, mode, impute):
         for search in pdbs:
             st.write('searching for pdb:', search)
             try:
-                if search.lower() not in existing_pdb:
-                    path_pdb = 'out_files/pdb/pdb_structures'
-                    st.write('path for pdb: ',path_pdb)
-                    file = pdbl.retrieve_pdb_file(search, pdir=path_pdb, file_format="pdb")
-                    st.write('file: ',file)
-                    existing_pdb =  list(Path(path_to_output_files/'pdb_structures').glob("*"))
-                    st.write('after download:', existing_pdb)
-                    st.write(Path(path_to_output_files/'pdb_structures') == path_pdb)
-                    existing_pdb = list(path_pdb.glob("*"))
-                    st.write('after download:', existing_pdb)
-                else:
-                    print('PDB structure file exists..')
-                    for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
-                        filename_replace_ext = filename.with_suffix(".pdb")
-                        filename.rename(filename_replace_ext)
-                    file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
-                    base = os.path.splitext(str(file))[0]
-                    base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
-                    os.rename(file, base + ".ent")
-                    file = base + '.ent'
                 resolution_method = parser.get_structure(search, file)
                 for record in SeqIO.parse(file, "pdb-seqres"):
@@ -269,7 +310,7 @@ def pdb(input_set, mode, impute):
                 pdb_info.at[index, 'pdbID'] = 'nan'
                 pdb_info.at[index, 'chain'] = 'nan'
                 pdb_info.at[index, 'resolution'] = 'nan'
-            cnt +=1
         print()
         print('PDB file processing finished..')
         for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
@@ -323,13 +364,11 @@ def pdb(input_set, mode, impute):
                     TypeError
                     with_pdb.at[i, 'pdbInfo'] = 'nan'
-        with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
                              'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
                              'wt_sequence_match',
                              'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
         # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
         # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
         # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
@@ -343,7 +382,8 @@ def pdb(input_set, mode, impute):
         if len(with_pdb) > 0:
             with_pdb = add_annotations(with_pdb)
         else:
-            new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
                                                      'activeSite',
                                                      'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
                                                      'crosslink', 'mutagenesis', 'strand',
@@ -362,7 +402,7 @@ def pdb(input_set, mode, impute):
                                                      'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
                                                      'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
                                                      'glycosylationBinary', 'propeptideBinary']
-            with_pdb = pd.DataFrame(columns = new_cols)
         try:
             with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
         except:
@@ -374,7 +414,7 @@ def pdb(input_set, mode, impute):
         with_pdb.replace({'[]': 'nan'}, inplace=True)
         with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
         with_pdb.replace({'': 'nan'}, inplace=True)
         """
         STEP 7
         Do alignment for PDB
@@ -409,8 +449,7 @@ def pdb(input_set, mode, impute):
         existing_pdb = None
         with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
         with_pdb = None
         print('Aligning sequences...\n')
         aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
         aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
@@ -433,7 +472,6 @@ def pdb(input_set, mode, impute):
         aligned_m = aligned_m.astype(str)
         aligned_nm = aligned_nm.astype(str)
         frames = [aligned_m, aligned_nm]
         after_up_pdb_alignment = pd.concat(frames, sort=False)
         if len(after_up_pdb_alignment) == 0:
@@ -456,7 +494,6 @@ def pdb(input_set, mode, impute):
             (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
         no_pdb = no_pdb.copy()
         print('PDB matching is completed...\n')
         print('SUMMARY')
         print('-------')
@@ -471,7 +508,6 @@ def pdb(input_set, mode, impute):
         print('--%d will be searched in Swiss-Model database.\n' % (
                 len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
         dfM = None
         dfNM = None
         aligned_nm = None
@@ -527,7 +563,8 @@ def pdb(input_set, mode, impute):
             swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
                                       dtype=str, header=None, skiprows=1,
                                       names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
-                                             'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm','seqid', 'url'])
         else:
             swiss_model = pd.DataFrame(
@@ -547,13 +584,13 @@ def pdb(input_set, mode, impute):
                 swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
             else:
                 swiss_model.at[ind, 'whichIsoform'] = 'nan'
-#        swiss_model.drop(['input'], axis=1, inplace=True)
         swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
         print('Index File Processed...\n')
         # Get relevant columns
-        swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
         # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
         swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
         swiss_model.reset_index(inplace=True)
@@ -710,7 +747,6 @@ def pdb(input_set, mode, impute):
                                                                       ascending=[True, False])
         swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
         swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
         swiss_models_with_data.reset_index(inplace=True)
         swiss_models_with_data.drop(['index'], axis=1, inplace=True)
@@ -727,7 +763,6 @@ def pdb(input_set, mode, impute):
         swiss_models_with_data = swiss_models_with_data1.copy()
         swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
         swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
                                                                     axis=0, ascending=[True, True, True, False])
@@ -737,7 +772,8 @@ def pdb(input_set, mode, impute):
                                                                         keep='first')
         swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
         swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
-        len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(broken_swiss.drop_duplicates(['datapoint'])) + len(
             no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
         # This printed data here includes all possible models with different qualities,
         # because we may get a hit in either of them.
@@ -764,10 +800,10 @@ def pdb(input_set, mode, impute):
         swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
         swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
-        swiss_model_aligned = alignment(swiss_models_with_data, annotation_list, path_to_output_files / 'alignment_files')
         swiss_models_with_data = None
         if len(swiss_model_aligned) == 0:
             swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
             swiss_model_aligned['qmean_norm'] = 'nan'
@@ -860,7 +896,7 @@ def pdb(input_set, mode, impute):
                     url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
                     print(url)
                     req = requests.get(url)
-                    name = path_to_output_files / 'modbase_structures' /  f'{protein}.txt'
                     with open(name, 'wb') as f:
                         f.write(req.content)
                 else:
@@ -877,7 +913,7 @@ def pdb(input_set, mode, impute):
                                 individual.write(str('UniProt ID: ' + protein))
                                 individual.write('\n')
                                 individual.write(str(pdb.contents[3])[10:-11].strip())
-                        with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt',
                                   encoding="utf8") as f:
                             fasta = ''
                             chain = ''
@@ -960,7 +996,6 @@ def pdb(input_set, mode, impute):
             existing_modbase_models = None
             existing_modbase_models_ind = None
             model_info_added = model_info_added.drop(['UniprotID'], axis=1)
             model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
                                                                 'PDBCode': 'template', 'PDBChain': 'chain',
@@ -1013,7 +1048,8 @@ def pdb(input_set, mode, impute):
             with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
                                                               axis=0,
                                                               ascending=[True, True, True, True, False, True, False])
-            with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], keep='first')
             with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
             with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
@@ -1027,7 +1063,6 @@ def pdb(input_set, mode, impute):
             with_modbase_info.reset_index(inplace=True)
             with_modbase_info.drop('index', axis=1, inplace=True)
             align = with_modbase_info[
                 with_modbase_info.fasta != 'nan']
             yes_pdb_no_match = with_modbase_info[
@@ -1046,7 +1081,6 @@ def pdb(input_set, mode, impute):
             modbase_aligned = modbase_aligned.astype(str)
             modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
             # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
             if len(with_modbase_info) != 0:
                 not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
@@ -1054,29 +1088,30 @@ def pdb(input_set, mode, impute):
                     ['datapoint'],
                     keep=False)
             else:
-                not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
-                                                       'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                                       'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
-                                                       'intMet',
-                                                       'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                                                       'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
-                                                       'crosslink',
-                                                       'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                                       'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                                       'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
-                                                       'coiledCoil',
-                                                       'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
-                                                       'disulfide',
-                                                       'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
-                                                       'activeSite',
-                                                       'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
-                                                       'crosslink',
-                                                       'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                                       'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                                       'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
-                                                       'coiledCoil',
-                                                       'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
-                                                       'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
             with_modbase_info = None
             if len(not_in_aligned) != 0:
                 not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
@@ -1093,7 +1128,8 @@ def pdb(input_set, mode, impute):
                 nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
                 not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
                 not_nan.score = not_nan.score.astype(float)
-                not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False], inplace=True)
                 not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
                                               ascending=[True, True, False])
@@ -1105,7 +1141,7 @@ def pdb(input_set, mode, impute):
             which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
             if len(which_ones_are_match) == 0:
                 which_ones_are_match = pd.DataFrame(
-                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
                              'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
                              'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
                              'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
@@ -1141,7 +1177,6 @@ def pdb(input_set, mode, impute):
             not_nan = None
             nan = None
             # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
             # No model
@@ -1170,9 +1205,10 @@ def pdb(input_set, mode, impute):
             elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
                 rest = no_info
             else:
-                rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
-                                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                             'wt_sequence_match', 'whichIsoform', 'datapoint'])
             rest = rest[to_swiss_columns]
             rest = rest.drop_duplicates()
@@ -1184,49 +1220,53 @@ def pdb(input_set, mode, impute):
         else:
-            modbase_match = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
-                                                  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                                  'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
-                                                  'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                                                  'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
-                                                  'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                                  'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                                  'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                                                  'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
-                                                  'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
-                                                  'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
-                                                  'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
-                                                  'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
-                                                  'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
-                                                  'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
-                                                  'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
-                                                  'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
-                                                  'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
-                                                  'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
-                                                  'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
-                                                  'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
-            not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
-                                                   'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                                   'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
-                                                   'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                                                   'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
-                                                   'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                                   'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                                   'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                                                   'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
-                                                   'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                                                   'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
-                                                   'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                                   'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                                   'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                                                   'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
-                                                   'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
-            no_info = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
-                                            'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                            'wt_sequence_match', 'whichIsoform', 'datapoint'])
-            rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
-                                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                         'wt_sequence_match', 'whichIsoform', 'datapoint'])
             rest = rest[to_swiss_columns]
             rest = rest.drop_duplicates()
@@ -1262,7 +1302,6 @@ def pdb(input_set, mode, impute):
         not_models = None
         modbase_not_match = None
         # Final corrections
         # Now 3D alignment.
@@ -1284,7 +1323,6 @@ def pdb(input_set, mode, impute):
         # Fix the axes and  merge all data.
         pdb.drop(['pdbInfo'], axis=1, inplace=True)
         pdb.rename(columns={'resolution': 'score'}, inplace=True)
         swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
@@ -1297,7 +1335,6 @@ def pdb(input_set, mode, impute):
         modbase['source'] = 'MODBASE'
         data = pd.concat([swiss, modbase, pdb])
         data.reset_index(inplace=True)
         data.drop(['index'], axis=1, inplace=True)
         data = data.astype('str')
@@ -1321,10 +1358,10 @@ def pdb(input_set, mode, impute):
         for pdbID in pdb_only.pdbID.to_list():
             if pdbID not in existing_free_sasa:
                 (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
-                              Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
                               outdir=None, force_rerun=False, file_type='pdb'))
         print('Calculation RSA for SwissModel Files...\n')
         swiss_only = data[data.source == 'SWISSMODEL']
         swiss_dp = []
@@ -1342,7 +1379,8 @@ def pdb(input_set, mode, impute):
         for pdbID in modbase_only.pdbID.to_list():
             if pdbID not in existing_free_sasa:
                 (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
-                              Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
                               outdir=None, force_rerun=False, file_type='pdb'))
         # This annotation list is different than the prev one, keep it.
@@ -1380,16 +1418,18 @@ def pdb(input_set, mode, impute):
             chain = data.at[i, 'chain']
             uniprotID = data.at[i, 'uniprotID']
             pdbID = data.at[i, 'pdbID']
-            alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
             mutPos = data.at[i, 'mutationPositionOnPDB']
             try:
-                coordMut = get_coords(mutPos, alignments , 'nan', 'nan', mode)[0]
             except:
                 ValueError
                 coordMut = 'nan'
             try:
                 sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
-                data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos, data.at[i, 'wt'], mode, path_to_output_files,file_type = 'pdb')
             except:
                 ValueError
                 data.at[i, 'sasa'] = 'nan'  # mutation position is nan
@@ -1437,11 +1477,9 @@ def pdb(input_set, mode, impute):
             data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
                                                  float(data.at[i, 'domainEndonPDB']))
         data = data.astype(str)
         data.replace({'NaN': 'nan'}, inplace=True)
         # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
         # Get interface positions from ECLAIR. Download HQ human
@@ -1462,28 +1500,29 @@ def pdb(input_set, mode, impute):
         interface_dataframe.columns = ['uniprotID', 'positions']
         if len(data) == 0:
-            data = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
-                                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                         'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
-                                         'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
-                                         'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
-                                         'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
-                                         'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                         'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                                         'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
-                                         'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
-                                         'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
-                                         'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
-                                         'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
-                                         'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
-                                         'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
-                                         'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
-                                         'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
-                                         'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
-                                         'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
-                                         'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
-                                         'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
         else:
             data.sasa = data.sasa.astype('str')
@@ -1522,7 +1561,6 @@ def pdb(input_set, mode, impute):
         data.drop(['positions'], axis=1, inplace=True)
         # OPTIONAL
         # DOMAIN SELECTION
         # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
@@ -1541,7 +1579,8 @@ def pdb(input_set, mode, impute):
         # nan--> 0, 0 -->1 and 1 -->2
         print('Final adjustments are being done...\n')
-        binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary',
                       'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
                       'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
                       'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
@@ -1643,7 +1682,8 @@ def pdb(input_set, mode, impute):
         ready = data.copy()
         # Imputation
         if (impute == 'True') or (impute == 'true') or (impute == True):
-            filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, 16.82,
                       20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
             col_index = 0
             for col_ in ready.columns[-30:]:
@@ -1658,7 +1698,8 @@ def pdb(input_set, mode, impute):
         ready = ready.replace({'nan': np.NaN})
         ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
         if len(ready) == 0:
-            print('No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
         print(ready)
         print('Feature vector successfully created...')
         return ready
@@ -1669,3 +1710,4 @@ def pdb(input_set, mode, impute):
     print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
     sys.stdout.close()
     return ready

+Hugging
+Face
+'s logo
+Hugging
+Face
+Search
+models, datasets, users...
+Models
+Datasets
+Spaces
+Docs
+Solutions
+Pricing
+Spaces:
+HUBioDataLab
+/
+ASCARIS
+like
+0
+App
+Files
+Community
+Settings
+ASCARIS
+/
+code
+/
+pdb_featureVector.py
+fatmacankara
+'s picture
+fatmacankara
+Update
+code / pdb_featureVector.py
+debd6c0
+less
+than
+a
+minute
+ago
+raw
+history
+blame
+edit
+delete
+96
+kB
 # IMPORT NECESSARY MODULES AND LIBRARIES
 from timeit import default_timer as timer
 import xml.etree.ElementTree as ET
 from Bio import Align
 from Bio import SeqIO
 from Bio.PDB import *
 warnings.filterwarnings("ignore")
 start = timer()
 import streamlit as st
 # FUNCTIONS
 # FUNCTIONS
 from calc_pc_property import *
 from add_domains import *
     Add datapoint identifier and remove non-standard input.
     """
     data = clean_data(input_set)
+    path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(
+        mode)
     out_path = path_to_output_files / 'log.txt'
     sys.stdout = open(out_path, 'w')
     print('Creating directories...')
     annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
                        'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
+                       'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
+                       'region',
                        'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
                        'transitPeptide', 'glycosylation', 'propeptide']
                 if wt == can:
                     data.at[i, 'wt_sequence_match'] = 'm'
                 elif wt != can:
+                    isoList = isoform_fasta[
+                        isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
                     for k in isoList:
                         if len(k) >= int(data.at[i, 'pos']):
                             resInIso = k[int(int(data.at[i, 'pos']) - 1)]
                             if wt == resInIso:
+                                whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[
+                                    0]
                                 data.at[i, 'wt_sequence_match'] = 'i'
                                 data.at[i, 'whichIsoform'] = whichIsoform
                                 break
         for prot in protein:
             pdbs.append(get_pdb_ids(prot))
         print('PDBs', pdbs)
+        if len(pdbs) >= 1:
             print('pdbs not empty')
             pdbs = [item for sublist in pdbs for item in sublist]
             print('NEW', pdbs)
         else:
             print('pdbs empty')
+            pdbs = []
         print('Processing PDB structures...\n')
         if pdbs == []:
             print('No PDB structure found for the query. ')
         try:
             shutil.rmtree('obsolete')
         except OSError as e:
+            pass
+        existing_pdb = list(Path(path_to_output_files / 'pdb_structures').glob("*"))
         st.write('existing_pdb')
         st.write(existing_pdb)
         existing_pdb = [str(i) for i in existing_pdb]
         for search in pdbs:
             st.write('searching for pdb:', search)
             try:
+                path_pdb = 'out_files/pdb/pdb_structures'
+                st.write('path for pdb: ', path_pdb)
+                file = pdbl.retrieve_pdb_file(search, pdir=path_pdb, file_format="pdb")
+                st.write('file: ', file)
+                existing_pdb = list(Path(path_to_output_files / 'pdb_structures').glob("*"))
+                st.write('after download:', existing_pdb)
+                st.write(Path(path_to_output_files / 'pdb_structures') , path_pdb)
+                existing_pdb = list(path_pdb.glob("*"))
+                st.write('after download:', existing_pdb)
                 resolution_method = parser.get_structure(search, file)
                 for record in SeqIO.parse(file, "pdb-seqres"):
                 pdb_info.at[index, 'pdbID'] = 'nan'
                 pdb_info.at[index, 'chain'] = 'nan'
                 pdb_info.at[index, 'resolution'] = 'nan'
+            cnt += 1
         print()
         print('PDB file processing finished..')
         for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
                     TypeError
                     with_pdb.at[i, 'pdbInfo'] = 'nan'
+        with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
                              'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
                              'wt_sequence_match',
                              'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
         # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
         # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
         # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
         if len(with_pdb) > 0:
             with_pdb = add_annotations(with_pdb)
         else:
+            new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant',
+                                                     'dnaBinding',
                                                      'activeSite',
                                                      'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
                                                      'crosslink', 'mutagenesis', 'strand',
                                                      'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
                                                      'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
                                                      'glycosylationBinary', 'propeptideBinary']
+            with_pdb = pd.DataFrame(columns=new_cols)
         try:
             with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
         except:
         with_pdb.replace({'[]': 'nan'}, inplace=True)
         with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
         with_pdb.replace({'': 'nan'}, inplace=True)
         """
         STEP 7
         Do alignment for PDB
         existing_pdb = None
         with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
         with_pdb = None
         print('Aligning sequences...\n')
         aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
         aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
         aligned_m = aligned_m.astype(str)
         aligned_nm = aligned_nm.astype(str)
         frames = [aligned_m, aligned_nm]
         after_up_pdb_alignment = pd.concat(frames, sort=False)
         if len(after_up_pdb_alignment) == 0:
             (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
         no_pdb = no_pdb.copy()
         print('PDB matching is completed...\n')
         print('SUMMARY')
         print('-------')
         print('--%d will be searched in Swiss-Model database.\n' % (
                 len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
         dfM = None
         dfNM = None
         aligned_nm = None
             swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
                                       dtype=str, header=None, skiprows=1,
                                       names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
+                                             'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean',
+                                             'qmean_norm', 'seqid', 'url'])
         else:
             swiss_model = pd.DataFrame(
                 swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
             else:
                 swiss_model.at[ind, 'whichIsoform'] = 'nan'
+        #        swiss_model.drop(['input'], axis=1, inplace=True)
         swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
         print('Index File Processed...\n')
         # Get relevant columns
+        swiss_model = swiss_model[
+            ['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
         # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
         swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
         swiss_model.reset_index(inplace=True)
                                                                       ascending=[True, False])
         swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
         swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
         swiss_models_with_data.reset_index(inplace=True)
         swiss_models_with_data.drop(['index'], axis=1, inplace=True)
         swiss_models_with_data = swiss_models_with_data1.copy()
         swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
         swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
                                                                     axis=0, ascending=[True, True, True, False])
                                                                         keep='first')
         swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
         swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
+        len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(
+            broken_swiss.drop_duplicates(['datapoint'])) + len(
             no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
         # This printed data here includes all possible models with different qualities,
         # because we may get a hit in either of them.
         swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
         swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
+        swiss_model_aligned = alignment(swiss_models_with_data, annotation_list,
+                                        path_to_output_files / 'alignment_files')
         swiss_models_with_data = None
         if len(swiss_model_aligned) == 0:
             swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
             swiss_model_aligned['qmean_norm'] = 'nan'
                     url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
                     print(url)
                     req = requests.get(url)
+                    name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
                     with open(name, 'wb') as f:
                         f.write(req.content)
                 else:
                                 individual.write(str('UniProt ID: ' + protein))
                                 individual.write('\n')
                                 individual.write(str(pdb.contents[3])[10:-11].strip())
+                        with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt',
                                   encoding="utf8") as f:
                             fasta = ''
                             chain = ''
             existing_modbase_models = None
             existing_modbase_models_ind = None
             model_info_added = model_info_added.drop(['UniprotID'], axis=1)
             model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
                                                                 'PDBCode': 'template', 'PDBChain': 'chain',
             with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
                                                               axis=0,
                                                               ascending=[True, True, True, True, False, True, False])
+            with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
+                                                                  keep='first')
             with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
             with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
             with_modbase_info.reset_index(inplace=True)
             with_modbase_info.drop('index', axis=1, inplace=True)
             align = with_modbase_info[
                 with_modbase_info.fasta != 'nan']
             yes_pdb_no_match = with_modbase_info[
             modbase_aligned = modbase_aligned.astype(str)
             modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
             # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
             if len(with_modbase_info) != 0:
                 not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
                     ['datapoint'],
                     keep=False)
             else:
+                not_in_aligned = pd.DataFrame(
+                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                             'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
+                             'intMet',
+                             'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                             'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
+                             'crosslink',
+                             'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                             'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                             'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
+                             'coiledCoil',
+                             'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                             'disulfide',
+                             'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
+                             'activeSite',
+                             'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
+                             'crosslink',
+                             'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                             'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                             'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
+                             'coiledCoil',
+                             'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
+                             'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
             with_modbase_info = None
             if len(not_in_aligned) != 0:
                 not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
                 nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
                 not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
                 not_nan.score = not_nan.score.astype(float)
+                not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False],
+                                    inplace=True)
                 not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
                                               ascending=[True, True, False])
             which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
             if len(which_ones_are_match) == 0:
                 which_ones_are_match = pd.DataFrame(
+                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
                              'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
                              'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
                              'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
             not_nan = None
             nan = None
             # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
             # No model
             elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
                 rest = no_info
             else:
+                rest = pd.DataFrame(
+                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                             'wt_sequence_match', 'whichIsoform', 'datapoint'])
             rest = rest[to_swiss_columns]
             rest = rest.drop_duplicates()
         else:
+            modbase_match = pd.DataFrame(
+                columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                         'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
+                         'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                         'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                         'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                         'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                         'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                         'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
+                         'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                         'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                         'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                         'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                         'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                         'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                         'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                         'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                         'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
+                         'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
+                         'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
+            not_in_aligned = pd.DataFrame(
+                columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                         'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
+                         'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                         'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                         'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                         'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                         'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
+                         'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                         'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                         'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                         'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                         'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
+                         'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
+            no_info = pd.DataFrame(
+                columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                         'wt_sequence_match', 'whichIsoform', 'datapoint'])
+            rest = pd.DataFrame(
+                columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                         'wt_sequence_match', 'whichIsoform', 'datapoint'])
             rest = rest[to_swiss_columns]
             rest = rest.drop_duplicates()
         not_models = None
         modbase_not_match = None
         # Final corrections
         # Now 3D alignment.
         # Fix the axes and  merge all data.
         pdb.drop(['pdbInfo'], axis=1, inplace=True)
         pdb.rename(columns={'resolution': 'score'}, inplace=True)
         swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
         modbase['source'] = 'MODBASE'
         data = pd.concat([swiss, modbase, pdb])
         data.reset_index(inplace=True)
         data.drop(['index'], axis=1, inplace=True)
         data = data.astype('str')
         for pdbID in pdb_only.pdbID.to_list():
             if pdbID not in existing_free_sasa:
                 (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
+                              Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
+                              include_hetatms=True,
                               outdir=None, force_rerun=False, file_type='pdb'))
         print('Calculation RSA for SwissModel Files...\n')
         swiss_only = data[data.source == 'SWISSMODEL']
         swiss_dp = []
         for pdbID in modbase_only.pdbID.to_list():
             if pdbID not in existing_free_sasa:
                 (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
+                              Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
+                              include_hetatms=True,
                               outdir=None, force_rerun=False, file_type='pdb'))
         # This annotation list is different than the prev one, keep it.
             chain = data.at[i, 'chain']
             uniprotID = data.at[i, 'uniprotID']
             pdbID = data.at[i, 'pdbID']
+            alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode,
+                                           Path(path_to_output_files / '3D_alignment'), file_format='gzip')
             mutPos = data.at[i, 'mutationPositionOnPDB']
             try:
+                coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
             except:
                 ValueError
                 coordMut = 'nan'
             try:
                 sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
+                data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
+                                          data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb')
             except:
                 ValueError
                 data.at[i, 'sasa'] = 'nan'  # mutation position is nan
             data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
                                                  float(data.at[i, 'domainEndonPDB']))
         data = data.astype(str)
         data.replace({'NaN': 'nan'}, inplace=True)
         # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
         # Get interface positions from ECLAIR. Download HQ human
         interface_dataframe.columns = ['uniprotID', 'positions']
         if len(data) == 0:
+            data = pd.DataFrame(
+                columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                         'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
+                         'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
+                         'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
+                         'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
+                         'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                         'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                         'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                         'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
+                         'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                         'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                         'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                         'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                         'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                         'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                         'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                         'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                         'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
+                         'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
+                         'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
         else:
             data.sasa = data.sasa.astype('str')
         data.drop(['positions'], axis=1, inplace=True)
         # OPTIONAL
         # DOMAIN SELECTION
         # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
         # nan--> 0, 0 -->1 and 1 -->2
         print('Final adjustments are being done...\n')
+        binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary',
+                      'dnaBindingBinary',
                       'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
                       'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
                       'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
         ready = data.copy()
         # Imputation
         if (impute == 'True') or (impute == 'true') or (impute == True):
+            filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99,
+                      16.82,
                       20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
             col_index = 0
             for col_ in ready.columns[-30:]:
         ready = ready.replace({'nan': np.NaN})
         ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
         if len(ready) == 0:
+            print(
+                'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
         print(ready)
         print('Feature vector successfully created...')
         return ready
     print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
     sys.stdout.close()
     return ready