fatmacankara commited on
Commit
8d9c11e
·
1 Parent(s): debd6c0

Update code/pdb_featureVector.py

Browse files
Files changed (1) hide show
  1. code/pdb_featureVector.py +209 -167
code/pdb_featureVector.py CHANGED
@@ -1,3 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # IMPORT NECESSARY MODULES AND LIBRARIES
2
  from timeit import default_timer as timer
3
  import xml.etree.ElementTree as ET
@@ -25,13 +75,13 @@ from Bio.PDB import PDBList
25
  from Bio import Align
26
  from Bio import SeqIO
27
  from Bio.PDB import *
 
28
  warnings.filterwarnings("ignore")
29
  start = timer()
30
  import streamlit as st
31
  # FUNCTIONS
32
 
33
 
34
-
35
  # FUNCTIONS
36
  from calc_pc_property import *
37
  from add_domains import *
@@ -57,14 +107,16 @@ def pdb(input_set, mode, impute):
57
  Add datapoint identifier and remove non-standard input.
58
  """
59
  data = clean_data(input_set)
60
- path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode)
 
61
  out_path = path_to_output_files / 'log.txt'
62
  sys.stdout = open(out_path, 'w')
63
  print('Creating directories...')
64
 
65
  annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
66
  'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
67
- 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
 
68
  'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
69
  'transitPeptide', 'glycosylation', 'propeptide']
70
 
@@ -139,12 +191,14 @@ def pdb(input_set, mode, impute):
139
  if wt == can:
140
  data.at[i, 'wt_sequence_match'] = 'm'
141
  elif wt != can:
142
- isoList = isoform_fasta[isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
 
143
  for k in isoList:
144
  if len(k) >= int(data.at[i, 'pos']):
145
  resInIso = k[int(int(data.at[i, 'pos']) - 1)]
146
  if wt == resInIso:
147
- whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
 
148
  data.at[i, 'wt_sequence_match'] = 'i'
149
  data.at[i, 'whichIsoform'] = whichIsoform
150
  break
@@ -189,13 +243,13 @@ def pdb(input_set, mode, impute):
189
  for prot in protein:
190
  pdbs.append(get_pdb_ids(prot))
191
  print('PDBs', pdbs)
192
- if len(pdbs)>=1:
193
  print('pdbs not empty')
194
  pdbs = [item for sublist in pdbs for item in sublist]
195
  print('NEW', pdbs)
196
  else:
197
  print('pdbs empty')
198
- pdbs =[]
199
  print('Processing PDB structures...\n')
200
  if pdbs == []:
201
  print('No PDB structure found for the query. ')
@@ -218,8 +272,8 @@ def pdb(input_set, mode, impute):
218
  try:
219
  shutil.rmtree('obsolete')
220
  except OSError as e:
221
- pass
222
- existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*"))
223
  st.write('existing_pdb')
224
  st.write(existing_pdb)
225
  existing_pdb = [str(i) for i in existing_pdb]
@@ -229,28 +283,15 @@ def pdb(input_set, mode, impute):
229
  for search in pdbs:
230
  st.write('searching for pdb:', search)
231
  try:
232
- if search.lower() not in existing_pdb:
233
- path_pdb = 'out_files/pdb/pdb_structures'
234
- st.write('path for pdb: ',path_pdb)
235
- file = pdbl.retrieve_pdb_file(search, pdir=path_pdb, file_format="pdb")
236
- st.write('file: ',file)
237
- existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*"))
238
- st.write('after download:', existing_pdb)
239
- st.write(Path(path_to_output_files/'pdb_structures') == path_pdb)
240
- existing_pdb = list(path_pdb.glob("*"))
241
- st.write('after download:', existing_pdb)
242
- else:
243
- print('PDB structure file exists..')
244
- for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
245
- filename_replace_ext = filename.with_suffix(".pdb")
246
- filename.rename(filename_replace_ext)
247
-
248
- file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
249
-
250
- base = os.path.splitext(str(file))[0]
251
- base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
252
- os.rename(file, base + ".ent")
253
- file = base + '.ent'
254
 
255
  resolution_method = parser.get_structure(search, file)
256
  for record in SeqIO.parse(file, "pdb-seqres"):
@@ -269,7 +310,7 @@ def pdb(input_set, mode, impute):
269
  pdb_info.at[index, 'pdbID'] = 'nan'
270
  pdb_info.at[index, 'chain'] = 'nan'
271
  pdb_info.at[index, 'resolution'] = 'nan'
272
- cnt +=1
273
  print()
274
  print('PDB file processing finished..')
275
  for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
@@ -323,13 +364,11 @@ def pdb(input_set, mode, impute):
323
  TypeError
324
  with_pdb.at[i, 'pdbInfo'] = 'nan'
325
 
326
- with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
327
  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
328
  'wt_sequence_match',
329
  'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
330
 
331
-
332
-
333
  # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
334
  # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
335
  # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
@@ -343,7 +382,8 @@ def pdb(input_set, mode, impute):
343
  if len(with_pdb) > 0:
344
  with_pdb = add_annotations(with_pdb)
345
  else:
346
- new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
 
347
  'activeSite',
348
  'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
349
  'crosslink', 'mutagenesis', 'strand',
@@ -362,7 +402,7 @@ def pdb(input_set, mode, impute):
362
  'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
363
  'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
364
  'glycosylationBinary', 'propeptideBinary']
365
- with_pdb = pd.DataFrame(columns = new_cols)
366
  try:
367
  with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
368
  except:
@@ -374,7 +414,7 @@ def pdb(input_set, mode, impute):
374
  with_pdb.replace({'[]': 'nan'}, inplace=True)
375
  with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
376
  with_pdb.replace({'': 'nan'}, inplace=True)
377
-
378
  """
379
  STEP 7
380
  Do alignment for PDB
@@ -409,8 +449,7 @@ def pdb(input_set, mode, impute):
409
  existing_pdb = None
410
  with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
411
  with_pdb = None
412
-
413
-
414
  print('Aligning sequences...\n')
415
  aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
416
  aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
@@ -433,7 +472,6 @@ def pdb(input_set, mode, impute):
433
  aligned_m = aligned_m.astype(str)
434
  aligned_nm = aligned_nm.astype(str)
435
 
436
-
437
  frames = [aligned_m, aligned_nm]
438
  after_up_pdb_alignment = pd.concat(frames, sort=False)
439
  if len(after_up_pdb_alignment) == 0:
@@ -456,7 +494,6 @@ def pdb(input_set, mode, impute):
456
  (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
457
  no_pdb = no_pdb.copy()
458
 
459
-
460
  print('PDB matching is completed...\n')
461
  print('SUMMARY')
462
  print('-------')
@@ -471,7 +508,6 @@ def pdb(input_set, mode, impute):
471
  print('--%d will be searched in Swiss-Model database.\n' % (
472
  len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
473
 
474
-
475
  dfM = None
476
  dfNM = None
477
  aligned_nm = None
@@ -527,7 +563,8 @@ def pdb(input_set, mode, impute):
527
  swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
528
  dtype=str, header=None, skiprows=1,
529
  names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
530
- 'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm','seqid', 'url'])
 
531
 
532
  else:
533
  swiss_model = pd.DataFrame(
@@ -547,13 +584,13 @@ def pdb(input_set, mode, impute):
547
  swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
548
  else:
549
  swiss_model.at[ind, 'whichIsoform'] = 'nan'
550
- # swiss_model.drop(['input'], axis=1, inplace=True)
551
  swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
552
  print('Index File Processed...\n')
553
 
554
-
555
  # Get relevant columns
556
- swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
 
557
  # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
558
  swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
559
  swiss_model.reset_index(inplace=True)
@@ -710,7 +747,6 @@ def pdb(input_set, mode, impute):
710
  ascending=[True, False])
711
  swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
712
 
713
-
714
  swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
715
  swiss_models_with_data.reset_index(inplace=True)
716
  swiss_models_with_data.drop(['index'], axis=1, inplace=True)
@@ -727,7 +763,6 @@ def pdb(input_set, mode, impute):
727
 
728
  swiss_models_with_data = swiss_models_with_data1.copy()
729
 
730
-
731
  swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
732
  swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
733
  axis=0, ascending=[True, True, True, False])
@@ -737,7 +772,8 @@ def pdb(input_set, mode, impute):
737
  keep='first')
738
  swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
739
  swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
740
- len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(broken_swiss.drop_duplicates(['datapoint'])) + len(
 
741
  no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
742
  # This printed data here includes all possible models with different qualities,
743
  # because we may get a hit in either of them.
@@ -764,10 +800,10 @@ def pdb(input_set, mode, impute):
764
 
765
  swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
766
  swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
767
- swiss_model_aligned = alignment(swiss_models_with_data, annotation_list, path_to_output_files / 'alignment_files')
 
768
  swiss_models_with_data = None
769
 
770
-
771
  if len(swiss_model_aligned) == 0:
772
  swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
773
  swiss_model_aligned['qmean_norm'] = 'nan'
@@ -860,7 +896,7 @@ def pdb(input_set, mode, impute):
860
  url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
861
  print(url)
862
  req = requests.get(url)
863
- name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
864
  with open(name, 'wb') as f:
865
  f.write(req.content)
866
  else:
@@ -877,7 +913,7 @@ def pdb(input_set, mode, impute):
877
  individual.write(str('UniProt ID: ' + protein))
878
  individual.write('\n')
879
  individual.write(str(pdb.contents[3])[10:-11].strip())
880
- with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt',
881
  encoding="utf8") as f:
882
  fasta = ''
883
  chain = ''
@@ -960,7 +996,6 @@ def pdb(input_set, mode, impute):
960
  existing_modbase_models = None
961
  existing_modbase_models_ind = None
962
 
963
-
964
  model_info_added = model_info_added.drop(['UniprotID'], axis=1)
965
  model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
966
  'PDBCode': 'template', 'PDBChain': 'chain',
@@ -1013,7 +1048,8 @@ def pdb(input_set, mode, impute):
1013
  with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
1014
  axis=0,
1015
  ascending=[True, True, True, True, False, True, False])
1016
- with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], keep='first')
 
1017
 
1018
  with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
1019
  with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
@@ -1027,7 +1063,6 @@ def pdb(input_set, mode, impute):
1027
  with_modbase_info.reset_index(inplace=True)
1028
  with_modbase_info.drop('index', axis=1, inplace=True)
1029
 
1030
-
1031
  align = with_modbase_info[
1032
  with_modbase_info.fasta != 'nan']
1033
  yes_pdb_no_match = with_modbase_info[
@@ -1046,7 +1081,6 @@ def pdb(input_set, mode, impute):
1046
  modbase_aligned = modbase_aligned.astype(str)
1047
  modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
1048
 
1049
-
1050
  # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
1051
  if len(with_modbase_info) != 0:
1052
  not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
@@ -1054,29 +1088,30 @@ def pdb(input_set, mode, impute):
1054
  ['datapoint'],
1055
  keep=False)
1056
  else:
1057
- not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1058
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1059
- 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
1060
- 'intMet',
1061
- 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1062
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
1063
- 'crosslink',
1064
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1065
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1066
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
1067
- 'coiledCoil',
1068
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1069
- 'disulfide',
1070
- 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
1071
- 'activeSite',
1072
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
1073
- 'crosslink',
1074
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1075
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1076
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
1077
- 'coiledCoil',
1078
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
1079
- 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
 
1080
  with_modbase_info = None
1081
  if len(not_in_aligned) != 0:
1082
  not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
@@ -1093,7 +1128,8 @@ def pdb(input_set, mode, impute):
1093
  nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
1094
  not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
1095
  not_nan.score = not_nan.score.astype(float)
1096
- not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False], inplace=True)
 
1097
 
1098
  not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
1099
  ascending=[True, True, False])
@@ -1105,7 +1141,7 @@ def pdb(input_set, mode, impute):
1105
  which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
1106
  if len(which_ones_are_match) == 0:
1107
  which_ones_are_match = pd.DataFrame(
1108
- columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1109
  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1110
  'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1111
  'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
@@ -1141,7 +1177,6 @@ def pdb(input_set, mode, impute):
1141
  not_nan = None
1142
  nan = None
1143
 
1144
-
1145
  # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
1146
 
1147
  # No model
@@ -1170,9 +1205,10 @@ def pdb(input_set, mode, impute):
1170
  elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
1171
  rest = no_info
1172
  else:
1173
- rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1174
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1175
- 'wt_sequence_match', 'whichIsoform', 'datapoint'])
 
1176
 
1177
  rest = rest[to_swiss_columns]
1178
  rest = rest.drop_duplicates()
@@ -1184,49 +1220,53 @@ def pdb(input_set, mode, impute):
1184
 
1185
  else:
1186
 
1187
- modbase_match = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1188
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1189
- 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1190
- 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1191
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1192
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1193
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1194
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1195
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1196
- 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
1197
- 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
1198
- 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1199
- 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1200
- 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
1201
- 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
1202
- 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
1203
- 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
1204
- 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
1205
- 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
1206
- 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
1207
- 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
1208
- not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1209
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1210
- 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1211
- 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1212
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1213
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1214
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1215
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1216
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
1217
- 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1218
- 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1219
- 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1220
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1221
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1222
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
1223
- 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
1224
- no_info = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1225
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1226
- 'wt_sequence_match', 'whichIsoform', 'datapoint'])
1227
- rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1228
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1229
- 'wt_sequence_match', 'whichIsoform', 'datapoint'])
 
 
 
 
1230
 
1231
  rest = rest[to_swiss_columns]
1232
  rest = rest.drop_duplicates()
@@ -1262,7 +1302,6 @@ def pdb(input_set, mode, impute):
1262
  not_models = None
1263
  modbase_not_match = None
1264
 
1265
-
1266
  # Final corrections
1267
 
1268
  # Now 3D alignment.
@@ -1284,7 +1323,6 @@ def pdb(input_set, mode, impute):
1284
 
1285
  # Fix the axes and merge all data.
1286
 
1287
-
1288
  pdb.drop(['pdbInfo'], axis=1, inplace=True)
1289
  pdb.rename(columns={'resolution': 'score'}, inplace=True)
1290
  swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
@@ -1297,7 +1335,6 @@ def pdb(input_set, mode, impute):
1297
  modbase['source'] = 'MODBASE'
1298
  data = pd.concat([swiss, modbase, pdb])
1299
 
1300
-
1301
  data.reset_index(inplace=True)
1302
  data.drop(['index'], axis=1, inplace=True)
1303
  data = data.astype('str')
@@ -1321,10 +1358,10 @@ def pdb(input_set, mode, impute):
1321
  for pdbID in pdb_only.pdbID.to_list():
1322
  if pdbID not in existing_free_sasa:
1323
  (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
1324
- Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
 
1325
  outdir=None, force_rerun=False, file_type='pdb'))
1326
 
1327
-
1328
  print('Calculation RSA for SwissModel Files...\n')
1329
  swiss_only = data[data.source == 'SWISSMODEL']
1330
  swiss_dp = []
@@ -1342,7 +1379,8 @@ def pdb(input_set, mode, impute):
1342
  for pdbID in modbase_only.pdbID.to_list():
1343
  if pdbID not in existing_free_sasa:
1344
  (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
1345
- Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
 
1346
  outdir=None, force_rerun=False, file_type='pdb'))
1347
 
1348
  # This annotation list is different than the prev one, keep it.
@@ -1380,16 +1418,18 @@ def pdb(input_set, mode, impute):
1380
  chain = data.at[i, 'chain']
1381
  uniprotID = data.at[i, 'uniprotID']
1382
  pdbID = data.at[i, 'pdbID']
1383
- alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
 
1384
  mutPos = data.at[i, 'mutationPositionOnPDB']
1385
  try:
1386
- coordMut = get_coords(mutPos, alignments , 'nan', 'nan', mode)[0]
1387
  except:
1388
  ValueError
1389
  coordMut = 'nan'
1390
  try:
1391
  sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
1392
- data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos, data.at[i, 'wt'], mode, path_to_output_files,file_type = 'pdb')
 
1393
  except:
1394
  ValueError
1395
  data.at[i, 'sasa'] = 'nan' # mutation position is nan
@@ -1437,11 +1477,9 @@ def pdb(input_set, mode, impute):
1437
  data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
1438
  float(data.at[i, 'domainEndonPDB']))
1439
 
1440
-
1441
  data = data.astype(str)
1442
  data.replace({'NaN': 'nan'}, inplace=True)
1443
 
1444
-
1445
  # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
1446
 
1447
  # Get interface positions from ECLAIR. Download HQ human
@@ -1462,28 +1500,29 @@ def pdb(input_set, mode, impute):
1462
  interface_dataframe.columns = ['uniprotID', 'positions']
1463
 
1464
  if len(data) == 0:
1465
- data = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
1466
- 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1467
- 'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
1468
- 'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
1469
- 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
1470
- 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
1471
- 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1472
- 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1473
- 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1474
- 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1475
- 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
1476
- 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
1477
- 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1478
- 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1479
- 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
1480
- 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
1481
- 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
1482
- 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
1483
- 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
1484
- 'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
1485
- 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
1486
- 'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
 
1487
  else:
1488
  data.sasa = data.sasa.astype('str')
1489
 
@@ -1522,7 +1561,6 @@ def pdb(input_set, mode, impute):
1522
 
1523
  data.drop(['positions'], axis=1, inplace=True)
1524
 
1525
-
1526
  # OPTIONAL
1527
  # DOMAIN SELECTION
1528
  # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
@@ -1541,7 +1579,8 @@ def pdb(input_set, mode, impute):
1541
  # nan--> 0, 0 -->1 and 1 -->2
1542
 
1543
  print('Final adjustments are being done...\n')
1544
- binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary',
 
1545
  'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1546
  'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1547
  'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
@@ -1643,7 +1682,8 @@ def pdb(input_set, mode, impute):
1643
  ready = data.copy()
1644
  # Imputation
1645
  if (impute == 'True') or (impute == 'true') or (impute == True):
1646
- filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, 16.82,
 
1647
  20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
1648
  col_index = 0
1649
  for col_ in ready.columns[-30:]:
@@ -1658,7 +1698,8 @@ def pdb(input_set, mode, impute):
1658
  ready = ready.replace({'nan': np.NaN})
1659
  ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
1660
  if len(ready) == 0:
1661
- print('No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
 
1662
  print(ready)
1663
  print('Feature vector successfully created...')
1664
  return ready
@@ -1669,3 +1710,4 @@ def pdb(input_set, mode, impute):
1669
  print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
1670
  sys.stdout.close()
1671
  return ready
 
 
1
+ Hugging
2
+ Face
3
+ 's logo
4
+ Hugging
5
+ Face
6
+ Search
7
+ models, datasets, users...
8
+ Models
9
+ Datasets
10
+ Spaces
11
+ Docs
12
+ Solutions
13
+ Pricing
14
+
15
+ Spaces:
16
+
17
+ HUBioDataLab
18
+ /
19
+ ASCARIS
20
+
21
+ like
22
+ 0
23
+
24
+ App
25
+ Files
26
+ Community
27
+ Settings
28
+ ASCARIS
29
+ /
30
+ code
31
+ /
32
+ pdb_featureVector.py
33
+ fatmacankara
34
+ 's picture
35
+ fatmacankara
36
+ Update
37
+ code / pdb_featureVector.py
38
+ debd6c0
39
+ less
40
+ than
41
+ a
42
+ minute
43
+ ago
44
+ raw
45
+ history
46
+ blame
47
+ edit
48
+ delete
49
+ 96
50
+ kB
51
  # IMPORT NECESSARY MODULES AND LIBRARIES
52
  from timeit import default_timer as timer
53
  import xml.etree.ElementTree as ET
 
75
  from Bio import Align
76
  from Bio import SeqIO
77
  from Bio.PDB import *
78
+
79
  warnings.filterwarnings("ignore")
80
  start = timer()
81
  import streamlit as st
82
  # FUNCTIONS
83
 
84
 
 
85
  # FUNCTIONS
86
  from calc_pc_property import *
87
  from add_domains import *
 
107
  Add datapoint identifier and remove non-standard input.
108
  """
109
  data = clean_data(input_set)
110
+ path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(
111
+ mode)
112
  out_path = path_to_output_files / 'log.txt'
113
  sys.stdout = open(out_path, 'w')
114
  print('Creating directories...')
115
 
116
  annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
117
  'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
118
+ 'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
119
+ 'region',
120
  'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
121
  'transitPeptide', 'glycosylation', 'propeptide']
122
 
 
191
  if wt == can:
192
  data.at[i, 'wt_sequence_match'] = 'm'
193
  elif wt != can:
194
+ isoList = isoform_fasta[
195
+ isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
196
  for k in isoList:
197
  if len(k) >= int(data.at[i, 'pos']):
198
  resInIso = k[int(int(data.at[i, 'pos']) - 1)]
199
  if wt == resInIso:
200
+ whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[
201
+ 0]
202
  data.at[i, 'wt_sequence_match'] = 'i'
203
  data.at[i, 'whichIsoform'] = whichIsoform
204
  break
 
243
  for prot in protein:
244
  pdbs.append(get_pdb_ids(prot))
245
  print('PDBs', pdbs)
246
+ if len(pdbs) >= 1:
247
  print('pdbs not empty')
248
  pdbs = [item for sublist in pdbs for item in sublist]
249
  print('NEW', pdbs)
250
  else:
251
  print('pdbs empty')
252
+ pdbs = []
253
  print('Processing PDB structures...\n')
254
  if pdbs == []:
255
  print('No PDB structure found for the query. ')
 
272
  try:
273
  shutil.rmtree('obsolete')
274
  except OSError as e:
275
+ pass
276
+ existing_pdb = list(Path(path_to_output_files / 'pdb_structures').glob("*"))
277
  st.write('existing_pdb')
278
  st.write(existing_pdb)
279
  existing_pdb = [str(i) for i in existing_pdb]
 
283
  for search in pdbs:
284
  st.write('searching for pdb:', search)
285
  try:
286
+ path_pdb = 'out_files/pdb/pdb_structures'
287
+ st.write('path for pdb: ', path_pdb)
288
+ file = pdbl.retrieve_pdb_file(search, pdir=path_pdb, file_format="pdb")
289
+ st.write('file: ', file)
290
+ existing_pdb = list(Path(path_to_output_files / 'pdb_structures').glob("*"))
291
+ st.write('after download:', existing_pdb)
292
+ st.write(Path(path_to_output_files / 'pdb_structures') , path_pdb)
293
+ existing_pdb = list(path_pdb.glob("*"))
294
+ st.write('after download:', existing_pdb)
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
  resolution_method = parser.get_structure(search, file)
297
  for record in SeqIO.parse(file, "pdb-seqres"):
 
310
  pdb_info.at[index, 'pdbID'] = 'nan'
311
  pdb_info.at[index, 'chain'] = 'nan'
312
  pdb_info.at[index, 'resolution'] = 'nan'
313
+ cnt += 1
314
  print()
315
  print('PDB file processing finished..')
316
  for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
 
364
  TypeError
365
  with_pdb.at[i, 'pdbInfo'] = 'nan'
366
 
367
+ with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
368
  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
369
  'wt_sequence_match',
370
  'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
371
 
 
 
372
  # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
373
  # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
374
  # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
 
382
  if len(with_pdb) > 0:
383
  with_pdb = add_annotations(with_pdb)
384
  else:
385
+ new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant',
386
+ 'dnaBinding',
387
  'activeSite',
388
  'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
389
  'crosslink', 'mutagenesis', 'strand',
 
402
  'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
403
  'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
404
  'glycosylationBinary', 'propeptideBinary']
405
+ with_pdb = pd.DataFrame(columns=new_cols)
406
  try:
407
  with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
408
  except:
 
414
  with_pdb.replace({'[]': 'nan'}, inplace=True)
415
  with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
416
  with_pdb.replace({'': 'nan'}, inplace=True)
417
+
418
  """
419
  STEP 7
420
  Do alignment for PDB
 
449
  existing_pdb = None
450
  with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
451
  with_pdb = None
452
+
 
453
  print('Aligning sequences...\n')
454
  aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
455
  aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
 
472
  aligned_m = aligned_m.astype(str)
473
  aligned_nm = aligned_nm.astype(str)
474
 
 
475
  frames = [aligned_m, aligned_nm]
476
  after_up_pdb_alignment = pd.concat(frames, sort=False)
477
  if len(after_up_pdb_alignment) == 0:
 
494
  (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
495
  no_pdb = no_pdb.copy()
496
 
 
497
  print('PDB matching is completed...\n')
498
  print('SUMMARY')
499
  print('-------')
 
508
  print('--%d will be searched in Swiss-Model database.\n' % (
509
  len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
510
 
 
511
  dfM = None
512
  dfNM = None
513
  aligned_nm = None
 
563
  swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
564
  dtype=str, header=None, skiprows=1,
565
  names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
566
+ 'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean',
567
+ 'qmean_norm', 'seqid', 'url'])
568
 
569
  else:
570
  swiss_model = pd.DataFrame(
 
584
  swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
585
  else:
586
  swiss_model.at[ind, 'whichIsoform'] = 'nan'
587
+ # swiss_model.drop(['input'], axis=1, inplace=True)
588
  swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
589
  print('Index File Processed...\n')
590
 
 
591
  # Get relevant columns
592
+ swiss_model = swiss_model[
593
+ ['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
594
  # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
595
  swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
596
  swiss_model.reset_index(inplace=True)
 
747
  ascending=[True, False])
748
  swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
749
 
 
750
  swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
751
  swiss_models_with_data.reset_index(inplace=True)
752
  swiss_models_with_data.drop(['index'], axis=1, inplace=True)
 
763
 
764
  swiss_models_with_data = swiss_models_with_data1.copy()
765
 
 
766
  swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
767
  swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
768
  axis=0, ascending=[True, True, True, False])
 
772
  keep='first')
773
  swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
774
  swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
775
+ len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(
776
+ broken_swiss.drop_duplicates(['datapoint'])) + len(
777
  no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
778
  # This printed data here includes all possible models with different qualities,
779
  # because we may get a hit in either of them.
 
800
 
801
  swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
802
  swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
803
+ swiss_model_aligned = alignment(swiss_models_with_data, annotation_list,
804
+ path_to_output_files / 'alignment_files')
805
  swiss_models_with_data = None
806
 
 
807
  if len(swiss_model_aligned) == 0:
808
  swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
809
  swiss_model_aligned['qmean_norm'] = 'nan'
 
896
  url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
897
  print(url)
898
  req = requests.get(url)
899
+ name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
900
  with open(name, 'wb') as f:
901
  f.write(req.content)
902
  else:
 
913
  individual.write(str('UniProt ID: ' + protein))
914
  individual.write('\n')
915
  individual.write(str(pdb.contents[3])[10:-11].strip())
916
+ with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt',
917
  encoding="utf8") as f:
918
  fasta = ''
919
  chain = ''
 
996
  existing_modbase_models = None
997
  existing_modbase_models_ind = None
998
 
 
999
  model_info_added = model_info_added.drop(['UniprotID'], axis=1)
1000
  model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
1001
  'PDBCode': 'template', 'PDBChain': 'chain',
 
1048
  with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
1049
  axis=0,
1050
  ascending=[True, True, True, True, False, True, False])
1051
+ with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
1052
+ keep='first')
1053
 
1054
  with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
1055
  with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
 
1063
  with_modbase_info.reset_index(inplace=True)
1064
  with_modbase_info.drop('index', axis=1, inplace=True)
1065
 
 
1066
  align = with_modbase_info[
1067
  with_modbase_info.fasta != 'nan']
1068
  yes_pdb_no_match = with_modbase_info[
 
1081
  modbase_aligned = modbase_aligned.astype(str)
1082
  modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
1083
 
 
1084
  # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
1085
  if len(with_modbase_info) != 0:
1086
  not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
 
1088
  ['datapoint'],
1089
  keep=False)
1090
  else:
1091
+ not_in_aligned = pd.DataFrame(
1092
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1093
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1094
+ 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
1095
+ 'intMet',
1096
+ 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1097
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
1098
+ 'crosslink',
1099
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1100
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1101
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
1102
+ 'coiledCoil',
1103
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1104
+ 'disulfide',
1105
+ 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
1106
+ 'activeSite',
1107
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
1108
+ 'crosslink',
1109
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1110
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1111
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
1112
+ 'coiledCoil',
1113
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
1114
+ 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
1115
  with_modbase_info = None
1116
  if len(not_in_aligned) != 0:
1117
  not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
 
1128
  nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
1129
  not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
1130
  not_nan.score = not_nan.score.astype(float)
1131
+ not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False],
1132
+ inplace=True)
1133
 
1134
  not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
1135
  ascending=[True, True, False])
 
1141
  which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
1142
  if len(which_ones_are_match) == 0:
1143
  which_ones_are_match = pd.DataFrame(
1144
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1145
  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1146
  'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1147
  'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
 
1177
  not_nan = None
1178
  nan = None
1179
 
 
1180
  # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
1181
 
1182
  # No model
 
1205
  elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
1206
  rest = no_info
1207
  else:
1208
+ rest = pd.DataFrame(
1209
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1210
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1211
+ 'wt_sequence_match', 'whichIsoform', 'datapoint'])
1212
 
1213
  rest = rest[to_swiss_columns]
1214
  rest = rest.drop_duplicates()
 
1220
 
1221
  else:
1222
 
1223
+ modbase_match = pd.DataFrame(
1224
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1225
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1226
+ 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1227
+ 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1228
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1229
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1230
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1231
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1232
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1233
+ 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
1234
+ 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
1235
+ 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1236
+ 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1237
+ 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
1238
+ 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
1239
+ 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
1240
+ 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
1241
+ 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
1242
+ 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
1243
+ 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
1244
+ 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
1245
+ not_in_aligned = pd.DataFrame(
1246
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1247
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1248
+ 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
1249
+ 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1250
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1251
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1252
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1253
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1254
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
1255
+ 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
1256
+ 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
1257
+ 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1258
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1259
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1260
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
1261
+ 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
1262
+ no_info = pd.DataFrame(
1263
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1264
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1265
+ 'wt_sequence_match', 'whichIsoform', 'datapoint'])
1266
+ rest = pd.DataFrame(
1267
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1268
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1269
+ 'wt_sequence_match', 'whichIsoform', 'datapoint'])
1270
 
1271
  rest = rest[to_swiss_columns]
1272
  rest = rest.drop_duplicates()
 
1302
  not_models = None
1303
  modbase_not_match = None
1304
 
 
1305
  # Final corrections
1306
 
1307
  # Now 3D alignment.
 
1323
 
1324
  # Fix the axes and merge all data.
1325
 
 
1326
  pdb.drop(['pdbInfo'], axis=1, inplace=True)
1327
  pdb.rename(columns={'resolution': 'score'}, inplace=True)
1328
  swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
 
1335
  modbase['source'] = 'MODBASE'
1336
  data = pd.concat([swiss, modbase, pdb])
1337
 
 
1338
  data.reset_index(inplace=True)
1339
  data.drop(['index'], axis=1, inplace=True)
1340
  data = data.astype('str')
 
1358
  for pdbID in pdb_only.pdbID.to_list():
1359
  if pdbID not in existing_free_sasa:
1360
  (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
1361
+ Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
1362
+ include_hetatms=True,
1363
  outdir=None, force_rerun=False, file_type='pdb'))
1364
 
 
1365
  print('Calculation RSA for SwissModel Files...\n')
1366
  swiss_only = data[data.source == 'SWISSMODEL']
1367
  swiss_dp = []
 
1379
  for pdbID in modbase_only.pdbID.to_list():
1380
  if pdbID not in existing_free_sasa:
1381
  (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
1382
+ Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
1383
+ include_hetatms=True,
1384
  outdir=None, force_rerun=False, file_type='pdb'))
1385
 
1386
  # This annotation list is different than the prev one, keep it.
 
1418
  chain = data.at[i, 'chain']
1419
  uniprotID = data.at[i, 'uniprotID']
1420
  pdbID = data.at[i, 'pdbID']
1421
+ alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode,
1422
+ Path(path_to_output_files / '3D_alignment'), file_format='gzip')
1423
  mutPos = data.at[i, 'mutationPositionOnPDB']
1424
  try:
1425
+ coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
1426
  except:
1427
  ValueError
1428
  coordMut = 'nan'
1429
  try:
1430
  sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
1431
+ data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
1432
+ data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb')
1433
  except:
1434
  ValueError
1435
  data.at[i, 'sasa'] = 'nan' # mutation position is nan
 
1477
  data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
1478
  float(data.at[i, 'domainEndonPDB']))
1479
 
 
1480
  data = data.astype(str)
1481
  data.replace({'NaN': 'nan'}, inplace=True)
1482
 
 
1483
  # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
1484
 
1485
  # Get interface positions from ECLAIR. Download HQ human
 
1500
  interface_dataframe.columns = ['uniprotID', 'positions']
1501
 
1502
  if len(data) == 0:
1503
+ data = pd.DataFrame(
1504
+ columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
1505
+ 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
1506
+ 'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
1507
+ 'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
1508
+ 'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
1509
+ 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
1510
+ 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
1511
+ 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
1512
+ 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
1513
+ 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
1514
+ 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
1515
+ 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
1516
+ 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1517
+ 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1518
+ 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
1519
+ 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
1520
+ 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
1521
+ 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
1522
+ 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
1523
+ 'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
1524
+ 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
1525
+ 'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
1526
  else:
1527
  data.sasa = data.sasa.astype('str')
1528
 
 
1561
 
1562
  data.drop(['positions'], axis=1, inplace=True)
1563
 
 
1564
  # OPTIONAL
1565
  # DOMAIN SELECTION
1566
  # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
 
1579
  # nan--> 0, 0 -->1 and 1 -->2
1580
 
1581
  print('Final adjustments are being done...\n')
1582
+ binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary',
1583
+ 'dnaBindingBinary',
1584
  'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
1585
  'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
1586
  'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
 
1682
  ready = data.copy()
1683
  # Imputation
1684
  if (impute == 'True') or (impute == 'true') or (impute == True):
1685
+ filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99,
1686
+ 16.82,
1687
  20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
1688
  col_index = 0
1689
  for col_ in ready.columns[-30:]:
 
1698
  ready = ready.replace({'nan': np.NaN})
1699
  ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
1700
  if len(ready) == 0:
1701
+ print(
1702
+ 'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
1703
  print(ready)
1704
  print('Feature vector successfully created...')
1705
  return ready
 
1710
  print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
1711
  sys.stdout.close()
1712
  return ready
1713
+