fatmacankara commited on
Commit
f44aa18
·
1 Parent(s): 84bc25a

Update code/pdb_featureVector.py

Browse files
Files changed (1) hide show
  1. code/pdb_featureVector.py +86 -79
code/pdb_featureVector.py CHANGED
@@ -82,8 +82,10 @@ def pdb(input_set, mode, impute):
82
  out_path = path_to_output_files / 'log.txt'
83
  #sys.stdout = open(out_path, 'w')
84
  data = clean_data(input_set)
 
85
  data = add_uniprot_sequence(data)
86
  match = data[(data.wt_sequence_match == 'm')]
 
87
  iso = data[(data.wt_sequence_match == 'i')]
88
  noMatch = data[(data.wt_sequence_match != 'm') & (data.wt_sequence_match != 'i')]
89
  if len(data) == 0:
@@ -233,24 +235,28 @@ def pdb(input_set, mode, impute):
233
  modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
234
  modbase = modbase.fillna(np.NaN)
235
  print('\n>> Adding Modbase residue positions...\n')
236
- modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'datapoint']]
237
- modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'datapoint'])
238
  modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files)
239
- modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'datapoint'], how = 'left')
240
- no_modbase_models_updated['sasa'] = np.NaN
241
- modbase.reset_index(inplace=True, drop=True)
242
- no_modbase_add = modbase[pd.isna(modbase.coordinates)]
243
- modbase = modbase[~pd.isna(modbase.coordinates)]
244
- no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add])
245
- print('\n>> Mapping to Modbase models...\n')
246
- modbase = changeUPtoModels(modbase)
247
- print('\n>> Calculating 3D distances for Modbase models...\n')
248
- modbase = isZeroDistance(modbase)
249
- modbase = match3DModels(modbase)
250
- modbase = selectMaxAnnot(modbase)
251
- modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, True, True, True, True])
252
- modbase = modbase.drop_duplicates(['datapoint'])
253
- modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True)
 
 
 
 
254
  else:
255
  modbase = modbase[SIMPLE_COLS]
256
 
@@ -266,7 +272,7 @@ def pdb(input_set, mode, impute):
266
  'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
267
  'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
268
  'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
269
-
270
  if len(pdb)>0:
271
  pdb = pdb[COLS]
272
  pdb['Source'] = 'PDB'
@@ -282,7 +288,8 @@ def pdb(input_set, mode, impute):
282
  modbase['Source'] = 'Modbase'
283
  else:
284
  modbase = pd.DataFrame()
285
- no_modbase_models_updated = pd.DataFrame()
 
286
 
287
  # st.write('======PDB==========')
288
  # st.write(pdb.to_string())
@@ -291,13 +298,13 @@ def pdb(input_set, mode, impute):
291
  # st.write('======MODBASE==========')
292
  # st.write(modbase.to_string())
293
 
294
-
295
 
296
  allData = pd.concat([pdb, swiss, modbase])
297
  allData.reset_index(inplace=True, drop=True)
298
  allData.replace({np.NaN: ''}, inplace=True)
299
- # print('======ALL DATA==========')
300
- # print(allData.to_string())
301
  if len(allData)>0:
302
  allData.distance.replace({-1000: ''}, inplace=True)
303
 
@@ -318,52 +325,52 @@ def pdb(input_set, mode, impute):
318
  k = pd.Series((key, str(list(set(val)))))
319
  interface_dataframe = interface_dataframe.append(k, ignore_index=True)
320
  interface_dataframe.columns = ['uniprotID', 'positions']
321
- data = finalTouch(allData)
322
- data = data.merge(interface_dataframe, on='uniprotID', how='left')
323
- data.positions = data.positions.astype('str')
324
- for i in data.index:
325
- if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
326
- data.at[i, 'threeState_trsh4_HQ'] = 'interface'
327
- elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
328
- data.at[i, 'threeState_trsh4_HQ'] = 'surface'
329
- elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core':
330
- data.at[i, 'threeState_trsh4_HQ'] = 'core'
331
- elif (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core':
332
- data.at[i, 'threeState_trsh4_HQ'] = 'conflict'
333
- elif data.at[i, 'trsh4'] == 'nan':
334
- data.at[i, 'threeState_trsh4_HQ'] = 'nan'
335
-
336
- data.drop(['positions'], axis=1, inplace=True)
337
 
338
  fisherResult = pd.read_csv(fisher_path, sep='\t')
339
  significant_domains = fisherResult.domain.to_list()
340
- for i in data.index:
341
- if data.at[i, 'domain'] in significant_domains:
342
- data.at[i, 'domain_fisher'] = data.at[i, 'domain']
343
  else:
344
- data.at[i, 'domain_fisher'] = 'NULL'
345
  print('Final adjustments are being done...\n')
346
  binaryCols = UNIPROT_ANNOTATION_COLS[-30:]
347
- data = data.astype(str)
348
- data.replace({'NaN': 'nan'}, inplace=True)
349
- for i in data.index:
350
  for j in binaryCols:
351
- data[j] = data[j].astype('str')
352
- if (data.at[i, j] == '0') or (data.at[i, j] == '0.0'):
353
- data.at[i, j] = '1'
354
- elif data.at[i, j] == 'nan':
355
- data.at[i, j] = '0'
356
- elif (data.at[i, j] == '1') or (data.at[i, j] == '1.0'):
357
- data.at[i, j] = '2'
358
 
359
  annotCols = UNIPROT_ANNOTATION_COLS[:30]
360
 
361
- for i in data.index:
362
  for annot in annotCols:
363
  binaryName = str(annot) + 'Binary'
364
- if data.at[i, binaryName] == '2':
365
- data.at[i, annot] = '0.0'
366
- data.rename(
367
  columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
368
  'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
369
  'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
@@ -403,7 +410,7 @@ def pdb(input_set, mode, impute):
403
  'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
404
  'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
405
 
406
- data = data[
407
  ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position','Source', 'meta_merged', 'composition', 'polarity',
408
  'volume',
409
  'granthamScore', 'domains_all',
@@ -432,33 +439,33 @@ def pdb(input_set, mode, impute):
432
  16.82,
433
  20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
434
  col_index = 0
435
- for col_ in data.columns[-30:]:
436
- data[col_] = data[col_].fillna(filler[col_index])
437
- data[col_] = data[col_].replace({'nan': filler[col_index]})
438
  col_index += 1
439
- data['domains_3Ddist'] = data['domains_3Ddist'].fillna(24.5)
440
- data['sasa'] = data['sasa'].fillna(29.5)
441
- data['location_3state'] = data['location_3state'].fillna('unknown')
442
  elif (impute == 'False') or (impute == 'false'):
443
  pass
444
- data = data.replace({'nan': np.NaN})
445
- data.domains_all = data.domains_all.replace({-1: 'NULL'})
446
 
447
  # ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
448
- if len(data) == 0:
449
  print(
450
  'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
451
-
452
- data.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
453
-
454
- print('Feature vector successfully created...')
455
- return data
456
-
457
- end = timer()
458
- hours, rem = divmod(end - start, 3600)
459
- minutes, seconds = divmod(rem, 60)
460
- print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
461
- #sys.stdout.close()
462
- return data
463
-
464
 
 
 
 
 
 
 
 
82
  out_path = path_to_output_files / 'log.txt'
83
  #sys.stdout = open(out_path, 'w')
84
  data = clean_data(input_set)
85
+
86
  data = add_uniprot_sequence(data)
87
  match = data[(data.wt_sequence_match == 'm')]
88
+ org_len = len(match)
89
  iso = data[(data.wt_sequence_match == 'i')]
90
  noMatch = data[(data.wt_sequence_match != 'm') & (data.wt_sequence_match != 'i')]
91
  if len(data) == 0:
 
235
  modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
236
  modbase = modbase.fillna(np.NaN)
237
  print('\n>> Adding Modbase residue positions...\n')
238
+ modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'mut','datapoint']]
239
+ modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'mut','datapoint'])
240
  modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files)
241
+
242
+ if len(modbaseOut) > 0:
243
+ modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'mut','datapoint'], how = 'left')
244
+ no_modbase_models_updated['sasa'] = np.NaN
245
+ modbase.reset_index(inplace=True, drop=True)
246
+ no_modbase_add = modbase[pd.isna(modbase.coordinates)]
247
+ modbase = modbase[~pd.isna(modbase.coordinates)]
248
+ no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add])
249
+ print('\n>> Mapping to Modbase models...\n')
250
+ modbase = changeUPtoModels(modbase)
251
+ print('\n>> Calculating 3D distances for Modbase models...\n')
252
+ modbase = isZeroDistance(modbase)
253
+ modbase = match3DModels(modbase)
254
+ modbase = selectMaxAnnot(modbase)
255
+ modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, True, True, True, True])
256
+ modbase = modbase.drop_duplicates(['datapoint'])
257
+ modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True)
258
+ else:
259
+ modbase = pd.DataFrame(columns = SIMPLE_COLS)
260
  else:
261
  modbase = modbase[SIMPLE_COLS]
262
 
 
272
  'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
273
  'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
274
  'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
275
+ no_modbase_models_updated = no_modbase_models_updated.drop_duplicates()
276
  if len(pdb)>0:
277
  pdb = pdb[COLS]
278
  pdb['Source'] = 'PDB'
 
288
  modbase['Source'] = 'Modbase'
289
  else:
290
  modbase = pd.DataFrame()
291
+ if len(no_modbase_models_updated) == 0:
292
+ no_modbase_models_updated = pd.DataFrame()
293
 
294
  # st.write('======PDB==========')
295
  # st.write(pdb.to_string())
 
298
  # st.write('======MODBASE==========')
299
  # st.write(modbase.to_string())
300
 
301
+
302
 
303
  allData = pd.concat([pdb, swiss, modbase])
304
  allData.reset_index(inplace=True, drop=True)
305
  allData.replace({np.NaN: ''}, inplace=True)
306
+ # st.write('======ALL DATA==========')
307
+ # st.write(allData.to_string())
308
  if len(allData)>0:
309
  allData.distance.replace({-1000: ''}, inplace=True)
310
 
 
325
  k = pd.Series((key, str(list(set(val)))))
326
  interface_dataframe = interface_dataframe.append(k, ignore_index=True)
327
  interface_dataframe.columns = ['uniprotID', 'positions']
328
+ final_data = finalTouch(allData)
329
+ final_data = final_data.merge(interface_dataframe, on='uniprotID', how='left')
330
+ final_data.positions = final_data.positions.astype('str')
331
+ for i in final_data.index:
332
+ if (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface':
333
+ final_data.at[i, 'threeState_trsh4_HQ'] = 'interface'
334
+ elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface':
335
+ final_data.at[i, 'threeState_trsh4_HQ'] = 'surface'
336
+ elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core':
337
+ final_data.at[i, 'threeState_trsh4_HQ'] = 'core'
338
+ elif (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core':
339
+ final_data.at[i, 'threeState_trsh4_HQ'] = 'conflict'
340
+ elif final_data.at[i, 'trsh4'] == 'nan':
341
+ final_data.at[i, 'threeState_trsh4_HQ'] = 'nan'
342
+
343
+ final_data.drop(['positions'], axis=1, inplace=True)
344
 
345
  fisherResult = pd.read_csv(fisher_path, sep='\t')
346
  significant_domains = fisherResult.domain.to_list()
347
+ for i in final_data.index:
348
+ if final_data.at[i, 'domain'] in significant_domains:
349
+ final_data.at[i, 'domain_fisher'] = final_data.at[i, 'domain']
350
  else:
351
+ final_data.at[i, 'domain_fisher'] = 'NULL'
352
  print('Final adjustments are being done...\n')
353
  binaryCols = UNIPROT_ANNOTATION_COLS[-30:]
354
+ final_data = final_data.astype(str)
355
+ final_data.replace({'NaN': 'nan'}, inplace=True)
356
+ for i in final_data.index:
357
  for j in binaryCols:
358
+ final_data[j] = final_data[j].astype('str')
359
+ if (final_data.at[i, j] == '0') or (final_data.at[i, j] == '0.0'):
360
+ final_data.at[i, j] = '1'
361
+ elif final_data.at[i, j] == 'nan':
362
+ final_data.at[i, j] = '0'
363
+ elif (final_data.at[i, j] == '1') or (final_data.at[i, j] == '1.0'):
364
+ final_data.at[i, j] = '2'
365
 
366
  annotCols = UNIPROT_ANNOTATION_COLS[:30]
367
 
368
+ for i in final_data.index:
369
  for annot in annotCols:
370
  binaryName = str(annot) + 'Binary'
371
+ if final_data.at[i, binaryName] == '2':
372
+ final_data.at[i, annot] = '0.0'
373
+ final_data.rename(
374
  columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
375
  'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
376
  'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
 
410
  'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
411
  'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
412
 
413
+ final_data = final_data[
414
  ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position','Source', 'meta_merged', 'composition', 'polarity',
415
  'volume',
416
  'granthamScore', 'domains_all',
 
439
  16.82,
440
  20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
441
  col_index = 0
442
+ for col_ in final_data.columns[-30:]:
443
+ final_data[col_] = final_data[col_].fillna(filler[col_index])
444
+ final_data[col_] = final_data[col_].replace({'nan': filler[col_index]})
445
  col_index += 1
446
+ final_data['domains_3Ddist'] = final_data['domains_3Ddist'].fillna(24.5)
447
+ final_data['sasa'] = final_data['sasa'].fillna(29.5)
448
+ final_data['location_3state'] = final_data['location_3state'].fillna('unknown')
449
  elif (impute == 'False') or (impute == 'false'):
450
  pass
451
+ final_data = final_data.replace({'nan': np.NaN})
452
+ final_data.domains_all = final_data.domains_all.replace({-1: 'NULL'})
453
 
454
  # ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
455
+ if len(final_data) == 0:
456
  print(
457
  'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
458
+ final_data.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
459
+
460
+ print('Feature vector successfully created...')
461
+ end = timer()
462
+ hours, rem = divmod(end - start, 3600)
463
+ minutes, seconds = divmod(rem, 60)
464
+ print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
 
 
 
 
 
 
465
 
466
+ return final_data
467
+ elif len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len):
468
+ st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
469
+ st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
470
+ elif len(no_modbase_models_updated) == org_len:
471
+ st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.')