fatmacankara commited on
Commit
1744db1
·
1 Parent(s): 108cc29

Update code/pdb_featureVector.py

Browse files
Files changed (1) hide show
  1. code/pdb_featureVector.py +5 -28
code/pdb_featureVector.py CHANGED
@@ -95,8 +95,6 @@ def pdb(input_set, mode, impute):
95
  data.domStart = data.domStart.replace({'nan': '-1'})
96
  data.domEnd = data.domEnd.replace({'nan': '-1'})
97
  data.distance = data.distance.replace({'nan': '-1'})
98
- st.write('1')
99
- st.write(data)
100
  """
101
  STEP 4
102
  Retrieve canonical and isoform UniProt sequences.
@@ -202,7 +200,6 @@ def pdb(input_set, mode, impute):
202
  else:
203
  pdbs = []
204
  print('Processing PDB structures...\n')
205
- st.write('2')
206
  if pdbs == []:
207
  print('No PDB structure found for the query. ')
208
  print('Starting PDB structures download...\n')
@@ -303,7 +300,6 @@ def pdb(input_set, mode, impute):
303
  filename.rename(filename_replace_ext.with_suffix('.pdb'))
304
  except:
305
  FileNotFoundError
306
- st.write('3')
307
  uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
308
  uniprot_matched = uniprot_matched.astype(str)
309
  uniprot_matched = uniprot_matched.drop_duplicates()
@@ -408,9 +404,7 @@ def pdb(input_set, mode, impute):
408
  dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
409
  dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
410
  dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True)
411
- st.write('4')
412
- st.write(dfM)
413
- st.write(dfNM)
414
  dfM = dfM.astype(str)
415
  dfNM = dfNM.astype(str)
416
 
@@ -432,12 +426,8 @@ def pdb(input_set, mode, impute):
432
 
433
  print('Aligning sequences...\n')
434
  aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
435
- st.write('aligned_m')
436
- st.write(aligned_m)
437
- aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
438
- st.write(aligned_nm)
439
 
440
- st.write('WOTE')
441
 
442
 
443
 
@@ -508,7 +498,6 @@ def pdb(input_set, mode, impute):
508
 
509
  print('Proceeding to SwissModel search...')
510
  print('------------------------------------\n')
511
- st.write('5')
512
  # At this point we have 4 dataframes
513
  # 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well.
514
  # 1a. aligned --- we are done with this.
@@ -607,7 +596,6 @@ def pdb(input_set, mode, impute):
607
 
608
  with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False)
609
  with_swiss_models = with_swiss_models[to_swiss.columns]
610
- st.write('6')
611
  # Add model info.
612
 
613
  with_swiss_models = with_swiss_models.astype(str)
@@ -713,7 +701,6 @@ def pdb(input_set, mode, impute):
713
  swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'])
714
  else:
715
  swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']
716
- st.write('7')
717
  swissmodels_fasta = swissmodels_fasta.astype(str)
718
 
719
  swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float)
@@ -828,7 +815,6 @@ def pdb(input_set, mode, impute):
828
  to_swiss_columns = to_swiss.columns
829
  to_swiss_size = len(to_swiss.drop_duplicates(['datapoint']))
830
  to_swiss = None
831
- st.write('8')
832
  # CONTROL
833
 
834
  """
@@ -1325,7 +1311,6 @@ def pdb(input_set, mode, impute):
1325
  swiss['source'] = 'SWISSMODEL'
1326
  modbase['source'] = 'MODBASE'
1327
  data = pd.concat([swiss, modbase, pdb])
1328
- st.write(data)
1329
  data.reset_index(inplace=True)
1330
  data.drop(['index'], axis=1, inplace=True)
1331
  data = data.astype('str')
@@ -1344,7 +1329,6 @@ def pdb(input_set, mode, impute):
1344
  existing_free_sasa = [str(i) for i in existing_free_sasa]
1345
  existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
1346
  print('Calculation RSA for PDB Structure Files...\n')
1347
- st.write(existing_free_sasa)
1348
  pdb_only = data[data.source == 'PDB']
1349
 
1350
 
@@ -1381,7 +1365,6 @@ def pdb(input_set, mode, impute):
1381
  existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
1382
  existing_free_sasa = [str(i) for i in existing_free_sasa]
1383
  existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
1384
- st.write(existing_free_sasa)
1385
  annotation_list += ['domainStartonPDB', 'domainEndonPDB']
1386
 
1387
  folder_path = path_to_output_files / 'freesasa_files'
@@ -1397,8 +1380,6 @@ def pdb(input_set, mode, impute):
1397
  modbase_only = None
1398
  data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C')
1399
  data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C')
1400
- st.write('after')
1401
- st.write(data)
1402
  for i in data.index:
1403
  id_ = data.at[i, 'pdbID'].lower()
1404
  up_id_ = data.at[i, 'uniprotID']
@@ -1420,13 +1401,11 @@ def pdb(input_set, mode, impute):
1420
 
1421
  alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
1422
  mutPos = data.at[i, 'mutationPositionOnPDB']
1423
- st.write('mutpos', mutPos)
1424
  try:
1425
  coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
1426
  except:
1427
  ValueError
1428
  coordMut = 'nan'
1429
- st.write('coordMut', coordMut)
1430
  try:
1431
  sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
1432
  data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
@@ -1434,8 +1413,7 @@ def pdb(input_set, mode, impute):
1434
  except:
1435
  ValueError
1436
  data.at[i, 'sasa'] = 'nan' # mutation position is nan
1437
- st.write('data')
1438
- st.write(data)
1439
  for annot in annotation_list:
1440
  annotx = []
1441
  try:
@@ -1501,8 +1479,7 @@ def pdb(input_set, mode, impute):
1501
  k = pd.Series((key, str(list(set(val)))))
1502
  interface_dataframe = interface_dataframe.append(k, ignore_index=True)
1503
  interface_dataframe.columns = ['uniprotID', 'positions']
1504
- st.write('sasa')
1505
- st.write(data)
1506
  if len(data) == 0:
1507
  data = pd.DataFrame(
1508
  columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
@@ -1711,4 +1688,4 @@ def pdb(input_set, mode, impute):
1711
  hours, rem = divmod(end - start, 3600)
1712
  minutes, seconds = divmod(rem, 60)
1713
  print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
1714
- return ready#
 
95
  data.domStart = data.domStart.replace({'nan': '-1'})
96
  data.domEnd = data.domEnd.replace({'nan': '-1'})
97
  data.distance = data.distance.replace({'nan': '-1'})
 
 
98
  """
99
  STEP 4
100
  Retrieve canonical and isoform UniProt sequences.
 
200
  else:
201
  pdbs = []
202
  print('Processing PDB structures...\n')
 
203
  if pdbs == []:
204
  print('No PDB structure found for the query. ')
205
  print('Starting PDB structures download...\n')
 
300
  filename.rename(filename_replace_ext.with_suffix('.pdb'))
301
  except:
302
  FileNotFoundError
 
303
  uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
304
  uniprot_matched = uniprot_matched.astype(str)
305
  uniprot_matched = uniprot_matched.drop_duplicates()
 
404
  dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
405
  dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
406
  dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True)
407
+
 
 
408
  dfM = dfM.astype(str)
409
  dfNM = dfNM.astype(str)
410
 
 
426
 
427
  print('Aligning sequences...\n')
428
  aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
 
 
 
 
429
 
430
+ aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
431
 
432
 
433
 
 
498
 
499
  print('Proceeding to SwissModel search...')
500
  print('------------------------------------\n')
 
501
  # At this point we have 4 dataframes
502
  # 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well.
503
  # 1a. aligned --- we are done with this.
 
596
 
597
  with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False)
598
  with_swiss_models = with_swiss_models[to_swiss.columns]
 
599
  # Add model info.
600
 
601
  with_swiss_models = with_swiss_models.astype(str)
 
701
  swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'])
702
  else:
703
  swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']
 
704
  swissmodels_fasta = swissmodels_fasta.astype(str)
705
 
706
  swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float)
 
815
  to_swiss_columns = to_swiss.columns
816
  to_swiss_size = len(to_swiss.drop_duplicates(['datapoint']))
817
  to_swiss = None
 
818
  # CONTROL
819
 
820
  """
 
1311
  swiss['source'] = 'SWISSMODEL'
1312
  modbase['source'] = 'MODBASE'
1313
  data = pd.concat([swiss, modbase, pdb])
 
1314
  data.reset_index(inplace=True)
1315
  data.drop(['index'], axis=1, inplace=True)
1316
  data = data.astype('str')
 
1329
  existing_free_sasa = [str(i) for i in existing_free_sasa]
1330
  existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
1331
  print('Calculation RSA for PDB Structure Files...\n')
 
1332
  pdb_only = data[data.source == 'PDB']
1333
 
1334
 
 
1365
  existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
1366
  existing_free_sasa = [str(i) for i in existing_free_sasa]
1367
  existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
 
1368
  annotation_list += ['domainStartonPDB', 'domainEndonPDB']
1369
 
1370
  folder_path = path_to_output_files / 'freesasa_files'
 
1380
  modbase_only = None
1381
  data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C')
1382
  data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C')
 
 
1383
  for i in data.index:
1384
  id_ = data.at[i, 'pdbID'].lower()
1385
  up_id_ = data.at[i, 'uniprotID']
 
1401
 
1402
  alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
1403
  mutPos = data.at[i, 'mutationPositionOnPDB']
 
1404
  try:
1405
  coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
1406
  except:
1407
  ValueError
1408
  coordMut = 'nan'
 
1409
  try:
1410
  sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
1411
  data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
 
1413
  except:
1414
  ValueError
1415
  data.at[i, 'sasa'] = 'nan' # mutation position is nan
1416
+
 
1417
  for annot in annotation_list:
1418
  annotx = []
1419
  try:
 
1479
  k = pd.Series((key, str(list(set(val)))))
1480
  interface_dataframe = interface_dataframe.append(k, ignore_index=True)
1481
  interface_dataframe.columns = ['uniprotID', 'positions']
1482
+
 
1483
  if len(data) == 0:
1484
  data = pd.DataFrame(
1485
  columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
 
1688
  hours, rem = divmod(end - start, 3600)
1689
  minutes, seconds = divmod(rem, 60)
1690
  print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
1691
+ return ready