Spaces:
Running
Running
Commit
·
1744db1
1
Parent(s):
108cc29
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +5 -28
code/pdb_featureVector.py
CHANGED
@@ -95,8 +95,6 @@ def pdb(input_set, mode, impute):
|
|
95 |
data.domStart = data.domStart.replace({'nan': '-1'})
|
96 |
data.domEnd = data.domEnd.replace({'nan': '-1'})
|
97 |
data.distance = data.distance.replace({'nan': '-1'})
|
98 |
-
st.write('1')
|
99 |
-
st.write(data)
|
100 |
"""
|
101 |
STEP 4
|
102 |
Retrieve canonical and isoform UniProt sequences.
|
@@ -202,7 +200,6 @@ def pdb(input_set, mode, impute):
|
|
202 |
else:
|
203 |
pdbs = []
|
204 |
print('Processing PDB structures...\n')
|
205 |
-
st.write('2')
|
206 |
if pdbs == []:
|
207 |
print('No PDB structure found for the query. ')
|
208 |
print('Starting PDB structures download...\n')
|
@@ -303,7 +300,6 @@ def pdb(input_set, mode, impute):
|
|
303 |
filename.rename(filename_replace_ext.with_suffix('.pdb'))
|
304 |
except:
|
305 |
FileNotFoundError
|
306 |
-
st.write('3')
|
307 |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
|
308 |
uniprot_matched = uniprot_matched.astype(str)
|
309 |
uniprot_matched = uniprot_matched.drop_duplicates()
|
@@ -408,9 +404,7 @@ def pdb(input_set, mode, impute):
|
|
408 |
dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
409 |
dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
|
410 |
dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True)
|
411 |
-
|
412 |
-
st.write(dfM)
|
413 |
-
st.write(dfNM)
|
414 |
dfM = dfM.astype(str)
|
415 |
dfNM = dfNM.astype(str)
|
416 |
|
@@ -432,12 +426,8 @@ def pdb(input_set, mode, impute):
|
|
432 |
|
433 |
print('Aligning sequences...\n')
|
434 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
435 |
-
st.write('aligned_m')
|
436 |
-
st.write(aligned_m)
|
437 |
-
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
438 |
-
st.write(aligned_nm)
|
439 |
|
440 |
-
|
441 |
|
442 |
|
443 |
|
@@ -508,7 +498,6 @@ def pdb(input_set, mode, impute):
|
|
508 |
|
509 |
print('Proceeding to SwissModel search...')
|
510 |
print('------------------------------------\n')
|
511 |
-
st.write('5')
|
512 |
# At this point we have 4 dataframes
|
513 |
# 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well.
|
514 |
# 1a. aligned --- we are done with this.
|
@@ -607,7 +596,6 @@ def pdb(input_set, mode, impute):
|
|
607 |
|
608 |
with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False)
|
609 |
with_swiss_models = with_swiss_models[to_swiss.columns]
|
610 |
-
st.write('6')
|
611 |
# Add model info.
|
612 |
|
613 |
with_swiss_models = with_swiss_models.astype(str)
|
@@ -713,7 +701,6 @@ def pdb(input_set, mode, impute):
|
|
713 |
swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'])
|
714 |
else:
|
715 |
swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']
|
716 |
-
st.write('7')
|
717 |
swissmodels_fasta = swissmodels_fasta.astype(str)
|
718 |
|
719 |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float)
|
@@ -828,7 +815,6 @@ def pdb(input_set, mode, impute):
|
|
828 |
to_swiss_columns = to_swiss.columns
|
829 |
to_swiss_size = len(to_swiss.drop_duplicates(['datapoint']))
|
830 |
to_swiss = None
|
831 |
-
st.write('8')
|
832 |
# CONTROL
|
833 |
|
834 |
"""
|
@@ -1325,7 +1311,6 @@ def pdb(input_set, mode, impute):
|
|
1325 |
swiss['source'] = 'SWISSMODEL'
|
1326 |
modbase['source'] = 'MODBASE'
|
1327 |
data = pd.concat([swiss, modbase, pdb])
|
1328 |
-
st.write(data)
|
1329 |
data.reset_index(inplace=True)
|
1330 |
data.drop(['index'], axis=1, inplace=True)
|
1331 |
data = data.astype('str')
|
@@ -1344,7 +1329,6 @@ def pdb(input_set, mode, impute):
|
|
1344 |
existing_free_sasa = [str(i) for i in existing_free_sasa]
|
1345 |
existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
|
1346 |
print('Calculation RSA for PDB Structure Files...\n')
|
1347 |
-
st.write(existing_free_sasa)
|
1348 |
pdb_only = data[data.source == 'PDB']
|
1349 |
|
1350 |
|
@@ -1381,7 +1365,6 @@ def pdb(input_set, mode, impute):
|
|
1381 |
existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
|
1382 |
existing_free_sasa = [str(i) for i in existing_free_sasa]
|
1383 |
existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
|
1384 |
-
st.write(existing_free_sasa)
|
1385 |
annotation_list += ['domainStartonPDB', 'domainEndonPDB']
|
1386 |
|
1387 |
folder_path = path_to_output_files / 'freesasa_files'
|
@@ -1397,8 +1380,6 @@ def pdb(input_set, mode, impute):
|
|
1397 |
modbase_only = None
|
1398 |
data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C')
|
1399 |
data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C')
|
1400 |
-
st.write('after')
|
1401 |
-
st.write(data)
|
1402 |
for i in data.index:
|
1403 |
id_ = data.at[i, 'pdbID'].lower()
|
1404 |
up_id_ = data.at[i, 'uniprotID']
|
@@ -1420,13 +1401,11 @@ def pdb(input_set, mode, impute):
|
|
1420 |
|
1421 |
alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
|
1422 |
mutPos = data.at[i, 'mutationPositionOnPDB']
|
1423 |
-
st.write('mutpos', mutPos)
|
1424 |
try:
|
1425 |
coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
|
1426 |
except:
|
1427 |
ValueError
|
1428 |
coordMut = 'nan'
|
1429 |
-
st.write('coordMut', coordMut)
|
1430 |
try:
|
1431 |
sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
|
1432 |
data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
|
@@ -1434,8 +1413,7 @@ def pdb(input_set, mode, impute):
|
|
1434 |
except:
|
1435 |
ValueError
|
1436 |
data.at[i, 'sasa'] = 'nan' # mutation position is nan
|
1437 |
-
|
1438 |
-
st.write(data)
|
1439 |
for annot in annotation_list:
|
1440 |
annotx = []
|
1441 |
try:
|
@@ -1501,8 +1479,7 @@ def pdb(input_set, mode, impute):
|
|
1501 |
k = pd.Series((key, str(list(set(val)))))
|
1502 |
interface_dataframe = interface_dataframe.append(k, ignore_index=True)
|
1503 |
interface_dataframe.columns = ['uniprotID', 'positions']
|
1504 |
-
|
1505 |
-
st.write(data)
|
1506 |
if len(data) == 0:
|
1507 |
data = pd.DataFrame(
|
1508 |
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
@@ -1711,4 +1688,4 @@ def pdb(input_set, mode, impute):
|
|
1711 |
hours, rem = divmod(end - start, 3600)
|
1712 |
minutes, seconds = divmod(rem, 60)
|
1713 |
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
1714 |
-
return ready
|
|
|
95 |
data.domStart = data.domStart.replace({'nan': '-1'})
|
96 |
data.domEnd = data.domEnd.replace({'nan': '-1'})
|
97 |
data.distance = data.distance.replace({'nan': '-1'})
|
|
|
|
|
98 |
"""
|
99 |
STEP 4
|
100 |
Retrieve canonical and isoform UniProt sequences.
|
|
|
200 |
else:
|
201 |
pdbs = []
|
202 |
print('Processing PDB structures...\n')
|
|
|
203 |
if pdbs == []:
|
204 |
print('No PDB structure found for the query. ')
|
205 |
print('Starting PDB structures download...\n')
|
|
|
300 |
filename.rename(filename_replace_ext.with_suffix('.pdb'))
|
301 |
except:
|
302 |
FileNotFoundError
|
|
|
303 |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
|
304 |
uniprot_matched = uniprot_matched.astype(str)
|
305 |
uniprot_matched = uniprot_matched.drop_duplicates()
|
|
|
404 |
dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
|
405 |
dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
|
406 |
dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True)
|
407 |
+
|
|
|
|
|
408 |
dfM = dfM.astype(str)
|
409 |
dfNM = dfNM.astype(str)
|
410 |
|
|
|
426 |
|
427 |
print('Aligning sequences...\n')
|
428 |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
|
|
|
|
|
|
|
|
429 |
|
430 |
+
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
|
431 |
|
432 |
|
433 |
|
|
|
498 |
|
499 |
print('Proceeding to SwissModel search...')
|
500 |
print('------------------------------------\n')
|
|
|
501 |
# At this point we have 4 dataframes
|
502 |
# 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well.
|
503 |
# 1a. aligned --- we are done with this.
|
|
|
596 |
|
597 |
with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False)
|
598 |
with_swiss_models = with_swiss_models[to_swiss.columns]
|
|
|
599 |
# Add model info.
|
600 |
|
601 |
with_swiss_models = with_swiss_models.astype(str)
|
|
|
701 |
swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'])
|
702 |
else:
|
703 |
swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']
|
|
|
704 |
swissmodels_fasta = swissmodels_fasta.astype(str)
|
705 |
|
706 |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float)
|
|
|
815 |
to_swiss_columns = to_swiss.columns
|
816 |
to_swiss_size = len(to_swiss.drop_duplicates(['datapoint']))
|
817 |
to_swiss = None
|
|
|
818 |
# CONTROL
|
819 |
|
820 |
"""
|
|
|
1311 |
swiss['source'] = 'SWISSMODEL'
|
1312 |
modbase['source'] = 'MODBASE'
|
1313 |
data = pd.concat([swiss, modbase, pdb])
|
|
|
1314 |
data.reset_index(inplace=True)
|
1315 |
data.drop(['index'], axis=1, inplace=True)
|
1316 |
data = data.astype('str')
|
|
|
1329 |
existing_free_sasa = [str(i) for i in existing_free_sasa]
|
1330 |
existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
|
1331 |
print('Calculation RSA for PDB Structure Files...\n')
|
|
|
1332 |
pdb_only = data[data.source == 'PDB']
|
1333 |
|
1334 |
|
|
|
1365 |
existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
|
1366 |
existing_free_sasa = [str(i) for i in existing_free_sasa]
|
1367 |
existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
|
|
|
1368 |
annotation_list += ['domainStartonPDB', 'domainEndonPDB']
|
1369 |
|
1370 |
folder_path = path_to_output_files / 'freesasa_files'
|
|
|
1380 |
modbase_only = None
|
1381 |
data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C')
|
1382 |
data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C')
|
|
|
|
|
1383 |
for i in data.index:
|
1384 |
id_ = data.at[i, 'pdbID'].lower()
|
1385 |
up_id_ = data.at[i, 'uniprotID']
|
|
|
1401 |
|
1402 |
alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
|
1403 |
mutPos = data.at[i, 'mutationPositionOnPDB']
|
|
|
1404 |
try:
|
1405 |
coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
|
1406 |
except:
|
1407 |
ValueError
|
1408 |
coordMut = 'nan'
|
|
|
1409 |
try:
|
1410 |
sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
|
1411 |
data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
|
|
|
1413 |
except:
|
1414 |
ValueError
|
1415 |
data.at[i, 'sasa'] = 'nan' # mutation position is nan
|
1416 |
+
|
|
|
1417 |
for annot in annotation_list:
|
1418 |
annotx = []
|
1419 |
try:
|
|
|
1479 |
k = pd.Series((key, str(list(set(val)))))
|
1480 |
interface_dataframe = interface_dataframe.append(k, ignore_index=True)
|
1481 |
interface_dataframe.columns = ['uniprotID', 'positions']
|
1482 |
+
|
|
|
1483 |
if len(data) == 0:
|
1484 |
data = pd.DataFrame(
|
1485 |
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
|
|
|
1688 |
hours, rem = divmod(end - start, 3600)
|
1689 |
minutes, seconds = divmod(rem, 60)
|
1690 |
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
1691 |
+
return ready
|