Spaces:
Running
Running
Commit
·
f44aa18
1
Parent(s):
84bc25a
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +86 -79
code/pdb_featureVector.py
CHANGED
@@ -82,8 +82,10 @@ def pdb(input_set, mode, impute):
|
|
82 |
out_path = path_to_output_files / 'log.txt'
|
83 |
#sys.stdout = open(out_path, 'w')
|
84 |
data = clean_data(input_set)
|
|
|
85 |
data = add_uniprot_sequence(data)
|
86 |
match = data[(data.wt_sequence_match == 'm')]
|
|
|
87 |
iso = data[(data.wt_sequence_match == 'i')]
|
88 |
noMatch = data[(data.wt_sequence_match != 'm') & (data.wt_sequence_match != 'i')]
|
89 |
if len(data) == 0:
|
@@ -233,24 +235,28 @@ def pdb(input_set, mode, impute):
|
|
233 |
modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
|
234 |
modbase = modbase.fillna(np.NaN)
|
235 |
print('\n>> Adding Modbase residue positions...\n')
|
236 |
-
modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'datapoint']]
|
237 |
-
modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'datapoint'])
|
238 |
modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files)
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
|
|
|
|
|
|
|
|
254 |
else:
|
255 |
modbase = modbase[SIMPLE_COLS]
|
256 |
|
@@ -266,7 +272,7 @@ def pdb(input_set, mode, impute):
|
|
266 |
'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
267 |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
|
268 |
'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
|
269 |
-
|
270 |
if len(pdb)>0:
|
271 |
pdb = pdb[COLS]
|
272 |
pdb['Source'] = 'PDB'
|
@@ -282,7 +288,8 @@ def pdb(input_set, mode, impute):
|
|
282 |
modbase['Source'] = 'Modbase'
|
283 |
else:
|
284 |
modbase = pd.DataFrame()
|
285 |
-
no_modbase_models_updated
|
|
|
286 |
|
287 |
# st.write('======PDB==========')
|
288 |
# st.write(pdb.to_string())
|
@@ -291,13 +298,13 @@ def pdb(input_set, mode, impute):
|
|
291 |
# st.write('======MODBASE==========')
|
292 |
# st.write(modbase.to_string())
|
293 |
|
294 |
-
|
295 |
|
296 |
allData = pd.concat([pdb, swiss, modbase])
|
297 |
allData.reset_index(inplace=True, drop=True)
|
298 |
allData.replace({np.NaN: ''}, inplace=True)
|
299 |
-
#
|
300 |
-
#
|
301 |
if len(allData)>0:
|
302 |
allData.distance.replace({-1000: ''}, inplace=True)
|
303 |
|
@@ -318,52 +325,52 @@ def pdb(input_set, mode, impute):
|
|
318 |
k = pd.Series((key, str(list(set(val)))))
|
319 |
interface_dataframe = interface_dataframe.append(k, ignore_index=True)
|
320 |
interface_dataframe.columns = ['uniprotID', 'positions']
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
for i in
|
325 |
-
if (str(
|
326 |
-
|
327 |
-
elif (str(
|
328 |
-
|
329 |
-
elif (str(
|
330 |
-
|
331 |
-
elif (str(
|
332 |
-
|
333 |
-
elif
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
|
338 |
fisherResult = pd.read_csv(fisher_path, sep='\t')
|
339 |
significant_domains = fisherResult.domain.to_list()
|
340 |
-
for i in
|
341 |
-
if
|
342 |
-
|
343 |
else:
|
344 |
-
|
345 |
print('Final adjustments are being done...\n')
|
346 |
binaryCols = UNIPROT_ANNOTATION_COLS[-30:]
|
347 |
-
|
348 |
-
|
349 |
-
for i in
|
350 |
for j in binaryCols:
|
351 |
-
|
352 |
-
if (
|
353 |
-
|
354 |
-
elif
|
355 |
-
|
356 |
-
elif (
|
357 |
-
|
358 |
|
359 |
annotCols = UNIPROT_ANNOTATION_COLS[:30]
|
360 |
|
361 |
-
for i in
|
362 |
for annot in annotCols:
|
363 |
binaryName = str(annot) + 'Binary'
|
364 |
-
if
|
365 |
-
|
366 |
-
|
367 |
columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
|
368 |
'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
|
369 |
'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
|
@@ -403,7 +410,7 @@ def pdb(input_set, mode, impute):
|
|
403 |
'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
|
404 |
'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
|
405 |
|
406 |
-
|
407 |
['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position','Source', 'meta_merged', 'composition', 'polarity',
|
408 |
'volume',
|
409 |
'granthamScore', 'domains_all',
|
@@ -432,33 +439,33 @@ def pdb(input_set, mode, impute):
|
|
432 |
16.82,
|
433 |
20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
|
434 |
col_index = 0
|
435 |
-
for col_ in
|
436 |
-
|
437 |
-
|
438 |
col_index += 1
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
elif (impute == 'False') or (impute == 'false'):
|
443 |
pass
|
444 |
-
|
445 |
-
|
446 |
|
447 |
# ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
|
448 |
-
if len(
|
449 |
print(
|
450 |
'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
hours, rem = divmod(end - start, 3600)
|
459 |
-
minutes, seconds = divmod(rem, 60)
|
460 |
-
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
461 |
-
#sys.stdout.close()
|
462 |
-
return data
|
463 |
-
|
464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
out_path = path_to_output_files / 'log.txt'
|
83 |
#sys.stdout = open(out_path, 'w')
|
84 |
data = clean_data(input_set)
|
85 |
+
|
86 |
data = add_uniprot_sequence(data)
|
87 |
match = data[(data.wt_sequence_match == 'm')]
|
88 |
+
org_len = len(match)
|
89 |
iso = data[(data.wt_sequence_match == 'i')]
|
90 |
noMatch = data[(data.wt_sequence_match != 'm') & (data.wt_sequence_match != 'i')]
|
91 |
if len(data) == 0:
|
|
|
235 |
modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
|
236 |
modbase = modbase.fillna(np.NaN)
|
237 |
print('\n>> Adding Modbase residue positions...\n')
|
238 |
+
modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'mut','datapoint']]
|
239 |
+
modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'mut','datapoint'])
|
240 |
modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files)
|
241 |
+
|
242 |
+
if len(modbaseOut) > 0:
|
243 |
+
modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'mut','datapoint'], how = 'left')
|
244 |
+
no_modbase_models_updated['sasa'] = np.NaN
|
245 |
+
modbase.reset_index(inplace=True, drop=True)
|
246 |
+
no_modbase_add = modbase[pd.isna(modbase.coordinates)]
|
247 |
+
modbase = modbase[~pd.isna(modbase.coordinates)]
|
248 |
+
no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add])
|
249 |
+
print('\n>> Mapping to Modbase models...\n')
|
250 |
+
modbase = changeUPtoModels(modbase)
|
251 |
+
print('\n>> Calculating 3D distances for Modbase models...\n')
|
252 |
+
modbase = isZeroDistance(modbase)
|
253 |
+
modbase = match3DModels(modbase)
|
254 |
+
modbase = selectMaxAnnot(modbase)
|
255 |
+
modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, True, True, True, True])
|
256 |
+
modbase = modbase.drop_duplicates(['datapoint'])
|
257 |
+
modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True)
|
258 |
+
else:
|
259 |
+
modbase = pd.DataFrame(columns = SIMPLE_COLS)
|
260 |
else:
|
261 |
modbase = modbase[SIMPLE_COLS]
|
262 |
|
|
|
272 |
'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
273 |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
|
274 |
'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
|
275 |
+
no_modbase_models_updated = no_modbase_models_updated.drop_duplicates()
|
276 |
if len(pdb)>0:
|
277 |
pdb = pdb[COLS]
|
278 |
pdb['Source'] = 'PDB'
|
|
|
288 |
modbase['Source'] = 'Modbase'
|
289 |
else:
|
290 |
modbase = pd.DataFrame()
|
291 |
+
if len(no_modbase_models_updated) == 0:
|
292 |
+
no_modbase_models_updated = pd.DataFrame()
|
293 |
|
294 |
# st.write('======PDB==========')
|
295 |
# st.write(pdb.to_string())
|
|
|
298 |
# st.write('======MODBASE==========')
|
299 |
# st.write(modbase.to_string())
|
300 |
|
301 |
+
|
302 |
|
303 |
allData = pd.concat([pdb, swiss, modbase])
|
304 |
allData.reset_index(inplace=True, drop=True)
|
305 |
allData.replace({np.NaN: ''}, inplace=True)
|
306 |
+
# st.write('======ALL DATA==========')
|
307 |
+
# st.write(allData.to_string())
|
308 |
if len(allData)>0:
|
309 |
allData.distance.replace({-1000: ''}, inplace=True)
|
310 |
|
|
|
325 |
k = pd.Series((key, str(list(set(val)))))
|
326 |
interface_dataframe = interface_dataframe.append(k, ignore_index=True)
|
327 |
interface_dataframe.columns = ['uniprotID', 'positions']
|
328 |
+
final_data = finalTouch(allData)
|
329 |
+
final_data = final_data.merge(interface_dataframe, on='uniprotID', how='left')
|
330 |
+
final_data.positions = final_data.positions.astype('str')
|
331 |
+
for i in final_data.index:
|
332 |
+
if (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface':
|
333 |
+
final_data.at[i, 'threeState_trsh4_HQ'] = 'interface'
|
334 |
+
elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface':
|
335 |
+
final_data.at[i, 'threeState_trsh4_HQ'] = 'surface'
|
336 |
+
elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core':
|
337 |
+
final_data.at[i, 'threeState_trsh4_HQ'] = 'core'
|
338 |
+
elif (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core':
|
339 |
+
final_data.at[i, 'threeState_trsh4_HQ'] = 'conflict'
|
340 |
+
elif final_data.at[i, 'trsh4'] == 'nan':
|
341 |
+
final_data.at[i, 'threeState_trsh4_HQ'] = 'nan'
|
342 |
+
|
343 |
+
final_data.drop(['positions'], axis=1, inplace=True)
|
344 |
|
345 |
fisherResult = pd.read_csv(fisher_path, sep='\t')
|
346 |
significant_domains = fisherResult.domain.to_list()
|
347 |
+
for i in final_data.index:
|
348 |
+
if final_data.at[i, 'domain'] in significant_domains:
|
349 |
+
final_data.at[i, 'domain_fisher'] = final_data.at[i, 'domain']
|
350 |
else:
|
351 |
+
final_data.at[i, 'domain_fisher'] = 'NULL'
|
352 |
print('Final adjustments are being done...\n')
|
353 |
binaryCols = UNIPROT_ANNOTATION_COLS[-30:]
|
354 |
+
final_data = final_data.astype(str)
|
355 |
+
final_data.replace({'NaN': 'nan'}, inplace=True)
|
356 |
+
for i in final_data.index:
|
357 |
for j in binaryCols:
|
358 |
+
final_data[j] = final_data[j].astype('str')
|
359 |
+
if (final_data.at[i, j] == '0') or (final_data.at[i, j] == '0.0'):
|
360 |
+
final_data.at[i, j] = '1'
|
361 |
+
elif final_data.at[i, j] == 'nan':
|
362 |
+
final_data.at[i, j] = '0'
|
363 |
+
elif (final_data.at[i, j] == '1') or (final_data.at[i, j] == '1.0'):
|
364 |
+
final_data.at[i, j] = '2'
|
365 |
|
366 |
annotCols = UNIPROT_ANNOTATION_COLS[:30]
|
367 |
|
368 |
+
for i in final_data.index:
|
369 |
for annot in annotCols:
|
370 |
binaryName = str(annot) + 'Binary'
|
371 |
+
if final_data.at[i, binaryName] == '2':
|
372 |
+
final_data.at[i, annot] = '0.0'
|
373 |
+
final_data.rename(
|
374 |
columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
|
375 |
'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
|
376 |
'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
|
|
|
410 |
'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
|
411 |
'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
|
412 |
|
413 |
+
final_data = final_data[
|
414 |
['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position','Source', 'meta_merged', 'composition', 'polarity',
|
415 |
'volume',
|
416 |
'granthamScore', 'domains_all',
|
|
|
439 |
16.82,
|
440 |
20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
|
441 |
col_index = 0
|
442 |
+
for col_ in final_data.columns[-30:]:
|
443 |
+
final_data[col_] = final_data[col_].fillna(filler[col_index])
|
444 |
+
final_data[col_] = final_data[col_].replace({'nan': filler[col_index]})
|
445 |
col_index += 1
|
446 |
+
final_data['domains_3Ddist'] = final_data['domains_3Ddist'].fillna(24.5)
|
447 |
+
final_data['sasa'] = final_data['sasa'].fillna(29.5)
|
448 |
+
final_data['location_3state'] = final_data['location_3state'].fillna('unknown')
|
449 |
elif (impute == 'False') or (impute == 'false'):
|
450 |
pass
|
451 |
+
final_data = final_data.replace({'nan': np.NaN})
|
452 |
+
final_data.domains_all = final_data.domains_all.replace({-1: 'NULL'})
|
453 |
|
454 |
# ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
|
455 |
+
if len(final_data) == 0:
|
456 |
print(
|
457 |
'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
|
458 |
+
final_data.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
|
459 |
+
|
460 |
+
print('Feature vector successfully created...')
|
461 |
+
end = timer()
|
462 |
+
hours, rem = divmod(end - start, 3600)
|
463 |
+
minutes, seconds = divmod(rem, 60)
|
464 |
+
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
|
466 |
+
return final_data
|
467 |
+
elif len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len):
|
468 |
+
st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
|
469 |
+
st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
|
470 |
+
elif len(no_modbase_models_updated) == org_len:
|
471 |
+
st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.')
|