fatmacankara commited on
Commit
b24bdaf
·
1 Parent(s): 8b0bef9

Create modbaseModelAdd.py

Browse files
Files changed (1) hide show
  1. code/modbaseModelAdd.py +131 -0
code/modbaseModelAdd.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import numpy as np
3
+ import pandas as pd
4
+ from utils import *
5
+ from pathlib import Path
6
+ from bs4 import BeautifulSoup
7
+ from add_sasa import *
8
+ def addModbaseModels(dataframe, path_to_input_files, path_to_output_files):
9
+ if len(dataframe) != 0:
10
+ # GET MODBASE MODELS
11
+ # Get IDs from data to retrieve only their models from MODBASE
12
+ dataframe.reset_index(inplace=True, drop=True)
13
+
14
+ existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*"))
15
+ existing_modbase_models = [str(i) for i in existing_modbase_models]
16
+ existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models]
17
+
18
+ existing_modbase_models_ind = list(Path(path_to_output_files / 'modbase_structures_individual').glob("*"))
19
+ existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind]
20
+ existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind]
21
+
22
+ modbase_reduced = pd.DataFrame(columns = ['uniprotID', 'target_begin', 'target_end', 'quality_score',
23
+ 'model_id', 'coordinates','AAonPDB', 'coordVAR'])
24
+ print('Retrieving ModBase models...\n')
25
+ modbase = pd.DataFrame(
26
+ columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id',
27
+ 'coordinates', 'AAonPDB', 'coordVAR'])
28
+ no_modbase = pd.DataFrame(
29
+ columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id',
30
+ 'coordinates', 'AAonPDB', 'coordVAR'])
31
+ # Get model files associated with each UniProtID
32
+ existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
33
+ existing_free_sasa = [str(i) for i in existing_free_sasa]
34
+ existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
35
+ for i in dataframe.index:
36
+ coordDict = {}
37
+ protein = dataframe.at[i, 'uniprotID']
38
+ varPos = int(dataframe.at[i, 'pos'])
39
+ wt = dataframe.at[i, 'wt']
40
+ if protein not in existing_modbase_models:
41
+ print('Downloading Modbase models for ', protein)
42
+ url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
43
+ req = requests.get(url)
44
+ name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
45
+ with open(name, 'wb') as f:
46
+ f.write(req.content)
47
+ else:
48
+ print('Model exists for', protein)
49
+ name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt')
50
+
51
+ with open(name, encoding="utf8") as f:
52
+ a = open(name, 'r').read()
53
+ soup = BeautifulSoup(a, 'lxml')
54
+ if soup.findAll('pdbfile') != []:
55
+ for pdb in soup.findAll('pdbfile'):
56
+ model_id = str(pdb.contents[1])[10:-11]
57
+ if model_id not in existing_modbase_models_ind:
58
+ with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', 'w', encoding="utf8") as individual:
59
+ individual.write(str('UniProt ID: ' + protein))
60
+ individual.write('\n')
61
+ individual.write(str(pdb.contents[3])[10:-11].strip())
62
+ run_freesasa(
63
+ Path(path_to_output_files / 'modbase_structures_individual' / f'{model_id.lower()}.txt'),
64
+ Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt'),
65
+ include_hetatms=True,
66
+ outdir=None, force_rerun=False, file_type='pdb')
67
+ filename = Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt')
68
+ dataframe.loc[i, 'sasa'] = sasa(protein, varPos, wt, 1, filename, path_to_output_files, file_type='pdb')
69
+ with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', encoding="utf8") as m:
70
+
71
+ lines = m.readlines()
72
+ quality_score = -999
73
+ for ind_line in lines:
74
+ if ind_line[0:10] == 'UniProt ID':
75
+ uniprot_id = ind_line.split(':')[1].strip()
76
+ if ind_line[0:23] == 'REMARK 220 TARGET BEGIN':
77
+ target_begin = ind_line[40:43].strip()
78
+ if ind_line[0:21] == 'REMARK 220 TARGET END':
79
+ target_end = ind_line[40:43].strip()
80
+ coordDict, AAonPDB, coordVAR = {},np.NaN,np.NaN
81
+ if (int(varPos) > int(target_begin)) & (int(varPos) < int(target_end)):
82
+ coordDict = {}
83
+ for ind_line in lines:
84
+ if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID':
85
+ model_id = ind_line[40:].strip()
86
+ if ind_line[0:15].strip() == 'REMARK 220 MPQS':
87
+ quality_score = ind_line[40:].strip()
88
+ if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA':
89
+ position = int(ind_line[22:26].strip())
90
+ chain = ind_line[20:22].strip()
91
+ aminoacid = threeToOne(ind_line[17:20])
92
+ coords = [ind_line[31:38].strip(), ind_line[39:46].strip(), ind_line[47:54].strip()]
93
+ coordDict[position] = coords
94
+ if position == int(varPos):
95
+ AAonPDB = aminoacid
96
+ coordVAR = str(coords)
97
+ if ind_line[0:3] == 'TER':
98
+ break
99
+ try:
100
+ k = pd.Series(
101
+ [uniprot_id, target_begin, target_end,quality_score, model_id, coordDict, AAonPDB, coordVAR])
102
+ new_row = {'uniprotID': uniprot_id, 'target_begin': target_begin,
103
+ 'target_end': target_end, 'quality_score': quality_score,
104
+ 'model_id': model_id, 'coordinates': coordDict,
105
+ 'AAonPDB': AAonPDB, 'coordVAR': coordVAR}
106
+ modbase_reduced = modbase_reduced.append(new_row, ignore_index=True)
107
+ modbase_reduced = modbase_reduced[['uniprotID', 'quality_score', 'model_id', 'coordinates', 'AAonPDB', 'coordVAR']]
108
+ modbase = dataframe.merge(modbase_reduced, on='uniprotID', how='left')
109
+ modbase.quality_score = modbase.quality_score.astype(float)
110
+ modbase = modbase.sort_values(by=['datapoint', 'quality_score'], ascending=False)
111
+ modbase.reset_index(inplace=True, drop=True)
112
+ modbase.fillna(np.NaN, inplace=True)
113
+ modbase.replace({'\'?\', ': '',
114
+ ', \'?\'': '',
115
+ '(': '', ')': '',
116
+ '[\'?\']': np.NaN,
117
+ '[]': np.NaN,
118
+ 'nan-nan': np.NaN,
119
+ '': np.NaN}, inplace=True)
120
+ except NameError:
121
+ print('This file doesnt have Quality Score. Replacer: -999', model_id)
122
+ else:
123
+ k = pd.Series(
124
+ dataframe.loc[i])
125
+ no_modbase = no_modbase.append(k, ignore_index=True)
126
+
127
+ no_modbase_no_Coord = modbase[pd.isna(modbase['coordVAR'])]
128
+ no_modbase = pd.concat([no_modbase, no_modbase_no_Coord])
129
+ modbase = modbase[~pd.isna(modbase['coordVAR'])]
130
+
131
+ return modbase, no_modbase