oucgc1996 commited on
Commit
4627e6f
1 Parent(s): 18ffc8d

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +3 -53
utils.py CHANGED
@@ -1,17 +1,3 @@
1
- # Copyright 2021 Gabriele Orlando
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
  import os,torch
16
  from pyuul.sources.globalVariables import *
17
  from pyuul.sources import hashings
@@ -30,20 +16,16 @@ setup_seed(100)
30
  def parseSDF(SDFFile):
31
  """
32
  function to parse pdb files. It can be used to parse a single file or all the pdb files in a folder. In case a folder is given, the coordinates are gonna be padded
33
-
34
  Parameters
35
  ----------
36
  SDFFile : str
37
  path of the PDB file or of the folder containing multiple PDB files
38
-
39
  Returns
40
  -------
41
  coords : torch.Tensor
42
  coordinates of the atoms in the pdb file(s). Shape ( batch, numberOfAtoms, 3)
43
-
44
  atomNames : list
45
  a list of the atom identifier. It encodes atom type, residue type, residue position and chain
46
-
47
  """
48
  if not os.path.isdir(SDFFile):
49
  fil = SDFFile
@@ -98,7 +80,6 @@ def parsePDB(PDBFile,keep_only_chains=None,keep_hetatm=True,bb_only=False):
98
 
99
  """
100
  function to parse pdb files. It can be used to parse a single file or all the pdb files in a folder. In case a folder is given, the coordinates are gonna be padded
101
-
102
  Parameters
103
  ----------
104
  PDBFile : str
@@ -113,10 +94,8 @@ def parsePDB(PDBFile,keep_only_chains=None,keep_hetatm=True,bb_only=False):
113
  -------
114
  coords : torch.Tensor
115
  coordinates of the atoms in the pdb file(s). Shape ( batch, numberOfAtoms, 3)
116
-
117
  atomNames : list
118
  a list of the atom identifier. It encodes atom type, residue type, residue position and chain
119
-
120
  """
121
 
122
  bbatoms = ["N", "CA", "C"]
@@ -200,31 +179,26 @@ def parsePDB(PDBFile,keep_only_chains=None,keep_hetatm=True,bb_only=False):
200
  atomNamesTMP += ["HET_"+str(resnum)+"_"+atnameHet+"_"+line[21]]
201
  coords+=[torch.tensor(coordsTMP)]
202
  atomNames += [atomNamesTMP]
203
- print(atomNames)
204
- print(pdbname)
205
- print(pdb_num)
206
  return torch.torch.nn.utils.rnn.pad_sequence(coords, batch_first=True, padding_value=PADDING_INDEX), atomNames, pdbname, pdb_num
207
 
208
 
209
  def atomlistToChannels(atomNames, hashing="Element_Hashing", device="cpu"):
210
  """
211
  function to get channels from atom names (obtained parsing the pdb files with the parsePDB function)
212
-
213
  Parameters
214
  ----------
215
  atomNames : list
216
  atom names obtained parsing the pdb files with the parsePDB function
217
-
218
  hashing : "TPL_Hashing" or "Element_Hashing" or dict
219
  define which atoms are grouped together. You can use two default hashings or build your own hashing:
220
-
221
  TPL_Hashing: uses the hashing of torch protein library (https://github.com/lupoglaz/TorchProteinLibrary)
222
  Element_Hashing: groups atoms in accordnce with the element only: C -> 0, N -> 1, O ->2, P ->3, S- >4, H ->5, everything else ->6
223
-
224
  Alternatively, if you are not happy with the default hashings, you can build a dictionary of dictionaries that defines the channel of every atom type in the pdb.
225
  the first dictionary has the residue tag (three letters amino acid code) as key (3 letters compound name for hetero atoms, as written in the PDB file)
226
  every residue key is associated to a dictionary, which the atom tags (as written in the PDB files) as keys and the channel (int) as value
227
-
228
  for example, you can define the channels just based on the atom element as following:
229
  {
230
  'CYS': {'N': 1, 'O': 2, 'C': 0, 'SG': 3, 'CB': 0, 'CA': 0}, # channels for cysteine atoms
@@ -233,21 +207,16 @@ def atomlistToChannels(atomNames, hashing="Element_Hashing", device="cpu"):
233
  'GOL': {'O1':2,'O2':2,'O3':2,'C1':0,'C2':0,'C3':0}, # channels for glycerol atom
234
  ...
235
  }
236
-
237
  The default encoding is the one that assigns a different channel to each element
238
-
239
  other encodings can be found in sources/hashings.py
240
-
241
  device : torch.device
242
  The device on which the model should run. E.g. torch.device("cuda") or torch.device("cpu:0")
243
  Returns
244
  -------
245
  coords : torch.Tensor
246
  coordinates of the atoms in the pdb file(s). Shape ( batch, numberOfAtoms, 3)
247
-
248
  channels : torch.tensor
249
  the channel of every atom. Shape (batch,numberOfAtoms)
250
-
251
  """
252
  if hashing == "TPL_Hashing":
253
  hashing = hashings.TPLatom_hash
@@ -295,20 +264,15 @@ def atomlistToChannels(atomNames, hashing="Element_Hashing", device="cpu"):
295
  def atomlistToRadius(atomList, hashing="FoldX_radius", device="cpu"):
296
  """
297
  function to get radius from atom names (obtained parsing the pdb files with the parsePDB function)
298
-
299
-
300
-
301
  Parameters
302
  ----------
303
  atomNames : list
304
  atom names obtained parsing the pdb files with the parsePDB function
305
  hashing : FoldX_radius or dict
306
  "FoldX_radius" provides the radius used by the FoldX force field
307
-
308
  Alternatively, if you are not happy with the foldX radius, you can build a dictionary of dictionaries that defines the radius of every atom type in the pdb.
309
  The first dictionary has the residue tag (three letters amino acid code) as key (3 letters compound name for hetero atoms, as written in the PDB file)
310
  every residue key is associated to a dictionary, which the atom tags (as written in the PDB files) as keys and the radius (float) as value
311
-
312
  for example, you can define the radius as following:
313
  {
314
  'CYS': {'N': 1.45, 'O': 1.37, 'C': 1.7, 'SG': 1.7, 'CB': 1.7, 'CA': 1.7}, # radius for cysteine atoms
@@ -317,21 +281,16 @@ def atomlistToRadius(atomList, hashing="FoldX_radius", device="cpu"):
317
  'GOL': {'O1':1.37,'O2':1.37,'O3':1.37,'C1':1.7,'C2':1.7,'C3':1.7}, # radius for glycerol atoms
318
  ...
319
  }
320
-
321
  The default radius are the ones defined in FoldX
322
-
323
  Radius default dictionary can be found in sources/hashings.py
324
-
325
  device : torch.device
326
  The device on which the model should run. E.g. torch.device("cuda") or torch.device("cpu:0")
327
  Returns
328
  -------
329
  coords : torch.Tensor
330
  coordinates of the atoms in the pdb file(s). Shape ( batch, numberOfAtoms, 3)
331
-
332
  radius : torch.tensor
333
  The radius of every atom. Shape (batch,numberOfAtoms)
334
-
335
  """
336
  if hashing == "FoldX_radius":
337
  hashing = hashings.radius
@@ -359,45 +318,36 @@ def atomlistToRadius(atomList, hashing="FoldX_radius", device="cpu"):
359
 
360
  '''
361
  def write_pdb(batchedCoords, atomNames , name=None, output_folder="outpdb/"): #I need to add the chain id
362
-
363
  if name is None:
364
  name = range(len(batchedCoords))
365
-
366
  for struct in range(len(name)):
367
  f = open(output_folder + str(name[struct]) + ".pdb", "w")
368
-
369
  coords=batchedCoords[struct].data.numpy()
370
  atname=atomNames[struct]
371
  for i in range(len(coords)):
372
-
373
  rnName = atname[i].split("_")[0]#hashings.resi_hash_inverse[resi_list[i]]
374
  atName = atname[i].split("_")[2]#hashings.atom_hash_inverse[resi_list[i]][atom_list[i]]
375
  pos = atname[i].split("_")[1]
376
  chain = "A"
377
-
378
  num = " " * (5 - len(str(i))) + str(i)
379
  a_name = atName + " " * (4 - len(atName))
380
  numres = " " * (4 - len(str(pos))) + str(pos)
381
-
382
  x = round(float(coords[i][0]), 3)
383
  sx = str(x)
384
  while len(sx.split(".")[1]) < 3:
385
  sx += "0"
386
  x = " " * (8 - len(sx)) + sx
387
-
388
  y = round(float(coords[i][1]), 3)
389
  sy = str(y)
390
  while len(sy.split(".")[1]) < 3:
391
  sy += "0"
392
  y = " " * (8 - len(sy)) + sy
393
-
394
  z = round(float(coords[i][2]), 3)
395
  sz = str(z)
396
  while len(sz.split(".")[1]) < 3:
397
  sz += "0"
398
  z = " " * (8 - len(sz)) + sz
399
  chain = " " * (2 - len(chain)) + chain
400
-
401
  if rnName !="HET":
402
  f.write("ATOM " + num + " " + a_name + "" + rnName + chain + numres + " " + x + y + z + " 1.00 64.10 " + atName[0] + "\n")
403
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os,torch
2
  from pyuul.sources.globalVariables import *
3
  from pyuul.sources import hashings
 
16
  def parseSDF(SDFFile):
17
  """
18
  function to parse pdb files. It can be used to parse a single file or all the pdb files in a folder. In case a folder is given, the coordinates are gonna be padded
 
19
  Parameters
20
  ----------
21
  SDFFile : str
22
  path of the PDB file or of the folder containing multiple PDB files
 
23
  Returns
24
  -------
25
  coords : torch.Tensor
26
  coordinates of the atoms in the pdb file(s). Shape ( batch, numberOfAtoms, 3)
 
27
  atomNames : list
28
  a list of the atom identifier. It encodes atom type, residue type, residue position and chain
 
29
  """
30
  if not os.path.isdir(SDFFile):
31
  fil = SDFFile
 
80
 
81
  """
82
  function to parse pdb files. It can be used to parse a single file or all the pdb files in a folder. In case a folder is given, the coordinates are gonna be padded
 
83
  Parameters
84
  ----------
85
  PDBFile : str
 
94
  -------
95
  coords : torch.Tensor
96
  coordinates of the atoms in the pdb file(s). Shape ( batch, numberOfAtoms, 3)
 
97
  atomNames : list
98
  a list of the atom identifier. It encodes atom type, residue type, residue position and chain
 
99
  """
100
 
101
  bbatoms = ["N", "CA", "C"]
 
179
  atomNamesTMP += ["HET_"+str(resnum)+"_"+atnameHet+"_"+line[21]]
180
  coords+=[torch.tensor(coordsTMP)]
181
  atomNames += [atomNamesTMP]
182
+ print(atomNames)
183
+ print(pdbname)
184
+ print(pdb_num)
185
  return torch.torch.nn.utils.rnn.pad_sequence(coords, batch_first=True, padding_value=PADDING_INDEX), atomNames, pdbname, pdb_num
186
 
187
 
188
  def atomlistToChannels(atomNames, hashing="Element_Hashing", device="cpu"):
189
  """
190
  function to get channels from atom names (obtained parsing the pdb files with the parsePDB function)
 
191
  Parameters
192
  ----------
193
  atomNames : list
194
  atom names obtained parsing the pdb files with the parsePDB function
 
195
  hashing : "TPL_Hashing" or "Element_Hashing" or dict
196
  define which atoms are grouped together. You can use two default hashings or build your own hashing:
 
197
  TPL_Hashing: uses the hashing of torch protein library (https://github.com/lupoglaz/TorchProteinLibrary)
198
  Element_Hashing: groups atoms in accordnce with the element only: C -> 0, N -> 1, O ->2, P ->3, S- >4, H ->5, everything else ->6
 
199
  Alternatively, if you are not happy with the default hashings, you can build a dictionary of dictionaries that defines the channel of every atom type in the pdb.
200
  the first dictionary has the residue tag (three letters amino acid code) as key (3 letters compound name for hetero atoms, as written in the PDB file)
201
  every residue key is associated to a dictionary, which the atom tags (as written in the PDB files) as keys and the channel (int) as value
 
202
  for example, you can define the channels just based on the atom element as following:
203
  {
204
  'CYS': {'N': 1, 'O': 2, 'C': 0, 'SG': 3, 'CB': 0, 'CA': 0}, # channels for cysteine atoms
 
207
  'GOL': {'O1':2,'O2':2,'O3':2,'C1':0,'C2':0,'C3':0}, # channels for glycerol atom
208
  ...
209
  }
 
210
  The default encoding is the one that assigns a different channel to each element
 
211
  other encodings can be found in sources/hashings.py
 
212
  device : torch.device
213
  The device on which the model should run. E.g. torch.device("cuda") or torch.device("cpu:0")
214
  Returns
215
  -------
216
  coords : torch.Tensor
217
  coordinates of the atoms in the pdb file(s). Shape ( batch, numberOfAtoms, 3)
 
218
  channels : torch.tensor
219
  the channel of every atom. Shape (batch,numberOfAtoms)
 
220
  """
221
  if hashing == "TPL_Hashing":
222
  hashing = hashings.TPLatom_hash
 
264
  def atomlistToRadius(atomList, hashing="FoldX_radius", device="cpu"):
265
  """
266
  function to get radius from atom names (obtained parsing the pdb files with the parsePDB function)
 
 
 
267
  Parameters
268
  ----------
269
  atomNames : list
270
  atom names obtained parsing the pdb files with the parsePDB function
271
  hashing : FoldX_radius or dict
272
  "FoldX_radius" provides the radius used by the FoldX force field
 
273
  Alternatively, if you are not happy with the foldX radius, you can build a dictionary of dictionaries that defines the radius of every atom type in the pdb.
274
  The first dictionary has the residue tag (three letters amino acid code) as key (3 letters compound name for hetero atoms, as written in the PDB file)
275
  every residue key is associated to a dictionary, which the atom tags (as written in the PDB files) as keys and the radius (float) as value
 
276
  for example, you can define the radius as following:
277
  {
278
  'CYS': {'N': 1.45, 'O': 1.37, 'C': 1.7, 'SG': 1.7, 'CB': 1.7, 'CA': 1.7}, # radius for cysteine atoms
 
281
  'GOL': {'O1':1.37,'O2':1.37,'O3':1.37,'C1':1.7,'C2':1.7,'C3':1.7}, # radius for glycerol atoms
282
  ...
283
  }
 
284
  The default radius are the ones defined in FoldX
 
285
  Radius default dictionary can be found in sources/hashings.py
 
286
  device : torch.device
287
  The device on which the model should run. E.g. torch.device("cuda") or torch.device("cpu:0")
288
  Returns
289
  -------
290
  coords : torch.Tensor
291
  coordinates of the atoms in the pdb file(s). Shape ( batch, numberOfAtoms, 3)
 
292
  radius : torch.tensor
293
  The radius of every atom. Shape (batch,numberOfAtoms)
 
294
  """
295
  if hashing == "FoldX_radius":
296
  hashing = hashings.radius
 
318
 
319
  '''
320
  def write_pdb(batchedCoords, atomNames , name=None, output_folder="outpdb/"): #I need to add the chain id
 
321
  if name is None:
322
  name = range(len(batchedCoords))
 
323
  for struct in range(len(name)):
324
  f = open(output_folder + str(name[struct]) + ".pdb", "w")
 
325
  coords=batchedCoords[struct].data.numpy()
326
  atname=atomNames[struct]
327
  for i in range(len(coords)):
 
328
  rnName = atname[i].split("_")[0]#hashings.resi_hash_inverse[resi_list[i]]
329
  atName = atname[i].split("_")[2]#hashings.atom_hash_inverse[resi_list[i]][atom_list[i]]
330
  pos = atname[i].split("_")[1]
331
  chain = "A"
 
332
  num = " " * (5 - len(str(i))) + str(i)
333
  a_name = atName + " " * (4 - len(atName))
334
  numres = " " * (4 - len(str(pos))) + str(pos)
 
335
  x = round(float(coords[i][0]), 3)
336
  sx = str(x)
337
  while len(sx.split(".")[1]) < 3:
338
  sx += "0"
339
  x = " " * (8 - len(sx)) + sx
 
340
  y = round(float(coords[i][1]), 3)
341
  sy = str(y)
342
  while len(sy.split(".")[1]) < 3:
343
  sy += "0"
344
  y = " " * (8 - len(sy)) + sy
 
345
  z = round(float(coords[i][2]), 3)
346
  sz = str(z)
347
  while len(sz.split(".")[1]) < 3:
348
  sz += "0"
349
  z = " " * (8 - len(sz)) + sz
350
  chain = " " * (2 - len(chain)) + chain
 
351
  if rnName !="HET":
352
  f.write("ATOM " + num + " " + a_name + "" + rnName + chain + numres + " " + x + y + z + " 1.00 64.10 " + atName[0] + "\n")
353
  else: