fatmacankara commited on
Commit
dbf4494
·
1 Parent(s): 9a8f002

Create add_3Dalignment.py

Browse files
Files changed (1) hide show
  1. code/add_3Dalignment.py +284 -0
code/add_3Dalignment.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This code file produces alignments between the structure and the sequence for a given protein.
3
+
4
+ """
5
+
6
+ import math
7
+ import glob
8
+ import numpy as np
9
+ from Bio import Align
10
+ import gzip
11
+ from pathlib import Path
12
+ from Bio.Align import substitution_matrices
13
+ aligner = Align.PairwiseAligner()
14
+ def convert_non_standard_amino_acids(sequence):
15
+ """
16
+ Convert non-standard or ambiguous amino acid codes to their closest relatives.
17
+ """
18
+
19
+ # Define a dictionary to map non-standard codes to standard amino acids
20
+ conversion_dict = {
21
+ 'B': 'D', # Aspartic Acid (D) is often used for B (Asx)
22
+ 'Z': 'E', # Glutamic Acid (E) is often used for Z (Glx)
23
+ 'X': 'A', # Alanine (A) is a common placeholder for unknown/ambiguous
24
+ 'U': 'C', # Cysteine (C) is often used for Selenocysteine (U)
25
+ 'J': 'L', # Leucine (L) is often used for J (Leu/Ile)
26
+ 'O': 'K', # Lysine (K) is often used for O (Pyrrolysine)
27
+ # '*' or 'Stop' represents a stop codon; you may replace with '' to remove
28
+ '*': '',
29
+ }
30
+
31
+ # Replace non-standard codes with their closest relatives
32
+ converted_sequence = ''.join([conversion_dict.get(aa, aa) for aa in sequence])
33
+
34
+ return converted_sequence
35
+ def distance(x1, y1, z1, x2, y2, z2):
36
+ d = math.sqrt(math.pow(x2 - x1, 2) +
37
+ math.pow(y2 - y1, 2) +
38
+ math.pow(z2 - z1, 2) * 1.0)
39
+ return d
40
+
41
+
42
+ def find_distance(coordMut, coordAnnot):
43
+ if coordMut != np.NaN:
44
+ try:
45
+ dist = distance(float(coordMut[0]), float(coordMut[1]), float(coordMut[2]), float(coordAnnot[0]),
46
+ float(coordAnnot[1]), float(coordAnnot[2]))
47
+ return "%.2f" % dist
48
+ except:
49
+ ValueError
50
+ dist = 'nan'
51
+ return dist
52
+ else:
53
+ return np.NaN
54
+
55
+
56
+ def threeToOne(variant):
57
+ if variant == "ALA":
58
+ variant = "A"
59
+ elif variant == "ARG":
60
+ variant = "R"
61
+ elif variant == "VAL":
62
+ variant = "V"
63
+ elif variant == "GLU":
64
+ variant = "E"
65
+ elif variant == "PRO":
66
+ variant = "P"
67
+ elif variant == "LEU":
68
+ variant = "L"
69
+ elif variant == "GLY":
70
+ variant = "G"
71
+ elif variant == "ASN":
72
+ variant = "N"
73
+ elif variant == "SER":
74
+ variant = "S"
75
+ elif variant == "GLN":
76
+ variant = "Q"
77
+ elif variant == "THR":
78
+ variant = "T"
79
+ elif variant == "MET":
80
+ variant = "M"
81
+ elif variant == "LYS":
82
+ variant = "K"
83
+ elif variant == "ASP":
84
+ variant = "D"
85
+ elif variant == "ILE":
86
+ variant = "I"
87
+ elif variant == "PHE":
88
+ variant = "F"
89
+ elif variant == "TRP":
90
+ variant = "W"
91
+ elif variant == "TYR":
92
+ variant = "Y"
93
+ elif variant == "HIS":
94
+ variant = "H"
95
+ elif variant == "CYS":
96
+ variant = "C"
97
+ elif variant == 'UNK':
98
+ variant = 'X'
99
+ elif variant == 'ASX':
100
+ variant = 'O'
101
+ return (variant)
102
+
103
+
104
+ def get_coords(annot, alignments, coords, resnums_for_sasa, mode):
105
+ if mode == 1:
106
+ for alignment in alignments[0]:
107
+ alignment = (str(alignment).strip().split('\n'))
108
+ startGap = 0
109
+ if alignment[0].startswith('.'):
110
+ for k in alignment[0]:
111
+ if k == '.' or k == '-':
112
+ startGap += 1
113
+ else:
114
+ break
115
+ countGap = startGap
116
+ countResidue = 0
117
+ for j in alignment[0][startGap:]:
118
+ if j == '.' or j == '-':
119
+ countGap += 1
120
+ else:
121
+ countResidue += 1
122
+ if countResidue == float(annot):
123
+ break
124
+ countGap_pdb = 0
125
+ countResidue_pdb = 0
126
+ for m in alignment[2][0:countResidue + countGap - 1]:
127
+ if m == '.' or m == '-':
128
+ countGap_pdb += 1
129
+ posAtom = countResidue + countGap - countGap_pdb
130
+
131
+ realpdbStart = 0
132
+ for j in alignment[2]:
133
+ if j == '.' or j == '-':
134
+ realpdbStart += 1
135
+ else:
136
+ break
137
+
138
+ if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
139
+ try:
140
+ coordinates = alignments[1]
141
+ residue_numbers = alignments[2]
142
+ coordWeWant = coordinates[posAtom - 1]
143
+ residue_number_we_want = residue_numbers[posAtom - 1]
144
+
145
+ except:
146
+ IndexError
147
+ coordWeWant = 'nan'
148
+ else:
149
+ coordWeWant = 'nan'
150
+ return coordWeWant, posAtom, residue_number_we_want
151
+ if mode == 2:
152
+ if annot != 'nan':
153
+ if int(annot) <= 1400:
154
+ alignment = (str(alignments).strip().split('\n'))
155
+ startGap = 0
156
+ if alignment[0].startswith('.'):
157
+ for k in alignment[0]:
158
+ if k == '.' or k == '-':
159
+ startGap += 1
160
+ else:
161
+ break
162
+ countGap = startGap
163
+ countResidue = 0
164
+ for j in alignment[0][startGap:]:
165
+ if j == '.' or j == '-':
166
+ countGap += 1
167
+ else:
168
+ countResidue += 1
169
+ if countResidue == float(annot):
170
+ break
171
+ countGap_pdb = 0
172
+ countResidue_pdb = 0
173
+ for m in alignment[2][0:countResidue + countGap - 1]:
174
+ if m == '.' or m == '-':
175
+ countGap_pdb += 1
176
+ posAtom = countResidue + countGap - countGap_pdb
177
+ realpdbStart = 0
178
+ for j in alignment[2]:
179
+ if j == '.' or j == '-':
180
+ realpdbStart += 1
181
+ else:
182
+ break
183
+ if len(alignment[2]) > (countResidue + countGap - 1):
184
+ if (alignment[2][countResidue + countGap - 1] != '-') and (float(annot) >= float(realpdbStart) + 1):
185
+ try:
186
+ coordinates = coords
187
+ residue_numbers = resnums_for_sasa
188
+ coordWeWant = coordinates[posAtom - 1]
189
+ residue_number_we_want = residue_numbers[posAtom - 1]
190
+ except:
191
+ IndexError
192
+ coordWeWant = 'nan'
193
+ residue_number_we_want = 'nan'
194
+ else:
195
+ coordWeWant = 'nan'
196
+ residue_number_we_want = 'nan'
197
+ return coordWeWant, posAtom, residue_number_we_want
198
+ else:
199
+ coordWeWant = 'nan'
200
+ residue_number_we_want = 'nan'
201
+ return coordWeWant, posAtom, residue_number_we_want
202
+ else:
203
+ return np.NaN, np.NaN, np.NaN
204
+ else:
205
+ return np.NaN, np.NaN, np.NaN
206
+
207
+
208
+ def get_alignments_3D(identifier, model_num, pdb_path, pdbSequence, source, chain, pdbID, mode, path_3D_alignment,file_format = 'gzip'):
209
+ pdbSequence = convert_non_standard_amino_acids(pdbSequence)
210
+ if mode == 1:
211
+ atomSequence = ''
212
+ coords = []
213
+ resnums_for_sasa = []
214
+ with open(pdb_path, encoding="utf8") as f:
215
+ for line in f.readlines():
216
+ if source != 'MODBASE':
217
+ if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21].upper() == chain.upper():
218
+ atomSequence += threeToOne(line[17:20].strip())
219
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
220
+ resnums_for_sasa.append(line[22:26].strip())
221
+ elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
222
+ atomSequence += threeToOne(line[17:20].strip())
223
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
224
+ resnums_for_sasa.append(line[22:26].strip())
225
+ else:
226
+ if line[0:7].strip() == 'ATOM' and line[13:15].strip() == 'CA':
227
+ atomSequence += threeToOne(line[17:20].strip())
228
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
229
+ resnums_for_sasa.append(line[22:26].strip())
230
+
231
+ f = open(Path(path_3D_alignment / f'{identifier}_{pdbID}_{str(chain)}_alignment.txt'),"w")
232
+ atomSequence = convert_non_standard_amino_acids(atomSequence)
233
+
234
+ aligner.mode = 'local'
235
+ aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
236
+ aligner.open_gap_score = -11
237
+ aligner.extend_gap_score = -1
238
+ alignments = aligner.align(pdbSequence, atomSequence)
239
+ alignments = (list(alignments))
240
+ for alignment in alignments:
241
+ f.write(str(alignment))
242
+ f.write('\n')
243
+ f.write('\n')
244
+ return alignments, coords, resnums_for_sasa
245
+ elif mode==2:
246
+ atomSequence = ''
247
+ coords = []
248
+ resnums_for_sasa = []
249
+ if file_format == 'txt':
250
+ with open(name, encoding="utf8") as f:
251
+ for line in f.readlines():
252
+ if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
253
+ atomSequence += threeToOne(line[17:20].strip())
254
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
255
+ resnums_for_sasa.append(line[22:26].strip())
256
+ elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
257
+ atomSequence += threeToOne(line[17:20].strip())
258
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
259
+ resnums_for_sasa.append(line[22:26].strip())
260
+ elif file_format == 'gzip':
261
+ with gzip.open(pdb_path, mode='rb') as f:
262
+ for line in f:
263
+ line = line.decode()
264
+ if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA':
265
+ atomSequence += threeToOne(line[17:20].strip())
266
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
267
+ resnums_for_sasa.append(line[22:26].strip())
268
+ elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
269
+ atomSequence += threeToOne(line[17:20].strip())
270
+ coords.append([line[31:38].strip(), line[39:46].strip(), line[47:54].strip()])
271
+ resnums_for_sasa.append(line[22:26].strip())
272
+ f = open(Path(path_3D_alignment / f'{identifier}_{str(model_num)}_3Dalignment.txt'),"w")
273
+ aligner.mode = 'local'
274
+ aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
275
+ aligner.open_gap_score = -11
276
+ aligner.extend_gap_score = -1
277
+ atomSequence = convert_non_standard_amino_acids(atomSequence)
278
+ alignments = aligner.align(pdbSequence, atomSequence)
279
+ alignments = (list(alignments))
280
+ for alignment in alignments:
281
+ f.write(str(alignment))
282
+ f.write('\n')
283
+ f.write('\n')
284
+ return alignments, coords, resnums_for_sasa