File size: 14,442 Bytes
fa18295
c2a02c6
 
 
 
 
 
 
001c319
c2a02c6
 
fa18295
 
 
 
 
 
 
 
c2a02c6
fa18295
c2a02c6
fa18295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2a02c6
 
 
 
fa18295
 
c2a02c6
fa18295
 
 
 
c2a02c6
fa18295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2a02c6
fa18295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b796c18
fa18295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0df7f2f
fa18295
0df7f2f
fa18295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
import ast
import re
import time
import json
import zlib
from xml.etree import ElementTree
from urllib.parse import urlparse, parse_qs, urlencode
import requests
import unipressed
from requests.adapters import HTTPAdapter, Retry
from unipressed import IdMappingClient
import Bio
from Bio import SeqIO
import pandas as pd
import numpy as np
from pathlib import Path
from Bio.PDB import *
from io import StringIO
from utils import *

import math

import json
UNIPROT_ANNOTATION_COLS =  ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
     'activeSite',
     'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
     'crosslink', 'mutagenesis', 'strand',
     'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain',
     'caBinding', 'bindingSite', 'region',
     'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
     'coiledCoil', 'peptide',
     'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary',
     'intMetBinary', 'intramembraneBinary',
     'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
     'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
     'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
     'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
     'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
     'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
     'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
     'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
     'glycosylationBinary', 'propeptideBinary']
def get_pdb_ids(protein_id):
    try:
        request = IdMappingClient.submit(
            source="UniProtKB_AC-ID", dest="PDB", ids={protein_id})
        pdb_list = list(request.each_result())
        return [i['to'] for i in pdb_list]
    except requests.exceptions.HTTPError:
        return  []
    except unipressed.id_mapping.core.IdMappingError:
        print('IdMappingError caused by UniProt API service, please try later.')
        return  []
    except KeyError:
        return  []


def fix_filename(filename):
    try:
        if Path(filename).suffix == '.pdb':
            pass
        elif Path(filename).stem.endswith("ent"):
            filename_replace_ext = filename.with_name( Path(filename).stem[3:])
            Path(filename).rename(filename_replace_ext.with_suffix('.pdb'))
        elif Path(filename).stem.startswith("pdb"):
            filename_replace_ext = Path(filename).with_name(Path(filename).stem[3:])
            Path(filename).rename(filename_replace_ext.with_suffix('.pdb'))
        else:
            filename_replace_ext = filename.with_suffix(".pdb")
            Path(filename).rename(filename_replace_ext)

    except:
        FileNotFoundError



def fetch_uniprot_ids(pdb_code):
    response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_code}")
    response.raise_for_status()
    resp = response.json()
    return list(list(list(resp.values())[0].values())[0].keys())

def addPDBinfo(data, path_to_output_files):
    # pdb_fasta = pd.DataFrame(columns=['pdbID', 'chain', 'pdbSequence'])
    pdb_info = pd.DataFrame(columns=['uniprotID', 'pdbID', 'chain', 'resolution'])
    print('Retrieving PDB structures...\n')
    up_list = data.uniprotID.to_list()
    pdbs = [get_pdb_ids(i) for i in up_list]

    if len(pdbs) >= 1:
        pdbs = [item for sublist in pdbs for item in sublist]
        pdbs = list(filter(None, pdbs))
        pdbs = set(pdbs)
        pdbs = [i.lower() for i in pdbs]
    else:
        pdbs = []
        print('No PDB structure found for the query. ')

    print('\n>>Starting PDB structures download...\n')
    print('\n>>Processing PDB structures...\n')
    parser = PDBParser()
    ppb = PPBuilder()

    index = 0
    for search in pdbs:
        print(f'Searching for {search.upper()}')
        try:
            pdb_url = f"https://files.rcsb.org/download/{search}.pdb"
            response = requests.get(pdb_url)
            response.raise_for_status()  # Check for a successful response
            pdb_data = response.text
            pdb_parser = PDBParser(QUIET=True)  # QUIET=True suppresses warnings
            pdb_file_content = StringIO(pdb_data)
            structure = pdb_parser.get_structure(search, pdb_file_content)
            pdb_data_list = pdb_data.split('\n')
            pdb_data_list = [i for i in pdb_data_list if i.startswith('DBREF')]
            pdb_data_list = [[list(filter(None, i.split(' '))) for j in i.split(' ') if j == 'UNP'] for
                             i in pdb_data_list]
            pdb_data_list = [i for i in pdb_data_list if i != []]
            header = structure.header
            for unp in pdb_data_list:
                if (unp[0][5] == 'UNP') & (unp[0][6].split('-')[0] in up_list):
                    pdb_info.at[index, 'uniprotID'] = unp[0][6].split('-')[0]
                    pdb_info.at[index, 'pdbID'] = unp[0][1].upper()
                    pdb_info.at[index, 'chain'] = unp[0][2].upper()
                    pdb_info.at[index, 'resolution'] = header.get('resolution', 'N/A')
                    pdb_info.at[index, 'start'] = unp[0][8]
                    pdb_info.at[index, 'end'] = unp[0][9]
                    index += 1
        except:
            continue
    pdb_info.replace({'None': np.NaN}, inplace=True)
    print('PDB file processing finished..')

    return pdb_info
from add_sasa import *



def downloadPDB(pdbID, path_to_output_files):
    pdbl = PDBList()
    existing_pdb = list(Path(path_to_output_files / 'pdb_structures').glob("*"))
    existing_pdb = [str(i) for i in existing_pdb]
    existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
    if pdbID not in existing_pdb:
        # print(f'Downloading PDB file for {pdbID.upper()}..')
        file = pdbl.retrieve_pdb_file(pdbID, pdir=Path(path_to_output_files / 'pdb_structures'), file_format="pdb")
        fix_filename(file)
        file = fix_filename(file)
        file = Path(path_to_output_files / 'pdb_structures' / f'{pdbID}.pdb')
    else:
        print(f'PDB file for {pdbID.upper()} exists..')
        file = Path(path_to_output_files / 'pdb_structures' / f'{pdbID}.pdb')
        fix_filename(file)
        file = fix_filename(file)

        file = Path(path_to_output_files / 'pdb_structures' / f'{pdbID}.pdb')


    existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
    existing_free_sasa = [str(i) for i in existing_free_sasa]
    existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
    if pdbID not in existing_free_sasa:
        run_freesasa(file, Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt'), include_hetatms=True,
                              outdir=None, force_rerun=False, file_type='pdb')

    return file

def processFile(data, path_to_output_files):
    for i in data.index:
        protein = data.at[i,'uniprotID']
        pdbID = data.at[i,'pdbID'].lower()
        chain =  data.at[i,'chain']
        pos = int(data.at[i, 'pos'])
        wt = data.at[i, 'wt']


        url = f'https://files.rcsb.org/download/{pdbID}.pdb'
        response = requests.get(url)
        
        if response.status_code == 200:
            with open(f'{path_to_output_files}/pdb_structures/{pdbID}.pdb', 'w') as f:
                f.write(response.text)
            print(f"Downloaded {pdbID}.pdb successfully.")
        else:
            print(f"Failed to download {pdbID}.pdb. Status code: {response.status_code}")
        file = Path(path_to_output_files / 'pdb_structures' / f'{pdbID}.pdb')

        
        run_freesasa(file, Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt'), include_hetatms=True,
                                  outdir=None, force_rerun=False, file_type='pdb')

 
        
        filename = Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt')
        data.loc[i, 'sasa'] = sasa(protein, pos, wt, 1, filename, path_to_output_files,file_type='pdb')

        
        
        newCol = {}
        with open(file, encoding="utf8") as f:
            for line in f.readlines():
                if line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21].upper() == chain.upper():
                    coords= [line[31:38].strip(), line[39:46].strip(), line[47:54].strip()]
                    resnums_for_sasa = line[22:26].strip()
                    newCol[resnums_for_sasa] = coords
                elif line[0:4].strip() == 'ATOM' and line[13:15].strip() == 'CA' and line[21] == ' ':
                    coords= [line[31:38].strip(), line[39:46].strip(), line[47:54].strip()]
                    resnums_for_sasa = line[22:26].strip()
                    newCol[resnums_for_sasa] = coords
        data.at[i, 'coordinates'] = json.dumps(newCol)
    return data

def distance(x1, y1, z1, x2, y2, z2):
    d = math.sqrt(math.pow(x2 - x1, 2) +
                  math.pow(y2 - y1, 2) +
                  math.pow(z2 - z1, 2) * 1.0)
    return d


def find_distance(coordMut, coordAnnot):
    if coordMut != np.NaN:
        try:
            dist = distance(float(coordMut[0]), float(coordMut[1]), float(coordMut[2]), float(coordAnnot[0]),
                            float(coordAnnot[1]), float(coordAnnot[2]))

            return "%.2f" % dist
        except:
            ValueError
            dist = 'nan'
            return dist
    else:
        return np.NaN

def domainDistance(domStart, domEnd, coordinates, mutationPosition, matchList, posOnPDB):
    resList = list(range(domStart, domEnd))
    domainDistanceList = []
    for i in resList:
        try:
            domainPos = ast.literal_eval(matchList)[str(i)]
            coordMut = coordinates[str(posOnPDB)]
            coordDomain = coordinates[str(domainPos)]
            distance = find_distance(coordMut, coordDomain)
            domainDistanceList.append(distance)
            return min(domainDistanceList)
        except KeyError:
            domainDistanceList = np.NaN
            return np.NaN



def match3D(data):
    data.fillna(np.NaN, inplace=True)
    for i in data.index:
        coordinates = ast.literal_eval(data.at[i, 'coordinates'])
        pos = str(data.at[i, 'pos'])
        matchList = data.at[i, 'MATCHDICT']
        try:
            posOnPDB = ast.literal_eval(data.at[i, 'MATCHDICT'])[pos]
            coordMut = coordinates[str(posOnPDB)]
            if data.at[i, 'distance'] == -1000:
                domStart = data.at[i, 'domStart']
                domEnd = data.at[i, 'domEnd']
                data.at[i, 'distance'] = domainDistance(domStart, domEnd, coordinates, pos, matchList, posOnPDB)
        except KeyError:
            posOnPDB = np.NaN
            coordMut = np.NaN
            data.at[i, 'distance'] = np.NaN


        for col in UNIPROT_ANNOTATION_COLS[0:30]:
            allDist = []
            if (data.at[i, col] != np.NaN) & (data.at[i, col] != 'hit') &  (data.at[i, col] != '[]')&  (data.at[i, col] != []):
                annotation_list = ast.literal_eval(data.at[i, col])
                integer_list = [int(element) for element in annotation_list if element != 'null']
                for annotPosition in integer_list:
                    coordAnnot = coordinates[str(annotPosition)]
                    distance = find_distance(coordMut, coordAnnot)
                    allDist.append(distance)
                if len(allDist)>0:
                    data.at[i, col] = min(allDist)
    return data


def domainDistanceModels(domStart, domEnd, coordinates, mutationPosition):
    resList = list(range(domStart, domEnd))
    domainDistanceList = []
    for i in resList:
        try:
            coordMut = (coordinates)[mutationPosition]
            coordDomain = (coordinates)[i]
            distance = find_distance(coordMut, coordDomain)
            domainDistanceList.append(distance)
            return min(domainDistanceList)
        except KeyError:
            domainDistanceList = np.NaN
            return np.NaN


def match3DModels(data):
    data.fillna(np.NaN, inplace=True)
    for i in data.index:
        pos = int(data.at[i, 'pos'])
        coords = data.at[i, 'coordinates']
        if type(coords) != dict:
            coordinates = ast.literal_eval(coords)
        else:
            coordinates = coords
            pass
        coordMut = coordinates[pos]
        if data.at[i, 'distance'] == -1000:
            domStart = data.at[i, 'domStart']
            domEnd = data.at[i, 'domEnd']
            data.at[i, 'distance'] = domainDistanceModels(domStart, domEnd, coordinates, pos)
        for col in UNIPROT_ANNOTATION_COLS[0:30]:
            allDist = []
            if (data.at[i, col] != np.NaN) & (data.at[i, col] != 'hit') &  (data.at[i, col] != '[]')&  (data.at[i, col] != []):
                annotation_list = ast.literal_eval(data.at[i, col])
                integer_list = [int(element) for element in annotation_list]
                for annotPosition in integer_list:
                    try:
                        coordAnnot = coordinates[annotPosition]
                    except KeyError:
                        coordAnnot = []
                    distance = find_distance(coordMut, coordAnnot)
                    allDist.append(distance)

                if len(allDist)>0:
                    allDist = [float(i) for i in allDist]
                    data.at[i, col] = min(allDist)

    return data


def selectMaxAnnot(data):
    if len(data) >0:
        for i in data.index:
            total = 0
            nanCounter = 0
            hitCounter = 0
            for col in UNIPROT_ANNOTATION_COLS[0:30]:
                if (str(data.at[i,col]) != 'nan') and (data.at[i,col] != '[]') and (data.at[i,col] != 'hit') and (data.at[i,col] != ''):
                    total += float(data.at[i,col])
                elif  (str(data.at[i,col]) == 'nan') or (data.at[i,col] == '[]') or (data.at[i,col] != ''):
                    nanCounter +=1
                if data.at[i,col] == 'hit':
                    hitCounter += 1

            if hitCounter > 0:
                data.at[i, 'hitTotal'] = hitCounter
            else:
                data.at[i, 'hitTotal'] = np.NaN

            if nanCounter != 30:
                data.at[i, 'annotTotal'] = total
            else:
                data.at[i, 'annotTotal'] = np.NaN
    else:
        data['annotTotal'] = np.NaN

    return data