File size: 9,336 Bytes
b24bdaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad9add7
b24bdaf
 
 
 
 
ad9add7
 
 
b24bdaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36da03c
b24bdaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36da03c
b24bdaf
 
 
36da03c
b24bdaf
36da03c
b24bdaf
 
 
 
 
 
 
 
 
 
 
 
 
 
ad9add7
 
 
 
 
b24bdaf
ad9add7
 
 
 
 
b24bdaf
 
 
 
ad9add7
36da03c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import requests
import numpy as np
import pandas as pd
from utils import *
from pathlib import Path
from bs4 import BeautifulSoup
from add_sasa import *
def addModbaseModels(dataframe, path_to_input_files, path_to_output_files):
    if len(dataframe) != 0:
        # GET MODBASE MODELS
        # Get IDs from data to retrieve only their models from MODBASE
        dataframe.reset_index(inplace=True, drop=True)
        existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*"))
        existing_modbase_models = [str(i) for i in existing_modbase_models]
        existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models]

        existing_modbase_models_ind = list(Path(path_to_output_files / 'modbase_structures_individual').glob("*"))
        existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind]
        existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind]

        modbase_reduced = pd.DataFrame(columns = ['uniprotID', 'target_begin', 'target_end', 'quality_score',
                                               'model_id', 'coordinates','AAonPDB', 'coordVAR'])
        print('Retrieving ModBase models...\n')
        modbase = pd.DataFrame(
            columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id',
                     'coordinates', 'AAonPDB', 'coordVAR'])
        no_modbase = pd.DataFrame(
            columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id',
                     'coordinates', 'AAonPDB', 'coordVAR'])
        # Get model files associated with each UniProtID
        existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
        existing_free_sasa = [str(i) for i in existing_free_sasa]
        existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
        keep_cols = dataframe.columns
        for i in dataframe.index:
            coordDict = {}
            protein = dataframe.at[i, 'uniprotID']
            varPos = int(dataframe.at[i, 'pos'])
            wt =  dataframe.at[i, 'wt']
            mut = dataframe.at[i, 'mut']
            datapoint = dataframe.at[i, 'datapoint']
            
            if protein not in existing_modbase_models:
                print('Downloading Modbase models for ', protein)
                url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
                req = requests.get(url)
                name = path_to_output_files / 'modbase_structures' /  f'{protein}.txt'
                with open(name, 'wb') as f:
                    f.write(req.content)
            else:
                print('Model exists for', protein)
                name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt')

            with open(name, encoding="utf8") as f:
                a = open(name, 'r').read()
                soup = BeautifulSoup(a, 'lxml')
                if soup.findAll('pdbfile') != []:
                    for pdb in soup.findAll('pdbfile'):
                        model_id = str(pdb.contents[1])[10:-11]
                        if model_id not in existing_modbase_models_ind:
                            with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', 'w', encoding="utf8") as individual:
                                individual.write(str('UniProt ID: ' + protein))
                                individual.write('\n')
                                individual.write(str(pdb.contents[3])[10:-11].strip())
                            run_freesasa(
                                Path(path_to_output_files / 'modbase_structures_individual' / f'{model_id.lower()}.txt'),
                                Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt'),
                                include_hetatms=True,
                                outdir=None, force_rerun=False, file_type='pdb')
                        filename = Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt')
                        sasa_val = sasa(protein, varPos, wt, 1, filename, path_to_output_files, file_type='pdb')
                        with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', encoding="utf8") as m:

                            lines = m.readlines()
                            quality_score = -999
                            for ind_line in lines:
                                if ind_line[0:10] == 'UniProt ID':
                                    uniprot_id = ind_line.split(':')[1].strip()
                                if ind_line[0:23] == 'REMARK 220 TARGET BEGIN':
                                    target_begin = ind_line[40:43].strip()
                                if ind_line[0:21] == 'REMARK 220 TARGET END':
                                    target_end = ind_line[40:43].strip()
                            coordDict, AAonPDB, coordVAR = {},np.NaN,np.NaN
                            if (int(varPos) > int(target_begin)) & (int(varPos) < int(target_end)):
                                coordDict = {}
                                for ind_line in lines:
                                    if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID':
                                        model_id = ind_line[40:].strip()
                                    if ind_line[0:15].strip() == 'REMARK 220 MPQS':
                                        quality_score = ind_line[40:].strip()
                                    if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA':
                                        position = int(ind_line[22:26].strip())
                                        chain = ind_line[20:22].strip()
                                        aminoacid = threeToOne(ind_line[17:20])
                                        coords = [ind_line[31:38].strip(), ind_line[39:46].strip(), ind_line[47:54].strip()]
                                        coordDict[position] = coords
                                        if position == int(varPos):
                                            AAonPDB = aminoacid
                                            coordVAR = str(coords)
                                        if ind_line[0:3] == 'TER':
                                            break
                                try:
                                    k = pd.Series(
                                        [uniprot_id, target_begin, target_end,quality_score, model_id, coordDict, AAonPDB, coordVAR, sasa_val])
                                    new_row = {'uniprotID': uniprot_id, 'target_begin': target_begin,
                                               'target_end': target_end, 'quality_score': quality_score,
                                               'model_id': model_id, 'coordinates': coordDict,
                                               'AAonPDB': AAonPDB, 'coordVAR': coordVAR, 'sasa':sasa_val}
                                    modbase_reduced = modbase_reduced.append(new_row, ignore_index=True)
                                    modbase_reduced = modbase_reduced[['uniprotID', 'quality_score', 'model_id', 'coordinates', 'AAonPDB', 'coordVAR', 'sasa']]           
                                    modbase = dataframe.merge(modbase_reduced, on='uniprotID', how='left')
                                    modbase.quality_score = modbase.quality_score.astype(float)
                                    modbase = modbase.sort_values(by=['datapoint', 'quality_score'], ascending=False)
                                    modbase.reset_index(inplace=True, drop=True)
                                    modbase.fillna(np.NaN, inplace=True)
                                    modbase.replace({'\'?\', ': '',
                                                     ', \'?\'': '',
                                                     '(': '', ')': '',
                                                     '[\'?\']': np.NaN,
                                                     '[]': np.NaN,
                                                     'nan-nan': np.NaN,
                                                     '': np.NaN}, inplace=True)
                                except NameError:
                                    print('This file doesnt have Quality Score. Replacer: -999', model_id)
                            else:
                                new_row = {'uniprotID': uniprot_id, 'wt': wt,
                                               'pos': varPos, 'mut': mut, 'datapoint': datapoint }
                                no_modbase = no_modbase.append(new_row, ignore_index=True)
                                
                else:
                    new_row = {'uniprotID': uniprot_id, 'wt': wt,
                                               'pos': varPos, 'mut': mut, 'datapoint': datapoint }
                    no_modbase = no_modbase.append(new_row, ignore_index=True)
                    


    no_modbase_no_Coord = modbase[pd.isna(modbase['coordVAR'])]
    no_modbase = pd.concat([no_modbase, no_modbase_no_Coord])
    modbase = modbase[~pd.isna(modbase['coordVAR'])]
    no_modbase = no_modbase[keep_cols]
    return modbase, no_modbase