Spaces:
Sleeping
Sleeping
File size: 10,525 Bytes
02bf24d 6124876 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
import ssl
import requests as r
from decimal import *
import numpy as np
import pandas as pd
import json
import ast
UNIPROT_ANNOTATION_COLS = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
'activeSite',
'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
'crosslink', 'mutagenesis', 'strand',
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain',
'caBinding', 'bindingSite', 'region',
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
'coiledCoil', 'peptide',
'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary',
'intMetBinary', 'intramembraneBinary',
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
'glycosylationBinary', 'propeptideBinary']
annotation_list = UNIPROT_ANNOTATION_COLS[0:30]
def add_annotations(dataframe):
print('Downloading UniProt sequence annotations...\n')
ssl._create_default_https_context = ssl._create_unverified_context
original_annot_name = ['DISULFID', 'INIT_MET', 'INTRAMEM', 'VARIANT', 'DNA_BIND', 'ACT_SITE', 'NP_BIND', 'LIPID',
'SITE', 'TRANSMEM', 'CROSSLNK', 'MUTAGEN', 'STRAND', 'HELIX', 'TURN', 'METAL', 'REPEAT', 'TOPO_DOM',
'CA_BIND', 'BINDING', 'REGION', 'SIGNAL', 'MOD_RES', 'ZN_FING', 'MOTIF', 'COILED', 'PEPTIDE',
'TRANSIT', 'CARBOHYD', 'PROPEP']
annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
'region', 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
'transitPeptide', 'glycosylation', 'propeptide']
dataframe = dataframe.reset_index().drop(['index'], axis=1)
for protein in list(set(dataframe.uniprotID.to_list())):
print('Retieving annotations for ' + protein)
uniprot_entry = r.get("http://www.uniprot.org/uniprot/" + protein + ".txt")
uniprot_entry = uniprot_entry.text.split('\n')
annot_for_protein = []
for annotation in original_annot_name:
for line in uniprot_entry:
if annotation.strip() in line and line.startswith(
'FT') and 'evidence' not in line and 'ECO' not in line and 'note' not in line:
annot_for_protein.append(list(filter(None, line.split(' ')))[1:])
annotations_present = []
for select in annot_for_protein:
if select[0] not in annotations_present:
dataframe.loc[dataframe.uniprotID == protein, select[0]] = str((select[1].replace('..', '-') + '; '))
annotations_present.append(select[0])
else:
dataframe.loc[dataframe.uniprotID == protein, select[0]] += str((select[1].replace('..', '-') + '; '))
missingAnnotations = list(set(original_annot_name) - set(annotations_present))
for miss in missingAnnotations:
dataframe.loc[dataframe.uniprotID == protein, miss] = np.NaN
for i in range(len(original_annot_name)):
dataframe = dataframe.rename(columns={original_annot_name[i]: annotation_list[i]})
# Fix annotation positions
print('Processing positions...\n')
for i in dataframe.index:
all_positions = []
for annot in annotation_list:
if (annot != 'disulfide') & (pd.isna(dataframe.at[i, annot]) != True):
dataframe.at[i, annot] = [x for x in [k.strip() for k in dataframe.at[i, annot].split(';')] if x]
all_positions.append(dataframe.at[i, annot])
elif (annot == 'disulfide') & (pd.isna(dataframe.at[i, annot]) != True):
dataframe.at[i, annot] = dataframe.at[i, annot].split(';')
dataframe.at[i, annot] = [i.split('-') for i in dataframe.at[i, annot]]
dataframe.at[i, annot] = [e for v in dataframe.at[i, annot] for e in v]
dataframe.at[i, annot] = [i for i in dataframe.at[i, annot] if i != ' ']
all_positions.append(dataframe.at[i, annot])
dataframe.at[i, annot] = str(dataframe.at[i, annot])
all_positions = [item for sublist in all_positions for item in sublist]
updated_allPos = []
for pos in all_positions:
if '-' in pos:
first = pos.split('-')[0]
second = pos.split('-')[1]
newPos = list(range(int(first), int(second)+1))
updated_allPos += newPos
else:
updated_allPos.append(int(pos))
updated_allPos.append(dataframe.at[i, 'pos'])
updated_allPos.append(dataframe.at[i, 'domEnd'])
updated_allPos.append(dataframe.at[i, 'domStart'])
updated_allPos = [int(i) for i in updated_allPos]
dataframe.loc[i, 'POSITIONS'] = str(list(set(updated_allPos)))
# Add binary annotations
print('Adding binary annotations...\n')
for i in dataframe.index:
for k in annotation_list: # get the positions of each attribute as a list
txt = k + 'Binary'
dataframe.at[i, txt] = np.NaN
try:
for positions in dataframe.at[i, k].split(','):
position = positions.strip('[').strip(']').replace("'", "")
if (position != np.NaN) and (position != '') and ('-' not in position) and (int(
dataframe.at[i, 'pos']) == int(position)):
dataframe.at[i, txt] = '1'
break
elif (position != np.NaN) and (position != '') and ('-' not in position) and (int(
dataframe.at[i, 'pos']) != int(position)):
dataframe.at[i, txt] = '0'
elif (position != np.NaN) and (position != '') and ('-' in position):
if int(position.split('-')[0]) < int(dataframe.at[i, 'pos']) < int(position.split('-')[1]):
dataframe.at[i, txt] = '1'
break
else:
dataframe.at[i, txt] = '0'
except:
ValueError
# Final corrections
dataframe = dataframe.replace({'[\'?\']': np.NaN})
dataframe = dataframe.replace({'[]': np.NaN})
dataframe = dataframe.replace({'': np.NaN})
dataframe = dataframe.fillna(np.NaN)
return dataframe
def changeUPtoPDB(dataframe):
for i in dataframe.index:
for col in annotation_list:
newList = []
if dataframe.at[i, col] != np.NaN:
if type(dataframe.at[i, col]) == str:
list_v = dataframe.at[i, col][1:-1].split(',')
positionList = [i.strip().strip('\'') for i in list_v]
elif type(dataframe.at[i, col]) == list:
positionList = dataframe.at[i, col]
else:
positionList = []
for position in positionList:
if '-' in position:
all_annots = list(range(int(position.split('-')[0]), int(position.split('-')[1])+1))
for annot in all_annots:
try:
newList.append(ast.literal_eval(dataframe.at[i, 'MATCHDICT'])[str(annot)])
except KeyError:
pass
except TypeError:
pass
else:
try:
newList.append(ast.literal_eval(dataframe.at[i, 'MATCHDICT'])[str(position)])
except KeyError:
pass
except TypeError:
pass
dataframe.loc[i, col] = str(newList)
return dataframe
def changeUPtoModels(dataframe):
dataframe.fillna(np.NaN, inplace=True)
for i in dataframe.index:
for col in annotation_list:
newList = []
if (dataframe.at[i, col] != np.NaN) or (type(dataframe.at[i, col]) != 'float'):
if (type(dataframe.at[i, col]) == str) and (str(dataframe.at[i, col]) != 'nan') :
list_v = dataframe.at[i, col][1:-1].split(',')
positionList = [i.strip().strip('\'') for i in list_v]
elif type(dataframe.at[i, col]) == list:
positionList = dataframe.at[i, col]
else:
positionList = []
if positionList != []:
for position in positionList:
if '-' in position:
all_annots = list(range(int(position.split('-')[0]), int(position.split('-')[1])+1))
newList += all_annots
else:
newList.append(str(position))
pass
else:
all_annots = np.NaN
else:
all_annots = np.NaN
newList = [str(i) for i in newList]
dataframe.loc[i, col] = str(newList)
return dataframe
def isZeroDistance(data):
data.fillna(np.NaN, inplace=True)
for i in data.index:
for col in UNIPROT_ANNOTATION_COLS[0:30]:
if data.at[i, col] != np.NaN:
if type(data.at[i, col]) != 'dict':
annotList = ast.literal_eval(data.at[i, col])
else:
annotList = data.at[i, col]
annotList = [int(i.strip()) for i in annotList if i != 'null']
if int(data.at[i, 'pos']) in annotList:
data.at[i, col] = 'hit'
return data
|