fatmacankara commited on
Commit
50482d6
·
1 Parent(s): e837258

Create retrieveUniprotSequences.py

Browse files
Files changed (1) hide show
  1. code/retrieveUniprotSequences.py +82 -0
code/retrieveUniprotSequences.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from add_sequence import *
4
+
5
+
6
+ def create_isoforms(uniprot_id, isoform_fasta):
7
+ if uniprot_id not in isoform_fasta.uniprotID.to_list():
8
+ isoform_current = pd.DataFrame(get_isoforms(uniprot_id).items(), columns=['uniprotID', 'isoformSequence'])
9
+ isoform_current['whichIsoform'] = isoform_current['uniprotID'].apply(lambda x: x[7:10].strip())
10
+ isoform_current['uniprotID'] = isoform_current['uniprotID'].apply(lambda x: x[0:6])
11
+ isoform_fasta = pd.concat([isoform_fasta, isoform_current], axis=0)
12
+ return isoform_fasta
13
+
14
+
15
+ def add_isoform(isoform_fasta, uniprot_id, variation_position, wild_type):
16
+ if len(isoform_fasta) != 0:
17
+ isoList = isoform_fasta[isoform_fasta['uniprotID'] == uniprot_id].isoformSequence.to_list()
18
+ for k in isoList:
19
+ if len(k) >= int(variation_position):
20
+ resInIso = k[int(int(variation_position) - 1)]
21
+ if wild_type == resInIso:
22
+ whichIsoform_ = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
23
+ wt_sequence_match = 'i'
24
+ break
25
+ else:
26
+ whichIsoform_ = np.NaN
27
+ wt_sequence_match = np.NaN
28
+ else:
29
+ whichIsoform_ = np.NaN
30
+ wt_sequence_match = np.NaN
31
+ return whichIsoform_, wt_sequence_match
32
+
33
+
34
+ def add_uniprot_sequence(DATAFRAME):
35
+ CANONICAL_FASTA = pd.DataFrame(columns=['uniprotID', 'uniprotSequence'])
36
+ ISOFORM_FASTA = pd.DataFrame(columns=['uniprotID', 'isoformSequence'])
37
+
38
+ UNIPROT_ID_LIST = list(set(DATAFRAME['uniprotID'].to_list()))
39
+ for i in range(len(UNIPROT_ID_LIST)):
40
+ CANONICAL_FASTA.at[i, 'uniprotSequence'] = get_uniprot_seq(UNIPROT_ID_LIST[i])
41
+ CANONICAL_FASTA.at[i, 'uniprotID'] = UNIPROT_ID_LIST[i]
42
+
43
+ canonical_fasta = CANONICAL_FASTA.drop_duplicates()
44
+ DATAFRAME = DATAFRAME.merge(canonical_fasta, on='uniprotID', how='left')
45
+ DATAFRAME['uniprotSequence'].replace({'': np.NaN}, inplace=True)
46
+
47
+ for i in DATAFRAME.index:
48
+ UNIPROT_ID = DATAFRAME.at[i, 'uniprotID']
49
+ VARIATION_POSITION = DATAFRAME.at[i, 'pos']
50
+ WILDTYPE = DATAFRAME.at[i, 'wt']
51
+
52
+ if len(DATAFRAME.loc[i, 'uniprotSequence']) >= int(VARIATION_POSITION):
53
+ can = str(DATAFRAME.at[i, 'uniprotSequence'])[int(VARIATION_POSITION) - 1]
54
+ if WILDTYPE == can:
55
+ DATAFRAME.loc[i, 'wt_sequence_match'] = 'm'
56
+ elif WILDTYPE != can:
57
+ ISOFORM_FASTA = create_isoforms(UNIPROT_ID, ISOFORM_FASTA)
58
+ ISOFORM_NUM, MATCH_STAT = add_isoform(ISOFORM_FASTA, UNIPROT_ID, VARIATION_POSITION, WILDTYPE)
59
+ ISOFORM_SEQUENCE = ISOFORM_FASTA[(ISOFORM_FASTA['uniprotID'] == UNIPROT_ID) &
60
+ (ISOFORM_FASTA['whichIsoform'] == ISOFORM_NUM)].isoformSequence.values
61
+
62
+
63
+ DATAFRAME.at[i,'whichIsoform'] = ISOFORM_NUM
64
+ DATAFRAME.at[i, 'wt_sequence_match'] = MATCH_STAT
65
+ DATAFRAME.at[i, 'uniprotSequence'] = ISOFORM_SEQUENCE
66
+
67
+ elif len(DATAFRAME.at[i, 'uniprotSequence']) < int(VARIATION_POSITION):
68
+ ISOFORM_FASTA = create_isoforms(ISOFORM_FASTA, UNIPROT_ID)
69
+ ISOFORM_NUM, MATCH_STAT = add_isoform(ISOFORM_FASTA, UNIPROT_ID, VARIATION_POSITION, WILDTYPE)
70
+ ISOFORM_SEQUENCE = ISOFORM_FASTA[(ISOFORM_FASTA['uniprotID'] == UNIPROT_ID) &
71
+ ( ISOFORM_FASTA['whichIsoform'] == ISOFORM_NUM)].isoformSequence.values
72
+
73
+ DATAFRAME.at[i, 'whichIsoform'] = ISOFORM_NUM
74
+ DATAFRAME.at[i, 'wt_sequence_match'] = MATCH_STAT
75
+ DATAFRAME.at[i, 'uniprotSequence'] = ISOFORM_SEQUENCE
76
+
77
+ DATAFRAME['uniprotSequence'] = DATAFRAME['uniprotSequence'].apply(lambda x: x[0] if (type(x) != str and len(x)>0) else x)
78
+ DATAFRAME['uniprotSequence'] = DATAFRAME['uniprotSequence'].apply(lambda x: np.NaN if (type(x) != str and len(x) == 0) else x)
79
+
80
+ print('>> Sequence files created...\n')
81
+ return DATAFRAME
82
+