Spaces:
Sleeping
Sleeping
Commit
·
50482d6
1
Parent(s):
e837258
Create retrieveUniprotSequences.py
Browse files
code/retrieveUniprotSequences.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from add_sequence import *
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def create_isoforms(uniprot_id, isoform_fasta):
|
| 7 |
+
if uniprot_id not in isoform_fasta.uniprotID.to_list():
|
| 8 |
+
isoform_current = pd.DataFrame(get_isoforms(uniprot_id).items(), columns=['uniprotID', 'isoformSequence'])
|
| 9 |
+
isoform_current['whichIsoform'] = isoform_current['uniprotID'].apply(lambda x: x[7:10].strip())
|
| 10 |
+
isoform_current['uniprotID'] = isoform_current['uniprotID'].apply(lambda x: x[0:6])
|
| 11 |
+
isoform_fasta = pd.concat([isoform_fasta, isoform_current], axis=0)
|
| 12 |
+
return isoform_fasta
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def add_isoform(isoform_fasta, uniprot_id, variation_position, wild_type):
|
| 16 |
+
if len(isoform_fasta) != 0:
|
| 17 |
+
isoList = isoform_fasta[isoform_fasta['uniprotID'] == uniprot_id].isoformSequence.to_list()
|
| 18 |
+
for k in isoList:
|
| 19 |
+
if len(k) >= int(variation_position):
|
| 20 |
+
resInIso = k[int(int(variation_position) - 1)]
|
| 21 |
+
if wild_type == resInIso:
|
| 22 |
+
whichIsoform_ = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
|
| 23 |
+
wt_sequence_match = 'i'
|
| 24 |
+
break
|
| 25 |
+
else:
|
| 26 |
+
whichIsoform_ = np.NaN
|
| 27 |
+
wt_sequence_match = np.NaN
|
| 28 |
+
else:
|
| 29 |
+
whichIsoform_ = np.NaN
|
| 30 |
+
wt_sequence_match = np.NaN
|
| 31 |
+
return whichIsoform_, wt_sequence_match
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def add_uniprot_sequence(DATAFRAME):
|
| 35 |
+
CANONICAL_FASTA = pd.DataFrame(columns=['uniprotID', 'uniprotSequence'])
|
| 36 |
+
ISOFORM_FASTA = pd.DataFrame(columns=['uniprotID', 'isoformSequence'])
|
| 37 |
+
|
| 38 |
+
UNIPROT_ID_LIST = list(set(DATAFRAME['uniprotID'].to_list()))
|
| 39 |
+
for i in range(len(UNIPROT_ID_LIST)):
|
| 40 |
+
CANONICAL_FASTA.at[i, 'uniprotSequence'] = get_uniprot_seq(UNIPROT_ID_LIST[i])
|
| 41 |
+
CANONICAL_FASTA.at[i, 'uniprotID'] = UNIPROT_ID_LIST[i]
|
| 42 |
+
|
| 43 |
+
canonical_fasta = CANONICAL_FASTA.drop_duplicates()
|
| 44 |
+
DATAFRAME = DATAFRAME.merge(canonical_fasta, on='uniprotID', how='left')
|
| 45 |
+
DATAFRAME['uniprotSequence'].replace({'': np.NaN}, inplace=True)
|
| 46 |
+
|
| 47 |
+
for i in DATAFRAME.index:
|
| 48 |
+
UNIPROT_ID = DATAFRAME.at[i, 'uniprotID']
|
| 49 |
+
VARIATION_POSITION = DATAFRAME.at[i, 'pos']
|
| 50 |
+
WILDTYPE = DATAFRAME.at[i, 'wt']
|
| 51 |
+
|
| 52 |
+
if len(DATAFRAME.loc[i, 'uniprotSequence']) >= int(VARIATION_POSITION):
|
| 53 |
+
can = str(DATAFRAME.at[i, 'uniprotSequence'])[int(VARIATION_POSITION) - 1]
|
| 54 |
+
if WILDTYPE == can:
|
| 55 |
+
DATAFRAME.loc[i, 'wt_sequence_match'] = 'm'
|
| 56 |
+
elif WILDTYPE != can:
|
| 57 |
+
ISOFORM_FASTA = create_isoforms(UNIPROT_ID, ISOFORM_FASTA)
|
| 58 |
+
ISOFORM_NUM, MATCH_STAT = add_isoform(ISOFORM_FASTA, UNIPROT_ID, VARIATION_POSITION, WILDTYPE)
|
| 59 |
+
ISOFORM_SEQUENCE = ISOFORM_FASTA[(ISOFORM_FASTA['uniprotID'] == UNIPROT_ID) &
|
| 60 |
+
(ISOFORM_FASTA['whichIsoform'] == ISOFORM_NUM)].isoformSequence.values
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
DATAFRAME.at[i,'whichIsoform'] = ISOFORM_NUM
|
| 64 |
+
DATAFRAME.at[i, 'wt_sequence_match'] = MATCH_STAT
|
| 65 |
+
DATAFRAME.at[i, 'uniprotSequence'] = ISOFORM_SEQUENCE
|
| 66 |
+
|
| 67 |
+
elif len(DATAFRAME.at[i, 'uniprotSequence']) < int(VARIATION_POSITION):
|
| 68 |
+
ISOFORM_FASTA = create_isoforms(ISOFORM_FASTA, UNIPROT_ID)
|
| 69 |
+
ISOFORM_NUM, MATCH_STAT = add_isoform(ISOFORM_FASTA, UNIPROT_ID, VARIATION_POSITION, WILDTYPE)
|
| 70 |
+
ISOFORM_SEQUENCE = ISOFORM_FASTA[(ISOFORM_FASTA['uniprotID'] == UNIPROT_ID) &
|
| 71 |
+
( ISOFORM_FASTA['whichIsoform'] == ISOFORM_NUM)].isoformSequence.values
|
| 72 |
+
|
| 73 |
+
DATAFRAME.at[i, 'whichIsoform'] = ISOFORM_NUM
|
| 74 |
+
DATAFRAME.at[i, 'wt_sequence_match'] = MATCH_STAT
|
| 75 |
+
DATAFRAME.at[i, 'uniprotSequence'] = ISOFORM_SEQUENCE
|
| 76 |
+
|
| 77 |
+
DATAFRAME['uniprotSequence'] = DATAFRAME['uniprotSequence'].apply(lambda x: x[0] if (type(x) != str and len(x)>0) else x)
|
| 78 |
+
DATAFRAME['uniprotSequence'] = DATAFRAME['uniprotSequence'].apply(lambda x: np.NaN if (type(x) != str and len(x) == 0) else x)
|
| 79 |
+
|
| 80 |
+
print('>> Sequence files created...\n')
|
| 81 |
+
return DATAFRAME
|
| 82 |
+
|