Spaces:

HUBioDataLab
/

ASCARIS

Sleeping

App Files Files Community

fatmacankara commited on Dec 11, 2023

Commit

50482d6

1 Parent(s): e837258

Create retrieveUniprotSequences.py

Browse files

Files changed (1) hide show

code/retrieveUniprotSequences.py +82 -0

code/retrieveUniprotSequences.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import pandas as pd
+import numpy as np
+from add_sequence import *
+def create_isoforms(uniprot_id, isoform_fasta):
+    if uniprot_id not in isoform_fasta.uniprotID.to_list():
+        isoform_current = pd.DataFrame(get_isoforms(uniprot_id).items(), columns=['uniprotID', 'isoformSequence'])
+        isoform_current['whichIsoform'] = isoform_current['uniprotID'].apply(lambda x: x[7:10].strip())
+        isoform_current['uniprotID'] = isoform_current['uniprotID'].apply(lambda x: x[0:6])
+        isoform_fasta = pd.concat([isoform_fasta, isoform_current], axis=0)
+    return isoform_fasta
+def add_isoform(isoform_fasta, uniprot_id, variation_position, wild_type):
+    if len(isoform_fasta) != 0:
+        isoList = isoform_fasta[isoform_fasta['uniprotID'] == uniprot_id].isoformSequence.to_list()
+        for k in isoList:
+            if len(k) >= int(variation_position):
+                resInIso = k[int(int(variation_position) - 1)]
+                if wild_type == resInIso:
+                    whichIsoform_ = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
+                    wt_sequence_match = 'i'
+                    break
+            else:
+                whichIsoform_ = np.NaN
+                wt_sequence_match = np.NaN
+    else:
+        whichIsoform_ = np.NaN
+        wt_sequence_match = np.NaN
+    return whichIsoform_, wt_sequence_match
+def add_uniprot_sequence(DATAFRAME):
+    CANONICAL_FASTA = pd.DataFrame(columns=['uniprotID', 'uniprotSequence'])
+    ISOFORM_FASTA = pd.DataFrame(columns=['uniprotID', 'isoformSequence'])
+    UNIPROT_ID_LIST = list(set(DATAFRAME['uniprotID'].to_list()))
+    for i in range(len(UNIPROT_ID_LIST)):
+        CANONICAL_FASTA.at[i, 'uniprotSequence'] = get_uniprot_seq(UNIPROT_ID_LIST[i])
+        CANONICAL_FASTA.at[i, 'uniprotID'] = UNIPROT_ID_LIST[i]
+    canonical_fasta = CANONICAL_FASTA.drop_duplicates()
+    DATAFRAME = DATAFRAME.merge(canonical_fasta, on='uniprotID', how='left')
+    DATAFRAME['uniprotSequence'].replace({'': np.NaN}, inplace=True)
+    for i in DATAFRAME.index:
+        UNIPROT_ID = DATAFRAME.at[i, 'uniprotID']
+        VARIATION_POSITION = DATAFRAME.at[i, 'pos']
+        WILDTYPE = DATAFRAME.at[i, 'wt']
+        if len(DATAFRAME.loc[i, 'uniprotSequence']) >= int(VARIATION_POSITION):
+            can = str(DATAFRAME.at[i, 'uniprotSequence'])[int(VARIATION_POSITION) - 1]
+            if WILDTYPE == can:
+                DATAFRAME.loc[i, 'wt_sequence_match'] = 'm'
+            elif WILDTYPE != can:
+                ISOFORM_FASTA = create_isoforms(UNIPROT_ID, ISOFORM_FASTA)
+                ISOFORM_NUM, MATCH_STAT = add_isoform(ISOFORM_FASTA, UNIPROT_ID, VARIATION_POSITION, WILDTYPE)
+                ISOFORM_SEQUENCE = ISOFORM_FASTA[(ISOFORM_FASTA['uniprotID'] == UNIPROT_ID) &
+                                                 (ISOFORM_FASTA['whichIsoform'] == ISOFORM_NUM)].isoformSequence.values
+                DATAFRAME.at[i,'whichIsoform'] = ISOFORM_NUM
+                DATAFRAME.at[i, 'wt_sequence_match'] = MATCH_STAT
+                DATAFRAME.at[i, 'uniprotSequence'] = ISOFORM_SEQUENCE
+        elif len(DATAFRAME.at[i, 'uniprotSequence']) < int(VARIATION_POSITION):
+            ISOFORM_FASTA = create_isoforms(ISOFORM_FASTA, UNIPROT_ID)
+            ISOFORM_NUM, MATCH_STAT = add_isoform(ISOFORM_FASTA, UNIPROT_ID, VARIATION_POSITION, WILDTYPE)
+            ISOFORM_SEQUENCE = ISOFORM_FASTA[(ISOFORM_FASTA['uniprotID'] == UNIPROT_ID) &
+                                             ( ISOFORM_FASTA['whichIsoform'] == ISOFORM_NUM)].isoformSequence.values
+            DATAFRAME.at[i, 'whichIsoform'] = ISOFORM_NUM
+            DATAFRAME.at[i, 'wt_sequence_match'] = MATCH_STAT
+            DATAFRAME.at[i, 'uniprotSequence'] = ISOFORM_SEQUENCE
+    DATAFRAME['uniprotSequence'] = DATAFRAME['uniprotSequence'].apply(lambda x: x[0] if (type(x) != str and len(x)>0) else x)
+    DATAFRAME['uniprotSequence'] = DATAFRAME['uniprotSequence'].apply(lambda x: np.NaN if (type(x) != str and len(x) == 0) else x)
+    print('>> Sequence files created...\n')
+    return DATAFRAME