File size: 6,964 Bytes
d4a5429
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#==================================================================================================#

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from Bio.SeqUtils import ProtParam
import streamlit as st
from Bio.SeqUtils import IsoelectricPoint
# The variable 'seq' should be initialized with a valid protein sequence
seq = ""
def giveValues(seq1):
    global seq
    seq = seq1

def structure(seq):
    alpha_helix, beta_sheet, turn = ProtParam.ProteinAnalysis(seq).secondary_structure_fraction()
    return alpha_helix, beta_sheet, turn

def calculate_net_charge(protein_sequence):
    # Define the charges of amino acids
    amino_acid_charges = {
        'A': 0, 'R': 1, 'N': 0, 'D': -1, 'C': 0,
        'Q': 0, 'E': -1, 'G': 0, 'H': 0, 'I': 0,
        'L': 0, 'K': 1, 'M': 0, 'F': 0, 'P': 0,
        'S': 0, 'T': 0, 'W': 0, 'Y': 0, 'V': 0
    }

    # Calculate net charge
    net_charge = sum(amino_acid_charges.get(aa, 0) for aa in protein_sequence)

    return net_charge

def calculate_electric_potential(protein_sequence, pH=7.0):
    # Define the pKa values of amino acids
    amino_acid_pKa = {
        'A': 2.35, 'R': 12.0, 'N': 7.0, 'D': 3.9, 'C': 8.3,
        'Q': 9.1, 'E': 4.2, 'G': 2.3, 'H': 6.0, 'I': 6.0,
        'L': 6.0, 'K': 10.5, 'M': 5.7, 'F': 5.5, 'P': 6.3,
        'S': 2.2, 'T': 2.6, 'W': 5.9, 'Y': 5.7, 'V': 6.0
    }

    # Calculate the electric potential
    electric_potential = 0.0
    for aa in protein_sequence:
        charge = 0.0
        if aa in amino_acid_pKa:
            pKa = amino_acid_pKa[aa]
            charge = 10 ** (pH - pKa) / (1 + 10 ** (pH - pKa))

        electric_potential += charge

    return electric_potential

def molWeight(seq):
    """
    Calculate the molecular weight of a protein sequence.

    Parameters:
    - seq (str): Protein sequence.

    Returns:
    - float: Molecular weight.
    """
    moleWeight1 = ProtParam.ProteinAnalysis(seq).molecular_weight()
    return moleWeight1

def hydrophobicityValues(seq):
    """
    Calculate the hydrophobicity index of a protein sequence.

    Parameters:
    - seq (str): Protein sequence.

    Returns:
    - float: Hydrophobicity index.
    """
    hydrophobicity_values = {
        'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
        'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
        'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
        'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
    }
    hydrophobicity = sum(hydrophobicity_values[aa] for aa in seq)
    return hydrophobicity




def helix_propensity_score(sequence):
    """
    Calculate the helix propensity score of a protein sequence.

    Parameters:
    - sequence (str): Protein sequence.

    Returns:
    - float: Helix propensity score.
    """
    helix_propensity_values = {
        'A': 1.2, 'C': 0.8, 'E': 0.7, 'G': 0.9, 'H': 1.0,
        'I': 1.1, 'K': 1.3, 'L': 1.0, 'M': 1.2, 'N': 0.9,
        'Q': 1.0, 'R': 1.2, 'S': 0.8, 'T': 0.9, 'V': 1.1,
        'W': 1.3, 'Y': 1.1, 'P': 0.7, 'F': 1.1, 'D': 0.7
    }
    helix_propensity_score = sum(helix_propensity_values[aa] for aa in sequence)
    return helix_propensity_score

def abundance_dispersion(sequence):
    """
    Calculate the abundance and dispersion of beta strand amino acids in a protein sequence.

    Parameters:
    - sequence (str): Protein sequence.

    Returns:
    - tuple: Abundance and dispersion.
    """
    beta_strand_amino_acids = ['E', 'F', 'Y', 'W']
    beta_strand_count = sum(sequence.count(aa) for aa in beta_strand_amino_acids)
    abundance = beta_strand_count
    dispersion = 0
    
    if beta_strand_count > 1:
        positions = [i for i, aa in enumerate(sequence) if aa in beta_strand_amino_acids]
        mean_position = sum(positions) / beta_strand_count
        dispersion = sum((pos - mean_position) ** 2 for pos in positions) / (beta_strand_count - 1)
        dispersion = dispersion ** 0.5
    
    return abundance, dispersion

def BSpropensity_score(sequence):
    """
    Calculate the beta strand propensity score of a protein sequence.

    Parameters:
    - sequence (str): Protein sequence.

    Returns:
    - float: Beta strand propensity score.
    """
    beta_strand_propensity_values = {
        'A': 0.7, 'C': 1.0, 'E': 1.1, 'G': 0.5, 'H': 0.8,
        'I': 0.6, 'K': 1.2, 'L': 0.7, 'M': 0.5, 'N': 1.1,
        'Q': 0.9, 'R': 0.8, 'S': 0.6, 'T': 0.7, 'V': 0.8,
        'W': 1.3, 'Y': 1.2, 'P': 1.0, 'F': 1.1, 'D': 0.9
    }
    beta_strand_propensity_score = sum(beta_strand_propensity_values[aa] for aa in sequence)
    return beta_strand_propensity_score

def getamino_acid(seq):
    aminoAcidPercent = ProtParam.ProteinAnalysis(seq).get_amino_acids_percent()
    return aminoAcidPercent


def returnValues(seqLen,seq_unce):
    """
    Return labeled data based on protein sequence features.

    Returns:
    - pd.DataFrame: Labeled data.
    """
    global seq
    alpha_helix, beta_sheet, turn = structure(seq)
    if seq is None:
        raise ValueError("Protein sequence 'seq' is not provided. Use giveValues() to set the sequence.")

    # Convert the single sequence to a list
    seq_list = [seq]
    

    
    percent_A, percent_C, percent_D, percent_E, percent_F, percent_G, percent_H, percent_I, percent_K, percent_L, percent_M, percent_N, percent_P, percent_Q, percent_R, percent_S, percent_T, percent_V, percent_W, percent_Y = getamino_acid(seq).values()
    
    data = pd.DataFrame({
        "Sequence": [seq_unce],
        # "Sequence Length": [seqLen],
        "net_charge": calculate_net_charge(seq),
        "isoelectric point": [IsoelectricPoint.IsoelectricPoint(seq).pi()],
        'alpha_helix':[alpha_helix],
        'beta_sheet':[beta_sheet],
        'turn':[turn],
        "Molecular Weight": molWeight(seq),
        'percent_A':[percent_A],
        'percent_C':[percent_C],
        'percent_D':[percent_D],
        'percent_E':[percent_E],
        'percent_F':[percent_F],
        'percent_G':[percent_G],
        'percent_H':[percent_H],
        'percent_I':[percent_I],
        'percent_K':[percent_K],
        'percent_L':[percent_L],
        'percent_M':[percent_M],
        'percent_N':[percent_N],
        'percent_P':[percent_P],
        'percent_Q':[percent_Q],
        'percent_R':[percent_R],
        'percent_S':[percent_S],
        'percent_T':[percent_T],
        'percent_V':[percent_V],
        'percent_W':[percent_W],
        'percent_Y':[percent_Y],
        "Hydrophobicity": hydrophobicityValues(seq),
        "Electrical Potential": calculate_electric_potential(seq),
        "Abundance": abundance_dispersion(seq)[0],
        "Dispersion": abundance_dispersion(seq)[1],
        # "Helix Propensity Score": helix_propensity_score(seq),
        # "Beta strand propensity values": BSpropensity_score(seq),
    })

    return data

# Example of how to use the functions:
# giveValues("YOUR_PROTEIN_SEQUENCE_HERE")
# data = returnValues()
# print(data)