Spaces:
Runtime error
Runtime error
File size: 6,964 Bytes
d4a5429 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
#==================================================================================================#
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from Bio.SeqUtils import ProtParam
import streamlit as st
from Bio.SeqUtils import IsoelectricPoint
# The variable 'seq' should be initialized with a valid protein sequence
seq = ""
def giveValues(seq1):
global seq
seq = seq1
def structure(seq):
alpha_helix, beta_sheet, turn = ProtParam.ProteinAnalysis(seq).secondary_structure_fraction()
return alpha_helix, beta_sheet, turn
def calculate_net_charge(protein_sequence):
# Define the charges of amino acids
amino_acid_charges = {
'A': 0, 'R': 1, 'N': 0, 'D': -1, 'C': 0,
'Q': 0, 'E': -1, 'G': 0, 'H': 0, 'I': 0,
'L': 0, 'K': 1, 'M': 0, 'F': 0, 'P': 0,
'S': 0, 'T': 0, 'W': 0, 'Y': 0, 'V': 0
}
# Calculate net charge
net_charge = sum(amino_acid_charges.get(aa, 0) for aa in protein_sequence)
return net_charge
def calculate_electric_potential(protein_sequence, pH=7.0):
# Define the pKa values of amino acids
amino_acid_pKa = {
'A': 2.35, 'R': 12.0, 'N': 7.0, 'D': 3.9, 'C': 8.3,
'Q': 9.1, 'E': 4.2, 'G': 2.3, 'H': 6.0, 'I': 6.0,
'L': 6.0, 'K': 10.5, 'M': 5.7, 'F': 5.5, 'P': 6.3,
'S': 2.2, 'T': 2.6, 'W': 5.9, 'Y': 5.7, 'V': 6.0
}
# Calculate the electric potential
electric_potential = 0.0
for aa in protein_sequence:
charge = 0.0
if aa in amino_acid_pKa:
pKa = amino_acid_pKa[aa]
charge = 10 ** (pH - pKa) / (1 + 10 ** (pH - pKa))
electric_potential += charge
return electric_potential
def molWeight(seq):
"""
Calculate the molecular weight of a protein sequence.
Parameters:
- seq (str): Protein sequence.
Returns:
- float: Molecular weight.
"""
moleWeight1 = ProtParam.ProteinAnalysis(seq).molecular_weight()
return moleWeight1
def hydrophobicityValues(seq):
"""
Calculate the hydrophobicity index of a protein sequence.
Parameters:
- seq (str): Protein sequence.
Returns:
- float: Hydrophobicity index.
"""
hydrophobicity_values = {
'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
}
hydrophobicity = sum(hydrophobicity_values[aa] for aa in seq)
return hydrophobicity
def helix_propensity_score(sequence):
"""
Calculate the helix propensity score of a protein sequence.
Parameters:
- sequence (str): Protein sequence.
Returns:
- float: Helix propensity score.
"""
helix_propensity_values = {
'A': 1.2, 'C': 0.8, 'E': 0.7, 'G': 0.9, 'H': 1.0,
'I': 1.1, 'K': 1.3, 'L': 1.0, 'M': 1.2, 'N': 0.9,
'Q': 1.0, 'R': 1.2, 'S': 0.8, 'T': 0.9, 'V': 1.1,
'W': 1.3, 'Y': 1.1, 'P': 0.7, 'F': 1.1, 'D': 0.7
}
helix_propensity_score = sum(helix_propensity_values[aa] for aa in sequence)
return helix_propensity_score
def abundance_dispersion(sequence):
"""
Calculate the abundance and dispersion of beta strand amino acids in a protein sequence.
Parameters:
- sequence (str): Protein sequence.
Returns:
- tuple: Abundance and dispersion.
"""
beta_strand_amino_acids = ['E', 'F', 'Y', 'W']
beta_strand_count = sum(sequence.count(aa) for aa in beta_strand_amino_acids)
abundance = beta_strand_count
dispersion = 0
if beta_strand_count > 1:
positions = [i for i, aa in enumerate(sequence) if aa in beta_strand_amino_acids]
mean_position = sum(positions) / beta_strand_count
dispersion = sum((pos - mean_position) ** 2 for pos in positions) / (beta_strand_count - 1)
dispersion = dispersion ** 0.5
return abundance, dispersion
def BSpropensity_score(sequence):
"""
Calculate the beta strand propensity score of a protein sequence.
Parameters:
- sequence (str): Protein sequence.
Returns:
- float: Beta strand propensity score.
"""
beta_strand_propensity_values = {
'A': 0.7, 'C': 1.0, 'E': 1.1, 'G': 0.5, 'H': 0.8,
'I': 0.6, 'K': 1.2, 'L': 0.7, 'M': 0.5, 'N': 1.1,
'Q': 0.9, 'R': 0.8, 'S': 0.6, 'T': 0.7, 'V': 0.8,
'W': 1.3, 'Y': 1.2, 'P': 1.0, 'F': 1.1, 'D': 0.9
}
beta_strand_propensity_score = sum(beta_strand_propensity_values[aa] for aa in sequence)
return beta_strand_propensity_score
def getamino_acid(seq):
aminoAcidPercent = ProtParam.ProteinAnalysis(seq).get_amino_acids_percent()
return aminoAcidPercent
def returnValues(seqLen,seq_unce):
"""
Return labeled data based on protein sequence features.
Returns:
- pd.DataFrame: Labeled data.
"""
global seq
alpha_helix, beta_sheet, turn = structure(seq)
if seq is None:
raise ValueError("Protein sequence 'seq' is not provided. Use giveValues() to set the sequence.")
# Convert the single sequence to a list
seq_list = [seq]
percent_A, percent_C, percent_D, percent_E, percent_F, percent_G, percent_H, percent_I, percent_K, percent_L, percent_M, percent_N, percent_P, percent_Q, percent_R, percent_S, percent_T, percent_V, percent_W, percent_Y = getamino_acid(seq).values()
data = pd.DataFrame({
"Sequence": [seq_unce],
# "Sequence Length": [seqLen],
"net_charge": calculate_net_charge(seq),
"isoelectric point": [IsoelectricPoint.IsoelectricPoint(seq).pi()],
'alpha_helix':[alpha_helix],
'beta_sheet':[beta_sheet],
'turn':[turn],
"Molecular Weight": molWeight(seq),
'percent_A':[percent_A],
'percent_C':[percent_C],
'percent_D':[percent_D],
'percent_E':[percent_E],
'percent_F':[percent_F],
'percent_G':[percent_G],
'percent_H':[percent_H],
'percent_I':[percent_I],
'percent_K':[percent_K],
'percent_L':[percent_L],
'percent_M':[percent_M],
'percent_N':[percent_N],
'percent_P':[percent_P],
'percent_Q':[percent_Q],
'percent_R':[percent_R],
'percent_S':[percent_S],
'percent_T':[percent_T],
'percent_V':[percent_V],
'percent_W':[percent_W],
'percent_Y':[percent_Y],
"Hydrophobicity": hydrophobicityValues(seq),
"Electrical Potential": calculate_electric_potential(seq),
"Abundance": abundance_dispersion(seq)[0],
"Dispersion": abundance_dispersion(seq)[1],
# "Helix Propensity Score": helix_propensity_score(seq),
# "Beta strand propensity values": BSpropensity_score(seq),
})
return data
# Example of how to use the functions:
# giveValues("YOUR_PROTEIN_SEQUENCE_HERE")
# data = returnValues()
# print(data)
|