AllergenePrediction / backend /labelingInput.py
SyedSyab's picture
model added
d4a5429
#==================================================================================================#
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from Bio.SeqUtils import ProtParam
import streamlit as st
from Bio.SeqUtils import IsoelectricPoint
# The variable 'seq' should be initialized with a valid protein sequence
seq = ""
def giveValues(seq1):
global seq
seq = seq1
def structure(seq):
alpha_helix, beta_sheet, turn = ProtParam.ProteinAnalysis(seq).secondary_structure_fraction()
return alpha_helix, beta_sheet, turn
def calculate_net_charge(protein_sequence):
# Define the charges of amino acids
amino_acid_charges = {
'A': 0, 'R': 1, 'N': 0, 'D': -1, 'C': 0,
'Q': 0, 'E': -1, 'G': 0, 'H': 0, 'I': 0,
'L': 0, 'K': 1, 'M': 0, 'F': 0, 'P': 0,
'S': 0, 'T': 0, 'W': 0, 'Y': 0, 'V': 0
}
# Calculate net charge
net_charge = sum(amino_acid_charges.get(aa, 0) for aa in protein_sequence)
return net_charge
def calculate_electric_potential(protein_sequence, pH=7.0):
# Define the pKa values of amino acids
amino_acid_pKa = {
'A': 2.35, 'R': 12.0, 'N': 7.0, 'D': 3.9, 'C': 8.3,
'Q': 9.1, 'E': 4.2, 'G': 2.3, 'H': 6.0, 'I': 6.0,
'L': 6.0, 'K': 10.5, 'M': 5.7, 'F': 5.5, 'P': 6.3,
'S': 2.2, 'T': 2.6, 'W': 5.9, 'Y': 5.7, 'V': 6.0
}
# Calculate the electric potential
electric_potential = 0.0
for aa in protein_sequence:
charge = 0.0
if aa in amino_acid_pKa:
pKa = amino_acid_pKa[aa]
charge = 10 ** (pH - pKa) / (1 + 10 ** (pH - pKa))
electric_potential += charge
return electric_potential
def molWeight(seq):
"""
Calculate the molecular weight of a protein sequence.
Parameters:
- seq (str): Protein sequence.
Returns:
- float: Molecular weight.
"""
moleWeight1 = ProtParam.ProteinAnalysis(seq).molecular_weight()
return moleWeight1
def hydrophobicityValues(seq):
"""
Calculate the hydrophobicity index of a protein sequence.
Parameters:
- seq (str): Protein sequence.
Returns:
- float: Hydrophobicity index.
"""
hydrophobicity_values = {
'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
}
hydrophobicity = sum(hydrophobicity_values[aa] for aa in seq)
return hydrophobicity
def helix_propensity_score(sequence):
"""
Calculate the helix propensity score of a protein sequence.
Parameters:
- sequence (str): Protein sequence.
Returns:
- float: Helix propensity score.
"""
helix_propensity_values = {
'A': 1.2, 'C': 0.8, 'E': 0.7, 'G': 0.9, 'H': 1.0,
'I': 1.1, 'K': 1.3, 'L': 1.0, 'M': 1.2, 'N': 0.9,
'Q': 1.0, 'R': 1.2, 'S': 0.8, 'T': 0.9, 'V': 1.1,
'W': 1.3, 'Y': 1.1, 'P': 0.7, 'F': 1.1, 'D': 0.7
}
helix_propensity_score = sum(helix_propensity_values[aa] for aa in sequence)
return helix_propensity_score
def abundance_dispersion(sequence):
"""
Calculate the abundance and dispersion of beta strand amino acids in a protein sequence.
Parameters:
- sequence (str): Protein sequence.
Returns:
- tuple: Abundance and dispersion.
"""
beta_strand_amino_acids = ['E', 'F', 'Y', 'W']
beta_strand_count = sum(sequence.count(aa) for aa in beta_strand_amino_acids)
abundance = beta_strand_count
dispersion = 0
if beta_strand_count > 1:
positions = [i for i, aa in enumerate(sequence) if aa in beta_strand_amino_acids]
mean_position = sum(positions) / beta_strand_count
dispersion = sum((pos - mean_position) ** 2 for pos in positions) / (beta_strand_count - 1)
dispersion = dispersion ** 0.5
return abundance, dispersion
def BSpropensity_score(sequence):
"""
Calculate the beta strand propensity score of a protein sequence.
Parameters:
- sequence (str): Protein sequence.
Returns:
- float: Beta strand propensity score.
"""
beta_strand_propensity_values = {
'A': 0.7, 'C': 1.0, 'E': 1.1, 'G': 0.5, 'H': 0.8,
'I': 0.6, 'K': 1.2, 'L': 0.7, 'M': 0.5, 'N': 1.1,
'Q': 0.9, 'R': 0.8, 'S': 0.6, 'T': 0.7, 'V': 0.8,
'W': 1.3, 'Y': 1.2, 'P': 1.0, 'F': 1.1, 'D': 0.9
}
beta_strand_propensity_score = sum(beta_strand_propensity_values[aa] for aa in sequence)
return beta_strand_propensity_score
def getamino_acid(seq):
aminoAcidPercent = ProtParam.ProteinAnalysis(seq).get_amino_acids_percent()
return aminoAcidPercent
def returnValues(seqLen,seq_unce):
"""
Return labeled data based on protein sequence features.
Returns:
- pd.DataFrame: Labeled data.
"""
global seq
alpha_helix, beta_sheet, turn = structure(seq)
if seq is None:
raise ValueError("Protein sequence 'seq' is not provided. Use giveValues() to set the sequence.")
# Convert the single sequence to a list
seq_list = [seq]
percent_A, percent_C, percent_D, percent_E, percent_F, percent_G, percent_H, percent_I, percent_K, percent_L, percent_M, percent_N, percent_P, percent_Q, percent_R, percent_S, percent_T, percent_V, percent_W, percent_Y = getamino_acid(seq).values()
data = pd.DataFrame({
"Sequence": [seq_unce],
# "Sequence Length": [seqLen],
"net_charge": calculate_net_charge(seq),
"isoelectric point": [IsoelectricPoint.IsoelectricPoint(seq).pi()],
'alpha_helix':[alpha_helix],
'beta_sheet':[beta_sheet],
'turn':[turn],
"Molecular Weight": molWeight(seq),
'percent_A':[percent_A],
'percent_C':[percent_C],
'percent_D':[percent_D],
'percent_E':[percent_E],
'percent_F':[percent_F],
'percent_G':[percent_G],
'percent_H':[percent_H],
'percent_I':[percent_I],
'percent_K':[percent_K],
'percent_L':[percent_L],
'percent_M':[percent_M],
'percent_N':[percent_N],
'percent_P':[percent_P],
'percent_Q':[percent_Q],
'percent_R':[percent_R],
'percent_S':[percent_S],
'percent_T':[percent_T],
'percent_V':[percent_V],
'percent_W':[percent_W],
'percent_Y':[percent_Y],
"Hydrophobicity": hydrophobicityValues(seq),
"Electrical Potential": calculate_electric_potential(seq),
"Abundance": abundance_dispersion(seq)[0],
"Dispersion": abundance_dispersion(seq)[1],
# "Helix Propensity Score": helix_propensity_score(seq),
# "Beta strand propensity values": BSpropensity_score(seq),
})
return data
# Example of how to use the functions:
# giveValues("YOUR_PROTEIN_SEQUENCE_HERE")
# data = returnValues()
# print(data)