Spaces:

SyedSyab
/

AllergenePrediction

Runtime error

App Files Files Community

AllergenePrediction / backend /labelingInput.py

SyedSyab

model added

d4a5429 over 1 year ago

raw

history blame contribute delete

6.96 kB

	#==================================================================================================#

	import pandas as pd
	from sklearn.preprocessing import LabelEncoder
	from Bio.SeqUtils import ProtParam
	import streamlit as st
	from Bio.SeqUtils import IsoelectricPoint
	# The variable 'seq' should be initialized with a valid protein sequence
	seq = ""
	def giveValues(seq1):
	global seq
	seq = seq1

	def structure(seq):
	alpha_helix, beta_sheet, turn = ProtParam.ProteinAnalysis(seq).secondary_structure_fraction()
	return alpha_helix, beta_sheet, turn

	def calculate_net_charge(protein_sequence):
	# Define the charges of amino acids
	amino_acid_charges = {
	'A': 0, 'R': 1, 'N': 0, 'D': -1, 'C': 0,
	'Q': 0, 'E': -1, 'G': 0, 'H': 0, 'I': 0,
	'L': 0, 'K': 1, 'M': 0, 'F': 0, 'P': 0,
	'S': 0, 'T': 0, 'W': 0, 'Y': 0, 'V': 0
	}

	# Calculate net charge
	net_charge = sum(amino_acid_charges.get(aa, 0) for aa in protein_sequence)

	return net_charge

	def calculate_electric_potential(protein_sequence, pH=7.0):
	# Define the pKa values of amino acids
	amino_acid_pKa = {
	'A': 2.35, 'R': 12.0, 'N': 7.0, 'D': 3.9, 'C': 8.3,
	'Q': 9.1, 'E': 4.2, 'G': 2.3, 'H': 6.0, 'I': 6.0,
	'L': 6.0, 'K': 10.5, 'M': 5.7, 'F': 5.5, 'P': 6.3,
	'S': 2.2, 'T': 2.6, 'W': 5.9, 'Y': 5.7, 'V': 6.0
	}

	# Calculate the electric potential
	electric_potential = 0.0
	for aa in protein_sequence:
	charge = 0.0
	if aa in amino_acid_pKa:
	pKa = amino_acid_pKa[aa]
	charge = 10 (pH - pKa) / (1 + 10 (pH - pKa))

	electric_potential += charge

	return electric_potential

	def molWeight(seq):
	"""
	Calculate the molecular weight of a protein sequence.

	Parameters:
	- seq (str): Protein sequence.

	Returns:
	- float: Molecular weight.
	"""
	moleWeight1 = ProtParam.ProteinAnalysis(seq).molecular_weight()
	return moleWeight1

	def hydrophobicityValues(seq):
	"""
	Calculate the hydrophobicity index of a protein sequence.

	Parameters:
	- seq (str): Protein sequence.

	Returns:
	- float: Hydrophobicity index.
	"""
	hydrophobicity_values = {
	'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
	'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
	'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
	'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
	}
	hydrophobicity = sum(hydrophobicity_values[aa] for aa in seq)
	return hydrophobicity




	def helix_propensity_score(sequence):
	"""
	Calculate the helix propensity score of a protein sequence.

	Parameters:
	- sequence (str): Protein sequence.

	Returns:
	- float: Helix propensity score.
	"""
	helix_propensity_values = {
	'A': 1.2, 'C': 0.8, 'E': 0.7, 'G': 0.9, 'H': 1.0,
	'I': 1.1, 'K': 1.3, 'L': 1.0, 'M': 1.2, 'N': 0.9,
	'Q': 1.0, 'R': 1.2, 'S': 0.8, 'T': 0.9, 'V': 1.1,
	'W': 1.3, 'Y': 1.1, 'P': 0.7, 'F': 1.1, 'D': 0.7
	}
	helix_propensity_score = sum(helix_propensity_values[aa] for aa in sequence)
	return helix_propensity_score

	def abundance_dispersion(sequence):
	"""
	Calculate the abundance and dispersion of beta strand amino acids in a protein sequence.

	Parameters:
	- sequence (str): Protein sequence.

	Returns:
	- tuple: Abundance and dispersion.
	"""
	beta_strand_amino_acids = ['E', 'F', 'Y', 'W']
	beta_strand_count = sum(sequence.count(aa) for aa in beta_strand_amino_acids)
	abundance = beta_strand_count
	dispersion = 0

	if beta_strand_count > 1:
	positions = [i for i, aa in enumerate(sequence) if aa in beta_strand_amino_acids]
	mean_position = sum(positions) / beta_strand_count
	dispersion = sum((pos - mean_position) ** 2 for pos in positions) / (beta_strand_count - 1)
	dispersion = dispersion ** 0.5

	return abundance, dispersion

	def BSpropensity_score(sequence):
	"""
	Calculate the beta strand propensity score of a protein sequence.

	Parameters:
	- sequence (str): Protein sequence.

	Returns:
	- float: Beta strand propensity score.
	"""
	beta_strand_propensity_values = {
	'A': 0.7, 'C': 1.0, 'E': 1.1, 'G': 0.5, 'H': 0.8,
	'I': 0.6, 'K': 1.2, 'L': 0.7, 'M': 0.5, 'N': 1.1,
	'Q': 0.9, 'R': 0.8, 'S': 0.6, 'T': 0.7, 'V': 0.8,
	'W': 1.3, 'Y': 1.2, 'P': 1.0, 'F': 1.1, 'D': 0.9
	}
	beta_strand_propensity_score = sum(beta_strand_propensity_values[aa] for aa in sequence)
	return beta_strand_propensity_score

	def getamino_acid(seq):
	aminoAcidPercent = ProtParam.ProteinAnalysis(seq).get_amino_acids_percent()
	return aminoAcidPercent


	def returnValues(seqLen,seq_unce):
	"""
	Return labeled data based on protein sequence features.

	Returns:
	- pd.DataFrame: Labeled data.
	"""
	global seq
	alpha_helix, beta_sheet, turn = structure(seq)
	if seq is None:
	raise ValueError("Protein sequence 'seq' is not provided. Use giveValues() to set the sequence.")

	# Convert the single sequence to a list
	seq_list = [seq]



	percent_A, percent_C, percent_D, percent_E, percent_F, percent_G, percent_H, percent_I, percent_K, percent_L, percent_M, percent_N, percent_P, percent_Q, percent_R, percent_S, percent_T, percent_V, percent_W, percent_Y = getamino_acid(seq).values()

	data = pd.DataFrame({
	"Sequence": [seq_unce],
	# "Sequence Length": [seqLen],
	"net_charge": calculate_net_charge(seq),
	"isoelectric point": [IsoelectricPoint.IsoelectricPoint(seq).pi()],
	'alpha_helix':[alpha_helix],
	'beta_sheet':[beta_sheet],
	'turn':[turn],
	"Molecular Weight": molWeight(seq),
	'percent_A':[percent_A],
	'percent_C':[percent_C],
	'percent_D':[percent_D],
	'percent_E':[percent_E],
	'percent_F':[percent_F],
	'percent_G':[percent_G],
	'percent_H':[percent_H],
	'percent_I':[percent_I],
	'percent_K':[percent_K],
	'percent_L':[percent_L],
	'percent_M':[percent_M],
	'percent_N':[percent_N],
	'percent_P':[percent_P],
	'percent_Q':[percent_Q],
	'percent_R':[percent_R],
	'percent_S':[percent_S],
	'percent_T':[percent_T],
	'percent_V':[percent_V],
	'percent_W':[percent_W],
	'percent_Y':[percent_Y],
	"Hydrophobicity": hydrophobicityValues(seq),
	"Electrical Potential": calculate_electric_potential(seq),
	"Abundance": abundance_dispersion(seq)[0],
	"Dispersion": abundance_dispersion(seq)[1],
	# "Helix Propensity Score": helix_propensity_score(seq),
	# "Beta strand propensity values": BSpropensity_score(seq),
	})

	return data

	# Example of how to use the functions:
	# giveValues("YOUR_PROTEIN_SEQUENCE_HERE")
	# data = returnValues()
	# print(data)