Spaces:
Runtime error
Runtime error
#==================================================================================================# | |
import pandas as pd | |
from sklearn.preprocessing import LabelEncoder | |
from Bio.SeqUtils import ProtParam | |
import streamlit as st | |
from Bio.SeqUtils import IsoelectricPoint | |
# The variable 'seq' should be initialized with a valid protein sequence | |
seq = "" | |
def giveValues(seq1): | |
global seq | |
seq = seq1 | |
def structure(seq): | |
alpha_helix, beta_sheet, turn = ProtParam.ProteinAnalysis(seq).secondary_structure_fraction() | |
return alpha_helix, beta_sheet, turn | |
def calculate_net_charge(protein_sequence): | |
# Define the charges of amino acids | |
amino_acid_charges = { | |
'A': 0, 'R': 1, 'N': 0, 'D': -1, 'C': 0, | |
'Q': 0, 'E': -1, 'G': 0, 'H': 0, 'I': 0, | |
'L': 0, 'K': 1, 'M': 0, 'F': 0, 'P': 0, | |
'S': 0, 'T': 0, 'W': 0, 'Y': 0, 'V': 0 | |
} | |
# Calculate net charge | |
net_charge = sum(amino_acid_charges.get(aa, 0) for aa in protein_sequence) | |
return net_charge | |
def calculate_electric_potential(protein_sequence, pH=7.0): | |
# Define the pKa values of amino acids | |
amino_acid_pKa = { | |
'A': 2.35, 'R': 12.0, 'N': 7.0, 'D': 3.9, 'C': 8.3, | |
'Q': 9.1, 'E': 4.2, 'G': 2.3, 'H': 6.0, 'I': 6.0, | |
'L': 6.0, 'K': 10.5, 'M': 5.7, 'F': 5.5, 'P': 6.3, | |
'S': 2.2, 'T': 2.6, 'W': 5.9, 'Y': 5.7, 'V': 6.0 | |
} | |
# Calculate the electric potential | |
electric_potential = 0.0 | |
for aa in protein_sequence: | |
charge = 0.0 | |
if aa in amino_acid_pKa: | |
pKa = amino_acid_pKa[aa] | |
charge = 10 ** (pH - pKa) / (1 + 10 ** (pH - pKa)) | |
electric_potential += charge | |
return electric_potential | |
def molWeight(seq): | |
""" | |
Calculate the molecular weight of a protein sequence. | |
Parameters: | |
- seq (str): Protein sequence. | |
Returns: | |
- float: Molecular weight. | |
""" | |
moleWeight1 = ProtParam.ProteinAnalysis(seq).molecular_weight() | |
return moleWeight1 | |
def hydrophobicityValues(seq): | |
""" | |
Calculate the hydrophobicity index of a protein sequence. | |
Parameters: | |
- seq (str): Protein sequence. | |
Returns: | |
- float: Hydrophobicity index. | |
""" | |
hydrophobicity_values = { | |
'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, | |
'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, | |
'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, | |
'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2 | |
} | |
hydrophobicity = sum(hydrophobicity_values[aa] for aa in seq) | |
return hydrophobicity | |
def helix_propensity_score(sequence): | |
""" | |
Calculate the helix propensity score of a protein sequence. | |
Parameters: | |
- sequence (str): Protein sequence. | |
Returns: | |
- float: Helix propensity score. | |
""" | |
helix_propensity_values = { | |
'A': 1.2, 'C': 0.8, 'E': 0.7, 'G': 0.9, 'H': 1.0, | |
'I': 1.1, 'K': 1.3, 'L': 1.0, 'M': 1.2, 'N': 0.9, | |
'Q': 1.0, 'R': 1.2, 'S': 0.8, 'T': 0.9, 'V': 1.1, | |
'W': 1.3, 'Y': 1.1, 'P': 0.7, 'F': 1.1, 'D': 0.7 | |
} | |
helix_propensity_score = sum(helix_propensity_values[aa] for aa in sequence) | |
return helix_propensity_score | |
def abundance_dispersion(sequence): | |
""" | |
Calculate the abundance and dispersion of beta strand amino acids in a protein sequence. | |
Parameters: | |
- sequence (str): Protein sequence. | |
Returns: | |
- tuple: Abundance and dispersion. | |
""" | |
beta_strand_amino_acids = ['E', 'F', 'Y', 'W'] | |
beta_strand_count = sum(sequence.count(aa) for aa in beta_strand_amino_acids) | |
abundance = beta_strand_count | |
dispersion = 0 | |
if beta_strand_count > 1: | |
positions = [i for i, aa in enumerate(sequence) if aa in beta_strand_amino_acids] | |
mean_position = sum(positions) / beta_strand_count | |
dispersion = sum((pos - mean_position) ** 2 for pos in positions) / (beta_strand_count - 1) | |
dispersion = dispersion ** 0.5 | |
return abundance, dispersion | |
def BSpropensity_score(sequence): | |
""" | |
Calculate the beta strand propensity score of a protein sequence. | |
Parameters: | |
- sequence (str): Protein sequence. | |
Returns: | |
- float: Beta strand propensity score. | |
""" | |
beta_strand_propensity_values = { | |
'A': 0.7, 'C': 1.0, 'E': 1.1, 'G': 0.5, 'H': 0.8, | |
'I': 0.6, 'K': 1.2, 'L': 0.7, 'M': 0.5, 'N': 1.1, | |
'Q': 0.9, 'R': 0.8, 'S': 0.6, 'T': 0.7, 'V': 0.8, | |
'W': 1.3, 'Y': 1.2, 'P': 1.0, 'F': 1.1, 'D': 0.9 | |
} | |
beta_strand_propensity_score = sum(beta_strand_propensity_values[aa] for aa in sequence) | |
return beta_strand_propensity_score | |
def getamino_acid(seq): | |
aminoAcidPercent = ProtParam.ProteinAnalysis(seq).get_amino_acids_percent() | |
return aminoAcidPercent | |
def returnValues(seqLen,seq_unce): | |
""" | |
Return labeled data based on protein sequence features. | |
Returns: | |
- pd.DataFrame: Labeled data. | |
""" | |
global seq | |
alpha_helix, beta_sheet, turn = structure(seq) | |
if seq is None: | |
raise ValueError("Protein sequence 'seq' is not provided. Use giveValues() to set the sequence.") | |
# Convert the single sequence to a list | |
seq_list = [seq] | |
percent_A, percent_C, percent_D, percent_E, percent_F, percent_G, percent_H, percent_I, percent_K, percent_L, percent_M, percent_N, percent_P, percent_Q, percent_R, percent_S, percent_T, percent_V, percent_W, percent_Y = getamino_acid(seq).values() | |
data = pd.DataFrame({ | |
"Sequence": [seq_unce], | |
# "Sequence Length": [seqLen], | |
"net_charge": calculate_net_charge(seq), | |
"isoelectric point": [IsoelectricPoint.IsoelectricPoint(seq).pi()], | |
'alpha_helix':[alpha_helix], | |
'beta_sheet':[beta_sheet], | |
'turn':[turn], | |
"Molecular Weight": molWeight(seq), | |
'percent_A':[percent_A], | |
'percent_C':[percent_C], | |
'percent_D':[percent_D], | |
'percent_E':[percent_E], | |
'percent_F':[percent_F], | |
'percent_G':[percent_G], | |
'percent_H':[percent_H], | |
'percent_I':[percent_I], | |
'percent_K':[percent_K], | |
'percent_L':[percent_L], | |
'percent_M':[percent_M], | |
'percent_N':[percent_N], | |
'percent_P':[percent_P], | |
'percent_Q':[percent_Q], | |
'percent_R':[percent_R], | |
'percent_S':[percent_S], | |
'percent_T':[percent_T], | |
'percent_V':[percent_V], | |
'percent_W':[percent_W], | |
'percent_Y':[percent_Y], | |
"Hydrophobicity": hydrophobicityValues(seq), | |
"Electrical Potential": calculate_electric_potential(seq), | |
"Abundance": abundance_dispersion(seq)[0], | |
"Dispersion": abundance_dispersion(seq)[1], | |
# "Helix Propensity Score": helix_propensity_score(seq), | |
# "Beta strand propensity values": BSpropensity_score(seq), | |
}) | |
return data | |
# Example of how to use the functions: | |
# giveValues("YOUR_PROTEIN_SEQUENCE_HERE") | |
# data = returnValues() | |
# print(data) | |