#==================================================================================================# import pandas as pd from sklearn.preprocessing import LabelEncoder from Bio.SeqUtils import ProtParam import streamlit as st from Bio.SeqUtils import IsoelectricPoint # The variable 'seq' should be initialized with a valid protein sequence seq = "" def giveValues(seq1): global seq seq = seq1 def structure(seq): alpha_helix, beta_sheet, turn = ProtParam.ProteinAnalysis(seq).secondary_structure_fraction() return alpha_helix, beta_sheet, turn def calculate_net_charge(protein_sequence): # Define the charges of amino acids amino_acid_charges = { 'A': 0, 'R': 1, 'N': 0, 'D': -1, 'C': 0, 'Q': 0, 'E': -1, 'G': 0, 'H': 0, 'I': 0, 'L': 0, 'K': 1, 'M': 0, 'F': 0, 'P': 0, 'S': 0, 'T': 0, 'W': 0, 'Y': 0, 'V': 0 } # Calculate net charge net_charge = sum(amino_acid_charges.get(aa, 0) for aa in protein_sequence) return net_charge def calculate_electric_potential(protein_sequence, pH=7.0): # Define the pKa values of amino acids amino_acid_pKa = { 'A': 2.35, 'R': 12.0, 'N': 7.0, 'D': 3.9, 'C': 8.3, 'Q': 9.1, 'E': 4.2, 'G': 2.3, 'H': 6.0, 'I': 6.0, 'L': 6.0, 'K': 10.5, 'M': 5.7, 'F': 5.5, 'P': 6.3, 'S': 2.2, 'T': 2.6, 'W': 5.9, 'Y': 5.7, 'V': 6.0 } # Calculate the electric potential electric_potential = 0.0 for aa in protein_sequence: charge = 0.0 if aa in amino_acid_pKa: pKa = amino_acid_pKa[aa] charge = 10 ** (pH - pKa) / (1 + 10 ** (pH - pKa)) electric_potential += charge return electric_potential def molWeight(seq): """ Calculate the molecular weight of a protein sequence. Parameters: - seq (str): Protein sequence. Returns: - float: Molecular weight. """ moleWeight1 = ProtParam.ProteinAnalysis(seq).molecular_weight() return moleWeight1 def hydrophobicityValues(seq): """ Calculate the hydrophobicity index of a protein sequence. Parameters: - seq (str): Protein sequence. Returns: - float: Hydrophobicity index. """ hydrophobicity_values = { 'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5, 'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6, 'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2 } hydrophobicity = sum(hydrophobicity_values[aa] for aa in seq) return hydrophobicity def helix_propensity_score(sequence): """ Calculate the helix propensity score of a protein sequence. Parameters: - sequence (str): Protein sequence. Returns: - float: Helix propensity score. """ helix_propensity_values = { 'A': 1.2, 'C': 0.8, 'E': 0.7, 'G': 0.9, 'H': 1.0, 'I': 1.1, 'K': 1.3, 'L': 1.0, 'M': 1.2, 'N': 0.9, 'Q': 1.0, 'R': 1.2, 'S': 0.8, 'T': 0.9, 'V': 1.1, 'W': 1.3, 'Y': 1.1, 'P': 0.7, 'F': 1.1, 'D': 0.7 } helix_propensity_score = sum(helix_propensity_values[aa] for aa in sequence) return helix_propensity_score def abundance_dispersion(sequence): """ Calculate the abundance and dispersion of beta strand amino acids in a protein sequence. Parameters: - sequence (str): Protein sequence. Returns: - tuple: Abundance and dispersion. """ beta_strand_amino_acids = ['E', 'F', 'Y', 'W'] beta_strand_count = sum(sequence.count(aa) for aa in beta_strand_amino_acids) abundance = beta_strand_count dispersion = 0 if beta_strand_count > 1: positions = [i for i, aa in enumerate(sequence) if aa in beta_strand_amino_acids] mean_position = sum(positions) / beta_strand_count dispersion = sum((pos - mean_position) ** 2 for pos in positions) / (beta_strand_count - 1) dispersion = dispersion ** 0.5 return abundance, dispersion def BSpropensity_score(sequence): """ Calculate the beta strand propensity score of a protein sequence. Parameters: - sequence (str): Protein sequence. Returns: - float: Beta strand propensity score. """ beta_strand_propensity_values = { 'A': 0.7, 'C': 1.0, 'E': 1.1, 'G': 0.5, 'H': 0.8, 'I': 0.6, 'K': 1.2, 'L': 0.7, 'M': 0.5, 'N': 1.1, 'Q': 0.9, 'R': 0.8, 'S': 0.6, 'T': 0.7, 'V': 0.8, 'W': 1.3, 'Y': 1.2, 'P': 1.0, 'F': 1.1, 'D': 0.9 } beta_strand_propensity_score = sum(beta_strand_propensity_values[aa] for aa in sequence) return beta_strand_propensity_score def getamino_acid(seq): aminoAcidPercent = ProtParam.ProteinAnalysis(seq).get_amino_acids_percent() return aminoAcidPercent def returnValues(seqLen,seq_unce): """ Return labeled data based on protein sequence features. Returns: - pd.DataFrame: Labeled data. """ global seq alpha_helix, beta_sheet, turn = structure(seq) if seq is None: raise ValueError("Protein sequence 'seq' is not provided. Use giveValues() to set the sequence.") # Convert the single sequence to a list seq_list = [seq] percent_A, percent_C, percent_D, percent_E, percent_F, percent_G, percent_H, percent_I, percent_K, percent_L, percent_M, percent_N, percent_P, percent_Q, percent_R, percent_S, percent_T, percent_V, percent_W, percent_Y = getamino_acid(seq).values() data = pd.DataFrame({ "Sequence": [seq_unce], # "Sequence Length": [seqLen], "net_charge": calculate_net_charge(seq), "isoelectric point": [IsoelectricPoint.IsoelectricPoint(seq).pi()], 'alpha_helix':[alpha_helix], 'beta_sheet':[beta_sheet], 'turn':[turn], "Molecular Weight": molWeight(seq), 'percent_A':[percent_A], 'percent_C':[percent_C], 'percent_D':[percent_D], 'percent_E':[percent_E], 'percent_F':[percent_F], 'percent_G':[percent_G], 'percent_H':[percent_H], 'percent_I':[percent_I], 'percent_K':[percent_K], 'percent_L':[percent_L], 'percent_M':[percent_M], 'percent_N':[percent_N], 'percent_P':[percent_P], 'percent_Q':[percent_Q], 'percent_R':[percent_R], 'percent_S':[percent_S], 'percent_T':[percent_T], 'percent_V':[percent_V], 'percent_W':[percent_W], 'percent_Y':[percent_Y], "Hydrophobicity": hydrophobicityValues(seq), "Electrical Potential": calculate_electric_potential(seq), "Abundance": abundance_dispersion(seq)[0], "Dispersion": abundance_dispersion(seq)[1], # "Helix Propensity Score": helix_propensity_score(seq), # "Beta strand propensity values": BSpropensity_score(seq), }) return data # Example of how to use the functions: # giveValues("YOUR_PROTEIN_SEQUENCE_HERE") # data = returnValues() # print(data)