#==================================================================================================#

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from Bio.SeqUtils import ProtParam
import streamlit as st
from Bio.SeqUtils import IsoelectricPoint
# The variable 'seq' should be initialized with a valid protein sequence
seq = ""
def giveValues(seq1):
    global seq
    seq = seq1

def structure(seq):
    alpha_helix, beta_sheet, turn = ProtParam.ProteinAnalysis(seq).secondary_structure_fraction()
    return alpha_helix, beta_sheet, turn

def calculate_net_charge(protein_sequence):
    # Define the charges of amino acids
    amino_acid_charges = {
        'A': 0, 'R': 1, 'N': 0, 'D': -1, 'C': 0,
        'Q': 0, 'E': -1, 'G': 0, 'H': 0, 'I': 0,
        'L': 0, 'K': 1, 'M': 0, 'F': 0, 'P': 0,
        'S': 0, 'T': 0, 'W': 0, 'Y': 0, 'V': 0
    }

    # Calculate net charge
    net_charge = sum(amino_acid_charges.get(aa, 0) for aa in protein_sequence)

    return net_charge

def calculate_electric_potential(protein_sequence, pH=7.0):
    # Define the pKa values of amino acids
    amino_acid_pKa = {
        'A': 2.35, 'R': 12.0, 'N': 7.0, 'D': 3.9, 'C': 8.3,
        'Q': 9.1, 'E': 4.2, 'G': 2.3, 'H': 6.0, 'I': 6.0,
        'L': 6.0, 'K': 10.5, 'M': 5.7, 'F': 5.5, 'P': 6.3,
        'S': 2.2, 'T': 2.6, 'W': 5.9, 'Y': 5.7, 'V': 6.0
    }

    # Calculate the electric potential
    electric_potential = 0.0
    for aa in protein_sequence:
        charge = 0.0
        if aa in amino_acid_pKa:
            pKa = amino_acid_pKa[aa]
            charge = 10 ** (pH - pKa) / (1 + 10 ** (pH - pKa))

        electric_potential += charge

    return electric_potential

def molWeight(seq):
    """
    Calculate the molecular weight of a protein sequence.

    Parameters:
    - seq (str): Protein sequence.

    Returns:
    - float: Molecular weight.
    """
    moleWeight1 = ProtParam.ProteinAnalysis(seq).molecular_weight()
    return moleWeight1

def hydrophobicityValues(seq):
    """
    Calculate the hydrophobicity index of a protein sequence.

    Parameters:
    - seq (str): Protein sequence.

    Returns:
    - float: Hydrophobicity index.
    """
    hydrophobicity_values = {
        'A': 1.8, 'R': -4.5, 'N': -3.5, 'D': -3.5, 'C': 2.5,
        'Q': -3.5, 'E': -3.5, 'G': -0.4, 'H': -3.2, 'I': 4.5,
        'L': 3.8, 'K': -3.9, 'M': 1.9, 'F': 2.8, 'P': -1.6,
        'S': -0.8, 'T': -0.7, 'W': -0.9, 'Y': -1.3, 'V': 4.2
    }
    hydrophobicity = sum(hydrophobicity_values[aa] for aa in seq)
    return hydrophobicity


def helix_propensity_score(sequence):
    """
    Calculate the helix propensity score of a protein sequence.

    Parameters:
    - sequence (str): Protein sequence.

    Returns:
    - float: Helix propensity score.
    """
    helix_propensity_values = {
        'A': 1.2, 'C': 0.8, 'E': 0.7, 'G': 0.9, 'H': 1.0,
        'I': 1.1, 'K': 1.3, 'L': 1.0, 'M': 1.2, 'N': 0.9,
        'Q': 1.0, 'R': 1.2, 'S': 0.8, 'T': 0.9, 'V': 1.1,
        'W': 1.3, 'Y': 1.1, 'P': 0.7, 'F': 1.1, 'D': 0.7
    }
    helix_propensity_score = sum(helix_propensity_values[aa] for aa in sequence)
    return helix_propensity_score

def abundance_dispersion(sequence):
    """
    Calculate the abundance and dispersion of beta strand amino acids in a protein sequence.

    Parameters:
    - sequence (str): Protein sequence.

    Returns:
    - tuple: Abundance and dispersion.
    """
    beta_strand_amino_acids = ['E', 'F', 'Y', 'W']
    beta_strand_count = sum(sequence.count(aa) for aa in beta_strand_amino_acids)
    abundance = beta_strand_count
    dispersion = 0
    
    if beta_strand_count > 1:
        positions = [i for i, aa in enumerate(sequence) if aa in beta_strand_amino_acids]
        mean_position = sum(positions) / beta_strand_count
        dispersion = sum((pos - mean_position) ** 2 for pos in positions) / (beta_strand_count - 1)
        dispersion = dispersion ** 0.5
    
    return abundance, dispersion

def BSpropensity_score(sequence):
    """
    Calculate the beta strand propensity score of a protein sequence.

    Parameters:
    - sequence (str): Protein sequence.

    Returns:
    - float: Beta strand propensity score.
    """
    beta_strand_propensity_values = {
        'A': 0.7, 'C': 1.0, 'E': 1.1, 'G': 0.5, 'H': 0.8,
        'I': 0.6, 'K': 1.2, 'L': 0.7, 'M': 0.5, 'N': 1.1,
        'Q': 0.9, 'R': 0.8, 'S': 0.6, 'T': 0.7, 'V': 0.8,
        'W': 1.3, 'Y': 1.2, 'P': 1.0, 'F': 1.1, 'D': 0.9
    }
    beta_strand_propensity_score = sum(beta_strand_propensity_values[aa] for aa in sequence)
    return beta_strand_propensity_score

def getamino_acid(seq):
    aminoAcidPercent = ProtParam.ProteinAnalysis(seq).get_amino_acids_percent()
    return aminoAcidPercent


def returnValues(seqLen,seq_unce):
    """
    Return labeled data based on protein sequence features.

    Returns:
    - pd.DataFrame: Labeled data.
    """
    global seq
    alpha_helix, beta_sheet, turn = structure(seq)
    if seq is None:
        raise ValueError("Protein sequence 'seq' is not provided. Use giveValues() to set the sequence.")

    # Convert the single sequence to a list
    seq_list = [seq]
    

    percent_A, percent_C, percent_D, percent_E, percent_F, percent_G, percent_H, percent_I, percent_K, percent_L, percent_M, percent_N, percent_P, percent_Q, percent_R, percent_S, percent_T, percent_V, percent_W, percent_Y = getamino_acid(seq).values()
    
    data = pd.DataFrame({
        "Sequence": [seq_unce],
        # "Sequence Length": [seqLen],
        "net_charge": calculate_net_charge(seq),
        "isoelectric point": [IsoelectricPoint.IsoelectricPoint(seq).pi()],
        'alpha_helix':[alpha_helix],
        'beta_sheet':[beta_sheet],
        'turn':[turn],
        "Molecular Weight": molWeight(seq),
        'percent_A':[percent_A],
        'percent_C':[percent_C],
        'percent_D':[percent_D],
        'percent_E':[percent_E],
        'percent_F':[percent_F],
        'percent_G':[percent_G],
        'percent_H':[percent_H],
        'percent_I':[percent_I],
        'percent_K':[percent_K],
        'percent_L':[percent_L],
        'percent_M':[percent_M],
        'percent_N':[percent_N],
        'percent_P':[percent_P],
        'percent_Q':[percent_Q],
        'percent_R':[percent_R],
        'percent_S':[percent_S],
        'percent_T':[percent_T],
        'percent_V':[percent_V],
        'percent_W':[percent_W],
        'percent_Y':[percent_Y],
        "Hydrophobicity": hydrophobicityValues(seq),
        "Electrical Potential": calculate_electric_potential(seq),
        "Abundance": abundance_dispersion(seq)[0],
        "Dispersion": abundance_dispersion(seq)[1],
        # "Helix Propensity Score": helix_propensity_score(seq),
        # "Beta strand propensity values": BSpropensity_score(seq),
    })

    return data

# Example of how to use the functions:
# giveValues("YOUR_PROTEIN_SEQUENCE_HERE")
# data = returnValues()
# print(data)