Spaces:

PiaKannan
/

Rubisco-Prediction

Running

App Files Files Community

PiKaHa commited on Feb 24

Commit

2a2a7a4

1 Parent(s): d61adc7

Add saved models and requirements

Browse files

Files changed (14) hide show

KC.pkl +3 -0
KC/saved_model.pb +3 -0
KC/variables/variables.data-00000-of-00001 +0 -0
KC/variables/variables.index +0 -0
Specificity.pkl +3 -0
Specificity/saved_model.pb +3 -0
Specificity/variables/variables.data-00000-of-00001 +0 -0
Specificity/variables/variables.index +0 -0
app.py +171 -0
kcatC.pkl +3 -0
kcatC/saved_model.pb +3 -0
kcatC/variables/variables.data-00000-of-00001 +0 -0
kcatC/variables/variables.index +0 -0
requirements.txt +7 -0

KC.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9985473ee19c6be7ba5777b12bc1babe60746298345e64e0426ab54f8e8e92d0
+size 188721

KC/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c35ad708d8b793c7d4b9867ee4225445cd9c2648a3589067a7814da7a660f28
+size 1209629

KC/variables/variables.data-00000-of-00001 ADDED Viewed

Binary file (676 Bytes). View file

KC/variables/variables.index ADDED Viewed

Binary file (387 Bytes). View file

Specificity.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c7508f5fec50afa81a43d85917b947b367f20da122c6fc9635d6cd202bc0b51
+size 271953

Specificity/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bef26d451689448d1469d1865bf91b9f342c9258d059dbc8deff006690a3189
+size 2207910

Specificity/variables/variables.data-00000-of-00001 ADDED Viewed

Binary file (676 Bytes). View file

Specificity/variables/variables.index ADDED Viewed

Binary file (387 Bytes). View file

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import gradio as gr
+import joblib
+from concurrent.futures import ThreadPoolExecutor
+from transformers import AutoTokenizer, AutoModel, EsmModel
+import torch
+import numpy as np
+import random
+import tensorflow as tf
+import os
+from keras.layers import TFSMLayer
+import pandas as pd
+base_dir = "."
+# Set random seed
+SEED = 42
+np.random.seed(SEED)
+random.seed(SEED)
+torch.manual_seed(SEED)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed(SEED)
+    torch.cuda.manual_seed_all(SEED)
+# Ensure deterministic behavior
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+def load_model(model_path):
+    print(f"Loading model from {model_path}...")
+    return tf.saved_model.load(model_path)
+print("Loading models...")
+plant_models = {
+    "Specificity": {"model": joblib.load("Specificity.pkl"), "esm_model": "facebook/esm1b_t33_650M_UR50S", "layer": 6},
+    "kcatC": {"model": joblib.load("kcatC.pkl"), "esm_model": "facebook/esm2_t36_3B_UR50D", "layer": 11},
+    "KC": {"model": joblib.load("KC.pkl"), "esm_model": "facebook/esm1b_t33_650M_UR50S", "layer": 4},
+}
+general_models = {
+    "Specificity": {"model": load_model(f"Specificity"), "esm_model": "facebook/esm2_t33_650M_UR50D", "layer": 33},
+    "kcatC": {"model": load_model(f"kcatC"), "esm_model": "facebook/esm2_t12_35M_UR50D", "layer": 7},
+    "KC": {"model": load_model(f"KC"), "esm_model": "facebook/esm2_t30_150M_UR50D", "layer": 26},
+}
+# Function to generate embeddings
+def get_embedding(sequence, esm_model_name, layer):
+    print(f"Generating embeddings using {esm_model_name}, Layer {layer}...")
+    tokenizer = AutoTokenizer.from_pretrained(esm_model_name)
+    model = EsmModel.from_pretrained(esm_model_name, output_hidden_states=True)
+    # Tokenize the sequence
+    inputs = tokenizer(sequence, return_tensors="pt", truncation=True, max_length=1024)
+    # Generate embeddings
+    with torch.no_grad():
+        outputs = model(**inputs)
+        hidden_states = outputs.hidden_states  # Retrieve all hidden states
+        embedding = hidden_states[layer].mean(dim=1).numpy()  # Average pooling
+    # Convert to DataFrame with named columns
+    feature_columns = {f"D{i+1}": embedding[0, i] for i in range(embedding.shape[1])}
+    embedding_df = pd.DataFrame([feature_columns])
+    print (embedding_df)
+    return embedding_df.values, embedding_df
+def predict_with_gpflow(model, X):
+    print(model.signatures)
+    # Convert input to TensorFlow tensor
+    X_tensor = tf.convert_to_tensor(X, dtype=tf.float64)
+    print (X_tensor.shape)
+    # Get predictions
+    #predict_fn = model.predict_f_compiled
+    predict_fn = model.signatures["serving_default"]
+    result = predict_fn(Xnew=X_tensor)  # Pass Xnew explicitly
+    #mean, variance = predict_fn(Xnew=X_tensor)
+    mean = result["output_0"].numpy()  # Adjust output key names if needed
+    variance = result["output_1"].numpy()
+    # Return mean and variance as numpy arrays
+    #return mean.numpy().flatten(), variance.numpy().flatten()
+    return mean.flatten(), variance.flatten()
+def process_target(target, selected_models, sequence, prediction_type):
+    """
+    Process a single target for prediction using transformer embeddings and the specified model.
+    """
+    # Get model and embedding details
+    esm_model_name = selected_models[target]["esm_model"]
+    layer = selected_models[target]["layer"]
+    model = selected_models[target]["model"]
+    # Generate embeddings in the required format
+    embedding, _ = get_embedding(sequence, esm_model_name, layer)
+    embedding = embedding.astype(np.float64)
+    np.save(f"hf_embedding_{target}.npy", embedding)
+    if prediction_type == "Plant-Specific":
+        # Random Forest prediction
+        y_pred = model.predict(embedding)[0]
+        return target, round(y_pred, 2)
+    else:
+        # GPflow prediction
+        print (esm_model_name)
+        print (layer)
+        print (model)
+        y_pred, y_uncertainty = predict_with_gpflow(model, embedding)
+        return target, round(y_pred[0], 2), round(y_uncertainty[0], 2)
+def predict(sequence, prediction_type):
+    """
+    Predicts Specificity, kcatC, and KC for the given sequence and prediction type.
+    """
+    # Select the appropriate model set
+    selected_models = plant_models if prediction_type == "Plant-Specific" else general_models
+    # Predict for all targets in parallel
+    with ThreadPoolExecutor() as executor:
+        results = list(
+            executor.map(
+                lambda target: process_target(target, selected_models, sequence, prediction_type),
+                selected_models.keys()
+            )
+        )
+    # Format results
+    if prediction_type == "Plant-Specific":
+        formatted_results = [
+            ["Specificity", results[0][1]],
+            ["kcat\u1d9c", results[1][1]],
+            ["K\u1d9c", results[2][1]],
+        ]
+    else:
+        formatted_results = [
+            ["Specificity", results[0][1], results[0][2]],
+            ["kcat\u1d9c", results[1][1], results[1][2]],
+            ["K\u1d9c", results[2][1], results[2][2]],
+        ]
+    return formatted_results
+# Define Gradio interface
+print("Creating Gradio interface...")
+interface = gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Textbox(label="Input Protein Sequence",
+                   value="MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFRVTPQPGVPPEEAGAAVAAESSTGTWTTVWTDGLTSLDRYKGRCYHIEPVPGEETQFIAYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALAALRLEDLRIPPAYTKTFQGPPHGIQVERDKLNKYGRPLLGCTIKPKLGLSAKNYGRAVYECLRGGLDFTKDDENVNSQPFMRWRDRFLFCAEAIYKSQAETGEIKGHYLNATAGTCEEMIKRAVFARELGVPIVMHDYLTGGFTANTSLSHYCRDNGLLLHIHRAMHAVIDRQKNHGMHFRVLAKALRLSGGDHIHAGTVVGKLEGDRESTLGFVDLLRDDYVEKDRSRGIFFTQDWVSLPGVLPVASGGIHVWHMPALTEIFGDDSVLQFGGGTLGHPWGNAPGAVANRVALEACVQARNEGRDLAVEGNEIIREACKWSPELAAACEVWKEITFNFPTIDKLDGQE",
+            lines=10,
+                  ),  # Input: Text box for sequence
+        gr.Radio(choices=["Plant-Specific", "General"], label="Prediction Type", value="Plant-Specific"),  # Dropdown for selection
+    ],
+    outputs=gr.Dataframe(
+        headers=["Target", "Prediction", "Uncertainty (for General)"],
+        type="array"
+    ),  # Output: Table
+    title="Rubisco Kinetics Prediction",
+    description=(
+        "Enter a protein sequence to predict Rubisco kinetics properties (Specificity, kcat\u1d9c, and K\u1d9c). "
+        "Choose between 'Plant-Specific' (Random Forest) or 'General' (GPflow) predictions."
+    ),
+)
+if __name__ == "__main__":
+    interface.launch()

kcatC.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33e1131e59808c8c23f910502730c40569f71946322fbc6f6c9f0236c11a8c6a
+size 181809

kcatC/saved_model.pb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a3fedc1e7fc41a15fa41e5fa04efe015fcadd2395e1ac07da60fb5328a8a401
+size 1003709

kcatC/variables/variables.data-00000-of-00001 ADDED Viewed

Binary file (676 Bytes). View file

kcatC/variables/variables.index ADDED Viewed

Binary file (387 Bytes). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+transformers
+torch
+gradio
+joblib
+numpy
+scikit-learn
+gpflow